Outline
1. Load and insepct data
2. Select predictors
3. Reconstruct lagged dataset


*   Add new columns to the raw data frame


*   Only keep relevant columns and create a smaller dataframe


*   L = 3, 7


4. Train/test split


*   80/20 rule
*   Create X_train, Y_train, X_test, Y_test


5. Modeling

*   SFS search and select features--it doesn't produce a model. SFS object contains: best features + MSE score
*   sfs.k_feature_names_, sfs.k_score_
*   feature selection only happens on training data

  *   Liner Regression
  *   Ridge Regression: must standardize feature first. This makes all predictors comparable in scale, so Ridge’s penalty treats them fairly.

  *   Lasso Regression

6. Evaluation

L = 3

In [5]:
import pandas as pd
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV
from sklearn.preprocessing import StandardScaler
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.metrics import mean_squared_error
import numpy as np




df = pd.read_csv("bitcoin.csv")

# create lag features. L = 3
# btc_trade_volume
df['btc_trade_volume_lag1'] = df['btc_trade_volume'].shift(1)
df['btc_trade_volume_lag2'] = df['btc_trade_volume'].shift(2)
df['btc_trade_volume_lag3'] = df['btc_trade_volume'].shift(3)

# btc_n_transactions
df['btc_n_transactions_lag1'] = df['btc_n_transactions'].shift(1)
df['btc_n_transactions_lag2'] = df['btc_n_transactions'].shift(2)
df['btc_n_transactions_lag3'] = df['btc_n_transactions'].shift(3)

# btc_estimated_transaction_volume
df['btc_estimated_transaction_volume_usd_lag1'] = df['btc_estimated_transaction_volume_usd'].shift(1)
df['btc_estimated_transaction_volume_usd_lag2'] = df['btc_estimated_transaction_volume_usd'].shift(2)
df['btc_estimated_transaction_volume_usd_lag3'] = df['btc_estimated_transaction_volume_usd'].shift(3)

# btc_output_volume
df['btc_output_volume_lag1'] = df['btc_output_volume'].shift(1)
df['btc_output_volume_lag2'] = df['btc_output_volume'].shift(2)
df['btc_output_volume_lag3'] = df['btc_output_volume'].shift(3)


# print(len(df))= 2920
# create a list of selected features
cols = ['Date',
    'btc_trade_volume_lag1', 'btc_trade_volume_lag2', 'btc_trade_volume_lag3',
    'btc_n_transactions_lag1', 'btc_n_transactions_lag2', 'btc_n_transactions_lag3',
    'btc_estimated_transaction_volume_usd_lag1', 'btc_estimated_transaction_volume_usd_lag2', 'btc_estimated_transaction_volume_usd_lag3',
    'btc_output_volume_lag1', 'btc_output_volume_lag2', 'btc_output_volume_lag3',
    'btc_market_price'

]
cols_feature = [
    'btc_trade_volume_lag1', 'btc_trade_volume_lag2', 'btc_trade_volume_lag3',
    'btc_n_transactions_lag1', 'btc_n_transactions_lag2', 'btc_n_transactions_lag3',
    'btc_estimated_transaction_volume_usd_lag1', 'btc_estimated_transaction_volume_usd_lag2', 'btc_estimated_transaction_volume_usd_lag3',
    'btc_output_volume_lag1', 'btc_output_volume_lag2', 'btc_output_volume_lag3'

]
df_model = df[cols]
df_model = df[cols].copy()
df_model = df_model.ffill()
df_model = df_model.iloc[3:]
print('df_model_len', len(df_model))


# split train/test dataset 80/20 rule
training_set = df_model[:2333]
test_set = df_model[2333:]

X_train = training_set[cols_feature]
Y_train = training_set['btc_market_price']


X_test = test_set[cols_feature]
Y_test = test_set['btc_market_price']
avg_price = Y_test.mean()
print("avg_Y:", avg_price)

# Compute mean of training target
avg_train_price = Y_train.mean()

# Predict that same value for every test row
y_pred_baseline = [avg_train_price] * len(Y_test)


baseline_rmse = np.sqrt(mean_squared_error(Y_test, y_pred_baseline))
print("Baseline RMSE:", baseline_rmse)

# Model selection: Regression model: rmse 1400.88
model = LinearRegression()

# SFS: indices/names of the best features + MSE score
sfs = SFS(estimator = model,
          k_features = (1, 12),
          forward = True,
          floating = True,
          scoring = 'neg_mean_squared_error',
          cv = 0)

sfs = sfs.fit(X_train, Y_train)
# print(sfs.k_feature_names_)
# ('btc_trade_volume_lag1', 'btc_trade_volume_lag2', 'btc_trade_volume_lag3', 'btc_n_transactions_lag1', 'btc_n_transactions_lag2', 'btc_n_transactions_lag3', 'btc_estimated_transaction_volume_usd_lag1', 'btc_estimated_transaction_volume_usd_lag2', 'btc_estimated_transaction_volume_usd_lag3', 'btc_output_volume_lag1', 'btc_output_volume_lag2', 'btc_output_volume_lag3')
# print(sfs.k_score_)
# -18406.278969987245

# train model, .fit() updates in place
model.fit(X_train, Y_train)
# predict test model
Y_pred = model.predict(X_test)


rmse = np.sqrt(mean_squared_error(Y_test, Y_pred))
print('linear regression',rmse) # 1400.8817946211177

# Model selection: Ridge regression RMSE: 1461
# Create the scaler
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)


alphas = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000]
ridge = RidgeCV(alphas = alphas, scoring = 'neg_mean_squared_error')
ridge.fit(X_train_scaled, Y_train)
y_pred_ridge = ridge.predict(X_test_scaled)
rmse_ridge = np.sqrt(mean_squared_error(Y_test, y_pred_ridge))
print("Ridge rmse", rmse_ridge)
print("Best alpha", ridge.alpha_)

# Model selection: Lasso regression RMSE: 2233.9,
# Uses built-in cross-validation to automatically search α.
lasso = LassoCV(alphas=None, cv=5, max_iter = 10000)
lasso.fit(X_train_scaled, Y_train)
y_pred_lasso = lasso.predict(X_test_scaled)
rmse_lasso = np.sqrt(mean_squared_error(Y_test, y_pred_lasso))
print("RMSE Lasso", rmse_lasso)
print("Best alpha for lasso", lasso.alpha_)



df_model_len 2917
avg_Y: 3699.163465893723
Baseline RMSE: 5570.131857095667
linear regression 1384.6128707518333
Ridge rmse 1443.43137231305
Best alpha 100.0
RMSE Lasso 2227.6616751890497
Best alpha for lasso 41.28128541531293


L = 7

In [6]:
# create lag feature. L = 7

# btc_trade_volume
df['btc_trade_volume_lag4'] = df['btc_trade_volume'].shift(4)
df['btc_trade_volume_lag5'] = df['btc_trade_volume'].shift(5)
df['btc_trade_volume_lag6'] = df['btc_trade_volume'].shift(6)
df['btc_trade_volume_lag7'] = df['btc_trade_volume'].shift(7)

# btc_n_transactions
df['btc_n_transactions_lag4'] = df['btc_n_transactions'].shift(4)
df['btc_n_transactions_lag5'] = df['btc_n_transactions'].shift(5)
df['btc_n_transactions_lag6'] = df['btc_n_transactions'].shift(6)
df['btc_n_transactions_lag7'] = df['btc_n_transactions'].shift(7)


# btc_estimated_transaction_volume
df['btc_estimated_transaction_volume_usd_lag4'] = df['btc_estimated_transaction_volume_usd'].shift(4)
df['btc_estimated_transaction_volume_usd_lag5'] = df['btc_estimated_transaction_volume_usd'].shift(5)
df['btc_estimated_transaction_volume_usd_lag6'] = df['btc_estimated_transaction_volume_usd'].shift(6)
df['btc_estimated_transaction_volume_usd_lag7'] = df['btc_estimated_transaction_volume_usd'].shift(7)

# btc_output_volume
df['btc_output_volume_lag4'] = df['btc_output_volume'].shift(4)
df['btc_output_volume_lag5'] = df['btc_output_volume'].shift(5)
df['btc_output_volume_lag6'] = df['btc_output_volume'].shift(6)
df['btc_output_volume_lag7'] = df['btc_output_volume'].shift(7)

cols_l7 = ['btc_trade_volume_lag1', 'btc_trade_volume_lag2', 'btc_trade_volume_lag3','btc_trade_volume_lag4','btc_trade_volume_lag5', 'btc_trade_volume_lag6','btc_trade_volume_lag7',
    'btc_n_transactions_lag1', 'btc_n_transactions_lag2', 'btc_n_transactions_lag3','btc_n_transactions_lag4', 'btc_n_transactions_lag5', 'btc_n_transactions_lag6', 'btc_n_transactions_lag7',
    'btc_estimated_transaction_volume_usd_lag1', 'btc_estimated_transaction_volume_usd_lag2', 'btc_estimated_transaction_volume_usd_lag3','btc_estimated_transaction_volume_usd_lag4', 'btc_estimated_transaction_volume_usd_lag5', 'btc_estimated_transaction_volume_usd_lag6', 'btc_estimated_transaction_volume_usd_lag7',
    'btc_output_volume_lag1', 'btc_output_volume_lag2', 'btc_output_volume_lag3','btc_output_volume_lag4', 'btc_output_volume_lag5','btc_output_volume_lag6','btc_output_volume_lag7',
    'btc_market_price'
]

cols_l7_features = ['btc_trade_volume_lag1', 'btc_trade_volume_lag2', 'btc_trade_volume_lag3','btc_trade_volume_lag4','btc_trade_volume_lag5', 'btc_trade_volume_lag6','btc_trade_volume_lag7',
    'btc_n_transactions_lag1', 'btc_n_transactions_lag2', 'btc_n_transactions_lag3','btc_n_transactions_lag4', 'btc_n_transactions_lag5', 'btc_n_transactions_lag6', 'btc_n_transactions_lag7',
    'btc_estimated_transaction_volume_usd_lag1', 'btc_estimated_transaction_volume_usd_lag2', 'btc_estimated_transaction_volume_usd_lag3','btc_estimated_transaction_volume_usd_lag4', 'btc_estimated_transaction_volume_usd_lag5', 'btc_estimated_transaction_volume_usd_lag6', 'btc_estimated_transaction_volume_usd_lag7',
    'btc_output_volume_lag1', 'btc_output_volume_lag2', 'btc_output_volume_lag3','btc_output_volume_lag4', 'btc_output_volume_lag5','btc_output_volume_lag6','btc_output_volume_lag7'
]

df_model_l7 = df[cols_l7]
df_model_l7 = df[cols_l7].copy()
df_model_l7 = df_model_l7.ffill()
df_model_l7 = df_model_l7.iloc[7:]
print('l7 len', len(df_model_l7))

# split train/test dataset
l7_training_set = df_model_l7[:2330]
l7_test_set = df_model_l7[2330:]

l7_X_train = l7_training_set[cols_l7_features]
l7_Y_train = l7_training_set['btc_market_price']

l7_X_test = l7_test_set[cols_l7_features]
l7_Y_test = l7_test_set['btc_market_price']
l7_avg_price = l7_Y_test.mean()
print("ly_avg_Y:", l7_avg_price)

# Model selection: linear regression rmse: 978.4579140290379
l7_model = LinearRegression()

# SFS: indices/names of the best features + MSE score
l7_sfs = SFS(estimator = l7_model,
          k_features = (1, 12),
          forward = True,
          floating = True,
          scoring = 'neg_mean_squared_error',
          cv = 0)

l7_sfs = l7_sfs.fit(l7_X_train, l7_Y_train)

# train model, .fit() updates in place
l7_model.fit(l7_X_train, l7_Y_train)
# predict test model
l7_Y_pred = l7_model.predict(l7_X_test)

l7_rmse = np.sqrt(mean_squared_error(l7_Y_test, l7_Y_pred))
print('l7_linear regression',l7_rmse) # 1380.522304285302

# Model selection: Ridge regression rmse: 1150.732401497503

# Apply the scaler

scaler.fit(l7_X_train)
l7_X_train_scaled = scaler.transform(l7_X_train)
l7_X_test_scaled = scaler.transform(l7_X_test)


alphas = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000]
l7_ridge = RidgeCV(alphas = alphas, scoring = 'neg_mean_squared_error')
l7_ridge.fit(l7_X_train_scaled, l7_Y_train)
l7_y_pred_ridge = l7_ridge.predict(l7_X_test_scaled)
l7_rmse_ridge = np.sqrt(mean_squared_error(l7_Y_test, l7_y_pred_ridge))
print("l7 Ridge rmse", l7_rmse_ridge)
print("Best alpha", l7_ridge.alpha_)


# Model selection: Lasso regression RMSE: 2191.4097959377427
# Uses built-in cross-validation to automatically search α.
l7_lasso = LassoCV(alphas=None, cv=5, max_iter = 10000)
l7_lasso.fit(l7_X_train_scaled, l7_Y_train)
l7_y_pred_lasso = l7_lasso.predict(l7_X_test_scaled)
l7_rmse_lasso = np.sqrt(mean_squared_error(l7_Y_test, l7_y_pred_lasso))
print("RMSE Lasso", l7_rmse_lasso)
print("Best alpha for lasso", l7_lasso.alpha_)


l7 len 2913
ly_avg_Y: 3704.3499228678115
l7_linear regression 978.4579140290379
l7 Ridge rmse 1150.732401497503
Best alpha 1000.0
RMSE Lasso 2191.4097959377427
Best alpha for lasso 47.732381857618385
