In [1]:
import numpy as np
import pandas as pd
import plotly.io as pio
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, ElasticNet, BayesianRidge
from sklearn.linear_model import HuberRegressor
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import datetime
import lightgbm as lgb
pio.templates.default = 'plotly_dark'

In [2]:
def score_metrics(y_true, y_pred):
    return {
        'mae': mean_absolute_error(y_true, y_pred),
        'mse': mean_squared_error(y_true, y_pred),
        'rmse': np.sqrt(mean_squared_error(y_true, y_pred)),
        'r2': r2_score(y_true, y_pred)
    }

In [3]:
df = pd.read_csv('../data/processed/processed_data.csv')
df['TimeStamp'] = pd.to_datetime(df['TimeStamp'])
df.sort_values(by='TimeStamp', ascending=True, inplace=True)
final_test = df[df['TimeStamp'] > datetime.datetime(year=2021, month=6, day=14)]
df.drop(final_test.index, axis=0, inplace=True)
df.drop('TimeStamp', axis=1, inplace=True)
df.head()

Unnamed: 0,open,high,low,CurrentClose,Volume_ETH,NextClose
28137,733.04,735.99,731.7,734.64,2785.61,733.04
28136,734.64,734.65,722.0,731.32,10826.46,734.64
28135,731.32,732.0,728.44,728.44,2889.59,731.32
28134,728.44,739.3,725.52,735.21,9822.41,728.44
28133,735.21,736.84,730.0,732.1,5581.58,735.21


In [4]:
X = df.drop('NextClose', axis=1)
y = df['NextClose']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=43)

In [5]:
model = LinearRegression()
model.fit(X_train, y_train)
train_preds = model.predict(X_train)
test_preds = model.predict(X_test)

train_metrics = score_metrics(y_train, train_preds)
test_metrics = score_metrics(y_test, test_preds)

print(f'train_metrics: {train_metrics}\ntest_metrics: {test_metrics}')

train_metrics: {'mae': 0.09320341060899731, 'mse': 0.12187958299040064, 'rmse': 0.3491125649277044, 'r2': 0.9999997524544006}
test_metrics: {'mae': 0.09801295080674373, 'mse': 0.13248641993754945, 'rmse': 0.36398684033567674, 'r2': 0.999999736813406}


In [6]:
model = GradientBoostingRegressor()
model.fit(X_train, y_train)
train_preds = model.predict(X_train)
test_preds = model.predict(X_test)

train_metrics = score_metrics(y_train, train_preds)
test_metrics = score_metrics(y_test, test_preds)

print(f'train_metrics: {train_metrics}\ntest_metrics: {test_metrics}')

train_metrics: {'mae': 2.4126028772461416, 'mse': 16.932531188803853, 'rmse': 4.114915696439461, 'r2': 0.9999656088946205}
test_metrics: {'mae': 2.5745804060781516, 'mse': 20.060387600578355, 'rmse': 4.478882405308087, 'r2': 0.9999601496886279}


In [7]:
model = RandomForestRegressor()
model.fit(X_train, y_train)

train_preds = model.predict(X_train)
test_preds = model.predict(X_test)

train_metrics = score_metrics(y_train, train_preds)
test_metrics = score_metrics(y_test, test_preds)

print(f'train_metrics: {train_metrics}\ntest_metrics: {test_metrics}')

train_metrics: {'mae': 0.08973252035772866, 'mse': 0.21248676247699178, 'rmse': 0.4609628645313978, 'r2': 0.9999995684251481}
test_metrics: {'mae': 0.2361413262004218, 'mse': 1.4746862979999906, 'rmse': 1.214366624211976, 'r2': 0.9999970705098365}


In [8]:
model = DecisionTreeRegressor()
model.fit(X_train, y_train)

train_preds = model.predict(X_train)
test_preds = model.predict(X_test)

train_metrics = score_metrics(y_train, train_preds)
test_metrics = score_metrics(y_test, test_preds)

print(f'train_metrics: {train_metrics}\ntest_metrics: {test_metrics}')

train_metrics: {'mae': 3.5692449184226693e-16, 'mse': 2.343891285225206e-29, 'rmse': 4.8413750993134235e-15, 'r2': 1.0}
test_metrics: {'mae': 0.30127116334567045, 'mse': 1.660092449646854, 'rmse': 1.2884457495940036, 'r2': 0.9999967021972684}


In [9]:
model = KNeighborsRegressor()
model.fit(X_train, y_train)

train_preds = model.predict(X_train)
test_preds = model.predict(X_test)

train_metrics = score_metrics(y_train, train_preds)
test_metrics = score_metrics(y_test, test_preds)

print(f'train_metrics: {train_metrics}\ntest_metrics: {test_metrics}')

train_metrics: {'mae': 9.400229559623835, 'mse': 1248.2252171455693, 'rmse': 35.33023092403401, 'r2': 0.9974647709488034}
test_metrics: {'mae': 12.282691729391313, 'mse': 2879.81954189437, 'rmse': 53.66395011452633, 'r2': 0.9942791880333982}


In [10]:
model = MLPRegressor()
model.fit(X_train, y_train)

train_preds = model.predict(X_train)
test_preds = model.predict(X_test)

train_metrics = score_metrics(y_train, train_preds)
test_metrics = score_metrics(y_test, test_preds)

print(f'train_metrics: {train_metrics}\ntest_metrics: {test_metrics}')

train_metrics: {'mae': 3.330416479704726, 'mse': 43.928476227208776, 'rmse': 6.627856080755585, 'r2': 0.9999107783214309}
test_metrics: {'mae': 3.341018961454742, 'mse': 41.47090810802172, 'rmse': 6.4397909987841775, 'r2': 0.9999176173145856}


In [11]:
model = Ridge(alpha=.5)
model.fit(X_train, y_train)

train_preds = model.predict(X_train)
test_preds = model.predict(X_test)

train_metrics = score_metrics(y_train, train_preds)
test_metrics = score_metrics(y_test, test_preds)

print(f'train_metrics: {train_metrics}\ntest_metrics: {test_metrics}')

train_metrics: {'mae': 0.09320405614749308, 'mse': 0.12187958301117693, 'rmse': 0.3491125649574603, 'r2': 0.9999997524544005}
test_metrics: {'mae': 0.09801366775189113, 'mse': 0.1324867981656294, 'rmse': 0.3639873598981555, 'r2': 0.9999997368126546}


In [12]:
model = ElasticNet()
model.fit(X_train, y_train)

train_preds = model.predict(X_train)
test_preds = model.predict(X_test)

train_metrics = score_metrics(y_train, train_preds)
test_metrics = score_metrics(y_test, test_preds)

print(f'train_metrics: {train_metrics}\ntest_metrics: {test_metrics}')

train_metrics: {'mae': 0.09072658035919912, 'mse': 0.12234495495916343, 'rmse': 0.3497784369556869, 'r2': 0.999999751509199}
test_metrics: {'mae': 0.09513763230752736, 'mse': 0.13122627101575765, 'rmse': 0.3622516680648381, 'r2': 0.9999997393167139}


In [13]:
model = BayesianRidge()
model.fit(X_train, y_train)

train_preds = model.predict(X_train)
test_preds = model.predict(X_test)

train_metrics = score_metrics(y_train, train_preds)
test_metrics = score_metrics(y_test, test_preds)

print(f'train_metrics: {train_metrics}\ntest_metrics: {test_metrics}')

train_metrics: {'mae': 0.0932042034055375, 'mse': 0.12187958302173449, 'rmse': 0.34911256497258086, 'r2': 0.9999997524544005}
test_metrics: {'mae': 0.09801383129387642, 'mse': 0.13248688444331358, 'rmse': 0.3639874784155543, 'r2': 0.9999997368124832}


In [14]:
model = HuberRegressor()
model.fit(X_train, y_train)

train_preds = model.predict(X_train)
test_preds = model.predict(X_test)

train_metrics = score_metrics(y_train, train_preds)
test_metrics = score_metrics(y_test, test_preds)

print(f'train_metrics: {train_metrics}\ntest_metrics: {test_metrics}')

train_metrics: {'mae': 0.1415496184783459, 'mse': 0.1501022800076135, 'rmse': 0.38743035504153966, 'r2': 0.999999695132212}
test_metrics: {'mae': 0.1436957756402744, 'mse': 0.15719290837771774, 'rmse': 0.3964756088055326, 'r2': 0.9999996877335339}


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


In [15]:
model = lgb.LGBMRegressor(
#     num_leaves=31,
#     learning_rate=0.05,
#     n_estimators=20
)

model.fit(
    X_train, 
    y_train,
    eval_set=[(X_test, y_test)],
    eval_metric='l1',
    early_stopping_rounds=5
)
train_preds = model.predict(X_train)
test_preds = model.predict(X_test)

train_metrics = score_metrics(y_train, train_preds)
test_metrics = score_metrics(y_test, test_preds)

print(f'train_metrics: {train_metrics}\ntest_metrics: {test_metrics}')

[1]	valid_0's l1: 414.076	valid_0's l2: 408089
Training until validation scores don't improve for 5 rounds
[2]	valid_0's l1: 372.714	valid_0's l2: 330786
[3]	valid_0's l1: 335.496	valid_0's l2: 268247
[4]	valid_0's l1: 301.975	valid_0's l2: 217408
[5]	valid_0's l1: 271.861	valid_0's l2: 176407
[6]	valid_0's l1: 244.723	valid_0's l2: 143066
[7]	valid_0's l1: 220.279	valid_0's l2: 116017
[8]	valid_0's l1: 198.303	valid_0's l2: 94156.4
[9]	valid_0's l1: 178.539	valid_0's l2: 76424.2
[10]	valid_0's l1: 160.732	valid_0's l2: 62022.2
[11]	valid_0's l1: 144.738	valid_0's l2: 50365.1
[12]	valid_0's l1: 130.343	valid_0's l2: 40926.7
[13]	valid_0's l1: 117.387	valid_0's l2: 33258.3
[14]	valid_0's l1: 105.711	valid_0's l2: 27031.2
[15]	valid_0's l1: 95.2035	valid_0's l2: 21987.6
[16]	valid_0's l1: 85.7506	valid_0's l2: 17893.7
[17]	valid_0's l1: 77.2403	valid_0's l2: 14575.9
[18]	valid_0's l1: 69.5885	valid_0's l2: 11883.8
[19]	valid_0's l1: 62.7164	valid_0's l2: 9700.1
[20]	valid_0's l1: 56.52	v

## Choose best model

In [16]:
model = LinearRegression()
model.fit(X_train, y_train)
train_preds = model.predict(X_train)
test_preds = model.predict(X_test)
train_metrics = score_metrics(y_true=y_train, y_pred=train_preds)
test_metrics = score_metrics(y_true=y_test, y_pred=test_preds)

print(train_metrics)
print(test_metrics)

{'mae': 0.09320341060899731, 'mse': 0.12187958299040064, 'rmse': 0.3491125649277044, 'r2': 0.9999997524544006}
{'mae': 0.09801295080674373, 'mse': 0.13248641993754945, 'rmse': 0.36398684033567674, 'r2': 0.999999736813406}


actuals minus model prediction means that when the result is negative, we overpredicted the price. When it is positive, we under predicted.


So on highly negative residuals, the model believed the price be higher than it actually was. In these scenarios, since the model believed the price would go up, we would have bought with the intention of selling at the higher price.

In [17]:
train_residuals = y_train - train_preds
test_residuals = y_test - test_preds
residuals = pd.concat(
    (train_residuals.reset_index(drop=True), test_residuals.reset_index(drop=True)), 
    axis=1, 
    ignore_index=True
)
residuals.columns = ['train', 'test']

In [18]:
px.box(pd.melt(residuals), x='variable', y='value')

In [19]:
test = X_test.copy()
test['NextClose'] = y_test
test['ModelPrediction'] = test_preds

residuals = []
actions = []
for row in test.iterrows():
    idx, batch = row
    open_, high, low, current_close, volume_eth, next_close, model_prediction = batch
    residuals.append(next_close - model_prediction)
    current_diff = model_prediction - current_close
    action = 'do_nothing'
    if abs(current_diff) > test_metrics['mae']:
        if model_prediction < current_close:
            action = 'sell'
        else:
            action = 'buy'
            
    actions.append(action)
    
test['Residuals'] = residuals
test['action'] = actions

In [20]:
final_preds = model.predict(final_test.drop(['NextClose', 'TimeStamp'], axis=1))
final_test['ModelPrediction'] = final_preds
residuals = []
actions = []
for row in final_test.iterrows():
    idx, batch = row
    _, open_, high, low, current_close, volume_eth, next_close, model_prediction = batch
    residuals.append(next_close - model_prediction)
    current_diff = model_prediction - current_close
    action = 'do_nothing'
    if abs(current_diff) > test_metrics['mae']:
        if model_prediction < current_close:
            action = 'sell'
        else:
            action = 'buy'
            
    actions.append(action)
    
final_test['Residuals'] = residuals
final_test['action'] = actions



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [21]:
px.line(final_test, y='Residuals', color='action')

## Simulating

Say we start with a certain amount of money, and we buy or sell in increments of 10% of our current sum of money. How much money would we net after our final test?

In [22]:
initial_money = 100
percent_of_total_money_to_move = 0.10
eth_wallet_balance = 0.0
total_assets = 0.0
amount_of_eth_to_exchange = 0.0
amount_of_usd_to_exchange = 0.0

trading_history = []
for step, (idx, values) in enumerate(final_test.iterrows()):
    
    if step == 0:
        total_money = initial_money
    
    current_close = values['CurrentClose']
    next_close = values['NextClose']
    model_prediction = values['ModelPrediction']
    residuals = values['Residuals']
    action = values['action']
    action = 'do_nothing'
    if abs(model_prediction - current_close) > (test_metrics['mae'] / 3):
        if model_prediction - current_close > 0:
            action = 'buy'
        else:
            action = 'sell'
    
    amount_of_usd_to_exchange = percent_of_total_money_to_move * total_money
    amount_of_eth_to_exchange = amount_of_usd_to_exchange / current_close
    
    if total_money <= 0:
        break
    else:
        
        if action == 'sell':
            if amount_of_eth_to_exchange > eth_wallet_balance:
                amount_of_eth_to_exchange = eth_wallet_balance

            total_money += amount_of_eth_to_exchange * current_close
            eth_wallet_balance -= amount_of_eth_to_exchange
            assert eth_wallet_balance >= 0, f'eth_wallet_balance is negative: {eth_wallet_balance}'

        elif action == 'buy':
            if amount_of_usd_to_exchange > total_money:
                amount_of_usd_to_exchange = total_money

            total_money -= amount_of_usd_to_exchange
            eth_wallet_balance += amount_of_eth_to_exchange

        else:
            pass
        
        total_assets = eth_wallet_balance * current_close + total_money
    
        actions_this_turn = {
            'action': action,
            'amount_of_eth_to_exchange': amount_of_eth_to_exchange,
            'amount_of_usd_to_exchange': amount_of_usd_to_exchange,
            'eth_wallet_balance': eth_wallet_balance,
            'total_money': total_money,
            'total_assets': total_assets
        }
        trading_history.append(actions_this_turn)
        
trading_history = pd.DataFrame(trading_history)
trading_history.head()

Unnamed: 0,action,amount_of_eth_to_exchange,amount_of_usd_to_exchange,eth_wallet_balance,total_money,total_assets
0,sell,0.0,10.0,0.0,100.0,100.0
1,buy,0.004003,10.0,0.004003,90.0,100.0
2,buy,0.003633,9.0,0.007636,81.0,99.915705
3,sell,0.003254,8.1,0.004381,89.1,100.005805
4,sell,0.003567,8.91,0.000815,98.01,100.045029


In [23]:
trading_history['TimeStamp'] = final_test['TimeStamp'].values

In [24]:
fig = px.line(
    data_frame=trading_history, 
    x='TimeStamp',
    y='total_assets', 
    color='action', 
    title='Total Assets [In USD]'
)
fig.update_layout(
    yaxis_title="Total Assets [$USD]",
    legend_title="Buy or Sell Action", 
    yaxis_tickprefix = '$', 
    yaxis_tickformat = ',.'
)

fig.show()

In [25]:
fig = px.line(
    data_frame=trading_history, 
    x='TimeStamp',
    y='eth_wallet_balance', 
    title='Total ETH '
)
fig.update_layout(
    yaxis_title="Total ETH",
)

fig.show()

In [26]:
# Create figure with secondary y-axis
fig = make_subplots(specs=[[{"secondary_y": True}]])

# Add traces
fig.add_trace(
    go.Scatter(
        x=trading_history['TimeStamp'], 
        y=trading_history['eth_wallet_balance'], 
        name="ETH Wallet Balance"
    ),
    secondary_y=False,
)

fig.add_trace(
    go.Scatter(
        x=trading_history['TimeStamp'], 
        y=final_test['CurrentClose'].values, 
        name="ETH Price"
    ),
    secondary_y=True,
)

# Add figure title
fig.update_layout(
    title_text="ETH Wallet vs Price of ETH",
)

# Set x-axis title
fig.update_xaxes(title_text="TimeStamp")

# Set y-axes titles
fig.update_yaxes(title_text="ETH Wallet Balance", secondary_y=False)
fig.update_yaxes(title_text="Price of ETH [In $USD]", secondary_y=True, )

fig.show()


In [27]:
# Create figure with secondary y-axis
fig = make_subplots(specs=[[{"secondary_y": True}]])

# Add traces
fig.add_trace(
    go.Scatter(
        x=trading_history['TimeStamp'], 
        y=trading_history['eth_wallet_balance'], 
        name="ETH Wallet Balance"
    ),
    secondary_y=False,
)

fig.add_trace(
    go.Scatter(
        x=trading_history['TimeStamp'], 
        y=trading_history['total_assets'], 
        name="Total Assets"
    ),
    secondary_y=True,
)

# Add figure title
fig.update_layout(
    title_text="ETH Wallet vs Total Assets",
)

# Set x-axis title
fig.update_xaxes(title_text="TimeStamp")

# Set y-axes titles
fig.update_yaxes(title_text="ETH Wallet Balance", secondary_y=False)
fig.update_yaxes(title_text="Total Assets [In $USD]", secondary_y=True, )

fig.show()


## Saving Model

In [137]:
from joblib import dump

In [138]:
dump(model, '../models/lgb_model.joblib')

['../models/lgb_model.joblib']