In [25]:
import wandb
import numpy as np
import pandas as pd
import plotly.io as pio
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import datetime
pio.templates.default = 'plotly_dark'

In [11]:
def score_metrics(y_true, y_pred):
    MAE = mean_absolute_error(y_true=y_true, y_pred=y_pred)
    MSE = mean_squared_error(y_true=y_true, y_pred=y_pred)
    R2 = r2_score(y_true=y_true, y_pred=y_pred)
    RMSE = np.sqrt(MSE)
    return {
        'mae': MAE,
        'rmse': RMSE,
        'r2': R2
    }

In [12]:
df = pd.read_csv('../data/processed/eth_hourly.csv')
df['TimeStamp'] = pd.to_datetime(df['TimeStamp'])
df.sort_values(by='TimeStamp', ascending=True, inplace=True)
final_test = df[df['TimeStamp'] > datetime.datetime(year=2021, month=5, day=1)]
df.drop(final_test.index, axis=0, inplace=True)
df.drop('TimeStamp', axis=1, inplace=True)
df.head()

Unnamed: 0,open,high,low,CurrentClose,Volume_USD,NextClose
0,733.12,736.48,731.19,733.04,4246576.84,734.64
1,733.04,735.99,731.7,734.64,2044880.32,731.32
2,734.64,734.65,722.0,731.32,7891317.14,728.44
3,731.32,732.0,728.44,728.44,2111099.12,735.21
4,728.44,739.3,725.52,735.21,7197617.75,732.1


In [13]:
df['PriceIncreased'] = df['NextClose'] > df['CurrentClose']
df['PriceIncreased'] = df['PriceIncreased'].astype(int)
df.drop('NextClose', axis=1, inplace=True)

In [14]:
X = df.drop('PriceIncreased', axis=1)
y = df['PriceIncreased']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=43)

In [23]:
wandb.init(project="ETH-Price")
model = LogisticRegression()
model.fit(X_train, y_train)
labels = ['not_increased', 'increased']
test_preds = model.predict(X_test)
test_probas = model.predict_proba(X_test)

wandb.sklearn.plot_summary_metrics(model=model, X=X_train, y=y_train, X_test=X_test, y_test=y_test)
wandb.sklearn.plot_classifier(model, X_train, X_test, y_train, y_test, test_preds, test_probas, labels)
wandb.sklearn.plot_confusion_matrix(y_true=y_test, y_pred=test_preds, labels=labels)
wandb.log({'model_name': 'LogisticRegression'})
wandb.finish()

VBox(children=(Label(value=' 0.47MB of 0.47MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
_runtime,19
_timestamp,1624364848
_step,8


0,1
_runtime,▁▂▃▄▅▅▆▇█
_timestamp,▁▂▃▄▅▅▆▇█
_step,▁▂▃▄▅▅▆▇█


  _warn_prf(average, modifier, msg_start, len(result))
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Plotting Classifier.
[34m[1mwandb[0m: Logged feature importances.
[34m[1mwandb[0m: Logged learning curve.
[34m[1mwandb[0m: Logged confusion matrix.
  _warn_prf(average, modifier, msg_start, len(result))
[34m[1mwandb[0m: Logged summary metrics.
[34m[1mwandb[0m: Logged class proportions.
[34m[1mwandb[0m: Logged calibration curve.
[34m[1mwandb[0m: Logged roc curve.
[34m[1mwandb[0m: Logged precision recall curve.


VBox(children=(Label(value=' 0.47MB of 0.47MB uploaded (0.00MB deduped)\r'), FloatProgress(value=0.99959237394…

0,1
_runtime,24
_timestamp,1624364920
_step,10
model_name,LogisticRegression


0,1
_runtime,▁▂▃▄▄▄▅▇▇██
_timestamp,▁▂▃▄▄▄▅▇▇██
_step,▁▂▂▃▄▅▅▆▇▇█


In [24]:
wandb.init(project="ETH-Price")
model = GradientBoostingClassifier()
model.fit(X_train, y_train)
labels = ['not_increased', 'increased']
test_preds = model.predict(X_test)
test_probas = model.predict_proba(X_test)

wandb.sklearn.plot_summary_metrics(model=model, X=X_train, y=y_train, X_test=X_test, y_test=y_test)
wandb.sklearn.plot_classifier(model, X_train, X_test, y_train, y_test, test_preds, test_probas, labels)
wandb.sklearn.plot_confusion_matrix(y_true=y_test, y_pred=test_preds, labels=labels)
wandb.log({'model_name': 'GradientBoostingClassifier'})
wandb.finish()

[34m[1mwandb[0m: 
[34m[1mwandb[0m: Plotting Classifier.
[34m[1mwandb[0m: Logged feature importances.
[34m[1mwandb[0m: Logged learning curve.
[34m[1mwandb[0m: Logged confusion matrix.
[34m[1mwandb[0m: Logged summary metrics.
[34m[1mwandb[0m: Logged class proportions.
[34m[1mwandb[0m: Logged calibration curve.
[34m[1mwandb[0m: Logged roc curve.
[34m[1mwandb[0m: Logged precision recall curve.


VBox(children=(Label(value=' 0.12MB of 0.12MB uploaded (0.00MB deduped)\r'), FloatProgress(value=0.99832902809…

0,1
_runtime,56
_timestamp,1624365050
_step,10
model_name,GradientBoostingClas...


0,1
_runtime,▁▁▅▅▆▆▇▇███
_timestamp,▁▁▅▅▆▆▇▇███
_step,▁▂▂▃▄▅▅▆▇▇█


In [26]:
wandb.init(project="ETH-Price")
model = RandomForestClassifier()
model.fit(X_train, y_train)
labels = ['not_increased', 'increased']
test_preds = model.predict(X_test)
test_probas = model.predict_proba(X_test)

wandb.sklearn.plot_summary_metrics(model=model, X=X_train, y=y_train, X_test=X_test, y_test=y_test)
wandb.sklearn.plot_classifier(model, X_train, X_test, y_train, y_test, test_preds, test_probas, labels)
wandb.sklearn.plot_confusion_matrix(y_true=y_test, y_pred=test_preds, labels=labels)
wandb.log({'model_name': 'RandomForestClassifier'})
wandb.finish()

[34m[1mwandb[0m: 
[34m[1mwandb[0m: Plotting Classifier.
[34m[1mwandb[0m: Logged feature importances.
[34m[1mwandb[0m: Logged learning curve.
[34m[1mwandb[0m: Logged confusion matrix.
[34m[1mwandb[0m: Logged summary metrics.
[34m[1mwandb[0m: Logged class proportions.
[34m[1mwandb[0m: Logged calibration curve.
[34m[1mwandb[0m: Logged roc curve.
[34m[1mwandb[0m: Logged precision recall curve.


VBox(children=(Label(value=' 0.03MB of 0.03MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
_runtime,108
_timestamp,1624365161
_step,10
model_name,RandomForestClassifi...


0,1
_runtime,▁▁▆▆▇▇▇████
_timestamp,▁▁▆▆▇▇▇████
_step,▁▂▂▃▄▅▅▆▇▇█


## Choose best model

In [32]:
model = RandomForestClassifier(n_estimators=25, max_depth=30)
model.fit(X_train, y_train)
train_preds = model.predict(X_train)
test_preds = model.predict(X_test)

print(classification_report(y_true=y_train, y_pred=train_preds))
print(classification_report(y_true=y_test, y_pred=test_preds))

              precision    recall  f1-score   support

           0       0.95      0.97      0.96      9098
           1       0.97      0.95      0.96      9076

    accuracy                           0.96     18174
   macro avg       0.96      0.96      0.96     18174
weighted avg       0.96      0.96      0.96     18174

              precision    recall  f1-score   support

           0       0.50      0.53      0.51      3893
           1       0.50      0.48      0.49      3896

    accuracy                           0.50      7789
   macro avg       0.50      0.50      0.50      7789
weighted avg       0.50      0.50      0.50      7789

