# Import packages

In [1]:
# ! pip install xgboost # Uncomment this line if you haven't installed xgboost 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pprint import pprint

from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score, roc_auc_score, classification_report, confusion_matrix
from mlxtend.plotting import plot_confusion_matrix
import xgboost as xgb
import re

# Prepare data

In [2]:
# Read financial data we generated in file 1. 
financial_data = pd.read_csv("yfinance_2016_2019.csv")
financial_data.Date = pd.to_datetime(financial_data.Date)

In [3]:
# Merge with sentiment score
sentiment = pd.read_csv("date_and_score.csv")
sentiment.Date = pd.to_datetime(sentiment.Date)

merged_df = pd.merge(financial_data, sentiment, on=["Date"])

# output_file_name = "yfinance_2016_2019_sentiment.csv"   # ** Uncomment this line ONLY when you wish to write the file.
# merged_df.to_csv(output_file_name, index=False)

In [4]:
df = pd.read_csv("yfinance_2016_2019_sentiment.csv")

# Set the label 
we're going to predict: whether tomorrow's  price will be higher 

In [5]:
label = df.btc_change_tmr

# Define a function for assessing model performance

In [6]:
# Define a function to evaluate model performance 
def model_performance(true, pred, name=None, print_=False, confusion=False):
    accuracy = accuracy_score(true, pred)
    precision = precision_score(true, pred)
    f1 = f1_score(true, pred)
    recall = recall_score(true, pred)
    roc_auc = roc_auc_score(true, pred)
    
    if print_:
        print('Accuracy : ', accuracy)
        print('Precision : ', precision)
        print('F1 Score : ', f1)
        print('Recall : ', recall)
        print('ROC AUC : ', roc_auc)


    if confusion:
        fig, ax = plot_confusion_matrix(conf_mat=confusion_matrix(true, pred))
        plt.show()

    return [name, accuracy, precision, f1, recall, roc_auc]

# PREDICTION WITH SENTIMENT ANALYSIS SCORE 

## Select features to use for prediction (asset change data plus sentiment analysis score)

In [7]:
features = df[['btc_change_ytd', 'oil_change_ytd', 
                   'ethereum_change_ytd', 'euro_change_ytd', 
                   'gold_change_ytd', 'tether_change_ytd', 
                   'dow_change_ytd', 'sp_change_ytd', 
                   'nasdaq_change_ytd','tenyear_change_ytd','score']]

## Split test and train data

In [8]:
# Split the data into train (80%) and test (20%) set.
X_train, X_test, y_train, y_test = train_test_split(features, label, test_size=0.2, random_state=1)

## Model 1: random forest 

In [9]:
# Baseline model: RF
rf = RandomForestClassifier(n_estimators=100, min_samples_split=50, random_state=1).fit(X_train,y_train)

In [10]:
preds_rf = pd.Series(rf.predict(X_test), index=y_test.index)
perf_rf = model_performance(y_test, preds_rf, name='Random Forest', print_=True)

Accuracy :  0.512280701754386
Precision :  0.5387931034482759
F1 Score :  0.6426735218508999
Recall :  0.7961783439490446
ROC AUC :  0.4801204219745223


In [11]:
# Report model performance in a table 
columns = ['Model', 'Accuracy', 'Precision', 'F1_Score', 'Recall', 'AUC']
model_metrics = []

model_metrics.append(perf_rf)
results_df = pd.DataFrame(columns=columns, data=model_metrics)
results_df.head()

Unnamed: 0,Model,Accuracy,Precision,F1_Score,Recall,AUC
0,Random Forest,0.512281,0.538793,0.642674,0.796178,0.48012


## Model 2: xgboost

In [12]:
# create model instance
bst = xgb.XGBClassifier(random_state=1)
# fit model
bst.fit(X_train, y_train)
# make predictions
preds_bst = bst.predict(X_test)
preds_bst = pd.Series(preds_bst, index=y_test.index, name="predictions")

In [13]:
xgboost_perf = model_performance(y_test, preds_bst, name='XGBoost', print_=True)

model_metrics.append(xgboost_perf)
results_df = pd.DataFrame(columns=columns, data=model_metrics)
results_df.head()

Accuracy :  0.512280701754386
Precision :  0.5511363636363636
F1 Score :  0.5825825825825827
Recall :  0.6178343949044586
ROC AUC :  0.5003234474522293


Unnamed: 0,Model,Accuracy,Precision,F1_Score,Recall,AUC
0,Random Forest,0.512281,0.538793,0.642674,0.796178,0.48012
1,XGBoost,0.512281,0.551136,0.582583,0.617834,0.500323


# Model 3: random forest w/ parameter optimization

In [14]:
rf = RandomForestClassifier(random_state = 42)

# Look at parameters used by our current forest
print('Parameters currently in use:\n')
pprint(rf.get_params())

Parameters currently in use:

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}


In [15]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}


In [16]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier(random_state = 42)
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


In [17]:
best_rf = rf_random.best_estimator_.fit(X_train, y_train)

preds_best_rf = best_rf.predict(X_test)

preds_best_rf = pd.Series(preds_best_rf, index=y_test.index, name="predictions")

best_rf_perf = model_performance(y_test, preds_best_rf, name='Best RF', print_=True)

model_metrics.append(best_rf_perf)
results_df = pd.DataFrame(columns=columns, data=model_metrics)
results_df.head()

Accuracy :  0.512280701754386
Precision :  0.5428571428571428
F1 Score :  0.6212534059945504
Recall :  0.7261146496815286
ROC AUC :  0.4880573248407643


Unnamed: 0,Model,Accuracy,Precision,F1_Score,Recall,AUC
0,Random Forest,0.512281,0.538793,0.642674,0.796178,0.48012
1,XGBoost,0.512281,0.551136,0.582583,0.617834,0.500323
2,Best RF,0.512281,0.542857,0.621253,0.726115,0.488057
