# 3. Financial Model (Daily)
Daniel Ruiz, MSc in Data Science and Business Analytics (DSBA), Bocconi University

Reference codes (alphabetically):
- https://www.dezyre.com/recipes/plot-roc-curve-in-python

## 3.1. Loading packages

In [1]:
# import packages
import csv
import pandas as pd
import numpy as np
import os
import random
import time
from numpy import array, hstack, vstack

# graphs
import matplotlib.pyplot as plt
import seaborn as sns

# classification
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.metrics import confusion_matrix, roc_curve, roc_auc_score
from sklearn.naive_bayes import BernoulliNB, MultinomialNB, GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC#, NuSVC, SVC

In [2]:
def return_confusion(company,y_test, predicted_classes):

    # confusion matrix
    cm = confusion_matrix(y_test,predicted_classes)
    TN, FP, FN, TP = cm.flatten()
    total = TN+FP+FN+TP

    # class 1
    prec1 = TP / (TP+FP)
    reca1 = TP / (TP+FN)
    fone1 = 2*(prec1*reca1)/(prec1+reca1)
    # class 0
    prec0 = TN / (TN+FN)
    reca0 = TN / (TN+FP)
    fone0 = 2*(prec0*reca0)/(prec0+reca0)

    # global / weighted
    accuw = TP/total +TN/total
    precw = prec0*(TN+FP)/(total) + prec1*(TP+FN)/(total)
    recaw = reca0*(TN+FP)/(total) + reca1*(TP+FN)/(total)
    fonew = fone0*(TN+FP)/(total) + fone1*(TP+FN)/(total)

    # list
    sup = [company, TN, FP, FN, TP, prec1, reca1, fone1, prec0, reca0, fone0, precw, recaw, fonew, accuw]

    return sup

def plot_roc(company_name,classifier_name,classifier,X_test,y_test,folder='Models_Daily/Shallow/'):

    sns.set()
    
    # predictions
    y_score = classifier.predict_proba(X_test)[:,1]
    fpr, tpr, thold = roc_curve(y_test, y_score)
    
    # plot ROC curves
    plt.subplots(1, figsize=(8,8))
    plt.title('ROC Curve - {} - {} - {:.5}%'.format(company_name,
                                                    classifier_name,
                                                    str(100*roc_auc_score(y_test, y_score))),
             fontweight='bold',fontsize=16)
    plt.plot(fpr, tpr)
    plt.plot([0, 1], ls="--")
    plt.plot([0, 0], [1, 0] , c=".7"), plt.plot([1, 1] , c=".7")
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    
    # save
    plt.savefig(folder+'ROC_'+company_name+'_'+classifier_name+'.png')
    plt.close()
    
def run_models(folder,variables):
    
    name = 'performance_'+time.strftime('%Y-%m-%d_%H-%M',time.gmtime())
    
    with open(folder+name+'.csv', 'w', encoding="utf-8") as csvFile:

        csvWriter = csv.writer(csvFile)

        for company in my_companies:

            df = pd.read_pickle('Dataset_ToModel_Daily/'+company+'.pkl')

            # select variables
            X = df[variables]

            # standardize (z-score)
            X = (X-X.mean())/X.std()

            #X = array(X)
            y = df['l_close_to_open_lag0']>=0
            #array(y)

            # train, test
            train=int(0.75*len(X))
            X_train, X_test = X[:train], X[train:]
            y_train, y_test = y[:train], y[train:]

            # Regressions
            skc_logistic = LogisticRegression(solver='lbfgs',max_iter=2000).fit(X_train, y_train)
            skc_logistic_CV = LogisticRegressionCV(cv=5,solver='lbfgs',max_iter=2000).fit(X_train, y_train)
            skc_nb_bernoulli = BernoulliNB().fit(X_train, y_train)
            skc_nb_gaussian = GaussianNB().fit(X_train, y_train)

            # no probabilities
            skc_svc_linear_inf = LinearSVC().fit(X_train, y_train)
            skc_dt_onetree = DecisionTreeClassifier().fit(X_train, y_train)
            skc_dt_randforest = RandomForestClassifier(n_estimators=100,bootstrap=True,max_features='sqrt').fit(X_train, y_train)
            #skc_nb_mult_inf = MultinomialNB().fit(X_train, y_train)
            #skc_svc_nu_inf = NuSVC(max_iter=2000).fit(X_train, y_train)

            classifiers = [['Logistic',skc_logistic],
                           ['Logistic-CV',skc_logistic_CV],
                           ['NB-Bernoulli',skc_nb_bernoulli],
                           ['NB-Gaussian',skc_nb_gaussian],
                           ['DT-OneTree',skc_dt_onetree],
                           ['DT-RandomForest',skc_dt_randforest]]

            for classifier in classifiers:

                # performance
                sup=[classifier[0]]+return_confusion(company,y_test,classifier[1].predict(X_test))
                csvWriter.writerow(sup)

                # ROC curve
                plot_roc(company,classifier[0],classifier[1],X_test,y_test,folder)

# 3.2  Loading and preparing data

In [3]:
# sample
df = pd.read_pickle('Dataset_ToModel_Daily/br_americanas.pkl')
print(df.columns)

df

Index(['l_delta_volume_lag0', 'l_close_to_close_lag0', 'l_open_to_close_lag0',
       'l_close_to_open_lag0', 'l_delta_volume_lag1', 'l_close_to_close_lag1',
       'l_open_to_close_lag1', 'l_close_to_open_lag1', 'final_pos_on_comp',
       'final_neg_on_comp', 'avg_pos_on_comp', 'final_pos_off_comp',
       'final_neg_off_comp', 'avg_pos_off_comp', 'final_pos_comp',
       'final_neg_comp', 'avg_pos_comp', 'final_pos_on_news',
       'final_neg_on_news', 'avg_pos_on_news', 'final_pos_off_news',
       'final_neg_off_news', 'avg_pos_off_news', 'final_pos_news',
       'final_neg_news', 'avg_pos_news'],
      dtype='object')


Unnamed: 0,l_delta_volume_lag0,l_close_to_close_lag0,l_open_to_close_lag0,l_close_to_open_lag0,l_delta_volume_lag1,l_close_to_close_lag1,l_open_to_close_lag1,l_close_to_open_lag1,final_pos_on_comp,final_neg_on_comp,...,avg_pos_comp,final_pos_on_news,final_neg_on_news,avg_pos_on_news,final_pos_off_news,final_neg_off_news,avg_pos_off_news,final_pos_news,final_neg_news,avg_pos_news
2020-01-13,-0.064198,0.019454,0.01606,0.003394,-0.014769,-0.001887,-0.006777,0.00489,9.0,3.0,...,0.75,3011.0,2194.0,0.578482,-0.752856,-1.278097,1.059146,3011.0,2194.0,0.578482
2020-01-14,0.358362,0.032084,0.027648,0.004436,-0.064198,0.019454,0.01606,0.003394,2.0,4.0,...,0.333333,2511.0,1453.0,0.633451,-0.556208,-0.938519,0.589386,2511.0,1453.0,0.633451
2020-01-15,0.106029,-0.013365,-0.006524,-0.006841,0.358362,0.032084,0.027648,0.004436,6.0,1.0,...,0.857143,2016.0,1393.0,0.591376,-0.776312,-0.974089,-0.28686,2016.0,1393.0,0.591376
2020-01-16,-0.311824,0.012647,-0.001794,0.014441,0.106029,-0.013365,-0.006524,-0.006841,14.0,5.0,...,0.736842,2138.0,1726.0,0.553313,-0.67376,-1.10404,0.685889,2138.0,1726.0,0.553313
2020-01-17,-0.101943,0.017088,0.011717,0.005372,-0.311824,0.012647,-0.001794,0.014441,9.0,1.0,...,0.9,2731.0,1775.0,0.606081,-0.571754,-1.203162,1.484772,2731.0,1775.0,0.606081
2020-01-20,-0.176129,0.023719,0.029738,-0.006019,-0.101943,0.017088,0.011717,0.005372,4.0,1.0,...,0.8,2290.0,1681.0,0.576681,-0.552389,-0.931879,0.582727,2290.0,1681.0,0.576681
2020-01-21,0.606271,0.0,0.005184,-0.005184,-0.176129,0.023719,0.029738,-0.006019,4.0,3.0,...,0.571429,5641.0,2465.0,0.695904,-0.483658,-1.04191,1.190727,5641.0,2465.0,0.695904
2020-01-22,-0.742092,-0.017035,-0.027664,0.010629,0.606271,0.0,0.005184,-0.005184,10.0,2.0,...,0.833333,4520.0,1733.0,0.722853,-0.251553,-0.611272,0.673213,4520.0,1733.0,0.722853
2020-01-23,0.628774,0.018757,0.02403,-0.005273,-0.742092,-0.017035,-0.027664,0.010629,29.0,5.0,...,0.852941,4906.0,3515.0,0.582591,0.15238,-0.902949,2.284583,4906.0,3515.0,0.582591
2020-01-24,0.037935,-0.003792,-0.003792,0.0,0.628774,0.018757,0.02403,-0.005273,8.0,3.0,...,0.727273,5917.0,8584.0,0.408041,0.902697,-0.171148,1.899295,5917.0,8584.0,0.408041


In [4]:
# set seeds
seed_value = 42
os.environ['PYTHONHASHSEED']=str(seed_value)
random.seed(seed_value)
np.random.seed(seed_value)
#tf.random.set_seed(seed_value)

# companies
my_companies = ['br_embraer',                
                'br_americanas',
                'br_pontofrio',
                'br_petrobras',
                'br_bradesco',
                'br_renner',
                'br_gol',
                'br_magazineluiza',
                'br_itau',
                'us_abercrombie',
                'us_boeing',
                'us_beyondmeat',
                'us_morganstanley',
                'us_jpmorgan',
                'us_exxonmobil',
                'us_americanair',
                'us_cocacola',
                'us_tesla']

vars_comp = ['final_pos_off_comp',
             'avg_pos_off_comp',
             'final_pos_on_comp',
             'avg_pos_on_comp']

vars_news = ['final_pos_off_news',
             'avg_pos_off_news',
             'final_pos_on_news',
             'avg_pos_on_news']

vars_finn = ['l_close_to_close_lag1',
             'l_delta_volume_lag1']

# news only
folder='Models_Daily/Shallow/News/'
run_models(folder,vars_news+vars_finn)

# comp only
folder='Models_Daily/Shallow/Comp/'
run_models(folder,vars_comp+vars_finn)

# comp and news
folder='Models_Daily/Shallow/Comp_News/'
run_models(folder,vars_news+vars_comp+vars_finn)

  # This is added back by InteractiveShellApp.init_path()
  # This is added back by InteractiveShellApp.init_path()
  if __name__ == '__main__':
  if __name__ == '__main__':
  # This is added back by InteractiveShellApp.init_path()
  if __name__ == '__main__':
  if __name__ == '__main__':
  if __name__ == '__main__':
  if __name__ == '__main__':
  if __name__ == '__main__':
  # This is added back by InteractiveShellApp.init_path()
  if __name__ == '__main__':
  # This is added back by InteractiveShellApp.init_path()
  if __name__ == '__main__':
  # This is added back by InteractiveShellApp.init_path()
  # This is added back by InteractiveShellApp.init_path()
  # This is added back by InteractiveShellApp.init_path()
  if __name__ == '__main__':
  # This is added back by InteractiveShellApp.init_path()
  # This is added back by InteractiveShellApp.init_path()
  if __name__ == '__main__':
  from ipykernel import kernelapp as app
  del sys.path[0]
  # This is added back by InteractiveShell

  # This is added back by InteractiveShellApp.init_path()
  # This is added back by InteractiveShellApp.init_path()
  # This is added back by InteractiveShellApp.init_path()
  # This is added back by InteractiveShellApp.init_path()
  # This is added back by InteractiveShellApp.init_path()
  # This is added back by InteractiveShellApp.init_path()
  if __name__ == '__main__':
  # This is added back by InteractiveShellApp.init_path()
  if __name__ == '__main__':
  if __name__ == '__main__':
  if __name__ == '__main__':
  if __name__ == '__main__':
  # This is added back by InteractiveShellApp.init_path()
  if __name__ == '__main__':
  # This is added back by InteractiveShellApp.init_path()
  if __name__ == '__main__':
  # This is added back by InteractiveShellApp.init_path()
  if __name__ == '__main__':
  # This is added back by InteractiveShellApp.init_path()
  # This is added back by InteractiveShellApp.init_path()
  # This is added back by InteractiveShellApp.init_path()
  # This is ad

______________________
## Pairs

In [5]:
folder='Models_Daily/Shallow/Pairs/'
name = 'performance_'+time.strftime('%Y-%m-%d_%H-%M',time.gmtime())

with open(folder+name+'.csv', 'w', encoding="utf-8") as csvFile:

    csvWriter = csv.writer(csvFile)
    
    for company in my_companies:


        # train, test
        train=int(0.75*len(X))
        X_train, X_test = X[:train], X[train:]
        y_train, y_test = y[:train], y[train:]

        # Regressions
        skc_logistic = LogisticRegression(solver='lbfgs',max_iter=2000).fit(X_train, y_train)
        skc_logistic_CV = LogisticRegressionCV(cv=5,solver='lbfgs',max_iter=2000).fit(X_train, y_train)
        skc_nb_bernoulli = BernoulliNB().fit(X_train, y_train)

        # no probabilities
        skc_svc_linear_inf = LinearSVC().fit(X_train, y_train)
        skc_dt_onetree = DecisionTreeClassifier().fit(X_train, y_train)
        skc_dt_randforest = RandomForestClassifier(n_estimators=100,bootstrap=True,max_features='sqrt').fit(X_train, y_train)
        #skc_nb_mult_inf = MultinomialNB().fit(X_train, y_train)
        #skc_svc_nu_inf = NuSVC(max_iter=2000).fit(X_train, y_train)
        
        classifiers = [['Logistic',skc_logistic],
                       ['Logistic-CV',skc_logistic_CV],
                       ['NB-Bernoulli',skc_nb_bernoulli],
                       ['NB-Gaussian',skc_nb_gaussian],
                       ['DT-OneTree',skc_dt_onetree],
                       ['DT-RandomForest',skc_dt_randforest]]

        for classifier in classifiers:

            # performance
            sup=[classifier[0]]+return_confusion(company,y_test,classifier[1].predict(X_test))
            csvWriter.writerow(sup)
            
            # ROC curve
            plot_roc(company,classifier[0],classifier[1],X_test,y_test,folder)

NameError: name 'X' is not defined

In [None]:
pair=['us_jpmorgan','us_morganstanley']

# select variables
df0 = pd.read_pickle('Dataset_ToModel_Daily/'+pair[0]+'.pkl')
df1 = pd.read_pickle('Dataset_ToModel_Daily/'+pair[1]+'.pkl')

df=df1/df0

# select variables
X = df[['final_pos_off_comp',
        'avg_pos_off_comp',
        'final_pos_off_news',
        'avg_pos_off_news',
        'final_pos_on_comp',
        'avg_pos_on_comp',
        'final_pos_on_news',
        'avg_pos_on_news',
        'l_close_to_close_lag1',
        'l_delta_volume_lag1']]

# standardize (z-score)
X = (X-X.mean())/X.std()

#X = array(X)
y = df['l_close_to_close_lag0']>=0

In [None]:
df1.index

In [None]:
from pandas import read_csv
import matplotlib.pyplot as plt

# load dataset
values = df.values
# specify columns to plot
groups = [i for i in range(len(df.columns))]

i = 1
# plot each column

plt.figure(figsize=(16,16))

for group in groups:
    plt.subplot(len(groups), 1, i)
    plt.plot(values[:, group])
    plt.title(df.columns[group], y=0.5, loc='right')
    i += 1
    
plt.show()

In [None]:
# select variables

y1 = df1['l_close_to_close_lag0']
X1 = df1[['final_pos_off_comp',
          'avg_pos_off_comp',
          'final_pos_off_news',
          'avg_pos_off_news',
          'final_pos_on_comp',
          'avg_pos_on_comp',
          'final_pos_on_news',
          'avg_pos_on_news',
          'l_close_to_close_lag1',
          'l_delta_volume_lag1']]

# binary
model_bin = prepare_model_bin(3)
#model_bin.fit(X,,epochs=20,verbose=0) #validation_data=(X_valid, to_categorical(y_valid)))


model_bin_train = model_bin.fit(X_train,
                                to_categorical(y_train>=0),
                                epochs=100,
                                verbose=0,
                                validation_data=(X_valid, to_categorical(y_valid>=0)>0))

sup = return_confusion(company,y_test>0,model_bin.predict(X_test)[:,1]>=0.50)
#csvWriter.writerow(sup)

#save_graphs(model_bin_train,company,'model_daily_bin/')