# 3. Financial Model (Daily)
Daniel Ruiz, MSc in Data Science and Business Analytics (DSBA), Bocconi University

Reference codes (alphabetically):
- https://www.datacamp.com/community/tutorials/convolutional-neural-networks-python
- https://machinelearningmastery.com/how-to-develop-convolutional-neural-network-models-for-time-series-forecasting/
- https://machinelearningmastery.com/tensorflow-tutorial-deep-learning-with-tf-keras/
- https://pythonprogramming.net/convolutional-neural-network-deep-learning-python-tensorflow-keras/
- https://stackoverflow.com/questions/32419510/how-to-get-reproducible-results-in-keras
- https://towardsdatascience.com/how-to-use-convolutional-neural-networks-for-time-series-classification-56b1b0a07a57


## 3.1. Loading packages

In [1]:
# import packages
import csv
import pandas as pd
import numpy as np
import os
import random
import time
from numpy import array, hstack, vstack

# graphs
import matplotlib.pyplot as plt
import seaborn as sns

# neural networks
from keras.models import Input, Model
from keras.layers import Dense, Dropout, Flatten, Input
from keras.layers.convolutional import Conv1D, MaxPooling1D
from keras.layers.merge import concatenate
from keras.utils import to_categorical
from sklearn.metrics import confusion_matrix
import tensorflow as tf

Using TensorFlow backend.


In [2]:
def split_sequences(sequences, n_steps):
    # split a multivariate sequence into samples
    X, y = list(), list()
    for i in range(len(sequences)):
        # find the end of this pattern
        end_ix = i + n_steps
        # check if we are beyond the dataset
        if end_ix > len(sequences):
            break
        # gather input and output parts of the pattern
        seq_x, seq_y = sequences[i:end_ix, :-1], sequences[end_ix-1, -1]
        X.append(seq_x)
        y.append(seq_y)
    return array(X), array(y)

def split_data(df,perc_train,perc_valid):
    assert perc_train + perc_valid <= 1
    assert perc_train >= 0
    assert perc_valid >= 0
    train = int(len(X[0])*perc_train)
    valid = int(len(X[0])*(perc_train+perc_valid))
    df_train = X[:train]
    df_valid = X[train:valid]
    df_test = X[valid:]
    return df_train, df_valid, df_test

#------------------------------------------------------------------------
def return_confusion(company,y_test, predicted_classes):

    # confusion matrix
    cm = confusion_matrix(y_test,predicted_classes)
    TN, FP, FN, TP = cm.flatten()
    total = TN+FP+FN+TP

    # class 1
    prec1 = TP / (TP+FP)
    reca1 = TP / (TP+FN)
    fone1 = 2*(prec1*reca1)/(prec1+reca1)
    # class 0
    prec0 = TN / (TN+FN)
    reca0 = TN / (TN+FP)
    fone0 = 2*(prec0*reca0)/(prec0+reca0)

    # global / weighted
    accuw = TP/total +TN/total
    precw = prec0*(TN+FP)/(total) + prec1*(TP+FN)/(total)
    recaw = reca0*(TN+FP)/(total) + reca1*(TP+FN)/(total)
    fonew = fone0*(TN+FP)/(total) + fone1*(TP+FN)/(total)

    # list
    sup = [company, TN, FP, FN, TP, prec1, reca1, fone1, prec0, reca0, fone0, precw, recaw, fonew, accuw]

    return sup

#------------------------------------------------------------------------
def save_graphs(model_train,company,folder='estimation_daily/'):
    
    # performance in validation and test set
    metricas = [i for i in list(model_train.history.keys()) if i[:4]!='val_']
    for metrica in metricas:
        met = model_train.history[metrica]
        val_met = model_train.history['val_'+metrica]
        epochs = range(len(met))
        plt.figure(figsize=(15,5))
        sns.set(font_scale=1.5)
        plt.plot(epochs, met, 'b', c='orange', label='Training '+metrica,lw=1)
        plt.plot(epochs, val_met, 'b', c='green', label='Validation '+metrica,lw=1)
        plt.title('Training and validation '+metrica)
        plt.legend(loc='center left')
        plt.savefig(folder+company+'_'+metrica+'.png')
        plt.close()
        
#------------------------------------------------------------------------
def find_cut(model,X,y):
                
    predictions = model.predict(X)[:,1]

    total = len(predictions)
    max_acc=0
    min_acc=1

    # loop
    sup_min = max(int(min(predictions)*100)+1,33)
    sup_max = min(int(max(predictions)*100)-1,67)
    best_cut = sup_min

    for cut in range(sup_min,sup_max): 

        hits = sum((predictions>=(cut/100))==y)

        acc = hits/total

        if acc >= max_acc:
            best_cut = cut
            max_acc = acc

        if acc <= min_acc:
            worst_cut = cut
            min_acc = acc
            
    return max_acc, best_cut, min_acc, worst_cut

#------------------------------------------------------------------------
def prepare_model_bin(n_variables,n_steps=3,n_features=1,num_classes=2):
    
    def make_cnn(n_steps=n_steps,n_features=n_features):
        visible = Input(shape=(n_steps, n_features))
        cnn = Conv1D(filters=16, kernel_size=2, activation='relu')(visible)
        cnn = MaxPooling1D(pool_size=2)(cnn)
        cnn = Dropout(0.25)(cnn)
        cnn = Flatten()(cnn)
        return visible, cnn

    # generate input models for each variable
    visibles, cnns = [], []
    for i in range(n_variables):
    # add element
        visibles.append([])
        cnns.append([])
        visibles[-1], cnns[-1] = make_cnn()

    # merge input models
    merge = concatenate(cnns)
    dense = Dense(50, activation='relu')(merge)
    dense = Dropout(0.3)(dense)
    output = Dense(num_classes, activation='softmax')(dense)
    model = Model(inputs=visibles, outputs=output)

    # for continuous predictions
    metricas = ['accuracy','mse','msle','mae', 'mape', 'cosine']
    model.compile(optimizer='adam', loss='mse', metrics=metricas)
        
    return(model)

#------------------------------------------------------------------------
def prepare_model_cont(n_variables,n_steps=3,n_features=1):
    
    def make_cnn(n_steps=n_steps,n_features=n_features):
        visible = Input(shape=(n_steps, n_features))
        cnn = Conv1D(filters=16, kernel_size=2, activation='relu')(visible)
        cnn = MaxPooling1D(pool_size=2)(cnn)
        cnn = Dropout(0.25)(cnn)
        cnn = Flatten()(cnn)
        return visible, cnn

    # generate input models for each variable
    visibles, cnns = [], []
    for i in range(n_variables):
    # add element
        visibles.append([])
        cnns.append([])
        visibles[-1], cnns[-1] = make_cnn()

    # merge input models
    merge = concatenate(cnns)
    dense = Dense(50, activation='relu')(merge)
    dense = Dropout(0.3)(dense)
    output = Dense(1)(dense)
    model = Model(inputs=visibles, outputs=output)

    # for continuous predictions
    metricas = ['accuracy','mse','msle','mae', 'mape', 'cosine']
    model.compile(optimizer='adam', loss='mse', metrics=metricas)
        
    return(model)

# 3.2  Loading and preparing data

In [3]:
# set seeds
seed_value = 42
os.environ['PYTHONHASHSEED']=str(seed_value)
random.seed(seed_value)
np.random.seed(seed_value)
tf.random.set_seed(seed_value)

# companies
my_companies = ['br_embraer',                
                'br_americanas',
                'br_pontofrio',
                'br_petrobras',
                'br_bradesco',
                'br_renner',
                'br_gol',
                'br_magazineluiza',
                'br_itau',
                'us_abercrombie',
                'us_boeing',
                'us_beyondmeat',
                'us_morganstanley',
                'us_jpmorgan',
                'us_exxonmobil',
                'us_americanair',
                'us_cocacola',
                'us_tesla',
                'us_wsj']

my_companies=['us_cocacola']

In [13]:
folder = 'Models_Daily/CNN_Cont/'
name = 'performance_'+time.strftime('%Y-%m-%d_%H-%M',time.gmtime())

with open(folder+name+'.csv', 'w', encoding="utf-8") as csvFile:

    csvWriter = csv.writer(csvFile)
    
    for company in my_companies:
        
        print(company)
    
        # open data
        df = pd.read_pickle('Dataset_ToModel_daily/'+company+'.pkl')

        # select variables
        X = df[['final_pos_off','final_neg_off','avg_pos_off']]
        y = df['l_close_to_close']

        # data format
        in_seqs = [array(X[col]).reshape((len(y), 1)) for col in X.columns]
        out_seq = [array(y).reshape((len(y), 1))]
        dataset = hstack(tuple(in_seqs+out_seq))
        n_steps=3
        X, y = split_sequences(dataset, n_steps)
        X = [X[:, :, i].reshape(X.shape[0], X.shape[1], 1) for i in range(X.shape[2])]

        # train, valid, test
        perc_train=0.50
        perc_valid=0.25
        train=int(perc_train*len(X[0]))
        valid=int((perc_train+perc_valid)*len(X[0]))
        # X
        X_train = [X[i][:train] for i in range(len(X))]
        X_valid = [X[i][train:valid] for i in range(len(X))]
        X_test = [X[i][valid:] for i in range(len(X))]
        # y
        y_train = y[:train]
        y_valid = y[train:valid]
        y_test = y[valid:]

        # model
        model_cont = prepare_model_cont(3)
        model_cont_train = model_cont.fit(X_train,
                                          y_train,
                                          epochs=100,
                                          verbose=0,
                                          validation_data=(X_valid, y_valid))
        sup = return_confusion(company,y_test>0,model_cont.predict(X_test)>0)

        # save data
        csvWriter.writerow(sup)
        save_graphs(model_cont_train,company,folder)



# binary
model_bin = prepare_model_bin(3)
#model_bin.fit(X,,epochs=20,verbose=0) #validation_data=(X_valid, to_categorical(y_valid)))


model_bin_train = model_bin.fit(X_train,
                                to_categorical(y_train>=0),
                                epochs=100,
                                verbose=0,
                                validation_data=(X_valid, to_categorical(y_valid>=0)>0))

sup = return_confusion(company,y_test>0,model_bin.predict(X_test)[:,1]>=0.50)
#csvWriter.writerow(sup)

#save_graphs(model_bin_train,company,'model_daily_bin/')