# 3. Financial Model (Intraday)
Daniel Ruiz, MSc in Data Science and Business Analytics (DSBA), Bocconi University

Reference codes (alphabetically):
- https://www.datacamp.com/community/tutorials/convolutional-neural-networks-python
- https://machinelearningmastery.com/how-to-develop-convolutional-neural-network-models-for-time-series-forecasting/
- https://machinelearningmastery.com/tensorflow-tutorial-deep-learning-with-tf-keras/
- https://pythonprogramming.net/convolutional-neural-network-deep-learning-python-tensorflow-keras/
- https://stackoverflow.com/questions/32419510/how-to-get-reproducible-results-in-keras
- https://towardsdatascience.com/how-to-use-convolutional-neural-networks-for-time-series-classification-56b1b0a07a57


## 3.1. Loading packages

In [1]:
# import packages
import csv
import pandas as pd
import numpy as np
import os
import random
import time
from numpy import array, hstack, vstack

# graphs
import matplotlib.pyplot as plt
import seaborn as sns

# neural networks
from keras.models import Input, Model
from keras.layers import Dense, Dropout, Flatten, Input
from keras.layers.convolutional import Conv1D, MaxPooling1D
from keras.layers.merge import concatenate
from keras.utils import to_categorical
from sklearn.metrics import confusion_matrix
import tensorflow as tf

Using TensorFlow backend.


In [2]:
x = pd.DataFrame(['hello'],columns=['my'])


# 3.2  Loading and preparing data

In [5]:
def prepare_data(company, print_steps=False):

    if print_steps:
        print(company)

    # 1) ---------------------------------------
    # load twitter data
    df_twitter = pd.read_pickle('Dataset_TS_02/'+company+'.pkl')

    # get variables
    df_twitter['final_neg']=1-df_twitter['final_pos']
    sup1 = df_twitter.groupby(['block_15'])['final_pos','final_neg'].sum()
    sup2 = df_twitter.groupby(['block_15'])['final_pos'].mean()
    df_twitter = sup1.merge(sup2,on='block_15', how='left')
    df_twitter.columns = [['count_pos','count_neg','avg_pos']]

    if print_steps:
        print('Twitter DF (block 15)', df_twitter.shape)
        print('1-ok!')
        
    # 2) ---------------------------------------
    # load finacial data
    df_stocks = pd.read_pickle('Dataset_TS_FIN_01/'+company+'.pkl')
    df_stocks = df_stocks.sort_values(['datetime'])

    # log
    df_stocks['high'] = np.log(df_stocks['high'])
    df_stocks['low'] = np.log(df_stocks['low'])
    df_stocks['open'] = np.log(df_stocks['open'])
    df_stocks['close'] = np.log(df_stocks['close'])
    
    #XXX
    #df_stocks['volume'] = [int(vol.replace(".","")) for vol in df_stocks.volume]
    df_stocks['volume'] = np.log(df_stocks['volume'])

    # Tweets [14:15:00,14:29:59] -> Prediction Open of [14:30:00]
    df_stocks['delta_open'] = df_stocks['open'].shift(-1)-df_stocks['open']
    df_stocks['delta_open_pos']=df_stocks['delta_open']>=0
    
    # Volume [14:15:00,14:29:59] -> 14:29:59 - 14:14:59
    df_stocks['delta_close'] = df_stocks['close']-df_stocks['close'].shift(1)
    df_stocks['delta_volume'] = df_stocks['volume']-df_stocks['volume'].shift(1)
    #df_stocks['delta_close_pos']=df_stocks['delta_open']>=0

    # cleaning first and last block of the day
    df_stocks = df_stocks[df_stocks.date_adj==df_stocks.date_adj.shift(-1)]
    df_stocks = df_stocks[df_stocks.date_adj==df_stocks.date_adj.shift(1)]
    
    if print_steps:
        print('Stocks DF (block 15)', df_stocks.shape)
        print('2-ok!')

    # 3) ---------------------------------------
    # merge the data
    XY = df_stocks.merge(df_twitter, on='block_15', how='left')
    XY = XY.rename(columns={('count_pos',): 'count_pos',
                            ('count_neg',): 'count_neg',
                            ('delta_open_pos',): 'delta_open_pos',
                            ('avg_pos',): 'avg_pos',
                            ('delta_volume',): 'delta_volume',
                            ('delta_close',): 'delta_close'},)
    
    if print_steps:
        print(XY.columns)
        
    # filtering data
    XY = XY[XY.block_on==True]
    used_variables = ['count_pos','count_neg','avg_pos','delta_close','delta_volume']
    n_X_vars = len(used_variables)
    XY = XY[['date_adj','block_15','delta_open_pos']+used_variables]

    # sort by datetime
    XY = XY.sort_values(['block_15'])
    XY = XY.reset_index(drop=True)

    # filling in NAs: n_tweets = 0, avg_pos = avg, demeaning
    try:
        XY['count_pos'] = XY['count_pos'].fillna(0)
        XY['count_neg'] = XY['count_neg'].fillna(0)
        XY['avg_pos'] = XY['avg_pos'].fillna(XY['avg_pos'].mean())- XY['avg_pos'].mean()
    except:
        pass

    if print_steps:
        print('3-ok!')

    # 4) ---------------------------------------
    # convert to NN readable format
    
    
    # split a multivariate sequence into samples
    def split_sequences(sequences, n_steps):
        X, y = list(), list()
        for i in range(len(sequences)):
            # find the end of this pattern
            end_ix = i + n_steps
            # check if we are beyond the dataset
            if end_ix > len(sequences):
                break
            # gather input and output parts of the pattern
            seq_x, seq_y = sequences[i:end_ix, :-1], sequences[end_ix-1, -1]
            X.append(seq_x)
            y.append(seq_y)
        return array(X), array(y)
    
    # past n_teps blocks of 15 minutes
    n_steps = 3
    n_features = 1

    X_all = np.empty((0,n_steps,n_X_vars), int)
    y_all = np.empty((0), int)

    # for all dates
    sup_dates = XY.date_adj.unique().tolist()
    for sup_date in sup_dates:
        sup_XY = XY[XY.date_adj==sup_date].T.values.tolist()

        # x
        seqs = []
        for i in range(n_X_vars):
            seq = array(sup_XY[3+i])
            seq = seq.reshape((len(seq), 1))
            seqs.append(seq)
        # y
        seq = array(sup_XY[2])
        seq = seq.reshape((len(seq), 1))
        seqs.append(seq)

        # convert into input/output
        dataset = hstack(tuple(seqs))
        X, y = split_sequences(dataset, n_steps)

        # stack
        X_all = vstack((X_all,X))
        y_all = hstack((y_all,y))
    
    if print_steps:
        print('4-ok!')
    
    # 5) ---------------------------------------
    
    # separate input data (n_features = time-series per head)
    X=[]
    for i in range(n_X_vars):
        X.append(X_all[:, :, i].reshape(X_all.shape[0], X_all.shape[1], n_features))

    # Train, Test, Valid = 60, 20, 20
    sup = int(np.ceil(len(y_all)*0.2))
    X_train, X_valid, X_test = [], [], []
    
    for i in range(n_X_vars):
        X_train.append(X[i][:len(y_all)-2*sup])
        X_valid.append(X[i][len(y_all)-2*sup:len(y_all)-sup])
        X_test.append(X[i][len(y_all)-sup:len(y_all)])

    # y
    y_train = y_all[:len(y_all)-2*sup]
    y_valid = y_all[len(y_all)-2*sup:len(y_all)-sup]
    y_test = y_all[len(y_all)-sup:len(y_all)]
    
    if print_steps:
        print('5-ok!')
    
    return[[X_train,X_valid,X_test],[y_train,y_valid,y_test]]

#------------------------------------------------------------------------
def prepare_model(n_variables,n_steps=3,n_features=1,num_classes=2,print_steps=False):
    
    def make_cnn(n_steps=n_steps,n_features=n_features):
        visible = Input(shape=(n_steps, n_features))
        cnn = Conv1D(filters=16, kernel_size=2, activation='relu')(visible)
        cnn = MaxPooling1D(pool_size=2)(cnn)
        cnn = Dropout(0.25)(cnn)
        cnn = Flatten()(cnn)
        return visible, cnn

    # generate input models for each variable
    visibles, cnns = [], []
    for i in range(n_variables):
    # add element
        visibles.append([])
        cnns.append([])
        visibles[-1], cnns[-1] = make_cnn()

    # merge input models
    merge = concatenate(cnns)
    dense = Dense(50, activation='relu')(merge)
    dense = Dropout(0.3)(dense)
    m_output = Dense(num_classes, activation='softmax')(dense)
    model = Model(inputs=visibles, outputs=m_output)

    # for continuous predictions
    metricas = ['accuracy','mse','msle','mae', 'mape', 'cosine']
    model.compile(optimizer='adam', loss='mse', metrics=metricas)

    #
    #model.compile(loss=keras.losses.categorical_crossentropy,
    #              optimizer=keras.optimizers.Adam(),
    #              metrics=['accuracy'])
    
    if print_steps:
        # visualize the model
        print(model.summary())
        
    return(model)

#------------------------------------------------------------------------
def save_graphs(model_train,company,folder='intraday_estimation/'):
    
    # performance in validation and test set
    metricas = [i for i in list(model_train.history.keys()) if i[:4]!='val_']
    for metrica in metricas:
        met = model_train.history[metrica]
        val_met = model_train.history['val_'+metrica]
        epochs = range(len(met))
        plt.figure(figsize=(15,5))
        sns.set(font_scale=1.5)
        plt.plot(epochs, met, 'b', c='orange', label='Training '+metrica,lw=1)
        plt.plot(epochs, val_met, 'b', c='green', label='Validation '+metrica,lw=1)
        plt.title('Training and validation '+metrica)
        plt.legend(loc='center left')
        plt.savefig(folder+company+'_'+metrica+'.png')
        plt.close()
        
#------------------------------------------------------------------------
def find_cut(model,X,y):
                
    predictions = model.predict(X)[:,1]

    total = len(predictions)
    max_acc=0
    min_acc=1

    # loop
    sup_min = max(int(min(predictions)*100)+1,33)
    sup_max = min(int(max(predictions)*100)-1,67)
    best_cut = sup_min

    for cut in range(sup_min,sup_max): 

        hits = sum((predictions>=(cut/100))==y)

        acc = hits/total

        if acc >= max_acc:
            best_cut = cut
            max_acc = acc

        if acc <= min_acc:
            worst_cut = cut
            min_acc = acc
            
    return max_acc, best_cut, min_acc, worst_cut
        
#------------------------------------------------------------------------
def return_confusion(model, X_test, y_test, print_steps=False, cut=50):

    # predicted class = most likely (MECE)
    predicted_classes = array(model.predict(X_test)[:,1]>=(cut/100),dtype=int)

    # overall performance:
    if print_steps:
        print("Of {} observations tested:".format(len(predicted_classes)))
        correct = np.where(predicted_classes==y_test)[0]
        print("- Classified {} correctly".format(len(correct)))
        incorrect = np.where(predicted_classes!=y_test)[0]
        print("- Classified {} incorrectly".format(len(incorrect)))

    # confusion matrix
    # 0,0 = TN = True Negatives
    # 0,1 = FP = False Positives
    # 1,0 = FN = False Negatives
    # 0,1 = TP = True Positives
    cm = confusion_matrix(y_test,predicted_classes)
    TN, FP, FN, TP = cm.flatten()
    total = TN+FP+FN+TP

    # class 1
    prec1 = TP / (TP+FP)
    reca1 = TP / (TP+FN)
    fone1 = 2*(prec1*reca1)/(prec1+reca1)
    # class 0
    prec0 = TN / (TN+FN)
    reca0 = TN / (TN+FP)
    fone0 = 2*(prec0*reca0)/(prec0+reca0)

    # global / weighted
    accuw = TP/total +TN/total
    precw = prec0*(TN+FP)/(total) + prec1*(TP+FN)/(total)
    recaw = reca0*(TN+FP)/(total) + reca1*(TP+FN)/(total)
    fonew = fone0*(TN+FP)/(total) + fone1*(TP+FN)/(total)

    # list
    sup = [company, TN, FP, FN, TP, prec1, reca1, fone1, prec0, reca0, fone0, precw, recaw, fonew, accuw]

    return sup

# 3.2. Running the model

In [9]:
# set seeds
seed_value = 42
os.environ['PYTHONHASHSEED']=str(seed_value)
random.seed(seed_value)
np.random.seed(seed_value)
tf.random.set_seed(seed_value)

# companies
my_companies = ['br_embraer',                
                'br_americanas',
                'br_pontofrio',
                'br_petrobras',
                'br_bradesco',
                'br_renner',
                'br_gol',
                'br_magazineluiza',
                'br_itau',
                'us_abercrombie',
                'us_boeing',
                'us_beyondmeat',
                'us_morganstanley',
                'us_jpmorgan',
                'us_exxonmobil',
                'us_americanair',
                'us_cocacola',
                'us_tesla',
                'us_wsj']


folder='intraday_estimation/'
with open(folder+'performance_intraday_'+time.strftime('%Y-%m-%d_%H-%M',time.gmtime())+'.csv', 'w', encoding="utf-8") as csvFile:

    csvWriter = csv.writer(csvFile)

    for company in my_companies:

        # prepare data
        X, y = prepare_data(company,print_steps=False)

        # unpacking
        X_train, X_valid, X_test = X
        y_train, y_valid, y_test = y

        # one hot enconding
        y_train_one_hot = to_categorical(y_train)
        y_valid_one_hot = to_categorical(y_valid)
        y_test_one_hot = to_categorical(y_test)

        # 1st input model -> count_pos
        # 2nd input model -> count_neg
        # 3rd input model -> avg_pos
        # 4th input model -> delta volume (lag)

        model = prepare_model(n_variables=len(X_train))

        # fit model (verbose = print steps)
        model_train = model.fit(X_train,
                                y_train_one_hot,
                                epochs=100,
                                verbose=0,
                                validation_data=(X_valid, y_valid_one_hot))

        save_graphs(model_train,company,folder)

        # test set final loss and accuracy
        test_eval = model.evaluate(X_test,y_test_one_hot,verbose=0)

        # hit an miss
        #max_acc, best_cut, min_acc, worst_cut = find_cut(model,X_valid,y_valid)
        #my_cut=best_cut        
        my_cut=50
        
        sup = return_confusion(model,
                               X_test,
                               y_test,
                               print_steps=False,
                               cut=my_cut)
        csvWriter.writerow(sup)
        
        # print
        print(company)
        print('Test loss:', test_eval[0])
        print('Test accuracy:', test_eval[1])
        print('my cut:',my_cut)
        print(sup)

AttributeError: 'str' object has no attribute 'log'