In [2]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta, date

import math
import matplotlib.pyplot as plt

In [25]:
merged_df = pd.read_csv('data/r_results.csv')
n_seq = 63

merged_df['Exchange.Date'] = merged_df['Exchange.Date'].apply(lambda x: date(1900, 1, 1) + timedelta(int(x)))
merged_df.index = merged_df['Exchange.Date']

merged_df.head()

Unnamed: 0_level_0,Exchange.Date,Close,logreturns,forecast,lower,upper
Exchange.Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2003-01-01,2003-01-01,100.0,0.0,100.0,100.0,100.0
2003-01-04,2003-01-04,100.995362,0.009904,100.995362,100.995362,100.995362
2003-01-05,2003-01-05,101.623083,0.006196,101.623083,101.623083,101.623083
2003-01-08,2003-01-08,101.623083,0.0,101.623083,101.623083,101.623083
2003-01-09,2003-01-09,102.392342,0.007541,102.392342,102.392342,102.392342


In [26]:
merged_df['error'] = merged_df['forecast'] - merged_df['Close']
merged_df['abs_error'] = np.abs(merged_df['forecast'] - merged_df['Close'])

merged_df['actual_up'] = merged_df['Close'].diff(1) > 0
merged_df['forecast_up'] = merged_df['forecast'].diff(1) > 0

def confusion(actual, forecast):
    if (actual and forecast):
        return 'TP'
    
    if (actual and not forecast):
        return 'FN'
    
    if (not actual and forecast):
        return 'FP'
    
    if (not actual and not forecast):
        return 'TN'
    
    return False

merged_df['confusion'] = merged_df.apply(lambda x: confusion(x['actual_up'], x['forecast_up']), axis=1)

merged_df.tail()

Unnamed: 0_level_0,Exchange.Date,Close,logreturns,forecast,lower,upper,error,abs_error,actual_up,forecast_up,confusion
Exchange.Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2016-10-22,2016-10-22,671.89,0.005567,657.637739,255.792032,1690.777438,-14.252261,14.252261,True,True,TP
2016-10-23,2016-10-23,671.08,-0.001206,658.20537,251.688108,1721.314181,-12.87463,12.87463,False,True,FP
2016-10-26,2016-10-26,668.82,-0.003373,658.77349,247.648335,1752.414414,-10.04651,10.04651,False,True,FP
2016-10-27,2016-10-27,668.02,-0.001197,659.342101,243.671832,1784.088057,-8.677899,8.677899,False,True,FP
2016-10-28,2016-10-28,663.33,-0.007046,659.911202,239.757723,1816.345223,-3.418798,3.418798,False,True,FP


# Evaluating at t = 1

In [27]:
# New dataframe that only contains the number of periods to evaluate (1,3,5,21,63)
def new_df(n_periods):
    start = -n_seq
    end = -n_seq + n_periods if n_periods != 63 else None
    df = merged_df[start : end]
    return df

In [34]:
# Calculating RMSE and MAE
def evaluate(n_periods):
    df = new_df(n_periods)
    print(df.head(1))

    mape = ((df['abs_error'] / df['Close']).sum() / n_periods) * 100
    rmse = math.sqrt(pow(df['error'].sum(), 2) / n_periods)
    print(f"{n_periods}, RMSE: {round(rmse, 3)}, MAPE: {round(mape, 3)}%")

evaluate(1) # 1 day
evaluate(3) # half a week
evaluate(5) # week
evaluate(21) # month
evaluate(63) # quarter

              Exchange.Date   Close  logreturns    forecast       lower  \
Exchange.Date                                                             
2016-08-03       2016-08-03  622.77    0.002234  622.745083  614.850489   

                    upper     error  abs_error  actual_up  forecast_up  \
Exchange.Date                                                            
2016-08-03     630.741043 -0.024917   0.024917       True         True   

              confusion  
Exchange.Date            
2016-08-03           TP  
1, RMSE: 0.025, MAPE: 0.004%


In [52]:
# Creating confusion matrix
def confusion_matrix(df):
    conf = pd.DataFrame(columns=['P', 'N'], index=['P', 'N'])
    conf.loc['P', 'P'] = len(df[df['confusion'] == 'TP'])
    conf.loc['P', 'N'] = len(df[df['confusion'] == 'FN'])
    conf.loc['N', 'P'] = len(df[df['confusion'] == 'FP'])
    conf.loc['N', 'N'] = len(df[df['confusion'] == 'TN'])
    return conf

confusion = confusion_matrix(new_df(21))
precision = confusion.iloc[0, 0] / (confusion.iloc[0,0] + confusion.iloc[1,0])
recall = confusion.iloc[0, 0] / (confusion.iloc[0,0] + confusion.iloc[0,1])
f_score = 2*precision*recall/(precision+recall)

print(confusion)
print(f'precision: {int(precision*100)}%, recall: {int(recall*100)}%, f-score: {round(f_score, 3)}')

    P  N
P  13  0
N   8  0
precision: 61%, recall: 100%, f-score: 0.765


# Plotting

In [None]:
# Without confidence intercal
plot_df = merged_df[-n_seq - n_seq*2 :]
plt.figure(figsize=(12, 6))
plt.plot(plot_df['forecast'], label="forecast")
plt.plot(plot_df['Close'], label="actual")
plt.legend()

In [None]:
# With confidence interval

# plot_df = merged_df[-n_seq - (n_seq * 2):]
plot_df = merged_df

x = plot_df.index
y = plot_df['forecast']

plt.figure(figsize=(12, 10))
plt.plot(x,y)
plt.fill_between(x, plot_df['lower'], plot_df['upper'], color='blue', alpha=0.1)
plt.plot(plot_df['Close'][:-n_seq], label="actual")
plt.ylim(90, 800)

# Cross-validation

In [39]:
# Creating dataframe columns (error, absolute error, actual_up, forecast_up och confusion (TP, FP, TN, FN))

r_cross_df = pd.read_csv('data/r_cross_val.csv')

# adding first row of data based on last row of test data
new_data = []
new_data.insert(0, {'time':0, 'Close': 621.38, 'forecast': 621.38, 'lower': 621.38, 'upper': 621.38})
r_cross_df = pd.concat([pd.DataFrame(new_data), r_cross_df], ignore_index=True)

# creating error and up columns 
r_cross_df['error'] = r_cross_df['forecast'] - r_cross_df['Close']
r_cross_df['abs_error'] = np.abs(r_cross_df['forecast'] - r_cross_df['Close'])
r_cross_df['actual_up'] = r_cross_df['Close'].diff(1) > 0
r_cross_df['forecast_up'] = r_cross_df['forecast'].diff(1) > 0

def confusion(actual, forecast):
    if (actual and forecast):
        return 'TP'
    
    if (actual and not forecast):
        return 'FN'
    
    if (not actual and forecast):
        return 'FP'
    
    if (not actual and not forecast):
        return 'TN'
    
    return False

r_cross_df['confusion'] = r_cross_df.apply(lambda x: confusion(x['actual_up'], x['forecast_up']), axis=1)

r_cross_df.head(1)

Unnamed: 0,time,Close,forecast,lower,upper,error,abs_error,actual_up,forecast_up,confusion
0,0,621.38,621.38,621.38,621.38,0.0,0.0,False,False,TN


In [71]:
def cross_evaluate(df, n_periods):
    df = df[-63:-63+n_periods] if n_periods < 63 else df.tail(63)
    mape = ((df["abs_error"] / df["Close"]).sum() / n_periods) * 100
    rmse = math.sqrt(pow(df["error"].sum(), 2) / n_periods)

    tp = len(df[df['confusion'] == 'TP'])
    fp = len(df[df['confusion'] == 'FP'])
    fn = len(df[df['confusion'] == 'FN'])

    precision = tp / (tp + fp) if (tp + fp) > 0 else 0 # if else för att undvika division by zero errror
    recall = tp / (tp + fn) if (tp + fn > 0) else 0
    fscore = (2*precision*recall)/(precision+recall) if (precision + recall > 0) else 0

    return mape, rmse, precision, recall, fscore

cross_df = pd.DataFrame(columns=[
    "mape_1", 
    "mape_3",
    "mape_5",
    "mape_21",
    "mape_63",
    "rmse_1",
    "rmse_3",
    "rmse_5",
    "rmse_21",
    "rmse_63",
    'precision_1',
    'precision_3',
    'precision_5',
    'precision_21',
    'precision_63',
    'recall_1',
    'recall_3',
    'recall_5',
    'recall_21',
    'recall_63',
    'fscore_1',
    'fscore_3',
    'fscore_5',
    'fscore_21',
    'fscore_63',
])

len_forecasts = 1000
for i in range(len_forecasts):
    cross_merged_df = r_cross_df[i+1 : i+63+1] # to avoid first row 1 is added
    one = cross_evaluate(cross_merged_df, 1)
    three = cross_evaluate(cross_merged_df, 3)
    five = cross_evaluate(cross_merged_df, 5)
    twentyone = cross_evaluate(cross_merged_df, 21)
    sixtythree = cross_evaluate(cross_merged_df, 63)

    cross_df = cross_df.append({
        'mape_1': one[0],
        'mape_3': three[0],
        'mape_5': five[0],
        'mape_21': twentyone[0],
        'mape_63': sixtythree[0],
        'rmse_1': one[1],
        'rmse_3': three[1],
        'rmse_5': five[1],
        'rmse_21': twentyone[1],
        'rmse_63': sixtythree[1],
        'precision_1': one[2],
        'precision_3': three[2],
        'precision_5': five[2],
        'precision_21': twentyone[2],
        'precision_63': sixtythree[2],
        'recall_1': one[3],
        'recall_3': three[3],
        'recall_5': five[3],
        'recall_21': twentyone[3],
        'recall_63': sixtythree[3],
        'fscore_1': one[4],
        'fscore_3': three[4],
        'fscore_5': five[4],
        'fscore_21': twentyone[4],
        'fscore_63': sixtythree[4],
    }, ignore_index=True)

cross_df.head(1) # notera hur raden här är identisk med resultatet när vi inte körde korsvalidering

Unnamed: 0,mape_1,mape_3,mape_5,mape_21,mape_63,rmse_1,rmse_3,rmse_5,rmse_21,rmse_63,...,recall_1,recall_3,recall_5,recall_21,recall_63,fscore_1,fscore_3,fscore_5,fscore_21,fscore_63
0,0.004001,0.705933,0.712958,0.541432,1.063077,0.024917,7.524653,9.852218,2.323509,43.548246,...,1.0,1.0,1.0,1.0,1.0,1.0,0.5,0.75,0.764706,0.74


In [72]:
cross_df.iloc[:,10:].describe()

Unnamed: 0,precision_1,precision_3,precision_5,precision_21,precision_63,recall_1,recall_3,recall_5,recall_21,recall_63,fscore_1,fscore_3,fscore_5,fscore_21,fscore_63
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,0.563,0.569833,0.5695,0.567693,0.569007,0.563,0.91,0.992,1.0,1.0,0.563,0.671967,0.695288,0.715262,0.724984
std,0.496263,0.306632,0.234211,0.128532,0.025059,0.496263,0.286325,0.089129,0.0,0.0,0.496263,0.284387,0.207152,0.110059,0.020339
min,0.0,0.0,0.0,0.25,0.532258,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.4,0.694737
25%,0.0,0.333333,0.4,0.47619,0.548387,0.0,1.0,1.0,1.0,1.0,0.0,0.5,0.571429,0.645161,0.708333
50%,1.0,0.666667,0.6,0.6,0.564516,1.0,1.0,1.0,1.0,1.0,1.0,0.8,0.75,0.75,0.721649
75%,1.0,0.666667,0.8,0.666667,0.596774,1.0,1.0,1.0,1.0,1.0,1.0,0.8,0.888889,0.8,0.747475
max,1.0,1.0,1.0,0.761905,0.612903,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.864865,0.76


In [74]:
n = cross_df.count()[0]
mean = cross_df.mean()
upper = cross_df.mean() + 1.96 * cross_df.std() / math.sqrt(n)
lower = cross_df.mean() - 1.96 * cross_df.std() / math.sqrt(n)

ci_df = pd.DataFrame(columns=['measure', 'mean', 'lower', 'upper'])

for i in range(25):
    ci_df = ci_df.append({
        'measure': cross_df.columns[i],
        'mean': mean[i],
        'lower': lower[i],
        'upper': upper[i]
    }, ignore_index=True)

ci_df

Unnamed: 0,measure,mean,lower,upper
0,mape_1,1.567065,1.497895,1.636234
1,mape_3,1.56761,1.502146,1.633073
2,mape_5,1.568478,1.506176,1.63078
3,mape_21,1.574839,1.5266,1.623077
4,mape_63,1.577985,1.546782,1.609188
5,rmse_1,10.306489,9.841756,10.771221
6,rmse_3,17.489946,16.703597,18.276296
7,rmse_5,22.238871,21.249562,23.228179
8,rmse_21,44.151222,42.468187,45.834257
9,rmse_63,74.172557,71.987043,76.358071
