In [2]:
%reset -f

import os
import numpy as np
import pandas as pd
import re
from datetime import datetime as DT
## sklearns
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix,\
    f1_score, accuracy_score, roc_auc_score, roc_curve
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, cross_validate
    
sMyPath = r'C:\UCSD_ML_Capstone'
os.chdir(sMyPath)


In [3]:
#### Import Data
##### Feature Generation #####
### Return the file names
# from os import listdir
# from os.path import isfile, join

datapath = sMyPath + r'\01_Input\01_03_ProcessedData\Tiingo_Stock_daily_Transformed'
# AvaTickers_df = pd.read_csv(sMyPath + r"\01_Input\01_01_DataCodes\Stock_Techonology_Semiconductors.csv")
# AvaTickers  = AvaTickers_df..values
AvaTickers_df = pd.read_csv(sMyPath + r"\01_Input\01_01_DataCodes\sNA_Select_Tickers.csv")
AvaTickers = AvaTickers_df[(AvaTickers_df.Sector=='Technology')&(AvaTickers_df.Industry=='Semiconductors')].Tickers.values

In [4]:
#### 0. Functions Define ####
def RandomForestCustomize(y, X, n_estimators = 100, max_depth=6,\
                      criterion = 'gini', ifcv=False, cvsize = 10):

    # X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)
    cutpoint = int(np.floor(X.shape[0] * 0.8))
    X_train = X.iloc[:cutpoint,:]
    X_test = X.iloc[cutpoint:,:]
    y_train = y.iloc[:cutpoint]
    y_test = y.iloc[cutpoint:]
    
    RFClf = RandomForestClassifier(n_estimators=n_estimators,\
                                   max_depth=max_depth, criterion=criterion)
    RFClf.fit(X_train, y_train)
    
    y_test_pred = RFClf.predict(X_test)

    Conf_Mat_tmp = confusion_matrix(y_test, y_test_pred)
    Reports_tmp = classification_report(y_test, y_test_pred)
    Accu_tmp = accuracy_score(y_test, y_test_pred)
    try:
        F1_tmp = f1_score(y_test, y_test_pred)
    except ValueError:
        F1_tmp = 0

    if ifcv:
        cv_results = cross_validate(RFClf, X, y, cv=cvsize)
        cv_score = cv_results['test_score'].mean()
    else:
        cv_score = 0
        
    return([Accu_tmp, Conf_Mat_tmp, Reports_tmp, F1_tmp, cv_score])

In [5]:
#### 1. Data clean and process ####
Var_Q = 5/100

TickerStart = 0
iTicker = TickerStart
Conf_Mat_df = list()
Reports_df = list()
Accu_df = list()
F1_df = list()
cv_score_df = list()
ifcv, cvsize = (True, 10)


In [6]:
S_time = DT.now()
iTicker = 0 

  
Loop_start = DT.now()

Ticker = AvaTickers[iTicker]

DataRaw = pd.read_pickle(datapath + r'\\'+ Ticker + r'_tsfresh.zip')
df = DataRaw.loc[:,[not bool(re.search('duplicate', x))\
                    for x in DataRaw.columns]].copy()

df = df.replace([np.inf, -np.inf], np.nan)
df = df.dropna(axis = 1)

### 1. Data initialization
## Daily
# Return
Ret_D_clf2 = df.Ret_D.copy()
Ret_D_clf2[df.Ret_D >= 0] = 1
Ret_D_clf2[df.Ret_D < 0] = 0

Ret_D_clf3 = df.Ret_D.copy()
Ret_D_clf3[df.Ret_D >= df.Ret_D.mean() + df.Ret_D.std()] = 1
Ret_D_clf3[df.Ret_D <= df.Ret_D.mean() - df.Ret_D.std()] = 2
Ret_D_clf3[(df.Ret_D > df.Ret_D.mean() - df.Ret_D.std())\
           & (df.Ret_D<=df.Ret_D.mean() + df.Ret_D.std())] = 0

Ret_D_clfVaR = df.Ret_D.copy()
Ret_D_clfVaR[df.Ret_D > df.Ret_D.quantile(Var_Q)] = 0
Ret_D_clfVaR[df.Ret_D <= df.Ret_D.mean() - df.Ret_D.std()] = 1
# Volatility
Vol_D_clf2 = df.Vol_D.copy()
Vol_D_clf2[df.Vol_D >= df.Vol_D.mean()] = 1
Vol_D_clf2[df.Vol_D < df.Vol_D.mean()] = 0

Vol_D_clf3 = df.Vol_D.copy()
Vol_D_clf3[df.Vol_D >= np.quantile(df.Vol_D, 0.95)] = 1
Vol_D_clf3[df.Vol_D <= np.quantile(df.Vol_D, 0.05)] = 2
Vol_D_clf3[(df.Vol_D > np.quantile(df.Vol_D, 0.05))\
           & (df.Vol_D<=np.quantile(df.Vol_D, 0.95))] = 0

## Weekly
# Return
Ret_W_clf2 = df.Ret_W.copy()
Ret_W_clf2[df.Ret_W >= 0] = 1
Ret_W_clf2[df.Ret_W < 0] = 0

Ret_W_clf3 = df.Ret_W.copy()
Ret_W_clf3[df.Ret_W >= df.Ret_W.mean() + df.Ret_W.std()] = 1
Ret_W_clf3[df.Ret_W <= df.Ret_W.mean() - df.Ret_W.std()] = 2
Ret_W_clf3[(df.Ret_W > df.Ret_W.mean() - df.Ret_W.std())\
           & (df.Ret_W<=df.Ret_W.mean() + df.Ret_W.std())] = 0

Ret_W_clfVaR = df.Ret_W.copy()
Ret_W_clfVaR[df.Ret_W > df.Ret_W.quantile(Var_Q)] = 0
Ret_W_clfVaR[df.Ret_W <= df.Ret_W.mean() - df.Ret_W.std()] = 1
# Volatility
Vol_W_clf2 = df.Vol_W.copy()
Vol_W_clf2[df.Vol_W >= df.Vol_W.mean()] = 1
Vol_W_clf2[df.Vol_W < df.Vol_W.mean()] = 0

Vol_W_clf3 = df.Vol_W.copy()
Vol_W_clf3[df.Vol_W >= np.quantile(df.Vol_W, 0.95)] = 1
Vol_W_clf3[df.Vol_W <= np.quantile(df.Vol_W, 0.05)] = 2
Vol_W_clf3[(df.Vol_W > np.quantile(df.Vol_W, 0.05))\
           & (df.Vol_W<=np.quantile(df.Vol_W, 0.95))] = 0

## Monthly
# Return
Ret_M_clf2 = df.Ret_M.copy()
Ret_M_clf2[df.Ret_M >= 0] = 1
Ret_M_clf2[df.Ret_M < 0] = 0

Ret_M_clf3 = df.Ret_M.copy()
Ret_M_clf3[df.Ret_M >= df.Ret_M.mean() + df.Ret_M.std()] = 1
Ret_M_clf3[df.Ret_M <= df.Ret_M.mean() - df.Ret_M.std()] = 2
Ret_M_clf3[(df.Ret_M > df.Ret_M.mean() - df.Ret_M.std())\
           & (df.Ret_M<=df.Ret_M.mean() + df.Ret_M.std())] = 0

Ret_M_clfVaR = df.Ret_M.copy()
Ret_M_clfVaR[df.Ret_M > df.Ret_M.quantile(Var_Q)] = 0
Ret_M_clfVaR[df.Ret_M <= df.Ret_M.mean() - df.Ret_M.std()] = 1
# Volatility
Vol_M_clf2 = df.Vol_M.copy()
Vol_M_clf2[df.Vol_M >= df.Vol_M.mean()] = 1
Vol_M_clf2[df.Vol_M < df.Vol_M.mean()] = 0

Vol_M_clf3 = df.Vol_M.copy()
Vol_M_clf3[df.Vol_M >= np.quantile(df.Vol_M, 0.95)] = 1
Vol_M_clf3[df.Vol_M <= np.quantile(df.Vol_M, 0.05)] = 2
Vol_M_clf3[(df.Vol_M > np.quantile(df.Vol_M, 0.05))\
           & (df.Vol_M<=np.quantile(df.Vol_M, 0.95))] = 0

## Construct lag ys
df['Ret_D_clf2'] = Ret_D_clf2
df['Ret_D_clf3'] = Ret_D_clf3
df['Ret_D_clfVaR'] = Ret_D_clfVaR
df['Vol_D_clf2'] = Vol_D_clf2
df['Vol_D_clf3'] = Vol_D_clf3

df['Ret_W_clf2'] = Ret_W_clf2
df['Ret_W_clf3'] = Ret_W_clf3
df['Ret_W_clfVaR'] = Ret_W_clfVaR
df['Vol_W_clf2'] = Vol_W_clf2
df['Vol_W_clf3'] = Vol_W_clf3

df['Ret_M_clf2'] = Ret_M_clf2
df['Ret_M_clf3'] = Ret_M_clf3
df['Ret_M_clfVaR'] = Ret_M_clfVaR
df['Vol_M_clf2'] = Vol_M_clf2
df['Vol_M_clf3'] = Vol_M_clf3

## Construct ys
df['y_Ret_D_clf2'] = Ret_D_clf2.shift(-1)
df['y_Ret_D_clf3'] = Ret_D_clf3.shift(-1)
df['y_Ret_D_clfVaR'] = Ret_D_clfVaR.shift(-1)
df['y_Vol_D_clf2'] = Vol_D_clf2.shift(-1)
df['y_Vol_D_clf3'] = Vol_D_clf3.shift(-1)

df['y_Ret_W_clf2'] = Ret_W_clf2.shift(-5)
df['y_Ret_W_clf3'] = Ret_W_clf3.shift(-5)
df['y_Ret_W_clfVaR'] = Ret_W_clfVaR.shift(-5)
df['y_Vol_W_clf2'] = Vol_W_clf2.shift(-5)
df['y_Vol_W_clf3'] = Vol_W_clf3.shift(-5)

df['y_Ret_M_clf2'] = Ret_M_clf2.shift(-21)
df['y_Ret_M_clf3'] = Ret_M_clf3.shift(-21)
df['y_Ret_M_clfVaR'] = Ret_M_clfVaR.shift(-21)
df['y_Vol_M_clf2'] = Vol_M_clf2.shift(-21)
df['y_Vol_M_clf3'] = Vol_M_clf3.shift(-21)

df = df.dropna()

y_features = ['y_Ret_D_clf2', 'y_Ret_D_clf3', 'y_Ret_D_clfVaR','y_Vol_D_clf2','y_Vol_D_clf3',
              'y_Ret_W_clf2', 'y_Ret_W_clf3', 'y_Ret_W_clfVaR','y_Vol_D_clf2','y_Vol_D_clf3',
              'y_Ret_M_clf2', 'y_Ret_M_clf3', 'y_Ret_M_clfVaR','y_Vol_D_clf2','y_Vol_D_clf3']


#### 2. RUn the models ####
X = df.copy()
X = X.drop(y_features,axis = 1)
Conf_Mat_list = list()
Reports_list = list()
Accu_list = list()
F1_list = list()
cv_score_list = list()


iy = 0
for iy in range(len(y_features)):
    print(r"Running " + Ticker + r" " + str(iy+1) + r"/" + str(len(y_features)))
    # y = df.y_Ret_W_clf2
    # y = df.y_Ret_W_clf3
    # y = df.y_Ret_W_clfVaR
    y = df[y_features[iy]]

    [Accu_tmp, Conf_Mat_tmp, Reports_tmp, F1_tmp, cv_score] = \
        RandomForestCustomize(y, X, n_estimators = 100, max_depth=6,\
                  criterion = 'gini', ifcv=ifcv, cvsize = cvsize)

    Accu_list.append(Accu_tmp)
    Conf_Mat_list.append(Conf_Mat_tmp)
    Reports_list.append(Reports_tmp)
    F1_list.append(F1_tmp)
    cv_score_list.append(cv_score)


Accu_list = pd.Series(Accu_list)
Conf_Mat_list = pd.Series(Conf_Mat_list)
Reports_list = pd.Series(Reports_list)
F1_list = pd.Series(F1_list)
cv_score_list = pd.Series(cv_score_list)


indexnames = ['D_PosiNega', 'D_BullBear', 'D_VaR', 'D_Vol2', 'D_Vol3',
              'W_PosiNega', 'W_BullBear', 'W_VaR', 'W_Vol2', 'W_Vol3',
              'M_PosiNega', 'M_BullBear', 'M_VaR', 'M_Vol2', 'M_Vol3']

Accu_list.name = Ticker + r'_Accuracy_Score'
Accu_list.index = indexnames
Conf_Mat_list.name = Ticker + r'_Confusion_Matrix'
Conf_Mat_list.index = indexnames
Reports_list.name = Ticker + r'_Clf_Report'
Reports_list.index = indexnames
F1_list.name = Ticker + r'_F1_Score'
F1_list.index = indexnames
cv_score_list.name = Ticker + r'_CV_Score'
cv_score_list.index = indexnames


# Output
if len(Conf_Mat_df) < 1:
    Accu_df = Accu_list
    Conf_Mat_df = Conf_Mat_list
    Reports_df = Reports_list
    F1_df = F1_list
    cv_score_df = cv_score_list
else:
    Conf_Mat_df = pd.concat([Conf_Mat_df, Conf_Mat_list], axis=1)
    Reports_df = pd.concat([Reports_df, Reports_list], axis=1)
    Accu_df = pd.concat([Accu_df, Accu_list], axis=1)
    F1_df = pd.concat([F1_df, F1_list], axis=1)
    cv_score_df = pd.concat([cv_score_df, cv_score_list], axis=1)

## Estimate finish time
Loop_end = DT.now()
LoopTime = Loop_end-Loop_start
TotalTime = Loop_end-S_time
AvgTime = (TotalTime)/(iTicker -TickerStart +1)
EstEnd = DT.now() + (len(AvaTickers) - iTicker - TickerStart - 1)*AvgTime

# AvgSeries[iTicker] = AvgTime
# EstEndSeries[iTicker] = EstEnd
print('Done No.'+ str(iTicker+1) + " " +Ticker+ ";\n" +
      "Loop time:" + str(LoopTime) + "\n" +
      "Total time:" + str(TotalTime) + "\n" +
      "Average time:" + str(AvgTime) + "\n" +
      "Estimate end:" + str(EstEnd) + '\n'  )


Running ADI 1/15
Running ADI 2/15


  _warn_prf(average, modifier, msg_start, len(result))


Running ADI 3/15
Running ADI 4/15
Running ADI 5/15


  _warn_prf(average, modifier, msg_start, len(result))


Running ADI 6/15
Running ADI 7/15
Running ADI 8/15
Running ADI 9/15
Running ADI 10/15


  _warn_prf(average, modifier, msg_start, len(result))


Running ADI 11/15
Running ADI 12/15
Running ADI 13/15
Running ADI 14/15
Running ADI 15/15


  _warn_prf(average, modifier, msg_start, len(result))


Done No.1 ADI;
Loop time:0:33:28.976279
Total time:0:33:28.976279
Average time:0:33:28.976279
Estimate end:2022-01-01 07:20:08.945107



In [7]:
Conf_Mat_df

D_PosiNega                    [[73, 187], [86, 212]]
D_BullBear     [[379, 0, 0], [89, 0, 0], [90, 0, 0]]
D_VaR                            [[468, 0], [89, 1]]
D_Vol2                        [[369, 10], [166, 13]]
D_Vol3         [[477, 0, 0], [58, 0, 0], [23, 0, 0]]
W_PosiNega                    [[47, 200], [40, 271]]
W_BullBear    [[383, 0, 1], [79, 11, 0], [83, 0, 1]]
W_VaR                           [[451, 23], [75, 9]]
W_Vol2                          [[376, 3], [172, 7]]
W_Vol3         [[477, 0, 0], [58, 0, 0], [23, 0, 0]]
M_PosiNega                    [[55, 137], [81, 285]]
M_BullBear     [[394, 0, 1], [87, 9, 0], [67, 0, 0]]
M_VaR                           [[461, 30], [58, 9]]
M_Vol2                          [[379, 0], [178, 1]]
M_Vol3         [[474, 3, 0], [58, 0, 0], [23, 0, 0]]
Name: ADI_Confusion_Matrix, dtype: object

In [8]:
Reports_df

D_PosiNega                  precision    recall  f1-score   ...
D_BullBear                  precision    recall  f1-score   ...
D_VaR                       precision    recall  f1-score   ...
D_Vol2                      precision    recall  f1-score   ...
D_Vol3                      precision    recall  f1-score   ...
W_PosiNega                  precision    recall  f1-score   ...
W_BullBear                  precision    recall  f1-score   ...
W_VaR                       precision    recall  f1-score   ...
W_Vol2                      precision    recall  f1-score   ...
W_Vol3                      precision    recall  f1-score   ...
M_PosiNega                  precision    recall  f1-score   ...
M_BullBear                  precision    recall  f1-score   ...
M_VaR                       precision    recall  f1-score   ...
M_Vol2                      precision    recall  f1-score   ...
M_Vol3                      precision    recall  f1-score   ...
Name: ADI_Clf_Report, dtype: object

In [9]:
Accu_df

D_PosiNega    0.510753
D_BullBear    0.679211
D_VaR         0.840502
D_Vol2        0.684588
D_Vol3        0.854839
W_PosiNega    0.569892
W_BullBear    0.707885
W_VaR         0.824373
W_Vol2        0.686380
W_Vol3        0.854839
M_PosiNega    0.609319
M_BullBear    0.722222
M_VaR         0.842294
M_Vol2        0.681004
M_Vol3        0.849462
Name: ADI_Accuracy_Score, dtype: float64

In [10]:
F1_df

D_PosiNega    0.608321
D_BullBear    0.000000
D_VaR         0.021978
D_Vol2        0.128713
D_Vol3        0.000000
W_PosiNega    0.693095
W_BullBear    0.000000
W_VaR         0.155172
W_Vol2        0.074074
W_Vol3        0.000000
M_PosiNega    0.723350
M_BullBear    0.000000
M_VaR         0.169811
M_Vol2        0.011111
M_Vol3        0.000000
Name: ADI_F1_Score, dtype: float64

In [11]:
cv_score_df

D_PosiNega    0.498038
D_BullBear    0.620193
D_VaR         0.813937
D_Vol2        0.520573
D_Vol3        0.866272
W_PosiNega    0.523120
W_BullBear    0.673380
W_VaR         0.825757
W_Vol2        0.548608
W_Vol3        0.870215
M_PosiNega    0.495552
M_BullBear    0.645058
M_VaR         0.784897
M_Vol2        0.521719
M_Vol3        0.864838
Name: ADI_CV_Score, dtype: float64