In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter

In [2]:
def cv_train(model, df, cols, over, under):
    X = df.drop(['labels'], axis=1)
    y = df['labels']
#     print(Counter(y))
    if over == 0.0:
        under = RandomUnderSampler(sampling_strategy=under)
        X_re2, y_re2 = under.fit_resample(X, y)
    else:
        over = SMOTE(sampling_strategy=over, n_jobs=4)
        under = RandomUnderSampler(sampling_strategy=under)
        X_re, y_re = over.fit_resample(X, y)
        X_re2, y_re2 = under.fit_resample(X_re, y_re)
#     print(Counter(y_re2))
    model.fit(X_re2, y_re2)
    return clf

def cv_test(model, df_test, cols):
    Xtest = df_test.drop(['labels'], axis=1)
    ytest = df_test['labels'].values
    ypred = model.predict(Xtest)
    acc = accuracy_score(ypred, ytest)
    TP, FP, TN, FN = 0, 0, 0, 0
    for i in range(len(ypred)):
        if ypred[i] == 1 and ytest[i] == 1:
            TP += 1
        elif ypred[i] == 1 and ytest[i] == 0:
            FP += 1
        elif ypred[i] == 0 and ytest[i] == 0:
            TN += 1    
        elif ypred[i] == 0 and ytest[i] == 1:
            FN += 1
    precision = TP/(TP + FP)
    recall = TP/(TP + FN)
    return acc, precision, recall


def cv(filename, cols):
    df = pd.read_csv('D:\\Stock Data\\'+filename, usecols=cols).iloc[0:10000]
    kf5 = KFold(n_splits=5, shuffle=False)
    for o in range(5):
        lower = o
        o = o/5 + 0.2
        for u in range(5-lower):
            u = o + u/5
            print('over sampling = ', o, 'under sampling = ', u)
            acc, pre, rec = np.zeros(5), np.zeros(5), np.zeros(5)
            for i, (train_ind, test_ind) in enumerate(kf5.split(df)):
                clf = RandomForestClassifier()
                cv_train(clf, df.iloc[train_ind], cols=cols, over=o, under=u)
                acc[i], pre[i], rec[i] = cv_test(clf, df.iloc[test_ind], cols)
            print('Accuracy, Precision, Recall: ', np.round(np.mean(acc), 3), \
                    np.round(np.mean(pre), 3), np.round(np.mean(rec), 3))

In [3]:
def train(filename, cols):
    path = 'D:\\Stock Data\\'
    file = path + filename
    print('Start Training ', file)
    df = pd.read_csv(file, usecols=cols)
    X = df.drop(['labels'], axis=1)
    y = df['labels']
    clf = RandomForestClassifier(n_estimators=80, min_samples_split=3)
    clf.fit(X, y)
    print('Finished Training ', file)
    return clf

def test(model, filename, cols):
    path = 'D:\\Stock Data\\'
    file = path + filename
    print('Start Testing ', file)
    df_test = pd.read_csv(file, usecols=cols)
    Xtest = df_test.drop(['labels'], axis=1)
    ytest = df_test['labels']
    ypred = clf.predict(Xtest)
    acc = accuracy_score(ypred, ytest)
    TP, FP, TN, FN = 0, 0, 0, 0
    for i in range(len(ypred)):
        if ypred[i] == 1 and ytest[i] == 1:
            TP += 1
        elif ypred[i] == 1 and ytest[i] == 0:
            FP += 1
        elif ypred[i] == 0 and ytest[i] == 0:
            TN += 1    
        elif ypred[i] == 0 and ytest[i] == 1:
            FN += 1
    precision = TP/(TP + FP)
    recall = TP/(TP + FN)
    print('Finished Testing ', file)
    print('Accuracy, Precision, Recall: ', np.round(acc, 3), np.round(precision, 3), np.round(recall, 3))
          

In [10]:
filename1 = 'data2020_train1.csv'
filename2 = 'data2020_under.csv'
test_filename = 'data2020_test1.csv'
# cols = ['past_1y_mean', 'past_1y_vol', 'PAST_1y_p90', 'leverage', 'leverage_mkt', \
#             'bm', 'roe', 'lag_baspread', 'lag_liquidity', 'lag_size', 'labels']
cols = ['PAST_1y_p90', 'leverage', 'leverage_mkt', \
            'bm', 'roe', 'lag_baspread', 'lag_liquidity', 'lag_size', 'labels']

In [11]:
#single year, original data
clf = train(filename1, cols)
test(clf, test_filename, cols)

Start Training  D:\Stock Data\data2020_train1.csv
Finished Training  D:\Stock Data\data2020_train1.csv
Start Testing  D:\Stock Data\data2020_test1.csv
Finished Testing  D:\Stock Data\data2020_test1.csv
Accuracy, Precision, Recall:  0.818 0.35 0.095


In [17]:
#single year, under sampling data
clf = train(filename2, cols)
test(clf, test_filename, cols)

Start Training  D:\Stock Data\data2020_under.csv
Finished Training  D:\Stock Data\data2020_under.csv
Start Testing  D:\Stock Data\data2020_test1.csv
Finished Testing  D:\Stock Data\data2020_test1.csv
Accuracy, Precision, Recall:  0.617 0.247 0.626


In [13]:
#predict using prior year
file_list1 = ['data2015_under.csv', 'data2016_under.csv', 'data2017_under.csv']
test_filelist1 = ['data2015_test1.csv', 'data2016_test1.csv', 'data2017_test1.csv']

In [14]:
for i in range(3):
    clf = train(file_list1[i], cols)
    test(clf, test_filelist1[i], cols)

Start Training  D:\Stock Data\data2015_under.csv
Finished Training  D:\Stock Data\data2015_under.csv
Start Testing  D:\Stock Data\data2015_test1.csv
Finished Testing  D:\Stock Data\data2015_test1.csv
Accuracy, Precision, Recall:  0.557 0.137 0.557
Start Training  D:\Stock Data\data2016_under.csv
Finished Training  D:\Stock Data\data2016_under.csv
Start Testing  D:\Stock Data\data2016_test1.csv
Finished Testing  D:\Stock Data\data2016_test1.csv
Accuracy, Precision, Recall:  0.574 0.137 0.58
Start Training  D:\Stock Data\data2017_under.csv
Finished Training  D:\Stock Data\data2017_under.csv
Start Testing  D:\Stock Data\data2017_test1.csv
Finished Testing  D:\Stock Data\data2017_test1.csv
Accuracy, Precision, Recall:  0.553 0.108 0.562


In [15]:
file_list2 = ['data2000_under.csv', 'data2001_under.csv', 'data2002_under.csv', \
                  'data2003_under.csv', 'data2004_under.csv', 'data2005_under.csv', \
                  'data2006_under.csv', 'data2007_under.csv', 'data2008_under.csv', \
                  'data2009_under.csv', 'data2010_under.csv', 'data2011_under.csv', \
                  'data2012_under.csv', 'data2013_under.csv', 'data2014_under.csv', \
                  'data2015_under.csv', 'data2016_under.csv', 'data2017_under.csv', \
                  'data2018_under.csv', 'data2019_under.csv', 'data2020_under.csv']
test_file_list2 = ['data2000_test1.csv', 'data2001_test1.csv', 'data2002_test1.csv', \
                          'data2003_test1.csv', 'data2004_test1.csv', 'data2005_test1.csv', \
                          'data2006_test1.csv', 'data2007_test1.csv', 'data2008_test1.csv', \
                          'data2009_test1.csv', 'data2010_test1.csv', 'data2011_test1.csv', \
                          'data2012_test1.csv', 'data2013_test1.csv', 'data2014_test1.csv', \
                          'data2015_test1.csv', 'data2016_test1.csv', 'data2017_test1.csv', \
                          'data2018_test1.csv', 'data2019_test1.csv', 'data2020_test1.csv']

In [17]:
for i in range(21):
    clf = train(file_list2[i], cols)
    test(clf, test_file_list2[i], cols)

Start Training  D:\Stock Data\data2000_under.csv
Finished Training  D:\Stock Data\data2000_under.csv
Start Testing  D:\Stock Data\data2000_test1.csv
Finished Testing  D:\Stock Data\data2000_test1.csv
Accuracy, Precision, Recall:  0.571 0.145 0.578
Start Training  D:\Stock Data\data2001_under.csv
Finished Training  D:\Stock Data\data2001_under.csv
Start Testing  D:\Stock Data\data2001_test1.csv
Finished Testing  D:\Stock Data\data2001_test1.csv
Accuracy, Precision, Recall:  0.588 0.122 0.576
Start Training  D:\Stock Data\data2002_under.csv
Finished Training  D:\Stock Data\data2002_under.csv
Start Testing  D:\Stock Data\data2002_test1.csv
Finished Testing  D:\Stock Data\data2002_test1.csv
Accuracy, Precision, Recall:  0.581 0.133 0.577
Start Training  D:\Stock Data\data2003_under.csv
Finished Training  D:\Stock Data\data2003_under.csv
Start Testing  D:\Stock Data\data2003_test1.csv
Finished Testing  D:\Stock Data\data2003_test1.csv
Accuracy, Precision, Recall:  0.573 0.106 0.563
Start Tr

In [5]:
file_list3 = ['data2000_train1.csv', 'data2001_train1.csv', 'data2002_train1.csv', \
                  'data2003_train1.csv', 'data2004_train1.csv', 'data2005_train1.csv', \
                  'data2006_train1.csv', 'data2007_train1.csv', 'data2008_train1.csv', \
                  'data2009_train1.csv', 'data2010_train1.csv', 'data2011_train1.csv', \
                  'data2012_train1.csv', 'data2013_train1.csv', 'data2014_train1.csv', \
                  'data2015_train1.csv', 'data2016_train1.csv', 'data2017_train1.csv', \
                  'data2018_train1.csv', 'data2019_train1.csv', 'data2020_train1.csv']
test_file_list3 = ['data2000_test1.csv', 'data2001_test1.csv', 'data2002_test1.csv', \
                          'data2003_test1.csv', 'data2004_test1.csv', 'data2005_test1.csv', \
                          'data2006_test1.csv', 'data2007_test1.csv', 'data2008_test1.csv', \
                          'data2009_test1.csv', 'data2010_test1.csv', 'data2011_test1.csv', \
                          'data2012_test1.csv', 'data2013_test1.csv', 'data2014_test1.csv', \
                          'data2015_test1.csv', 'data2016_test1.csv', 'data2017_test1.csv', \
                          'data2018_test1.csv', 'data2019_test1.csv', 'data2020_test1.csv']

In [7]:
for i in range(21):
    clf = train(file_list3[i], cols)
    test(clf, test_file_list3[i], cols)

Start Training  D:\Stock Data\data2000_train1.csv
Finished Training  D:\Stock Data\data2000_train1.csv
Start Testing  D:\Stock Data\data2000_test1.csv
Finished Testing  D:\Stock Data\data2000_test1.csv
Accuracy, Precision, Recall:  0.883 0.219 0.018
Start Training  D:\Stock Data\data2001_train1.csv
Finished Training  D:\Stock Data\data2001_train1.csv
Start Testing  D:\Stock Data\data2001_test1.csv
Finished Testing  D:\Stock Data\data2001_test1.csv
Accuracy, Precision, Recall:  0.907 0.236 0.014
Start Training  D:\Stock Data\data2002_train1.csv
Finished Training  D:\Stock Data\data2002_train1.csv
Start Testing  D:\Stock Data\data2002_test1.csv
Finished Testing  D:\Stock Data\data2002_test1.csv
Accuracy, Precision, Recall:  0.896 0.218 0.014
Start Training  D:\Stock Data\data2003_train1.csv
Finished Training  D:\Stock Data\data2003_train1.csv
Start Testing  D:\Stock Data\data2003_test1.csv
Finished Testing  D:\Stock Data\data2003_test1.csv
Accuracy, Precision, Recall:  0.915 0.17 0.007
S

In [8]:
file_list3 = ['data1998_under.csv', 'data1999_under.csv']
test_file_list3 = ['data1998_test1.csv', 'data1999_test1.csv']

In [9]:
for i in range(2):
    clf = train(file_list3[i], cols)
    test(clf, test_file_list3[i], cols)

Start Training  D:\Stock Data\data1998_under.csv
Finished Training  D:\Stock Data\data1998_under.csv
Start Testing  D:\Stock Data\data1998_test1.csv
Finished Testing  D:\Stock Data\data1998_test1.csv
Accuracy, Precision, Recall:  0.574 0.153 0.579
Start Training  D:\Stock Data\data1999_under.csv
Finished Training  D:\Stock Data\data1999_under.csv
Start Testing  D:\Stock Data\data1999_test1.csv
Finished Testing  D:\Stock Data\data1999_test1.csv
Accuracy, Precision, Recall:  0.577 0.153 0.579


In [None]:
filename = 'data2020_under.csv'

In [216]:
#cross validation of over and undersampling
cv(filename, cols=cols)

over sampling =  0.2 under sampling =  0.2
Accuracy, Precision, Recall:  0.889 0.157 0.014
over sampling =  0.2 under sampling =  0.4
Accuracy, Precision, Recall:  0.867 0.118 0.045
over sampling =  0.2 under sampling =  0.6000000000000001
Accuracy, Precision, Recall:  0.82 0.131 0.134
over sampling =  0.2 under sampling =  0.8
Accuracy, Precision, Recall:  0.735 0.116 0.236
over sampling =  0.2 under sampling =  1.0
Accuracy, Precision, Recall:  0.643 0.117 0.377
over sampling =  0.4 under sampling =  0.4
Accuracy, Precision, Recall:  0.875 0.123 0.035
over sampling =  0.4 under sampling =  0.6000000000000001
Accuracy, Precision, Recall:  0.839 0.112 0.081
over sampling =  0.4 under sampling =  0.8
Accuracy, Precision, Recall:  0.786 0.112 0.157
over sampling =  0.4 under sampling =  1.0
Accuracy, Precision, Recall:  0.725 0.109 0.233
over sampling =  0.6000000000000001 under sampling =  0.6000000000000001
Accuracy, Precision, Recall:  0.851 0.122 0.072
over sampling =  0.600000000000

In [221]:
#no oversampling
df = pd.read_csv('D:\\Stock Data\\'+filename, usecols=cols).iloc[0:10000]
kf5 = KFold(n_splits=5, shuffle=False)
acc, pre, rec = np.zeros(5), np.zeros(5), np.zeros(5)
for i, (train_ind, test_ind) in enumerate(kf5.split(df)):
    clf = RandomForestClassifier()
    cv_train(clf, df.iloc[train_ind], cols=cols, over=0.0, under=1.0)
    acc[i], pre[i], rec[i] = cv_test(clf, df.iloc[test_ind], cols)
print('Accuracy, Precision, Recall: ', np.round(np.mean(acc), 3), \
        np.round(np.mean(pre), 3), np.round(np.mean(rec), 3))

Accuracy, Precision, Recall:  0.546 0.118 0.529


In [223]:
#cross validation of number of trees
df = pd.read_csv('D:\\Stock Data\\'+filename, usecols=cols).iloc[0:10000]
kf5 = KFold(n_splits=5, shuffle=False)
for n in range(1, 11):
    m = n*20
    print('number of estimators: ', m)
    acc, pre, rec = np.zeros(5), np.zeros(5), np.zeros(5)
    for i, (train_ind, test_ind) in enumerate(kf5.split(df)):
        clf = RandomForestClassifier(n_estimators=m)
        cv_train(clf, df.iloc[train_ind], cols=cols, over=0.0, under=1.0)
        acc[i], pre[i], rec[i] = cv_test(clf, df.iloc[test_ind], cols)
    print('Accuracy, Precision, Recall: ', np.round(np.mean(acc), 3), \
            np.round(np.mean(pre), 3), np.round(np.mean(rec), 3))

number of estimators:  20
Accuracy, Precision, Recall:  0.576 0.117 0.476
number of estimators:  40
Accuracy, Precision, Recall:  0.56 0.113 0.479
number of estimators:  60
Accuracy, Precision, Recall:  0.545 0.116 0.518
number of estimators:  80
Accuracy, Precision, Recall:  0.554 0.118 0.518
number of estimators:  100
Accuracy, Precision, Recall:  0.539 0.111 0.5
number of estimators:  120
Accuracy, Precision, Recall:  0.555 0.116 0.503
number of estimators:  140
Accuracy, Precision, Recall:  0.536 0.109 0.489
number of estimators:  160
Accuracy, Precision, Recall:  0.547 0.112 0.49
number of estimators:  180
Accuracy, Precision, Recall:  0.524 0.11 0.511
number of estimators:  200
Accuracy, Precision, Recall:  0.542 0.114 0.508


In [225]:
#cross validation of number of split
df = pd.read_csv('D:\\Stock Data\\'+filename, usecols=cols).iloc[0:10000]
kf5 = KFold(n_splits=5, shuffle=False)
for n in range(2, 6):
    print('number of sample split: ', n)
    acc, pre, rec = np.zeros(5), np.zeros(5), np.zeros(5)
    for i, (train_ind, test_ind) in enumerate(kf5.split(df)):
        clf = RandomForestClassifier(min_samples_split=n)
        cv_train(clf, df.iloc[train_ind], cols=cols, over=0.0, under=1.0)
        acc[i], pre[i], rec[i] = cv_test(clf, df.iloc[test_ind], cols)
    print('Accuracy, Precision, Recall: ', np.round(np.mean(acc), 3), \
            np.round(np.mean(pre), 3), np.round(np.mean(rec), 3))

number of sample split:  2
Accuracy, Precision, Recall:  0.54 0.113 0.505
number of sample split:  3
Accuracy, Precision, Recall:  0.534 0.117 0.541
number of sample split:  4
Accuracy, Precision, Recall:  0.527 0.112 0.521
number of sample split:  5
Accuracy, Precision, Recall:  0.536 0.115 0.522
