In [10]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold

In [59]:
def train(filename, cols, k=5):
    path = 'D:\\Stock Data\\'
    file = path + filename
    print('Start Training ', file)
    df = pd.read_csv(file, usecols=cols)
    X = df.drop(['labels'], axis=1)
    y = df['labels']
    clf = KNeighborsClassifier(n_neighbors=k)
    clf.fit(X, y)
    print('Finished Training ', file)
    return clf

def test(model, filename, cols):
    path = 'D:\\Stock Data\\'
    file = path + filename
    print('Start Testing ', file)
    df_test = pd.read_csv(file, usecols=cols)
    Xtest = df_test.drop(['labels'], axis=1)
    ytest = df_test['labels']
    ypred = clf.predict(Xtest)
    acc = accuracy_score(ypred, ytest)
    TP, FP, TN, FN = 0, 0, 0, 0
    for i in range(len(ypred)):
        if ypred[i] == 1 and ytest[i] == 1:
            TP += 1
        elif ypred[i] == 1 and ytest[i] == 0:
            FP += 1
        elif ypred[i] == 0 and ytest[i] == 0:
            TN += 1    
        elif ypred[i] == 0 and ytest[i] == 1:
            FN += 1
    precision = TP/(TP + FP)
    recall = TP/(TP + FN)
    print('Finished Testing ', file)
    print('Accuracy, Precision, Recall: ', np.round(acc, 3), np.round(precision, 3), np.round(recall, 3))

def cv_test(model, df_test):
    Xtest = df_test.drop(['labels'], axis=1)
    ytest = df_test['labels'].values
    ypred = clf.predict(Xtest)
    acc = accuracy_score(ypred, ytest)
    TP, FP, TN, FN = 0, 0, 0, 0
#     print(ypred.shape, ytest.shape)
#     print(ypred, ytest)
    for i in range(len(ypred)):
        if ypred[i] == 1 and ytest[i] == 1:
            TP += 1
        elif ypred[i] == 1 and ytest[i] == 0:
            FP += 1
        elif ypred[i] == 0 and ytest[i] == 0:
            TN += 1    
        elif ypred[i] == 0 and ytest[i] == 1:
            FN += 1
    precision = TP/(TP + FP)
    recall = TP/(TP + FN)
    return acc, precision, recall

In [60]:
# filename = 'data2019_train1.csv'
filename = 'data2020_under.csv'
test_filename = 'data2019_test1.csv'
cols = ['PAST_1y_p90', 'leverage', 'leverage_mkt', 'bm', 'roe', 'lag_baspread', 'lag_liquidity', \
           'lag_size', 'labels']

In [62]:
clf = train(filename, cols, k=8)
test(clf, test_filename, cols)

Start Training  D:\Stock Data\data2020_under.csv
Finished Training  D:\Stock Data\data2020_under.csv
Start Testing  D:\Stock Data\data2019_test1.csv
Finished Testing  D:\Stock Data\data2019_test1.csv
Accuracy, Precision, Recall:  0.68 0.111 0.303


In [64]:
file_list2 = ['data2000_train1.csv', 'data2001_train1.csv', 'data2002_train1.csv', \
                  'data2003_train1.csv', 'data2004_train1.csv', 'data2005_train1.csv', \
                  'data2006_train1.csv', 'data2007_train1.csv', 'data2008_train1.csv', \
                  'data2009_train1.csv', 'data2010_train1.csv', 'data2011_train1.csv', \
                  'data2012_train1.csv', 'data2013_train1.csv', 'data2014_train1.csv', \
                  'data2015_train1.csv', 'data2016_train1.csv', 'data2017_train1.csv', \
                  'data2018_train1.csv', 'data2019_train1.csv', 'data2020_train1.csv']
test_file_list2 = ['data2000_test1.csv', 'data2001_test1.csv', 'data2002_test1.csv', \
                          'data2003_test1.csv', 'data2004_test1.csv', 'data2005_test1.csv', \
                          'data2006_test1.csv', 'data2007_test1.csv', 'data2008_test1.csv', \
                          'data2009_test1.csv', 'data2010_test1.csv', 'data2011_test1.csv', \
                          'data2012_test1.csv', 'data2013_test1.csv', 'data2014_test1.csv', \
                          'data2015_test1.csv', 'data2016_test1.csv', 'data2017_test1.csv', \
                          'data2018_test1.csv', 'data2019_test1.csv', 'data2020_test1.csv']

In [65]:
for i in range(len(file_list2)):
    clf = train(file_list2[i], cols, k=8)
    test(clf, test_file_list2[i], cols)

Start Training  D:\Stock Data\data2000_train1.csv
Finished Training  D:\Stock Data\data2000_train1.csv
Start Testing  D:\Stock Data\data2000_test1.csv
Finished Testing  D:\Stock Data\data2000_test1.csv
Accuracy, Precision, Recall:  0.887 0.203 0.003
Start Training  D:\Stock Data\data2001_train1.csv
Finished Training  D:\Stock Data\data2001_train1.csv
Start Testing  D:\Stock Data\data2001_test1.csv
Finished Testing  D:\Stock Data\data2001_test1.csv
Accuracy, Precision, Recall:  0.909 0.259 0.002
Start Training  D:\Stock Data\data2002_train1.csv
Finished Training  D:\Stock Data\data2002_train1.csv
Start Testing  D:\Stock Data\data2002_test1.csv
Finished Testing  D:\Stock Data\data2002_test1.csv
Accuracy, Precision, Recall:  0.899 0.215 0.002
Start Training  D:\Stock Data\data2003_train1.csv
Finished Training  D:\Stock Data\data2003_train1.csv
Start Testing  D:\Stock Data\data2003_test1.csv
Finished Testing  D:\Stock Data\data2003_test1.csv
Accuracy, Precision, Recall:  0.917 0.188 0.001


In [66]:
file_list3 = ['data2000_under.csv', 'data2001_under.csv', 'data2002_under.csv', \
                  'data2003_under.csv', 'data2004_under.csv', 'data2005_under.csv', \
                  'data2006_under.csv', 'data2007_under.csv', 'data2008_under.csv', \
                  'data2009_under.csv', 'data2010_under.csv', 'data2011_under.csv', \
                  'data2012_under.csv', 'data2013_under.csv', 'data2014_under.csv', \
                  'data2015_under.csv', 'data2016_under.csv', 'data2017_under.csv', \
                  'data2018_under.csv', 'data2019_under.csv', 'data2020_under.csv']
test_file_list3 = ['data2000_test1.csv', 'data2001_test1.csv', 'data2002_test1.csv', \
                          'data2003_test1.csv', 'data2004_test1.csv', 'data2005_test1.csv', \
                          'data2006_test1.csv', 'data2007_test1.csv', 'data2008_test1.csv', \
                          'data2009_test1.csv', 'data2010_test1.csv', 'data2011_test1.csv', \
                          'data2012_test1.csv', 'data2013_test1.csv', 'data2014_test1.csv', \
                          'data2015_test1.csv', 'data2016_test1.csv', 'data2017_test1.csv', \
                          'data2018_test1.csv', 'data2019_test1.csv', 'data2020_test1.csv']

In [67]:
for i in range(len(file_list3)):
    clf = train(file_list3[i], cols, k=8)
    test(clf, test_file_list3[i], cols)

Start Training  D:\Stock Data\data2000_under.csv
Finished Training  D:\Stock Data\data2000_under.csv
Start Testing  D:\Stock Data\data2000_test1.csv
Finished Testing  D:\Stock Data\data2000_test1.csv
Accuracy, Precision, Recall:  0.623 0.128 0.405
Start Training  D:\Stock Data\data2001_under.csv
Finished Training  D:\Stock Data\data2001_under.csv
Start Testing  D:\Stock Data\data2001_test1.csv
Finished Testing  D:\Stock Data\data2001_test1.csv
Accuracy, Precision, Recall:  0.636 0.107 0.412
Start Training  D:\Stock Data\data2002_under.csv
Finished Training  D:\Stock Data\data2002_under.csv
Start Testing  D:\Stock Data\data2002_test1.csv
Finished Testing  D:\Stock Data\data2002_test1.csv
Accuracy, Precision, Recall:  0.628 0.115 0.406
Start Training  D:\Stock Data\data2003_under.csv
Finished Training  D:\Stock Data\data2003_under.csv
Start Testing  D:\Stock Data\data2003_test1.csv
Finished Testing  D:\Stock Data\data2003_test1.csv
Accuracy, Precision, Recall:  0.632 0.094 0.401
Start Tr

In [48]:
#CV for number of neighbors
df = pd.read_csv('D:\\Stock Data\\'+filename, usecols=cols)
kf5 = KFold(n_splits=5, shuffle=False)
for n in range(1, 11):
    print('number of nearest neighbor: ', n)
    acc, pre, rec = np.zeros(5), np.zeros(5), np.zeros(5)
    for i, (train_ind, test_ind) in enumerate(kf5.split(df)):
        X = df.iloc[train_ind].drop(['labels'], axis=1)
        y = df.iloc[train_ind]['labels']
        clf = KNeighborsClassifier(n_neighbors=n)
        clf.fit(X, y)
        acc[i], pre[i], rec[i] = cv_test(clf, df.iloc[test_ind])
    print('Accuracy, Precision, Recall: ', np.round(np.mean(acc), 3), \
            np.round(np.mean(pre), 3), np.round(np.mean(rec), 3))

number of nearest neighbor:  1
Accuracy, Precision, Recall:  0.82 0.119 0.116
number of nearest neighbor:  2
Accuracy, Precision, Recall:  0.888 0.131 0.015
number of nearest neighbor:  3
Accuracy, Precision, Recall:  0.873 0.129 0.039
number of nearest neighbor:  4
Accuracy, Precision, Recall:  0.893 0.14 0.007
number of nearest neighbor:  5
Accuracy, Precision, Recall:  0.889 0.139 0.015
number of nearest neighbor:  6
Accuracy, Precision, Recall:  0.895 0.144 0.003
number of nearest neighbor:  7
Accuracy, Precision, Recall:  0.894 0.144 0.006
number of nearest neighbor:  8
Accuracy, Precision, Recall:  0.896 0.145 0.001
number of nearest neighbor:  9
Accuracy, Precision, Recall:  0.895 0.146 0.002
number of nearest neighbor:  10
Accuracy, Precision, Recall:  0.896 0.139 0.0
