In [3]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm

In [4]:
#Online training of a list of files
def train_svm(file_list, cols, chunksize=1e6, scaledata=False):
    for file in file_list:
        path = 'D:\\Stock Data\\' + file
        print('Training data file: ' + file)
        for df in tqdm(pd.read_csv(path, chunksize=chunksize, usecols=cols)):
            Xtrain = df.drop(['labels'], axis=1)
            ytrain = df['labels']
            if len(ytrain) < 10:
                continue
            if scaledata == True:
                Xtrain = scale(Xtrain)
            clf = SGDClassifier(loss='hinge')
            clf.partial_fit(Xtrain, ytrain, classes=np.array([1,  0]))
    print('Finished training...')
    return clf

#test SGDClassifier model
def test_svm(model, test_file_list, cols):
    n = len(test_file_list)
    acc, pre, rec = np.zeros(n), np.zeros(n), np.zeros(n)
    for k, file in enumerate(test_file_list):
        path = 'D:\\Stock Data\\' + file
        print('Testing data file: ' + file)
        df_test = pd.read_csv(path, usecols=cols)
        Xtest = df_test.drop(['labels'], axis=1)
        ytest = df_test['labels']
        ypred = model.predict(Xtest)
        acc[k] = accuracy_score(ypred, ytest)
        TP, FP, TN, FN = 0, 0, 0, 0
        for i in range(len(ypred)):
            if ypred[i] == 1 and ytest[i] == 1:
                TP += 1
            elif ypred[i] == 1 and ytest[i] == 0:
                FP += 1
            elif ypred[i] == 0 and ytest[i] == 0:
                TN += 1    
            elif ypred[i] == 0 and ytest[i] == 1:
                FN += 1
        pre[k] = TP/(TP + FP)
        rec[k] = TP/(TP + FN)
        print('Accuracy, Precision, Recall: ', np.round(np.mean(acc), 3), \
                    np.round(np.mean(pre), 3), np.round(np.mean(rec), 3))
    print('Finished Testing...')

#train individual file with SVC package    
def train_svm2(file_list, cols):
    for file in file_list:
        trainfile = 'D:\\Stock Data\\' + file+"_under.csv"
        testfile = 'D:\\Stock Data\\' + file+"_test1.csv"
        print('Training data file: ' + file)
        df = pd.read_csv(trainfile, usecols=cols)
        Xtrain = df.drop(['labels'], axis=1)
        ytrain = df['labels']
        clf = SVC()
        clf.fit(Xtrain, ytrain)
        test_svm2(clf, testfile, cols)
    print("Done")

#test SVC model
def test_svm2(model, testfile, cols):
    print('Testing data file: ' + testfile)
    df_test = pd.read_csv(testfile, usecols=cols)
    Xtest = df_test.drop(['labels'], axis=1)
    ytest = df_test['labels']
    ypred = model.predict(Xtest)
    acc[k] = accuracy_score(ypred, ytest)
    TP, FP, TN, FN = 0, 0, 0, 0
    for i in range(len(ypred)):
        if ypred[i] == 1 and ytest[i] == 1:
            TP += 1
        elif ypred[i] == 1 and ytest[i] == 0:
            FP += 1
        elif ypred[i] == 0 and ytest[i] == 0:
            TN += 1    
        elif ypred[i] == 0 and ytest[i] == 1:
            FN += 1
    pre[k] = TP/(TP + FP)
    rec[k] = TP/(TP + FN)
    print('Accuracy, Precision, Recall: ', np.round(np.mean(acc), 3), \
                np.round(np.mean(pre), 3), np.round(np.mean(rec), 3))

#scale data
def scale(data):
    scaler = StandardScaler()
    scaler.fit(data)
    scaled_df = scaler.transform(data)
    scaled_df = pd.DataFrame(scaled_df)
    return scaled_df

In [5]:
cols = ['PAST_1y_p90', 'leverage', 'leverage_mkt', 'bm', 'roe', 'lag_baspread', 'lag_liquidity', 'lag_size', 'labels']

In [52]:
#original data vs undersampling
file_list = ['data2020_train1.csv']
test_file_list = ['data2020_test1.csv']
for i in range(10):
    clf = train_svm(file_list=file_list, cols=cols, scaledata=False)
    test_svm(clf, test_file_list, cols=cols)

0it [00:00, ?it/s]

Training data file: data2020_train1.csv


1it [00:00,  3.90it/s]


Finished training...
Testing data file: data2020_test1.csv


0it [00:00, ?it/s]

Accuracy, Precision, Recall:  0.723 0.156 0.148
Finished Testing...
Training data file: data2020_train1.csv


1it [00:00,  3.95it/s]


Finished training...
Testing data file: data2020_test1.csv


0it [00:00, ?it/s]

Accuracy, Precision, Recall:  0.779 0.141 0.062
Finished Testing...
Training data file: data2020_train1.csv


1it [00:00,  3.93it/s]


Finished training...
Testing data file: data2020_test1.csv


0it [00:00, ?it/s]

Accuracy, Precision, Recall:  0.7 0.174 0.208
Finished Testing...
Training data file: data2020_train1.csv


1it [00:00,  3.80it/s]


Finished training...
Testing data file: data2020_test1.csv


0it [00:00, ?it/s]

Accuracy, Precision, Recall:  0.594 0.155 0.318
Finished Testing...
Training data file: data2020_train1.csv


1it [00:00,  3.74it/s]


Finished training...
Testing data file: data2020_test1.csv


0it [00:00, ?it/s]

Accuracy, Precision, Recall:  0.684 0.115 0.132
Finished Testing...
Training data file: data2020_train1.csv


1it [00:00,  3.95it/s]


Finished training...
Testing data file: data2020_test1.csv


0it [00:00, ?it/s]

Accuracy, Precision, Recall:  0.812 0.113 0.017
Finished Testing...
Training data file: data2020_train1.csv


1it [00:00,  3.74it/s]


Finished training...
Testing data file: data2020_test1.csv


0it [00:00, ?it/s]

Accuracy, Precision, Recall:  0.655 0.113 0.154
Finished Testing...
Training data file: data2020_train1.csv


1it [00:00,  3.73it/s]


Finished training...
Testing data file: data2020_test1.csv


0it [00:00, ?it/s]

Accuracy, Precision, Recall:  0.773 0.125 0.058
Finished Testing...
Training data file: data2020_train1.csv


1it [00:00,  3.87it/s]


Finished training...
Testing data file: data2020_test1.csv


0it [00:00, ?it/s]

Accuracy, Precision, Recall:  0.765 0.101 0.05
Finished Testing...
Training data file: data2020_train1.csv


1it [00:00,  3.92it/s]


Finished training...
Testing data file: data2020_test1.csv
Accuracy, Precision, Recall:  0.312 0.18 0.869
Finished Testing...


In [53]:
for i in range(10):
    clf = train_svm(file_list=file_list, cols=cols, scaledata=True)
    test_svm(clf, test_file_list, cols=cols)

0it [00:00, ?it/s]

Training data file: data2020_train1.csv


1it [00:00,  3.51it/s]


Finished training...
Testing data file: data2020_test1.csv


0it [00:00, ?it/s]

Accuracy, Precision, Recall:  0.826 0.13 0.007
Finished Testing...
Training data file: data2020_train1.csv


1it [00:00,  3.58it/s]


Finished training...
Testing data file: data2020_test1.csv


0it [00:00, ?it/s]

Accuracy, Precision, Recall:  0.754 0.124 0.076
Finished Testing...
Training data file: data2020_train1.csv


1it [00:00,  3.56it/s]


Finished training...
Testing data file: data2020_test1.csv


0it [00:00, ?it/s]

Accuracy, Precision, Recall:  0.829 0.164 0.004
Finished Testing...
Training data file: data2020_train1.csv


1it [00:00,  3.36it/s]


Finished training...
Testing data file: data2020_test1.csv


0it [00:00, ?it/s]

Accuracy, Precision, Recall:  0.828 0.13 0.004
Finished Testing...
Training data file: data2020_train1.csv


1it [00:00,  3.39it/s]


Finished training...
Testing data file: data2020_test1.csv


0it [00:00, ?it/s]

Accuracy, Precision, Recall:  0.644 0.221 0.443
Finished Testing...
Training data file: data2020_train1.csv


1it [00:00,  3.42it/s]


Finished training...
Testing data file: data2020_test1.csv


0it [00:00, ?it/s]

Accuracy, Precision, Recall:  0.83 0.137 0.002
Finished Testing...
Training data file: data2020_train1.csv


1it [00:00,  3.53it/s]


Finished training...
Testing data file: data2020_test1.csv


0it [00:00, ?it/s]

Accuracy, Precision, Recall:  0.831 0.114 0.001
Finished Testing...
Training data file: data2020_train1.csv


1it [00:00,  3.34it/s]


Finished training...
Testing data file: data2020_test1.csv


0it [00:00, ?it/s]

Accuracy, Precision, Recall:  0.779 0.124 0.051
Finished Testing...
Training data file: data2020_train1.csv


1it [00:00,  3.52it/s]


Finished training...
Testing data file: data2020_test1.csv


0it [00:00, ?it/s]

Accuracy, Precision, Recall:  0.829 0.14 0.004
Finished Testing...
Training data file: data2020_train1.csv


1it [00:00,  3.49it/s]


Finished training...
Testing data file: data2020_test1.csv
Accuracy, Precision, Recall:  0.769 0.222 0.149
Finished Testing...


In [54]:
file_list = ['data2020_under.csv']
test_file_list = ['data2020_test1.csv']
for i in range(10):
    clf = train_svm(file_list=file_list, cols=cols, scaledata=False)
    test_svm(clf, test_file_list, cols=cols)

1it [00:00, 10.13it/s]

Training data file: data2020_under.csv
Finished training...
Testing data file: data2020_test1.csv



1it [00:00, 10.55it/s]

Accuracy, Precision, Recall:  0.677 0.126 0.156
Finished Testing...
Training data file: data2020_under.csv
Finished training...
Testing data file: data2020_test1.csv



1it [00:00, 10.44it/s]

Accuracy, Precision, Recall:  0.191 0.168 0.962
Finished Testing...
Training data file: data2020_under.csv
Finished training...
Testing data file: data2020_test1.csv



1it [00:00,  9.73it/s]

Accuracy, Precision, Recall:  0.632 0.213 0.442
Finished Testing...
Training data file: data2020_under.csv
Finished training...
Testing data file: data2020_test1.csv



1it [00:00, 10.44it/s]

Accuracy, Precision, Recall:  0.386 0.191 0.818
Finished Testing...
Training data file: data2020_under.csv
Finished training...
Testing data file: data2020_test1.csv



1it [00:00, 10.55it/s]

Accuracy, Precision, Recall:  0.206 0.171 0.97
Finished Testing...
Training data file: data2020_under.csv
Finished training...
Testing data file: data2020_test1.csv



1it [00:00, 10.55it/s]

Accuracy, Precision, Recall:  0.296 0.175 0.855
Finished Testing...
Training data file: data2020_under.csv
Finished training...
Testing data file: data2020_test1.csv



1it [00:00,  9.83it/s]

Accuracy, Precision, Recall:  0.183 0.169 0.984
Finished Testing...
Training data file: data2020_under.csv
Finished training...
Testing data file: data2020_test1.csv



1it [00:00, 10.55it/s]

Accuracy, Precision, Recall:  0.177 0.168 0.988
Finished Testing...
Training data file: data2020_under.csv
Finished training...
Testing data file: data2020_test1.csv



1it [00:00,  9.64it/s]

Accuracy, Precision, Recall:  0.664 0.118 0.154
Finished Testing...
Training data file: data2020_under.csv
Finished training...
Testing data file: data2020_test1.csv





Accuracy, Precision, Recall:  0.39 0.137 0.495
Finished Testing...


In [57]:
for i in range(10):
    clf = train_svm(file_list=file_list, cols=cols, scaledata=True)
    test_svm(clf, test_file_list, cols=cols)

1it [00:00,  9.12it/s]

Training data file: data2020_under.csv
Finished training...
Testing data file: data2020_test1.csv



1it [00:00,  9.46it/s]

Accuracy, Precision, Recall:  0.17 0.168 0.999
Finished Testing...
Training data file: data2020_under.csv
Finished training...
Testing data file: data2020_test1.csv



1it [00:00,  9.55it/s]

Accuracy, Precision, Recall:  0.815 0.109 0.014
Finished Testing...
Training data file: data2020_under.csv
Finished training...
Testing data file: data2020_test1.csv



1it [00:00,  9.46it/s]

Accuracy, Precision, Recall:  0.173 0.168 0.995
Finished Testing...
Training data file: data2020_under.csv
Finished training...
Testing data file: data2020_test1.csv



1it [00:00,  8.50it/s]

Accuracy, Precision, Recall:  0.435 0.193 0.74
Finished Testing...
Training data file: data2020_under.csv
Finished training...
Testing data file: data2020_test1.csv



1it [00:00,  9.55it/s]

Accuracy, Precision, Recall:  0.23 0.173 0.943
Finished Testing...
Training data file: data2020_under.csv
Finished training...
Testing data file: data2020_test1.csv



1it [00:00,  8.80it/s]

Accuracy, Precision, Recall:  0.328 0.182 0.861
Finished Testing...
Training data file: data2020_under.csv
Finished training...
Testing data file: data2020_test1.csv



1it [00:00,  9.64it/s]

Accuracy, Precision, Recall:  0.21 0.171 0.957
Finished Testing...
Training data file: data2020_under.csv
Finished training...
Testing data file: data2020_test1.csv



1it [00:00,  8.64it/s]

Accuracy, Precision, Recall:  0.286 0.178 0.898
Finished Testing...
Training data file: data2020_under.csv
Finished training...
Testing data file: data2020_test1.csv



1it [00:00,  9.55it/s]

Accuracy, Precision, Recall:  0.183 0.169 0.984
Finished Testing...
Training data file: data2020_under.csv
Finished training...
Testing data file: data2020_test1.csv





Accuracy, Precision, Recall:  0.296 0.178 0.884
Finished Testing...


In [6]:
#past 20 years exclude two financial crisis and covid
file_list2 = ['data2000_train1.csv', 'data2001_train1.csv', 'data2002_train1.csv', 'data2003_train1.csv', \
              'data2004_train1.csv', 'data2005_train1.csv', 'data2006_train1.csv', 'data2007_train1.csv', \
              'data2010_train1.csv', 'data2011_train1.csv', 'data2012_train1.csv', 'data2013_train1.csv', \
              'data2014_train1.csv', 'data2015_train1.csv', 'data2016_train1.csv', 'data2017_train1.csv', \
              'data2018_train1.csv']
test_file_list2 = ['data2000_test1.csv', 'data2001_test1.csv', 'data2002_test1.csv', 'data2003_test1.csv', \
              'data2004_test1.csv', 'data2005_test1.csv', 'data2006_test1.csv', 'data2007_test1.csv', \
              'data2010_test1.csv', 'data2011_test1.csv', 'data2012_test1.csv', 'data2013_test1.csv', \
              'data2014_test1.csv', 'data2015_test1.csv', 'data2016_test1.csv', 'data2017_test1.csv', \
              'data2018_test1.csv']
#features and labels
# cols = ['past_1y_mean', 'past_1y_vol', 'PAST_1y_p90',\
#             'leverage', 'leverage_mkt', 'bm', 'roe', 'lag_baspread', 'lag_liquidity', 'lag_size', 'labels']
cols = ['PAST_1y_p90', 'leverage', 'leverage_mkt', 'bm', 'roe', 'lag_baspread', 'lag_liquidity', 'lag_size', 'labels']

0it [00:00, ?it/s]

Training data file: data2000_train1.csv


2it [00:01,  1.62it/s]


Finished training...
Testing data file: data2000_test1.csv
Accuracy, Precision, Recall:  0.871 0.105 0.02
Finished Testing...


In [29]:
clf_f1 = train_svm(file_list=file_list2, cols=cols, scaledata=False)
test_svm(clf_f1, test_file_list2, cols=cols)

7it [00:00, 62.67it/s]

Training data file: data2000_train1.csv


101it [00:01, 57.80it/s]
7it [00:00, 62.67it/s]

Training data file: data2001_train1.csv


94it [00:01, 60.69it/s]
7it [00:00, 61.57it/s]

Training data file: data2002_train1.csv


100it [00:01, 59.33it/s]
7it [00:00, 58.98it/s]

Training data file: data2003_train1.csv


97it [00:01, 60.30it/s]
7it [00:00, 61.57it/s]

Training data file: data2004_train1.csv


95it [00:01, 58.40it/s]
6it [00:00, 56.76it/s]

Training data file: data2005_train1.csv


93it [00:01, 58.68it/s]
7it [00:00, 61.57it/s]

Training data file: data2006_train1.csv


91it [00:01, 59.60it/s]
7it [00:00, 62.11it/s]

Training data file: data2007_train1.csv


90it [00:01, 59.60it/s]
7it [00:00, 61.03it/s]

Training data file: data2010_train1.csv


82it [00:01, 60.41it/s]
12it [00:00, 57.33it/s]

Training data file: data2011_train1.csv


82it [00:01, 59.58it/s]
7it [00:00, 62.11it/s]

Training data file: data2012_train1.csv


80it [00:01, 59.02it/s]
7it [00:00, 62.67it/s]

Training data file: data2013_train1.csv


80it [00:01, 60.49it/s]
7it [00:00, 61.57it/s]

Training data file: data2014_train1.csv


81it [00:01, 59.41it/s]
6it [00:00, 58.98it/s]

Training data file: data2015_train1.csv


81it [00:01, 58.60it/s]
6it [00:00, 58.98it/s]

Training data file: data2016_train1.csv


80it [00:01, 59.91it/s]
6it [00:00, 57.85it/s]

Training data file: data2017_train1.csv


78it [00:01, 58.54it/s]
7it [00:00, 61.03it/s]

Training data file: data2018_train1.csv


78it [00:01, 59.43it/s]


Finished training...
Testing data file: data2000_test1.csv
Accuracy, Precision, Recall:  0.047 0.006 0.006
Testing data file: data2001_test1.csv
Accuracy, Precision, Recall:  0.094 0.011 0.014
Testing data file: data2002_test1.csv
Accuracy, Precision, Recall:  0.142 0.017 0.019
Testing data file: data2003_test1.csv
Accuracy, Precision, Recall:  0.192 0.022 0.024
Testing data file: data2004_test1.csv
Accuracy, Precision, Recall:  0.242 0.027 0.029
Testing data file: data2005_test1.csv
Accuracy, Precision, Recall:  0.291 0.032 0.034
Testing data file: data2006_test1.csv
Accuracy, Precision, Recall:  0.34 0.039 0.039
Testing data file: data2007_test1.csv
Accuracy, Precision, Recall:  0.388 0.045 0.044
Testing data file: data2010_test1.csv
Accuracy, Precision, Recall:  0.438 0.049 0.048
Testing data file: data2011_test1.csv
Accuracy, Precision, Recall:  0.487 0.056 0.052
Testing data file: data2012_test1.csv
Accuracy, Precision, Recall:  0.538 0.061 0.057
Testing data file: data2013_test1.

In [30]:
clf_f1t = train_svm(file_list=file_list2, cols=cols, scaledata=True)
test_svm(clf_f1t, test_file_list2, cols=cols)

6it [00:00, 53.71it/s]

Training data file: data2000_train1.csv


101it [00:01, 52.25it/s]
6it [00:00, 54.20it/s]

Training data file: data2001_train1.csv


94it [00:01, 51.62it/s]
6it [00:00, 53.71it/s]

Training data file: data2002_train1.csv


100it [00:01, 52.50it/s]
6it [00:00, 54.20it/s]

Training data file: data2003_train1.csv


97it [00:01, 52.21it/s]
6it [00:00, 50.55it/s]

Training data file: data2004_train1.csv


95it [00:01, 52.57it/s]
6it [00:00, 51.42it/s]

Training data file: data2005_train1.csv


93it [00:01, 52.86it/s]
6it [00:00, 51.42it/s]

Training data file: data2006_train1.csv


91it [00:01, 52.53it/s]
6it [00:00, 54.20it/s]

Training data file: data2007_train1.csv


90it [00:01, 52.59it/s]
5it [00:00, 45.99it/s]

Training data file: data2010_train1.csv


82it [00:01, 50.91it/s]
6it [00:00, 53.71it/s]

Training data file: data2011_train1.csv


82it [00:01, 52.98it/s]
6it [00:00, 51.86it/s]

Training data file: data2012_train1.csv


80it [00:01, 53.05it/s]
6it [00:00, 54.20it/s]

Training data file: data2013_train1.csv


80it [00:01, 52.36it/s]
6it [00:00, 54.20it/s]

Training data file: data2014_train1.csv


81it [00:01, 51.90it/s]
6it [00:00, 52.77it/s]

Training data file: data2015_train1.csv


81it [00:01, 52.81it/s]
6it [00:00, 51.42it/s]

Training data file: data2016_train1.csv


80it [00:01, 51.29it/s]
6it [00:00, 54.20it/s]

Training data file: data2017_train1.csv


78it [00:01, 52.95it/s]
6it [00:00, 53.24it/s]

Training data file: data2018_train1.csv


78it [00:01, 52.10it/s]


Finished training...
Testing data file: data2000_test1.csv
Accuracy, Precision, Recall:  0.045 0.008 0.013
Testing data file: data2001_test1.csv
Accuracy, Precision, Recall:  0.091 0.015 0.024
Testing data file: data2002_test1.csv
Accuracy, Precision, Recall:  0.138 0.021 0.033
Testing data file: data2003_test1.csv
Accuracy, Precision, Recall:  0.185 0.027 0.043
Testing data file: data2004_test1.csv
Accuracy, Precision, Recall:  0.231 0.033 0.054
Testing data file: data2005_test1.csv
Accuracy, Precision, Recall:  0.277 0.039 0.064
Testing data file: data2006_test1.csv
Accuracy, Precision, Recall:  0.322 0.046 0.077
Testing data file: data2007_test1.csv
Accuracy, Precision, Recall:  0.364 0.054 0.091
Testing data file: data2010_test1.csv
Accuracy, Precision, Recall:  0.409 0.06 0.104
Testing data file: data2011_test1.csv
Accuracy, Precision, Recall:  0.453 0.068 0.118
Testing data file: data2012_test1.csv
Accuracy, Precision, Recall:  0.5 0.073 0.13
Testing data file: data2013_test1.csv

In [12]:
file_list3 = ['data2000_under.csv', 'data2001_under.csv', \
              'data2002_under.csv', 'data2003_under.csv', 'data2004_under.csv', 'data2005_under.csv', \
              'data2006_under.csv', 'data2007_under.csv', 'data2008_under.csv', 'data2011_under.csv', \
              'data2012_under.csv', 'data2013_under.csv', 'data2014_under.csv', 'data2015_under.csv', \
              'data2016_under.csv', 'data2017_under.csv', 'data2018_under.csv']
test_file_list3 = ['data2000_test1.csv', 'data2001_test1.csv', 'data2002_test1.csv', 'data2003_test1.csv', \
              'data2004_test1.csv', 'data2005_test1.csv', 'data2006_test1.csv', 'data2007_test1.csv', \
              'data2008_test1.csv', 'data2011_test1.csv', 'data2012_test1.csv', 'data2013_test1.csv', \
              'data2014_test1.csv', 'data2015_test1.csv', 'data2016_test1.csv', 'data2017_test1.csv', \
              'data2018_test1.csv']
#'data1997_under.csv', 'data1998_under.csv', 'data1999_under.csv', 'data2019_under.csv', 'data2020_under.csv'

In [13]:
clf_u = train_svm(file_list=file_list3, cols=cols, scaledata=False)
test_svm(clf_u, test_file_list3, cols=cols)

6it [00:00, 59.57it/s]

Training data file: data2000_under.csv


23it [00:00, 59.74it/s]
7it [00:00, 63.23it/s]

Training data file: data2001_under.csv


17it [00:00, 62.21it/s]
7it [00:00, 63.81it/s]

Training data file: data2002_under.csv


20it [00:00, 61.14it/s]
7it [00:00, 62.67it/s]

Training data file: data2003_under.csv


16it [00:00, 61.94it/s]
7it [00:00, 63.81it/s]

Training data file: data2004_under.csv


17it [00:00, 61.54it/s]
7it [00:00, 60.51it/s]

Training data file: data2005_under.csv


18it [00:00, 61.60it/s]
7it [00:00, 62.11it/s]

Training data file: data2006_under.csv


19it [00:00, 59.91it/s]
6it [00:00, 58.41it/s]

Training data file: data2007_under.csv


21it [00:00, 58.82it/s]
7it [00:00, 64.39it/s]

Training data file: data2008_under.csv


29it [00:00, 59.34it/s]
7it [00:00, 62.67it/s]

Training data file: data2011_under.csv


19it [00:00, 62.46it/s]
12it [00:00, 64.34it/s]

Training data file: data2012_under.csv



7it [00:00, 63.23it/s]

Training data file: data2013_under.csv


15it [00:00, 62.15it/s]
6it [00:00, 57.30it/s]

Training data file: data2014_under.csv


16it [00:00, 58.55it/s]
7it [00:00, 64.99it/s]

Training data file: data2015_under.csv


19it [00:00, 64.80it/s]
7it [00:00, 62.11it/s]

Training data file: data2016_under.csv


17it [00:00, 57.98it/s]
7it [00:00, 61.57it/s]

Training data file: data2017_under.csv


14it [00:00, 58.98it/s]
12it [00:00, 57.07it/s]

Training data file: data2018_under.csv


18it [00:00, 58.98it/s]


Finished training...
Testing data file: data2000_test1.csv
Accuracy, Precision, Recall:  0.008 0.007 0.058
Testing data file: data2001_test1.csv
Accuracy, Precision, Recall:  0.015 0.012 0.114
Testing data file: data2002_test1.csv
Accuracy, Precision, Recall:  0.022 0.018 0.17
Testing data file: data2003_test1.csv
Accuracy, Precision, Recall:  0.029 0.023 0.227
Testing data file: data2004_test1.csv
Accuracy, Precision, Recall:  0.036 0.028 0.284
Testing data file: data2005_test1.csv
Accuracy, Precision, Recall:  0.042 0.033 0.341
Testing data file: data2006_test1.csv
Accuracy, Precision, Recall:  0.05 0.039 0.398
Testing data file: data2007_test1.csv
Accuracy, Precision, Recall:  0.057 0.046 0.456
Testing data file: data2008_test1.csv
Accuracy, Precision, Recall:  0.068 0.056 0.513
Testing data file: data2011_test1.csv
Accuracy, Precision, Recall:  0.076 0.062 0.57
Testing data file: data2012_test1.csv
Accuracy, Precision, Recall:  0.083 0.067 0.626
Testing data file: data2013_test1.cs

In [14]:
clf_ut = train_svm(file_list=file_list3, cols=cols, scaledata=True)
test_svm(clf_ut, test_file_list3, cols=cols)

6it [00:00, 55.19it/s]

Training data file: data2000_under.csv


23it [00:00, 53.63it/s]
6it [00:00, 55.70it/s]

Training data file: data2001_under.csv


17it [00:00, 54.81it/s]
6it [00:00, 56.23it/s]

Training data file: data2002_under.csv


20it [00:00, 54.64it/s]
6it [00:00, 53.71it/s]

Training data file: data2003_under.csv


16it [00:00, 52.43it/s]
6it [00:00, 52.77it/s]

Training data file: data2004_under.csv


17it [00:00, 49.99it/s]
6it [00:00, 52.31it/s]

Training data file: data2005_under.csv


18it [00:00, 53.24it/s]
6it [00:00, 55.70it/s]

Training data file: data2006_under.csv


19it [00:00, 55.22it/s]
6it [00:00, 50.55it/s]

Training data file: data2007_under.csv


21it [00:00, 52.12it/s]
6it [00:00, 56.22it/s]

Training data file: data2008_under.csv


29it [00:00, 55.28it/s]
6it [00:00, 54.69it/s]

Training data file: data2011_under.csv


19it [00:00, 50.53it/s]
6it [00:00, 54.69it/s]

Training data file: data2012_under.csv


12it [00:00, 54.94it/s]
6it [00:00, 55.19it/s]

Training data file: data2013_under.csv


15it [00:00, 54.89it/s]
6it [00:00, 54.20it/s]

Training data file: data2014_under.csv


16it [00:00, 51.92it/s]
6it [00:00, 52.31it/s]

Training data file: data2015_under.csv


19it [00:00, 53.82it/s]
6it [00:00, 55.70it/s]

Training data file: data2016_under.csv


17it [00:00, 52.94it/s]
11it [00:00, 52.31it/s]

Training data file: data2017_under.csv


14it [00:00, 51.99it/s]
6it [00:00, 51.86it/s]

Training data file: data2018_under.csv


18it [00:00, 52.31it/s]


Finished training...
Testing data file: data2000_test1.csv
Accuracy, Precision, Recall:  0.008 0.007 0.058
Testing data file: data2001_test1.csv
Accuracy, Precision, Recall:  0.015 0.012 0.114
Testing data file: data2002_test1.csv
Accuracy, Precision, Recall:  0.023 0.018 0.17
Testing data file: data2003_test1.csv
Accuracy, Precision, Recall:  0.03 0.023 0.227
Testing data file: data2004_test1.csv
Accuracy, Precision, Recall:  0.036 0.028 0.284
Testing data file: data2005_test1.csv
Accuracy, Precision, Recall:  0.043 0.034 0.341
Testing data file: data2006_test1.csv
Accuracy, Precision, Recall:  0.05 0.039 0.399
Testing data file: data2007_test1.csv
Accuracy, Precision, Recall:  0.057 0.046 0.457
Testing data file: data2008_test1.csv
Accuracy, Precision, Recall:  0.068 0.056 0.514
Testing data file: data2011_test1.csv
Accuracy, Precision, Recall:  0.076 0.063 0.572
Testing data file: data2012_test1.csv
Accuracy, Precision, Recall:  0.082 0.067 0.628
Testing data file: data2013_test1.cs

In [64]:
#train individual year with undersampled data
file_list3 = ['data2000_under.csv', 'data2001_under.csv', 'data2002_under.csv', 'data2003_under.csv', \
                  'data2004_under.csv', 'data2005_under.csv', 'data2006_under.csv', 'data2007_under.csv', \
                  'data2008_under.csv', 'data2009_under.csv', 'data2010_under.csv', 'data2011_under.csv', \
                  'data2012_under.csv', 'data2013_under.csv', 'data2014_under.csv', 'data2015_under.csv', \
                  'data2016_under.csv', 'data2017_under.csv', 'data2018_under.csv', 'data2019_under.csv', \
                  'data2020_under.csv']
test_file_list3 = ['data2000_test1.csv', 'data2001_test1.csv', 'data2002_test1.csv', 'data2003_test1.csv', \
                          'data2004_test1.csv', 'data2005_test1.csv', 'data2006_test1.csv', 'data2007_test1.csv', \
                          'data2008_test1.csv', 'data2009_test1.csv', 'data2010_test1.csv', 'data2011_test1.csv', \
                          'data2012_test1.csv', 'data2013_test1.csv', 'data2014_test1.csv', 'data2015_test1.csv', \
                          'data2016_test1.csv', 'data2017_test1.csv', 'data2018_test1.csv', 'data2019_test1.csv', \
                          'data2020_test1.csv']

In [65]:
for i in range(len(file_list3)):
    file_list33 = [file_list3[i]]
    test_file_list33 = [test_file_list3[i]]
    clf = train_svm(file_list=file_list33, cols=cols, scaledata=True)
    test_svm(clf, test_file_list33, cols=cols)

0it [00:00, ?it/s]

Training data file: data2000_under.csv


1it [00:00,  3.32it/s]


Finished training...
Testing data file: data2000_test1.csv


0it [00:00, ?it/s]

Accuracy, Precision, Recall:  0.114 0.112 0.998
Finished Testing...
Training data file: data2001_under.csv


1it [00:00,  4.44it/s]


Finished training...
Testing data file: data2001_test1.csv


0it [00:00, ?it/s]

Accuracy, Precision, Recall:  0.862 0.119 0.081
Finished Testing...
Training data file: data2002_under.csv


1it [00:00,  3.73it/s]


Finished training...
Testing data file: data2002_test1.csv


1it [00:00,  4.80it/s]

Accuracy, Precision, Recall:  0.858 0.121 0.066
Finished Testing...
Training data file: data2003_under.csv


1it [00:00,  4.75it/s]


Finished training...
Testing data file: data2003_test1.csv


0it [00:00, ?it/s]

Accuracy, Precision, Recall:  0.815 0.095 0.146
Finished Testing...
Training data file: data2004_under.csv


1it [00:00,  4.46it/s]


Finished training...
Testing data file: data2004_test1.csv


0it [00:00, ?it/s]

Accuracy, Precision, Recall:  0.091 0.089 0.998
Finished Testing...
Training data file: data2005_under.csv


1it [00:00,  4.27it/s]


Finished training...
Testing data file: data2005_test1.csv


0it [00:00, ?it/s]

Accuracy, Precision, Recall:  0.103 0.096 0.992
Finished Testing...
Training data file: data2006_under.csv


1it [00:00,  3.92it/s]


Finished training...
Testing data file: data2006_test1.csv


0it [00:00, ?it/s]

Accuracy, Precision, Recall:  0.731 0.117 0.254
Finished Testing...
Training data file: data2007_under.csv


1it [00:00,  3.49it/s]


Finished training...
Testing data file: data2007_test1.csv


0it [00:00, ?it/s]

Accuracy, Precision, Recall:  0.779 0.142 0.181
Finished Testing...
Training data file: data2008_under.csv


1it [00:00,  2.54it/s]


Finished training...
Testing data file: data2008_test1.csv


1it [00:00,  5.33it/s]

Accuracy, Precision, Recall:  0.305 0.163 0.808
Finished Testing...
Training data file: data2009_under.csv


1it [00:00,  5.28it/s]


Finished training...
Testing data file: data2009_test1.csv


1it [00:00,  5.80it/s]

Accuracy, Precision, Recall:  0.362 0.081 0.638
Finished Testing...
Training data file: data2010_under.csv


1it [00:00,  5.76it/s]


Finished training...
Testing data file: data2010_test1.csv


0it [00:00, ?it/s]

Accuracy, Precision, Recall:  0.823 0.096 0.147
Finished Testing...
Training data file: data2011_under.csv


1it [00:00,  3.96it/s]


Finished training...
Testing data file: data2011_test1.csv


1it [00:00,  6.15it/s]

Accuracy, Precision, Recall:  0.152 0.116 0.956
Finished Testing...
Training data file: data2012_under.csv
Finished training...
Testing data file: data2012_test1.csv



1it [00:00,  5.12it/s]

Accuracy, Precision, Recall:  0.561 0.079 0.462
Finished Testing...
Training data file: data2013_under.csv


1it [00:00,  5.09it/s]


Finished training...
Testing data file: data2013_test1.csv


0it [00:00, ?it/s]

Accuracy, Precision, Recall:  0.266 0.094 0.79
Finished Testing...
Training data file: data2014_under.csv


1it [00:00,  4.64it/s]


Finished training...
Testing data file: data2014_test1.csv


0it [00:00, ?it/s]

Accuracy, Precision, Recall:  0.186 0.102 0.909
Finished Testing...
Training data file: data2015_under.csv


1it [00:00,  4.13it/s]


Finished training...
Testing data file: data2015_test1.csv


0it [00:00, ?it/s]

Accuracy, Precision, Recall:  0.167 0.114 0.945
Finished Testing...
Training data file: data2016_under.csv


1it [00:00,  4.48it/s]


Finished training...
Testing data file: data2016_test1.csv


1it [00:00,  5.42it/s]

Accuracy, Precision, Recall:  0.605 0.123 0.451
Finished Testing...
Training data file: data2017_under.csv


1it [00:00,  5.39it/s]


Finished training...
Testing data file: data2017_test1.csv


0it [00:00, ?it/s]

Accuracy, Precision, Recall:  0.338 0.092 0.741
Finished Testing...
Training data file: data2018_under.csv


1it [00:00,  4.30it/s]


Finished training...
Testing data file: data2018_test1.csv


0it [00:00, ?it/s]

Accuracy, Precision, Recall:  0.157 0.112 0.944
Finished Testing...
Training data file: data2019_under.csv


1it [00:00,  4.58it/s]


Finished training...
Testing data file: data2019_test1.csv


1it [00:00,  9.64it/s]

Accuracy, Precision, Recall:  0.13 0.103 0.968
Finished Testing...
Training data file: data2020_under.csv
Finished training...
Testing data file: data2020_test1.csv





Accuracy, Precision, Recall:  0.176 0.169 0.992
Finished Testing...
