In [1]:
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import *

from sklearn.datasets import load_svmlight_file

In [2]:
without_time = False

if without_time:
    N_FEAT = 36765
    svmlight_file = "../cleaned_data/features_svmlight_eventCounts.train"
else:
    N_FEAT = 36807
    svmlight_file = "../cleaned_data/features_svmlight_eventCounts_wt.train"

In [3]:
def get_data_from_svmlight(svmlight_file):
    data_train = load_svmlight_file(svmlight_file,n_features=N_FEAT)
    X_train = data_train[0]
    Y_train = data_train[1]
    return X_train, Y_train

In [4]:
RANDOM_STATE = 6250

In [5]:
X, Y = get_data_from_svmlight(svmlight_file)

In [6]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=RANDOM_STATE)

In [None]:
# Cross-validation

# https://scikit-learn.org/stable/modules/cross_validation.html

# Logistic Regression
# https://www.datacamp.com/tutorial/understanding-logistic-regression-python

# SVM support vector machine
# https://scikit-learn.org/dev/versions.html
# https://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html


# decision tree
# https://scikit-learn.org/stable/modules/tree.html

# random forest
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html

In [7]:
#input: X_train, Y_train
#output: Y_pred
def logistic_regression_pred(X_train, Y_train, X_test):
    #train a logistic regression classifier using X_train and Y_train. Use this to predict labels of X_train
    #use default params for the classifier
    log_reg = LogisticRegression(random_state=RANDOM_STATE)
    log_reg.fit(X_train, Y_train)
    Y_pred = log_reg.predict(X_test)

    return Y_pred


#input: X_train, Y_train
#output: Y_pred
def svm_pred(X_train, Y_train, X_test):
    #train a SVM classifier using X_train and Y_train. Use this to predict labels of X_train
    #use default params for the classifier
    lin_svc = LinearSVC(random_state=RANDOM_STATE)
    lin_svc.fit(X_train, Y_train)
    Y_pred = lin_svc.predict(X_test)

    return Y_pred


#input: X_train, Y_train
#output: Y_pred
def decisionTree_pred(X_train, Y_train, X_test):
    #train a logistic regression classifier using X_train and Y_train. Use this to predict labels of X_train
    #use max_depth as 5
    dec_tree = DecisionTreeClassifier(max_depth=5, random_state=RANDOM_STATE)
    dec_tree.fit(X_train, Y_train)
    Y_pred = dec_tree.predict(X_test)

    return Y_pred


#input: X_train, Y_train
#output: Y_pred
def randomForest_pred(X_train, Y_train, X_test):
    #train a logistic regression classifier using X_train and Y_train. Use this to predict labels of X_train
    #use max_depth as 5
    rand_forest = RandomForestClassifier(n_estimators=100, random_state=RANDOM_STATE)
    rand_forest.fit(X_train, Y_train)
    Y_pred = rand_forest.predict(X_test)

    return Y_pred

In [8]:
def classification_metrics(Y_pred, Y_true):
    #NOTE: It is important to provide the output in the same order
    accuracy = accuracy_score(Y_true, Y_pred)
    auc = roc_auc_score(Y_true, Y_pred)
    precision = precision_score(Y_true, Y_pred)
    recall = recall_score(Y_true, Y_pred)
    fscore = f1_score(Y_true, Y_pred)

    return accuracy, auc, precision, recall, fscore

In [9]:
X_train.shape

(350, 36807)

In [10]:
np.shape(X_train)[0]

350

In [11]:
print(np.shape(X_train)[0])

350


In [13]:
from sklearn.model_selection import KFold
kfold = KFold(n_splits=10, shuffle = True,random_state=RANDOM_STATE)

In [None]:
############### Logistic Regression #############

In [14]:
def cv_logistic(X,Y,kfold):
    
    acc_list, auc_list = [], []

    for train, test in kfold.split(X):
        X_train, Y_train, X_fold, Y_true = X[train], Y[train], X[test], Y[test]

        Y_pred = logistic_regression_pred(X_train, Y_train, X_fold)

        metrics = classification_metrics(Y_pred, Y_true)
        acc_list.append(metrics[0])
        auc_list.append(metrics[1])

    acc, auc = np.mean(acc_list), np.mean(auc_list)

    return acc, auc

In [15]:
print(cv_logistic(X_train,Y_train,kfold))

(0.8285714285714286, 0.7908113352026396)


In [16]:
################## SVM ################

In [17]:
def cv_svm(X,Y,kfold):

    acc_list, auc_list = [], []

    for train, test in kfold.split(X):
        X_train, Y_train, X_fold, Y_true = X[train], Y[train], X[test], Y[test]

        Y_pred = svm_pred(X_train, Y_train, X_fold)

        metrics = classification_metrics(Y_pred, Y_true)
        acc_list.append(metrics[0])
        auc_list.append(metrics[1])

    acc, auc = np.mean(acc_list), np.mean(auc_list)

    return acc, auc

In [18]:
print(cv_svm(X_train,Y_train,kfold))

(0.8371428571428572, 0.8191303644021035)


In [19]:
####################### Decision Tree ##############

In [20]:
def cv_decision_tree(X,Y,kfold):
    acc_list, auc_list = [], []

    for train, test in kfold.split(X):
        X_train, Y_train, X_fold, Y_true = X[train], Y[train], X[test], Y[test]

        Y_pred = decisionTree_pred(X_train, Y_train, X_fold)

        metrics = classification_metrics(Y_pred, Y_true)
        acc_list.append(metrics[0])
        auc_list.append(metrics[1])

    acc, auc = np.mean(acc_list), np.mean(auc_list)

    return acc, auc

In [21]:
print(cv_decision_tree(X_train,Y_train,kfold))

(0.8371428571428572, 0.790300775311645)


In [22]:
################### Random Forest ###################

In [23]:
def cv_random_forest(X,Y,kfold):
    acc_list, auc_list = [], []

    for train, test in kfold.split(X):
        X_train, Y_train, X_fold, Y_true = X[train], Y[train], X[test], Y[test]

        Y_pred = randomForest_pred(X_train, Y_train, X_fold)

        metrics = classification_metrics(Y_pred, Y_true)
        acc_list.append(metrics[0])
        auc_list.append(metrics[1])

    acc, auc = np.mean(acc_list), np.mean(auc_list)

    return acc, auc

In [24]:
print(cv_random_forest(X_train,Y_train,kfold))

(0.7942857142857143, 0.6988470603470605)


In [25]:
######################## Run the test data on the models ###########

In [27]:
#input: Name of classifier, predicted labels, actual labels
def display_metrics(classifierName,Y_pred,Y_true):
    print("______________________________________________")
    print( "Classifier: "+classifierName)
    acc, auc_, precision, recall, f1score = classification_metrics(Y_pred,Y_true)
    print( "Accuracy: "+str(acc))
    print( "AUC: "+str(auc_))
    print( "Precision: "+str(precision))
    print( "Recall: "+str(recall))
    print( "F1-score: "+str(f1score))
    print( "______________________________________________")

In [28]:
display_metrics("Logistic Regression",logistic_regression_pred(X_train,Y_train,X_test),Y_test)

display_metrics("SVM",svm_pred(X_train,Y_train,X_test),Y_test)

display_metrics("Decision Tree",decisionTree_pred(X_train,Y_train,X_test),Y_test)

display_metrics("Random Forest",randomForest_pred(X_train,Y_train,X_test),Y_test)

______________________________________________
Classifier: Logistic Regression
Accuracy: 0.8977272727272727
AUC: 0.8861607142857143
Precision: 0.8709677419354839
Recall: 0.84375
F1-score: 0.8571428571428571
______________________________________________
______________________________________________
Classifier: SVM
Accuracy: 0.875
AUC: 0.8683035714285715
Precision: 0.8181818181818182
Recall: 0.84375
F1-score: 0.8307692307692308
______________________________________________
______________________________________________
Classifier: Decision Tree
Accuracy: 0.9090909090909091
AUC: 0.8883928571428572
Precision: 0.9285714285714286
Recall: 0.8125
F1-score: 0.8666666666666666
______________________________________________
______________________________________________
Classifier: Random Forest
Accuracy: 0.8295454545454546
AUC: 0.7790178571428572
Precision: 0.9047619047619048
Recall: 0.59375
F1-score: 0.7169811320754718
______________________________________________
