In [1]:
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import *
from sklearn.cross_validation import KFold, train_test_split

from sklearn.datasets import load_svmlight_file



### Train Baseline models based on Word2Vec

In [2]:
without_time = True

if without_time:
    svmlight_file = "../cleaned_data/features_svmlight_w2v.train"
else:
    svmlight_file = "../cleaned_data/features_svmlight_w2v_wt.train"

In [3]:
def get_data_from_svmlight(svmlight_file):
    data_train = load_svmlight_file(svmlight_file,n_features=4908)
    X_train = data_train[0]
    Y_train = data_train[1]
    return X_train, Y_train

In [4]:
RANDOM_STATE = 6250

In [5]:
X, Y = get_data_from_svmlight(svmlight_file)


In [6]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=RANDOM_STATE)

### Models

Logistic, SVM, Decision Tree, Random Forest

In [7]:
#input: X_train, Y_train
#output: Y_pred
def logistic_regression_pred(X_train, Y_train, X_test):
    #train a logistic regression classifier using X_train and Y_train. Use this to predict labels of X_train
    #use default params for the classifier
    log_reg = LogisticRegression(random_state=RANDOM_STATE)
    log_reg.fit(X_train, Y_train)
    Y_pred = log_reg.predict(X_test)

    return Y_pred


#input: X_train, Y_train
#output: Y_pred
def svm_pred(X_train, Y_train, X_test):
    #train a SVM classifier using X_train and Y_train. Use this to predict labels of X_train
    #use default params for the classifier
    lin_svc = LinearSVC(random_state=RANDOM_STATE)
    lin_svc.fit(X_train, Y_train)
    Y_pred = lin_svc.predict(X_test)

    return Y_pred


#input: X_train, Y_train
#output: Y_pred
def decisionTree_pred(X_train, Y_train, X_test):
    #train a logistic regression classifier using X_train and Y_train. Use this to predict labels of X_train
    #use max_depth as 5
    dec_tree = DecisionTreeClassifier(max_depth=5, random_state=RANDOM_STATE)
    dec_tree.fit(X_train, Y_train)
    Y_pred = dec_tree.predict(X_test)

    return Y_pred


#input: X_train, Y_train
#output: Y_pred
def randomForest_pred(X_train, Y_train, X_test):
    #train a logistic regression classifier using X_train and Y_train. Use this to predict labels of X_train
    #use max_depth as 5
    rand_forest = RandomForestClassifier(n_estimators=100, random_state=RANDOM_STATE)
    rand_forest.fit(X_train, Y_train)
    Y_pred = rand_forest.predict(X_test)

    return Y_pred

In [8]:
def classification_metrics(Y_pred, Y_true):
    #NOTE: It is important to provide the output in the same order
    accuracy = accuracy_score(Y_true, Y_pred)
    auc = roc_auc_score(Y_true, Y_pred)
    precision = precision_score(Y_true, Y_pred)
    recall = recall_score(Y_true, Y_pred)
    fscore = f1_score(Y_true, Y_pred)

    return accuracy, auc, precision, recall, fscore

In [9]:
kfold = KFold(np.shape(X_train)[0], 10, random_state=RANDOM_STATE)

### Logistic Regression

In [10]:
def cv_logistic(X,Y,kfold):
    
    acc_list, auc_list = [], []
    
    for train, test in kfold:
        X_train, Y_train, X_fold, Y_true = X[train], Y[train], X[test], Y[test]

        Y_pred = logistic_regression_pred(X_train, Y_train, X_fold)

        metrics = classification_metrics(Y_pred, Y_true)
        acc_list.append(metrics[0])
        auc_list.append(metrics[1])

    acc, auc = np.mean(acc_list), np.mean(auc_list)

    return acc, auc

In [11]:
print cv_logistic(X_train,Y_train,kfold)

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


(0.66356984478935699, 0.5)


### SVM

In [12]:
def cv_svm(X,Y,kfold):

    acc_list, auc_list = [], []
    
    for train, test in kfold:
        X_train, Y_train, X_fold, Y_true = X[train], Y[train], X[test], Y[test]

        Y_pred = svm_pred(X_train, Y_train, X_fold)

        metrics = classification_metrics(Y_pred, Y_true)
        acc_list.append(metrics[0])
        auc_list.append(metrics[1])

    acc, auc = np.mean(acc_list), np.mean(auc_list)

    return acc, auc

In [13]:
print cv_svm(X_train,Y_train,kfold)

(0.73597560975609766, 0.63102986220404123)


### Decision Tree

In [14]:
def cv_decision_tree(X,Y,kfold):
    
    acc_list, auc_list = [], []
    
    for train, test in kfold:
        X_train, Y_train, X_fold, Y_true = X[train], Y[train], X[test], Y[test]

        Y_pred = decisionTree_pred(X_train, Y_train, X_fold)

        metrics = classification_metrics(Y_pred, Y_true)
        acc_list.append(metrics[0])
        auc_list.append(metrics[1])

    acc, auc = np.mean(acc_list), np.mean(auc_list)

    return acc, auc

In [15]:
print cv_decision_tree(X_train,Y_train,kfold)

(0.72990022172948998, 0.64917739872839508)


### Random Forest

In [16]:
def cv_random_forest(X,Y,kfold):
    
    acc_list, auc_list = [], []
    
    for train, test in kfold:
        X_train, Y_train, X_fold, Y_true = X[train], Y[train], X[test], Y[test]

        Y_pred = randomForest_pred(X_train, Y_train, X_fold)

        metrics = classification_metrics(Y_pred, Y_true)
        acc_list.append(metrics[0])
        auc_list.append(metrics[1])

    acc, auc = np.mean(acc_list), np.mean(auc_list)

    return acc, auc

In [17]:
print cv_random_forest(X_train,Y_train,kfold)

(0.76759793052475978, 0.67500876683684763)


### Run the test data on the models

In [18]:
#input: Name of classifier, predicted labels, actual labels
def display_metrics(classifierName,Y_pred,Y_true):
    print "______________________________________________"
    print "Classifier: "+classifierName
    acc, auc_, precision, recall, f1score = classification_metrics(Y_pred,Y_true)
    print "Accuracy: "+str(acc)
    print "AUC: "+str(auc_)
    print "Precision: "+str(precision)
    print "Recall: "+str(recall)
    print "F1-score: "+str(f1score)
    print "______________________________________________"
    print ""

In [19]:
display_metrics("Logistic Regression",logistic_regression_pred(X_train,Y_train,X_test),Y_test)

display_metrics("SVM",svm_pred(X_train,Y_train,X_test),Y_test)

display_metrics("Decision Tree",decisionTree_pred(X_train,Y_train,X_test),Y_test)

display_metrics("Random Forest",randomForest_pred(X_train,Y_train,X_test),Y_test)

______________________________________________
Classifier: Logistic Regression
Accuracy: 0.678832116788
AUC: 0.5
Precision: 0.0
Recall: 0.0
F1-score: 0.0
______________________________________________

______________________________________________
Classifier: SVM
Accuracy: 0.725060827251
AUC: 0.605897686543
Precision: 0.679245283019
Recall: 0.272727272727
F1-score: 0.389189189189
______________________________________________

______________________________________________
Classifier: Decision Tree
Accuracy: 0.720194647202
AUC: 0.596326164875
Precision: 0.673469387755
Recall: 0.25
F1-score: 0.364640883978
______________________________________________

______________________________________________
Classifier: Random Forest
Accuracy: 0.749391727494
AUC: 0.639784946237
Precision: 0.745762711864
Recall: 0.333333333333
F1-score: 0.460732984293
______________________________________________

