In [1]:
%matplotlib inline
from sklearn import svm
import numpy as np
from matplotlib import pyplot as plt
from sklearn.metrics import roc_curve, roc_auc_score

x = np.loadtxt("xtrain.txt", delimiter=",")
y = np.loadtxt("ytrain.txt", delimiter=" ")

In [8]:
train_size = int(len(x) * 0.7)

xtrain = x[:train_size]
ytrain = y[:train_size]

xtest = x[train_size:]
ytest = y[train_size:]

def print_stats(ypre, ytest, scores):
    # Get Score:
    score = len(filter(lambda x:x[0] == x[1], zip(ypre,ytest)))

    nsCorrect = filter(lambda x:x[0] == x[1] and x[0] == 1 , zip(ypre,ytest))
#     print('# of true positives: %s'% len(nsCorrect))
    nsPred = filter(lambda x: x==1, ypre)
#     print('# predicted positives: %s'% len(nsPred))

    nsActual = filter(lambda x: x==1, ytest)
#     print 'Number of actual No-Shows: %s' %len(nsActual)
    print 'Accuracy: %s' % (score*1.0/len(ytest))
    print('PPV: %s'%(len(nsCorrect)*1.0/len(nsPred))) # TP / (TP+FP)
    print 'Sensitivity or TPR : %s' % (len(nsCorrect)*1.0/(len(nsActual))) #TP / P
    
    fpr, tpr, thresholds = roc_curve(ytest, scores)
    print 'AUROC: %s' % roc_auc_score(ytest, scores)
#     print('FPR: %s' % fpr)
#     print('TPR: %s' % tpr)
#     print('Thresholds: %s' % thresholds)

#     print 'PPV: %s' %(len(nsCorrect) * 1.0/ len(nsActual))

In [3]:
from itertools import izip

def plotRocStats(ytest, scores):
    fpr, tpr, thresholds = roc_curve(ytest, scores)
    
    # ROC
    plt.figure(0)
    plt.clf()
    random,= plt.plot(np.arange(0,1,0.001),np.arange(0,1,0.001), label="Random")
    plt.title('ROC')
    model, = plt.plot(fpr,tpr, label="Model")
#     plt.legend(handles=[model,random],loc=4)
    plt.ylabel("True Positive Rate")
    plt.xlabel("False Positive Rate")
    
    # Metrics vs Threshold
    plt.figure(1)
    plt.clf()
    plt.title('Metrics V.S. Threshold ')
    tpr_plot, = plt.plot(thresholds, tpr, label='TPR')
    fpr_plot, = plt.plot(thresholds, fpr, label='FPR')
    plt.ylabel("Percentage")
    plt.xlabel("Thresholds")
    plt.show()

In [4]:
def plotAcc(ytest, scores, th):
    accs = []
    data_length = len(ytest)
    
    # Accuracy
    for t in th:
        acc = 0
        for score, y in izip(scores,ytest):
            p = 0 if score <= t else 1
            if p == y:
               acc += 1    
        accs.append(acc*1.0 / data_length)
    acc_plot, = plt.plot(th, accs, label='Accuracy')
    
#     plt.legend(handles=[tpr_plot, fpr_plot, acc_plot],loc=5)
    plt.ylabel("Percentage")
    plt.xlabel("Thresholds")
    plt.show()

# Linear SVM

In [7]:
# Train
# clf = svm.SVC(kernel='linear', C = 1.0)
for c in [0.01, 0.1, 0.3, 0.5]:
    print "------------ C =  %s ------------" % c
    clf = svm.LinearSVC(dual=False, C = c)
    clf.fit(xtrain,ytrain)

    # Predict
    ypre = clf.predict(xtest)

    # Get Score
#     score = len(filter(lambda x:x[0] == x[1], zip(ypre,ytest)))

    print_stats(ypre, ytest, clf.decision_function(xtest))

------------ C =  0.01 ------------
Accuracy: 0.859574668003
PPV: 0.832635983264
Sensitivity or TPR : 0.0218058294981
AUROC: 0.688015584165
FPR: [  0.00000000e+00   0.00000000e+00   1.82715147e-05 ...,   9.99744199e-01
   9.99780742e-01   1.00000000e+00]
TPR: [  1.09577033e-04   2.73942582e-03   2.73942582e-03 ...,   1.00000000e+00
   1.00000000e+00   1.00000000e+00]
Thresholds: [ 0.84720783  0.4217997   0.41993698 ..., -1.66054504 -1.66456957
 -1.73388968]
------------ C =  0.1 ------------
Accuracy: 0.859731270358
PPV: 0.802867383513
Sensitivity or TPR : 0.0245452553145
AUROC: 0.689884961735
FPR: [  0.00000000e+00   0.00000000e+00   1.82715147e-05 ...,   9.99725927e-01
   9.99762470e-01   1.00000000e+00]
TPR: [  1.09577033e-04   3.94477318e-03   3.94477318e-03 ...,   1.00000000e+00
   1.00000000e+00   1.00000000e+00]
Thresholds: [ 1.51420336  0.82558236  0.81404019 ..., -2.39832364 -2.40084969
 -2.5172752 ]
------------ C =  0.3 ------------
Accuracy: 0.859684289652
PPV: 0.7804054054

# Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=5)
clf = clf.fit(xtrain, ytrain)

# Predict
ypre = clf.predict(xtest)
print_stats(ypre, ytest)

# Adaboost

In [10]:
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, DecisionTreeClassifier
# Create and fit an AdaBoosted decision tree
for c in [1, 3, 5, 7, 10]:
    print "------------ C =  %s ------------" % c
    bdt = AdaBoostClassifier(RandomForestClassifier(),algorithm="SAMME", n_estimators=c)
    bdt.fit(xtrain, ytrain)

    ypre = bdt.predict(xtest)
    scores = bdt.decision_function(xtest)
    print_stats(ypre, ytest, scores)

------------ C =  1 ------------
Accuracy: 0.839012778752
PPV: 0.31589023612
Sensitivity or TPR : 0.108481262327
AUROC: 0.534653567396
------------ C =  3 ------------
Accuracy: 0.840954647958
PPV: 0.316071428571
Sensitivity or TPR : 0.0969756738988
AUROC: 0.552016313503
------------ C =  5 ------------
Accuracy: 0.844681784014
PPV: 0.340579710145
Sensitivity or TPR : 0.0927021696252
AUROC: 0.557288037916
------------ C =  7 ------------
Accuracy: 0.844901027311
PPV: 0.338990066225
Sensitivity or TPR : 0.0897435897436
AUROC: 0.564608651624
------------ C =  10 ------------
Accuracy: 0.845871961914
PPV: 0.345155709343
Sensitivity or TPR : 0.0874424720579
AUROC: 0.569517391355


### Linear SVC as Base

In [None]:
bdt = AdaBoostClassifier(svm.SVC(probability=True,kernel='linear'),n_estimators=5, learning_rate=1.0, algorithm='SAMME')
ypre = bdt.predict(xtest)
scores = bdt.decision_function(xtest)
print_stats(ypre, ytest, scores)

### Decision Tree Classifier as Base

In [None]:
from sklearn.tree import DecisionTreeClassifier
for c in [ 5]:
    print "------------ C =  %s ------------" % c
    bdt = AdaBoostClassifier(
        DecisionTreeClassifier(max_depth=c),
        n_estimators=15)
    bdt.fit(xtrain, ytrain)
    ypre = bdt.predict(xtest)
    scores = bdt.decision_function(xtest)
    print_stats(ypre, ytest, scores)

------------ C =  5 ------------
Accuracy: 0.856286018542
PPV: 0.480459770115
Sensitivity or TPR : 0.068704799474
AUROC: 0.702591358675


# GBRT

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,
   max_depth=1, random_state=0).fit(xtrain, ytrain)
ypre = clf.predict(xtest)

print_stats(ypre, ytest)

# XGBoost

In [None]:
import xgboost as xgb
xg_train = xgb.DMatrix(xtrain, label=ytrain)
xg_test = xgb.DMatrix(xtest, label=ytest)

## setup parameters for xgboost
param = {}
param['objective'] = 'binary:logistic'
param['eta'] = 0.1
param['max_depth'] = 15
param['silent'] = 1
# param['nthread'] = 4 # comment to use max num of threads
param['eval_metric'] ='auc'
param['scale_pos_weight'] = 47747*1.0/7836 # scaled by calculating ratio of pos to neg
watchlist = [ (xg_train,'train'), (xg_test, 'test') ]
num_round = 50
bst = xgb.train(param, xg_train, num_round, watchlist)


### get prediction
pred = bst.predict( xg_test )
print_stats(ytest, pred)

# Percentage of no shows

In [None]:
score = len(filter(lambda x:x == -1, ytest))
    
print score*1.0/len(ytest)