In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.preprocessing import StandardScaler

from sklearn import svm

%matplotlib inline



In [2]:
#use project clean data - OATH Violations

df = pd.read_csv('processed_data/2016_FINAL_DATA.csv')

In [3]:
df = df[~(df['GEOID'].isnull()) & ~(df['MEDIAN_PERSON_AGE'].isnull())]

In [4]:
df['YEARS_OLD'] = df['YEARS_OLD'].apply(lambda x: 0 if x == 2017. else x)

In [5]:
df.fillna(0, inplace=True)
df.replace([np.inf, -np.inf], 0, inplace=True)

In [6]:
#Determine X and Y and test/train values
df_y = df.ix[:,0]
df_x = df.ix[:, 5:27]
X = df_x.as_matrix()
Y = df_y.as_matrix()

# --------------------
# K-fold CV
# --------------------
#X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=0.7, random_state=3)
#ts = Y_test.shape[0]

In [7]:
scaler = StandardScaler()
X = scaler.fit_transform(X)
#X_2d = scaler.fit_transform(X_2d)

In [8]:
# Decision Tree (CART) Function


def DT(maxdepth, ts):
    # Train
    dtc = DecisionTreeClassifier(max_depth = maxdepth)
    dtc.fit(X_train, Y_train)

    # Predicting
    dtc_pred = dtc.predict(X_test)

    # Finding mispredicted samples
    dtc_verror = np.asarray([int(dtc_pred[i] != Y_test[i]) for i in range(0,ts)])
    dtc_error = np.sum(dtc_verror)
    dtc_ccidx = np.where(dtc_verror == 0)
    dtc_mcidx = np.where(dtc_verror == 1)

    perrordtc = float(dtc_error)/ts
    return perrordtc


In [9]:
# Decision Tree (CART ) with Bagging/Boosting
def bagb(maxdepth, nestimators, ts):
    #Train
    bagb = BaggingClassifier(DecisionTreeClassifier(max_depth=maxdepth), n_estimators=nestimators)
    bagb.fit(X_train,Y_train)

    # Predicting
    bagb_pred = bagb.predict(X_test)

    # Finding mispredicted samples
    bagb_verror = np.asarray([int(bagb_pred[i] != Y_test[i]) for i in range(0,ts)])
    bagb_error = np.sum(bagb_verror)
    bagb_ccidx = np.where(bagb_verror == 0)
    bagb_mcidx = np.where(bagb_verror == 1)

    perrordtb = float(bagb_error)/ts
    return perrordtb


In [10]:
# SVM Function (choose optimal gamma first)

def gammachoice(X_train, Y_train, X_test, Y_test, ts, step):  
    l=[]
    for g in np.logspace(-2, 5, step):
#     for g in range(1,250, step):

        svm_rbf = svm.SVC(kernel='rbf', gamma=float(g))
        svm_rbf.fit(X_train,Y_train)
        ypred_svm_rbf = svm_rbf.predict(X_test)
        e_svm_rbf = float(np.sum((ypred_svm_rbf[i] != Y_test[i]) for i in range(0,ts)))
        l.append((e_svm_rbf/ts))
    plt.plot(range(0,250, step), l)
    plt.xlabel('gamma')
    plt.ylabel('classification percent error')
    minindex = l.index(min(l)) + 1
    #tx = plt.xticks(range(0,250, step))
    print "optimal gamma is:", step*minindex
    return step*minindex

def svmrbf(X_train, Y_train, X_test, Y_test, g, ts): 
    svm_rbf = svm.SVC(kernel='rbf', gamma=g)
    svm_rbf.fit(X_train,Y_train)
    
    ypred_svm_rbf = svm_rbf.predict(X_test)
    
    e_svm_rbf = np.sum((ypred_svm_rbf[i] != Y_test[i]) for i in range(0,ts))
    perrorsvmrbf = float(e_svm_rbf)/ts
    return perrorsvmrbf


In [11]:
np.logspace(-2, 5, 10)[1]

0.059948425031894091

In [12]:
# Random Forest (No Boosting Function)

def RFreg(maxdepth, nestimators):
    #Train
    rdf = RandomForestClassifier(max_depth=maxdepth, n_estimators=nestimators, bootstrap = True)
    rdf.fit(X_train,Y_train)

    # Predicting
    rdf_pred = rdf.predict(X_test)

    # Finding mispredicted samples
    rdf_verror = np.asarray([int(rdf_pred[i] != Y_test[i]) for i in range(0,ts)])
    rdf_error = np.sum(rdf_verror)
    rdf_ccidx = np.where(rdf_verror == 0)
    rdf_mcidx = np.where(rdf_verror == 1)

    perrorrf = float(rdf_error)/ts
    return perrorrf

In [13]:
# Random Forest (Boosting Function)

def RFboost(maxdepth, nestimators):
    #Train
    rdfb = AdaBoostClassifier(RandomForestClassifier(max_depth=maxdepth, n_estimators=nestimators, bootstrap = True))
    rdfb.fit(X_train,Y_train)

    # Predicting
    rdf_pred = rdfb.predict(X_test)

    # Finding mispredicted samples
    rdf_verror = np.asarray([int(rdf_pred[i] != Y_test[i]) for i in range(0,ts)])
    rdf_error = np.sum(rdf_verror)
    rdf_ccidx = np.where(rdf_verror == 0)
    rdf_mcidx = np.where(rdf_verror == 1)

    perrorrfb = float(rdf_error)/ts
    return perrorrfb

In [14]:
def svml(X_train, Y_train, X_test, Y_test, ts):  
    svm_linear = svm.SVC(kernel='linear')
    svm_linear.fit(X_train,Y_train)

    ypred_svm_linear = svm_linear.predict(X_test)

    e_svm_linear = np.sum((ypred_svm_linear[i] != Y_test[i]) for i in range(0,ts))
    pererrorsvml = float(e_svm_linear)/ts
    return perrorsvml

In [15]:
#For Loop to determine the best parameters - training size, maxdepth and nestimators


splits = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
maxdepthoption = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
nestimatoroptions = [5, 10, 15, 20, 25, 30, 35, 40]
bestDTerror = 1
bestDTsplit = 0
bestDTdepth = 0
bestbagerror = 1
bestbagsplit = 0
bestbagdepth = 0
bestbagestimator = 0
bestRFerror = 1
bestRFsplit = 0
bestRFdepth = 0
bestRFestimator = 0
bestRFBerror = 1
bestRFBsplit = 0
bestRFBdepth = 0
bestRFBestimator = 0
bestSVMerror = 1
bestSVMsplit = 0

In [16]:
DTES = []
BAGES = []
RFES = []
RFBES = []
for i in splits:
    bestDTerror = 1
    bestbagerror = 1
    bestRFerror = 1
    bestRFBerror = 1
    print "percent of data in the training set is:", i*100
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=i, random_state=123)
    ts = Y_test.shape[0]
    

    
    for j in maxdepthoption:
        DTE = DT(j, ts)
        if DTE < bestDTerror:
            bestDTerror = DTE
            bestDTsplit = i
            bestDTdepth = j
        else:
            continue
        print "best DT error is", bestDTerror
        
        for k in nestimatoroptions:
            BAGE = bagb(j, k, ts)
            if BAGE < bestbagerror:
                bestbagerror = BAGE
                bestbagsplit = i
                bestbagdepth = j
                bestbagestimator = k
            else:
                continue
            print "best BAGDT error is", bestbagerror
            
            RFE = RFreg(j, k)
            if RFE < bestRFerror:
                bestRFerror = RFE
                bestRFsplit = i
                bestRFdepth = j
                bestRFestimator = k
            else:
                continue
            print "best RF error is", bestRFerror
            
            RFBE = RFboost(j, k)
            if RFBE < bestRFBerror:
                bestRFBerror = RFBE
                bestRFBsplit = i
                bestRFBdepth = j
                bestRFBestimator = k
            else:
                continue
            print "best Boosted RF error is", bestRFBerror
    DTES.append(bestDTerror)
    BAGES.append(bestbagerror)
    RFES.append(bestRFerror)    
    RFBES.append(bestRFBerror)


percent of data in the training set is: 10.0
best DT error is 0.330386213091
best BAGDT error is 0.330386213091
best RF error is 0.387223532053
best Boosted RF error is 0.363786909173
best DT error is 0.329431907488
best BAGDT error is 0.323369260132
best RF error is 0.337936454474
best Boosted RF error is 0.343634220276
best BAGDT error is 0.322499157966
best RF error is 0.326568990681
best BAGDT error is 0.321853598293
best BAGDT error is 0.319018749298
best DT error is 0.327102279106
best BAGDT error is 0.316857527787
best RF error is 0.32654092287
best Boosted RF error is 0.340462557539
best BAGDT error is 0.313882339733
best RF error is 0.30894240485
percent of data in the training set is: 20.0
best DT error is 0.331175597588
best BAGDT error is 0.331175597588
best RF error is 0.346932331302
best Boosted RF error is 0.335564747861
best DT error is 0.322870946351
best BAGDT error is 0.313713726357
best RF error is 0.31387160946
best BAGDT error is 0.312040165461
best RF error is 0.

In [17]:
#SVM - SVM will use its own loop because it has different parameters than the other models 

svmrbf = []
SVMRBFES = []

for i in splits:
    e = []
    print "percent of data in the training set is:", i*100
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=i, random_state=123)
    ts = Y_test.shape[0]
    
    
    #choose gamma
    step = 10
    # g = gammachoice(X_train, Y_train, X_test, Y_test, ts, step)
    # print g
    # plt.legend(splits, bbox_to_anchor=(1.25, 1.0))
    
    #run svm rbf with that split and optimal gamma and append percent error to the svmrbf array
    for g in np.logspace(-4, 1, 10):
        svm_rbf = svm.SVC(kernel='rbf', gamma=g)
        svm_rbf.fit(X_train,Y_train)

        ypred_svm_rbf = svm_rbf.predict(X_test)

        e_svm_rbf = np.sum((ypred_svm_rbf[i] != Y_test[i]) for i in range(0,ts))
        perbf = float(e_svm_rbf)/ts
        e.append(perbf)

    svmrbf.append(e)
    SVMRBFES.append(min(e))

percent of data in the training set is: 10.0
percent of data in the training set is: 20.0
percent of data in the training set is: 30.0
percent of data in the training set is: 40.0
percent of data in the training set is: 50.0
percent of data in the training set is: 60.0
percent of data in the training set is: 70.0
percent of data in the training set is: 80.0
percent of data in the training set is: 90.0


In [18]:
# Printing out lists for access later. 10% -> 90% for each
print DTES
print BAGES
print RFES
print RFBES
print SVMRBFES

[0.3271022791063209, 0.32287094635132146, 0.32081123019739455, 0.3203519703603907, 0.3136462385691911, 0.3019892642879697, 0.307005725833614, 0.3122000505178075, 0.29754988633493307]
[0.31388233973279445, 0.3080930878777353, 0.2980404893363646, 0.2827130346918154, 0.3007628959733239, 0.27849700031575625, 0.27761872684405525, 0.27721646880525386, 0.26749179085627683]
[0.30894240485011787, 0.30714578925763364, 0.2917974811446718, 0.2800606264735601, 0.2931844591522255, 0.2711083043890117, 0.27214550353654426, 0.27178580449608486, 0.26648143470573377]
[0.3404625575390143, 0.3173450377340617, 0.27454801342427193, 0.2548837992590098, 0.27454150457232357, 0.22993369119040102, 0.22246547659144494, 0.21811063399848446, 0.20459712048497095]
[0.33313685865049963, 0.3232182891786921, 0.3133051856663419, 0.3056584708656113, 0.29692315465063407, 0.2906220397852858, 0.2858706635230717, 0.2720383935337206, 0.2649659004799192]


In [19]:
print svmrbf


[[0.4422364432468845, 0.39132143258111596, 0.3800943078477602, 0.3615414842258898, 0.3350454698551701, 0.33313685865049963, 0.34298866060401934, 0.37080386213090827, 0.3989839452116313, 0.4082743909284832], [0.39262370141147496, 0.38630837727746375, 0.3709305630111466, 0.33673308282547604, 0.32697590703842877, 0.3244813540054943, 0.3232182891786921, 0.33793299441093816, 0.35978401591461684, 0.3695411917016641], [0.3892317130381437, 0.37598787485114216, 0.353938869041175, 0.32856988199631915, 0.32405903792717694, 0.3192594998376096, 0.3133051856663419, 0.3193316733427159, 0.3308072606546137, 0.34120024538991733], [0.38813573593802625, 0.3736106433142472, 0.3494442573256989, 0.32607780397440217, 0.3225833614011452, 0.3172364432468845, 0.3107527787133715, 0.3056584708656113, 0.3163523071741327, 0.32582519366790164], [0.38392360935684333, 0.36917091901177185, 0.3439599858535846, 0.3248623250644167, 0.320416308796039, 0.3136462385691911, 0.30172283130399635, 0.29692315465063407, 0.303844793

In [21]:
# lowest percent error.
print 'DT', min(DTES)
print 'BAG', min(BAGES)
print 'RF', min(RFES)
print 'RFB', min(RFBES)
print 'SVMRBF', min(SVMRBFES)


DT 0.297549886335
BAG 0.267491790856
RF 0.266481434706
RFB 0.204597120485
SVMRBF 0.26496590048
