In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn import svm

In [2]:
#use project clean data - OATH Violations

df = pd.read_csv('processed_data/2016_FINAL_DATA.csv')

In [3]:
df = df[~(df['GEOID'].isnull()) & ~(df['MEDIAN_PERSON_AGE'].isnull())]

In [4]:
df['YEARS_OLD'] = df['YEARS_OLD'].apply(lambda x: 0 if x == 2017. else x)

In [5]:
df.fillna(0, inplace=True)
df.replace([np.inf, -np.inf], 0, inplace=True)

In [6]:
#Determine X and Y and test/train values
df_y = df.ix[:,0]
df_x = df.ix[:, 5:27]
X = df_x.as_matrix()
Y = df_y.as_matrix()

# --------------------
# K-fold CV
# --------------------
#X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=0.7, random_state=3)
#ts = Y_test.shape[0]

In [7]:
# Decision Tree (CART) Function


def DT(maxdepth, ts):
    # Train
    dtc = DecisionTreeClassifier(max_depth = maxdepth)
    dtc.fit(X_train, Y_train)

    # Predicting
    dtc_pred = dtc.predict(X_test)

    # Finding mispredicted samples
    dtc_verror = np.asarray([int(dtc_pred[i] != Y_test[i]) for i in range(0,ts)])
    dtc_error = np.sum(dtc_verror)
    dtc_ccidx = np.where(dtc_verror == 0)
    dtc_mcidx = np.where(dtc_verror == 1)

    perrordtc = float(dtc_error)/ts
    return perrordtc


In [8]:
# Decision Tree (CART ) with Bagging/Boosting
def bagb(maxdepth, nestimators, ts):
    #Train
    bagb = BaggingClassifier(DecisionTreeClassifier(max_depth=maxdepth), n_estimators=nestimators)
    bagb.fit(X_train,Y_train)

    # Predicting
    bagb_pred = bagb.predict(X_test)

    # Finding mispredicted samples
    bagb_verror = np.asarray([int(bagb_pred[i] != Y_test[i]) for i in range(0,ts)])
    bagb_error = np.sum(bagb_verror)
    bagb_ccidx = np.where(bagb_verror == 0)
    bagb_mcidx = np.where(bagb_verror == 1)

    perrordtb = float(bagb_error)/ts
    return perrordtb


In [9]:
# SVM Function (choose optimal gamma first)

def gammachoice(X_train, Y_train, X_test, Y_test, ts, step):  
    l=[]
    for g in range(1,250, step):
        svm_rbf = svm.SVC(kernel='rbf', gamma=float(g))
        svm_rbf.fit(X_train,Y_train)
        ypred_svm_rbf = svm_rbf.predict(X_test)
        e_svm_rbf = float(np.sum((ypred_svm_rbf[i] != Y_test[i]) for i in range(0,ts)))
        l.append((e_svm_rbf/ts))
    plt.plot(range(0,250, step), l)
    plt.xlabel('gamma')
    plt.ylabel('classification percent error')
    minindex = l.index(min(l))
    #tx = plt.xticks(range(0,250, step))
    print "optimal gamma is:", step*minindex
    return step*minindex

def svmrbf(X_train, Y_train, X_test, Y_test, g, ts): 
    svm_rbf = svm.SVC(kernel='rbf', gamma=g)
    svm_rbf.fit(X_train,Y_train)
    
    ypred_svm_rbf = svm_rbf.predict(X_test)
    
    e_svm_rbf = np.sum((ypred_svm_rbf[i] != Y_test[i]) for i in range(0,ts))
    perrorsvmrbf = float(e_svm_rbf)/ts
    return perrorsvmrbf


In [10]:
# Random Forest (No Boosting Function)

def RFreg(maxdepth, nestimators):
    #Train
    rdf = RandomForestClassifier(max_depth=maxdepth, n_estimators=nestimators, bootstrap = True)
    rdf.fit(X_train,Y_train)

    # Predicting
    rdf_pred = rdf.predict(X_test)

    # Finding mispredicted samples
    rdf_verror = np.asarray([int(rdf_pred[i] != Y_test[i]) for i in range(0,ts)])
    rdf_error = np.sum(rdf_verror)
    rdf_ccidx = np.where(rdf_verror == 0)
    rdf_mcidx = np.where(rdf_verror == 1)

    perrorrf = float(rdf_error)/ts
    return perrorrf

In [11]:
# Random Forest (Boosting Function)

def RFboost(maxdepth, nestimators):
    #Train
    rdfb = AdaBoostClassifier(RandomForestClassifier(max_depth=maxdepth, n_estimators=nestimators, bootstrap = True))
    rdfb.fit(X_train,Y_train)

    # Predicting
    rdf_pred = rdfb.predict(X_test)

    # Finding mispredicted samples
    rdf_verror = np.asarray([int(rdf_pred[i] != Y_test[i]) for i in range(0,ts)])
    rdf_error = np.sum(rdf_verror)
    rdf_ccidx = np.where(rdf_verror == 0)
    rdf_mcidx = np.where(rdf_verror == 1)

    perrorrfb = float(rdf_error)/ts
    return perrorrfb

In [12]:
#For Loop to determine the best parameters - training size, maxdepth and nestimators


splits = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
maxdepthoption = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
nestimatoroptions = [5, 10, 15, 20, 25, 30, 35, 40]
bestDTerror = 1
bestDTsplit = 0
bestDTdepth = 0
bestbagerror = 1
bestbagsplit = 0
bestbagdepth = 0
bestbagestimator = 0
bestRFerror = 1
bestRFsplit = 0
bestRFdepth = 0
bestRFestimator = 0
bestRFBerror = 1
bestRFBsplit = 0
bestRFBdepth = 0
bestRFBestimator = 0


for i in splits:
    print "percent of data in the training set is:", i*100
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=i, random_state=123)
    ts = Y_test.shape[0]
    for j in maxdepthoption:
        DTE = DT(j, ts)
        if DTE < bestDTerror:
            bestDTerror = DTE
            bestDTsplit = i
            bestDTdepth = j
        else:
            continue
        for k in nestimatoroptions:
            BAGE = bagb(j, k, ts)
            if BAGE < bestbagerror:
                bestbagerror = BAGE
                bestbagsplit = i
                bestbagdepth = j
                bestbagestimator = k
            else:
                continue
            RFE = RFreg(j, k)
            if RFE < bestRFerror:
                bestRFerror = RFE
                bestRFsplit = i
                bestRFdepth = j
                bestRFestimator = k
            else:
                continue
            RFBE = RFboost(j, k)
            if RFBE < bestRFBerror:
                bestRFBerror = RFBE
                bestRFBsplit = i
                bestRFBdepth = j
                bestRFBestimator = k
            else:
                continue
        
        

percent of data in the training set is: 10.0
percent of data in the training set is: 20.0
percent of data in the training set is: 30.0
percent of data in the training set is: 40.0
percent of data in the training set is: 50.0
percent of data in the training set is: 60.0
percent of data in the training set is: 70.0
percent of data in the training set is: 80.0
percent of data in the training set is: 90.0


In [13]:
print bestDTerror

0.297297297297


In [14]:
print bestbagerror

0.269765092195


In [15]:
print bestRFerror

0.263197777216


In [16]:
print bestRFBerror

0.212679969689


In [21]:
#SVM - SVM will use its own loop because it has different parameters than the other models 

svmrbf = []

for i in splits:
    print "percent of data in the training set is:", i*100
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=i, random_state=123)
    ts = Y_test.shape[0]
    
    
    #choose gamma
    step = 10
    g = gammachoice(X_train, Y_train, X_test, Y_test, ts, step)
    print g
    plt.legend(splits, bbox_to_anchor=(1.25, 1.0))
    
    #run svm rbf with that split and optimal gamma and append percent error to the svmrbf array
    svm_rbf = svm.SVC(kernel='rbf', gamma=g)
    svm_rbf.fit(X_train,Y_train)
    
    ypred_svm_rbf = svm_rbf.predict(X_test)
    
    e_svm_rbf = np.sum((ypred_svm_rbf[i] != Y_test[i]) for i in range(0,ts))
    perbf = float(e_svm_rbf)/ts
    svmrbf.append(perbf)

percent of data in the training set is: 10.0
optimal gamma is: 0
0


ValueError: The gamma value of 0.0 is invalid. Use 'auto' to set gamma to a value of 1 / n_features.