In [3]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import os
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn import preprocessing
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from IPython.display import display, HTML
import time

In [4]:
income = pd.read_csv("./income.csv")
HR = pd.read_csv("./HR_comma_sep.csv")  

In [5]:
def processData(data, feature):
    Y = data[feature]
    X = data.drop(feature, axis = 1)
    X = pd.get_dummies(X)
    X = preprocessing.scale(X)
    X = pd.DataFrame(X)
    return X, Y

In [63]:
def write_scores(filename, x, y):
    file = open(filename, "w")
    if x is not None:
        for i in range(0, len(x)):
            file.write(str(x[i]) + ", ")
            file.write(str(y[i]) + "\n")
        file.write("\n")
    file.close()

In [64]:
def write_scores(filename, x, y, a, b):
    file = open(filename, "w")
    if x is not None:
        for i in range(0, len(x)):
            file.write(str(x[i]) + ", ")
            file.write(str(y[i]) + "\n")
        file.write("\n")
    file.write("\n")
    if a is not None:
        for i in range(0, len(a)):
            file.write(str(a[i]) + ", ")
            file.write(str(b[i]) + "\n")
        file.write("\n")
    file.close()

## Income Set

### Decision Trees

In [69]:
# data size vs accuracy
income_dt = DecisionTreeClassifier(max_leaf_nodes=10)
income_means = []
income_std = []
income_mean_training = []
training_time = []
print "start"
for i in range(1, 51):
    income_sample = income.sample(income["income"].count() / 50 * i)
    income_X, income_Y = processData(income_sample, "income")
    income_scores = cross_val_score(income_dt, income_X, income_Y, cv = 10)
    start = time.time()
    income_dt.fit(income_X, income_Y)
    end = time.time()
    training_time.append(end-start)
    income_means.append(income_scores.mean())
    income_std.append(income_scores.std())
    income_mean_training.append(income_dt.score(income_X, income_Y))
print "done"


start


done


In [5]:
#max depth
income_means_leaf = []
income_std_leaf = []
income_mean_training_leaf = []
print "start"
for i in range(1, 11):
    income_dt = DecisionTreeClassifier(max_depth=i*5)
    # income_sample = income.sample(income["income"].count() / 51 * (i - 1))
    income_X, income_Y = processData(income, "income")
    income_scores = cross_val_score(income_dt, income_X, income_Y, cv = 10)
    income_means_leaf.append(income_scores.mean())
    income_std_leaf.append(income_scores.std())
    income_dt.fit(income_X, income_Y)
    income_mean_training_leaf.append(income_dt.score(income_X, income_Y))
    print i
print "done"

start


1


2


3


4


5


6


7


8


9


10
done


In [None]:
fig, ax = plt.subplots()
fig.set_size_inches(15, 8)
x = []
print "start"
for i in list(range(1,51)) :
    x.append(i*2)
income_graph = sns.pointplot(x, income_means, ax = ax, color = 'r')
# income_graph = plt.errorbar(x, income_mean_training, income_std, linestyle='None', marker='^')
# sns.pointplot(x, income_mean_training, ax = ax, color = 'r')

income_graph.set_xlabel("% of training data")
income_graph.set_ylabel("% accuracy")
income_graph.set_title("Income Set, Decision Tree, % of training data vs % accuracy")
plt.savefig("Income_DT_Training_Data_Accuracy")
write_scores("Income_DT_Training_Data_Accuracy.csv", x, income_means, None, None)
write_scores("income_DT_training_time.csv", x, training_time, None, None)
plt.show()


start


In [None]:

fig2, ax2 = plt.subplots()
fig2.set_size_inches(15, 8)
x = []
for i in list(range(1,11)) :
    x.append(i*5)
print len(x)
print len(income_mean_training_leaf)
income_graph_leaf = sns.pointplot(x, income_means_leaf, ax = ax2)
sns.pointplot(x, income_mean_training_leaf, ax = ax2, color = 'r')
income_graph_leaf.set_xlabel("# of leaf nodes")
income_graph_leaf.set_ylabel("% accuracy")
income_graph_leaf.set_title("Income Set, Decision Tree, # of leaf nodes vs % accuracy")
plt.savefig("Income_DT_LeafNodes_Accuracy")
write_scores("Income_DT_LeafNodes_Accuracy.csv", x, income_means_leaf)
write_scores("Income_DT_LeafNodes_Accuracy_training.csv", x, income_mean_training_leaf)
plt.show()
print "done"

10
10
done


### KNN

In [62]:
income_knn = KNeighborsClassifier(n_neighbors = 5)
income_means_KNN = []
income_std_KNN = []
income_mean_training_KNN = []
training_time = []
print "start"
for i in range(1, 11):
    income_sample = income.sample(income["income"].count() / 10 * i)
    income_X, income_Y = processData(income_sample, "income")
    income_scores = cross_val_score(income_knn, income_X, income_Y, cv = 10)
    start = time.time()
    income_knn.fit(income_X, income_Y)
    end = time.time()
    training_time.append(end-start)
    income_means_KNN.append(income_scores.mean())
    income_std_KNN.append(income_scores.std())
    income_mean_training_KNN.append(income_knn.score(income_X, income_Y))
    print i
print "done"

start


1


2


3


4


5


6


7


8


9


10
done


In [10]:
income_means_KNN_neighbors = []
income_std_KNN_neighbors = []
income_mean_training_KNN_neighbors = []
print "start"
for i in range(1, 51, 5):
    income_knn = KNeighborsClassifier(n_neighbors = i)
    income_sample = income.sample(income["income"].count() / 10)
    income_X, income_Y = processData(income, "income")
    income_scores = cross_val_score(income_knn, income_X, income_Y, cv = 10)
    income_knn.fit(income_X, income_Y)
    income_means_KNN_neighbors.append(income_scores.mean())
    income_std_KNN_neighbors.append(income_scores.std())
    print "start training set"
    income_mean_training_KNN_neighbors.append(income_knn.score(income_X, income_Y))
    print i
print "done"

start


start training set


1


start training set


6


start training set


11


start training set


16


start training set


21


start training set


26


start training set


31


start training set


36


start training set


41


start training set


46
done


In [67]:
fig, ax = plt.subplots()
fig.set_size_inches(15, 8)
x = []
# for i in list(range(1,51)) :
#     x.append(i*2)
# income_graph = sns.pointplot(x, income_means, ax = ax)
# sns.pointplot(x, income_mean_training, ax = ax, color = 'r')
# income_graph.set_xlabel("% of training data")
# income_graph.set_ylabel("% accuracy")
# income_graph.set_title("Income Set, Decision Tree, % of training data vs % accuracy")
# plt.show()
for i in list(range(1,11)) :
    x.append(i * 10)
income_graph = sns.pointplot(x, income_means_KNN, ax = ax)
sns.pointplot(x, income_mean_training_KNN, ax = ax, color = 'r')
income_graph.set_xlabel("% training data")
income_graph.set_ylabel("% accuracy")
income_graph.set_title("Income Set, KNN, % training data vs % accuracy")
plt.savefig('Income Set_KNN_Training_Data_Vs_Accuracy.png', bbox_inches='tight')
write_scores("Income Set_KNN_Training_Data_Vs_Accuracy.csv", x, income_means_KNN, x, income_mean_training_KNN)
write_scores("Income_KNN_TrainingSpeed.csv", x, training_time, None, None)
plt.show()
print "done"

done


In [11]:
fig, ax = plt.subplots()
fig.set_size_inches(15, 8)
x = []
for i in list(range(1,11)) :
    x.append(i * 3)
income_graph = sns.pointplot(x, income_means_KNN_neighbors, ax = ax)
sns.pointplot(x, income_mean_training_KNN_neighbors, ax = ax, color = 'r')
income_graph.set_xlabel("Number of neighbors")
income_graph.set_ylabel("% accuracy")
income_graph.set_title("Income Set, KNN, neighbors vs % accuracy")
plt.savefig("Income_KNN_Neighbors_Accuracy.png")
write_scores("Income_KNN_Neighbors_Accuracy.csv", x, income_means_KNN_neighbors, x, income_mean_training_KNN_neighbors)
plt.show()
print "done"

done


##Neural Networks

In [20]:
income_X, income_Y = processData(income, "income")
income_X_train = income_X.sample(frac=0.2, random_state=1)
income_X_test = income_X.drop(income_X_train.index)
# print income_X_train.index
# print(income_X_train)

# income_test = income.drop(income_train.index)
income_X_train, income_Y_train = processData(income_X_train, "income")
income_X_test, income_Y_test = processData(income_test, "income")

# print income_train.shape, income_test.shape
# income_X_train.to_csv;
# print (income_X_train.shape, income_X_test.shape, income_Y_train.shape, income_Y_test.shape)
# display(income_X_train)
# display(income_X_test)

KeyError: 'income'

In [23]:
income_MLP = MLPClassifier()
income_means_NN = []
income_means_NN_training = []
fit_time = []
score_time = []
for i in range(1, 5) :
    print "start"
    income_X_train = income_X.sample(frac=0.2 * i, random_state=1)
    income_X_test = income_X.drop(income_X_train.index)
    income_Y_train = income_Y[income_X_train.index]
    income_Y_test = income_Y.drop(income_Y_train.index)
    start = time.time()
    income_MLP.fit(income_X_train, income_Y_train)
    end = time.time()
    print(end - start)
    fit_time.append(end-start)
    score = income_MLP.score(income_X_test, income_Y_test)
    end2 = time.time()
    print(end2 - end)
    score_time.append(end2-end)
    print score
    income_means_NN.append(score)
    score = income_MLP.score(income_X_train, income_Y_train)
    income_means_NN_training.append(score)
    print score
# print "start"
# for i in range(1, 5):
#     income_sample = income.sample(income["income"].count() / (20 * 5) * i)
#     income_X, income_Y = processData(income_sample, "income")
#     income_scores = cross_val_score(income_dt, income_X, income_Y, cv = 10)
#     income_dt.fit(income_X, income_Y)
#     income_means.append(income_scores.mean())
#     income_std.append(income_scores.std())
#     income_mean_training.append(income_dt.score(income_X, income_Y))
# print "done"

start


15.1749999523
0.154000043869
0.8119335347432024
0.9589123867069487
start


17.2829999924
0.112999916077
0.8225579053373615
0.9264350453172205
start


22.8420000076
0.0759999752045
0.8280966767371601
0.9159113796576033
start


22.3280000687
0.0380001068115
0.8311178247734139
0.9010574018126888
start


32.0999999046


ValueError: Found array with 0 sample(s) (shape=(0, 107)) while a minimum of 1 is required.

In [34]:
income_MLP = MLPClassifier()
income_means_NN_layers = []
income_means_NN_training_layers = []
income_X_train = income_X.sample(frac=0.8, random_state=1)
income_X_test = income_X.drop(income_X_train.index)
income_Y_train = income_Y[income_X_train.index]
income_Y_test = income_Y.drop(income_Y_train.index)
for i in range(1, 11) :
    test_scores = []
    train_scores = []
    layers = [i * 20]
    income_MLP = MLPClassifier(learning_rate_init = float(i) / 50)
    for j in range(1, 5):
        print "start"
        start = time.time()
        income_MLP.fit(income_X_train, income_Y_train)
        end = time.time()
        print(end - start)
        test_score = income_MLP.score(income_X_test, income_Y_test)
        train_score = income_MLP.score(income_X_train, income_Y_train)
        print(train_score)
        print(test_score)
        test_scores.append(test_score)
        end2 = time.time()
        print(end2 - end)
        train_scores.append(train_score)
        print ("set " + str(i))
        
    income_means_NN_layers.append(np.mean(test_score))
    income_means_NN_training_layers.append(np.mean(train_score))
    print "done"

start


5.15199995041
0.8826283987915408
0.8347432024169185
0.197000026703
set 1
start


7.34599995613
0.8950906344410876
0.83595166163142
0.18700003624
set 1
start


8.18499994278
0.8916918429003021
0.8283987915407856
0.18499994278
set 1
start


12.0329999924
0.909214501510574
0.8287009063444108
0.194000005722
set 1
done
start


7.92199993134
0.877416918429003
0.8323262839879154
0.199000120163
set 2


start
8.12800002098
0.8740936555891239
0.8362537764350453
0.18799996376
set 2
start


7.08200001717
0.883761329305136
0.8347432024169185
0.186000108719
set 2
start


8.94999980927
0.8832326283987916
0.8202416918429003
0.18700003624
set 2
done
start


1.4889998436
0.8566465256797583
0.8398791540785498
0.1890001297
set 3
start


3.51099991798
0.8488670694864048
0.8181268882175227
0.18499994278
set 3
start


7.32400012016
0.8659365558912386
0.8356495468277946
0.1859998703
set 3
start


6.15699982643
0.8657099697885197
0.834441087613293
0.186000108719
set 3
done
start


1.80200004578
0.8257552870090634
0.8081570996978852
0.18799996376
set 4
start


6.68400001526
0.854607250755287
0.8283987915407856
0.186000108719
set 4
start


6.10800004005
0.8556646525679759
0.8262839879154078
0.18399977684
set 4
start


5.08699989319


0.8560422960725076
0.8280966767371601
0.223000049591
set 4
done
start


6.14800000191
0.8495468277945619
0.8223564954682779
0.18499994278
set 5
start


2.45399999619
0.8483383685800604
0.8323262839879154
0.186999797821
set 5
start


5.00400018692
0.8601963746223565
0.8311178247734139
0.190999984741
set 5
start


6.79500007629
0.8549848942598187
0.8241691842900302
0.186999797821
set 5
done
start


5.5529999733
0.8036253776435045
0.7990936555891238
0.186000108719
set 6
start


4.82799983025
0.8388972809667674
0.817522658610272
0.199000120163
set 6
start


5.26399993896
0.8592900302114803
0.8277945619335347
0.19000005722
set 6
start


5.95000004768
0.8525679758308157
0.8302114803625378
0.184000015259
set 6
done
start


7.53100013733
0.8521148036253776
0.8217522658610272
0.1859998703
set 7
start


4.9350001812
0.8386706948640483
0.817522658610272
0.188999891281
set 7
start


6.37600016594
0.8430513595166164
0.816012084592145
0.184000015259
set 7
start


4.36999988556
0.8222054380664653
0.8045317220543806
0.185000181198
set 7
done
start


2.91499996185
0.8307401812688822
0.8120845921450152
0.18499994278
set 8
start


4.91899991035
0.818655589123867
0.7972809667673716
0.185000181198
set 8
start


7.25499987602
0.8564199395770393
0.8305135951661632
0.184000015259
set 8
start


5.26399993896
0.8432779456193353
0.8181268882175227
0.185000181198
set 8
done
start


5.80999994278
0.8322507552870091
0.8021148036253777
0.186000108719
set 9
start


5.5720000267
0.8112537764350454
0.7933534743202417
0.1859998703
set 9
start


6.08899998665
0.8392749244712991
0.8072507552870091
0.185000181198
set 9
start


2.36299991608
0.7962235649546828
0.7882175226586102
0.18700003624
set 9
done
start


3.78600001335
0.7979607250755287
0.7927492447129909
0.182999849319
set 10
start


5.25
0.8312688821752265
0.8151057401812689
0.186999797821
set 10
start


6.96300005913
0.8512839879154078
0.8299093655589124
0.184000015259
set 10
start


5.21100020409
0.78904833836858
0.7737160120845922
0.1859998703
set 10
done


In [25]:
fig, ax = plt.subplots()
fig.set_size_inches(15, 8)
x = []
for i in list(range(1,5)) :
    x.append(i * .2)
income_graph = sns.pointplot(x, income_means_NN, ax = ax)
sns.pointplot(x, income_means_NN_training, ax = ax, color = 'r')
income_graph.set_xlabel("% training data")
income_graph.set_ylabel("% accuracy")
income_graph.set_title("Income Set, NN, % training data vs % accuracy")
write_scores("Income_NN_training_data_accuracy.csv", x, income_means_NN, x, income_means_NN_training)
write_scores("Income_NN_training_accuracy_time.csv", x, fit_time, x, score_time)
plt.savefig("income_NN_training_accuracy_accuracy.png")
plt.show()
print "done"

done


In [36]:
fig, ax = plt.subplots()
fig.set_size_inches(15, 8)
x = []
for i in list(range(1,11)) :
    x.append(float(i)/20)
income_graph = sns.pointplot(x, income_means_NN_layers, ax = ax)
sns.pointplot(x, income_means_NN_training_layers, ax = ax, color = 'r')
income_graph.set_xlabel("learning rate")
income_graph.set_ylabel("% accuracy")
income_graph.set_title("Income Set, NN, learning rate vs % accuracy")
plt.savefig("Income_NN_Learning_Accuracy.png")
write_scores("Income_NN_Learning_Accuracy.csv", x, income_means_NN_layers, x, income_means_NN_training_layers)
plt.show()
print "done"

done


## Boosting

In [71]:
file = open("income_adaboost_max_depth_results.csv", "w")
print("Beginning model complexity analysis for AdaBoost... max_depth")
file.write("max_depth" + ", " + "cross_val_score" + ", " + "training_score" + ", " + "testing_score" + ", training_time\n")
for i in range(1, 11):
    classifier = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=i * 5), n_estimators=50)
    result = ""
    input_list, output_list = processData(income, "income")
    result += (str(i + 1) + "," + str(cross_val_score(
    classifier, input_list, output_list).mean()) + ", ")
    start = time.time()
    classifier.fit(input_list, output_list)
    end = time.time()
    result += str(classifier.score(input_list, output_list)) + ", " + str(end-start) + "\n"
    print(result)
    file.write(result)

Beginning model complexity analysis for AdaBoost... max_depth


2,0.8306946656496593, 0.9209667673716012, 3.72200012207



3,0.822598521921979, 1.0, 5.14399981499



4,0.8245316988661281, 1.0, 6.83899998665



5,0.8335352166369631, 1.0, 7.0600001812



6,0.8312991435398503, 1.0, 7.17599987984



7,0.8350456227129989, 1.0, 7.71399998665



8,0.8398792880020264, 1.0, 7.46799993515



9,0.8198199127762481, 1.0, 7.62300014496



10,0.8088822432029262, 1.0, 7.25



11,0.8082176307927998, 1.0, 0.144999980927



In [72]:
file.close()

## SVM

In [135]:
fig, ax = plt.subplots()
fig.set_size_inches(15, 8)
x = []
for i in list(range(1,21)) :
    x.append(i * .05)
print (income_means_boost)
print len(x)
income_graph = sns.pointplot(x, income_means_boost, ax = ax, label='test_set_accuracy')
sns.pointplot(x, income_mean_training_boost, ax = ax, color = 'r', label='train_set_accuracy')
income_graph.set_xlabel("% training data")
income_graph.set_ylabel("% accuracy")
income_graph.set_title("Income Set, Boosting, % training data vs % accuracy")
# income_graph.legend(loc='upper left')
# plt.legend()
plt.show()
print "done1"   

[0.8532906821746244, 0.8704587888192432, 0.8704587888192432, 0.8704587888192432, 0.8704587888192432, 0.8704587888192432, 0.8704587888192432, 0.8704587888192432, 0.8704587888192432, 0.8704587888192432]
10
done1


## SVM

In [46]:
def runSVM(kernel, runs, data, feature):
    score = []
    score_train = []
    data_X, data_Y = processData(data, feature)
    data_X_train = data_X.sample(frac=0.8, random_state=1)
    # print(data_X_train.index)
    data_X_test = data_X.drop(data_X_train.index)
    data_Y_train = data_Y[data_X_train.index]
    data_Y_test = data_Y.drop(data_Y_train.index)
    for i in range(1, runs): 
        print i
        SVM = SVC(kernel=kernel)
        SVM.fit(data_X_train, data_Y_train)
        score.append(SVM.score(data_X_test, data_Y_test))
        score_train.append(SVM.score(data_X_train, data_Y_train))
    return np.mean(score), np.mean(score_train), 

In [37]:
income_SVM = SVC()
income_means_SVM = []
income_std_SVM = []
income_mean_SVM_training = []
time_run = []
print "start"
for i in range(1, 11):
    income_sample = income.sample(income["income"].count() / 10 * i)
    income_X, income_Y = processData(income_sample, "income")
    income_scores = cross_val_score(income_SVM, income_X, income_Y, cv = 10)
    start = time.time()
    income_SVM.fit(income_X, income_Y)
    end = time.time()
    time_run.append(end - start)
    income_means_SVM.append(income_scores.mean())
    income_std_SVM.append(income_scores.std())
    income_mean_SVM_training.append(income_SVM.score(income_X, income_Y))
    print i
print "done"

start


1


2


3


4


5


6


7


8


9


10
done


In [47]:
income_means_SVM = []
income_std_SVM = []
income_mean_SVM_training = []
linear_score, linear_score_train = runSVM('linear', 2, income, "income")
poly_score, poly_score_train = runSVM('poly', 2, income, "income")
rbf_score, rbf_score_train = runSVM('rbf', 2, income, "income")
sigmoid_score, sigmoid_score_train = runSVM('sigmoid', 2, income, "income")


1


1


1


1


In [51]:
SVM_df = pd.DataFrame({'linear': [linear_score, linear_score_train], 'poly': [poly_score, poly_score_train], 'rbf': [rbf_score, rbf_score_train], 'sigmoid': [sigmoid_score, sigmoid_score_train]})
display(SVM_df)
file = open("income_SVM_kernel.csv", "w")
file.write("linear_score, linear_train, poly_score, poly_train, rbf_score, rbf_train, sigmoid_score, sigmoid_train\n")
file.write(str(linear_score) + ", " + str(linear_score_train) + ", " + str(poly_score) + ", " 
           + str(poly_score_train) + ", " + str(rbf_score) + ", " + str(rbf_score_train) + ',' + str(sigmoid_score) + "," + str(sigmoid_score_train))
file.close()

Unnamed: 0,linear,poly,rbf,sigmoid
0,0.840181,0.816918,0.838973,0.833233
1,0.853776,0.854909,0.864502,0.827719


In [44]:
fig, ax = plt.subplots()
fig.set_size_inches(15, 8)
x = []
for i in list(range(1,11)) :
    x.append(format(i * .1, '.2f'))
print x
print income_means_SVM
income_graph = sns.pointplot(x, income_means_SVM, ax = ax)
sns.pointplot(x, income_mean_SVM_training, ax = ax, color = 'r')
income_graph.set_xlabel("% training data")
income_graph.set_ylabel("% accuracy")
income_graph.set_title("Income Set, SVM, % training data vs % accuracy")
plt.savefig("Income_SVM_training_accuracy.png")
write_scores("Income_SVM_training_accuracy.csv", x, income_means_SVM, x, income_mean_SVM_training)
write_scores("Income_SVM_time.csv", x, time_run, None, None)
plt.show()
print "done"

['0.10', '0.20', '0.30', '0.40', '0.50', '0.60', '0.70', '0.80', '0.90', '1.00']
[0.8423282041692268, 0.8392643300474626, 0.8434991751910609, 0.8404906969455543, 0.8404838165840178, 0.8453242235876358, 0.8454034509834389, 0.8419920960435452, 0.8451166437341687, 0.8441089468100653]
done


## HR Set

In [8]:
HR_dt = DecisionTreeClassifier(max_leaf_nodes=7)
HR_means = []
HR_std = []
HR_mean_training = []
for i in range(1, 51):
    HR_sample = HR.sample(HR["left"].count() / 50 * i)
    HR_Y = HR_sample["left"]
    HR_X = HR_sample.drop("left", axis = 1)
    HR_X = pd.get_dummies(HR_X)
    HR_X = preprocessing.scale(HR_X)
    HR_scores = cross_val_score(HR_dt, HR_X, HR_Y, cv = 10)
    HR_dt.fit(HR_X, HR_Y)
    HR_means.append(HR_scores.mean())
    HR_std.append(HR_scores.std())
    HR_mean_training.append(HR_dt.score(HR_X, HR_Y))
print "done"

done


In [9]:
fig, ax = plt.subplots()
fig.set_size_inches(15, 8)
x = []
for i in list(range(1,51)) :
    x.append(i*2)
HR_graph = sns.pointplot(x, HR_means, ax = ax)
sns.pointplot(x, HR_mean_training, ax = ax, color = 'r')
HR_graph.set_xlabel("% of training data")
HR_graph.set_ylabel("% accuracy")
HR_graph.set_title("HR Set, Decision Tree, % of training data vs % accuracy")
plt.show()
print "done"

done
