## Data Processing

In [None]:
import sys
# the mock-0.3.1 dir contains testcase.py, testutils.py & mock.py
import numpy as np
from data_loader_df import *
import pandas as pd
from sklearn import svm, linear_model, metrics, neural_network, ensemble
import matplotlib.pyplot as plt
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.max_rows', 500)

In [None]:
train_file_path = "../data/adult.data"
test_file_path = "../data/adult.test"

train, val, test = load_all_data(train_file_path, test_file_path, valid_rate=0.1, is_df=True, norm=True, one_hot=True)
train_X, train_Y, val_X, val_Y, test_X, test_Y = load_all_data(train_file_path, test_file_path, valid_rate=0.1, is_df=False, norm=True, one_hot=True)

In [None]:
train

In [None]:
COLUMNS = [
		"age", "work", "fnlwgt", "edu", "edunum",
		"mstatus", "occ", "relation", "race", "sex",
		"cgain", "closs", "hpw", "nation", "income"
]
COLS_TO_NORM = ['age', 'fnlwgt', 'edunum', 'cgain', 'closs', 'hpw']
CATEGORICAL_COLS = [
		"work", "edu", "mstatus", "occ", "relation", "race", "sex", "nation"
	]

In [None]:
print("----mean & std of features of training data----")
print(np.mean(train[COLS_TO_NORM]))
print(np.var(train[COLS_TO_NORM]))
print("----mean & std of features of validation data----")
print(np.mean(val[COLS_TO_NORM]))
print(np.var(val[COLS_TO_NORM]))
print("----mean & std of features of testing data----")
print(np.mean(test[COLS_TO_NORM]))
print(np.var(test[COLS_TO_NORM]))

### Helper Functions

In [None]:
def auc(X, Y, reg):
    Y_hat = reg.predict(X);
    fpr, tpr, _ = metrics.roc_curve(Y, Y_hat)
    return metrics.auc(fpr, tpr)

In [None]:
def matshow(X, Y, X_label, Y_label, train_auc, val_auc, test_auc):
    f, ax = plt.subplots(1, 1, figsize=(8, 5))
    cax = ax.matshow(train_auc, interpolation='nearest') 
    f.colorbar(cax, fraction=0.01, pad=0.04)
    ax.set_xticks(np.arange(len(X)))
    ax.set_xticklabels(list(X)); ax.set_yticklabels(['']+list(Y))
    ax.set_title("matshow of train auc", pad=20)
    ax.set_xlabel(X_label)
    ax.set_ylabel(Y_label)
    plt.show()
    
    f, ax = plt.subplots(1, 1, figsize=(8, 5))
    cax = ax.matshow(val_auc, interpolation='nearest') 
    f.colorbar(cax, fraction=0.01, pad=0.04)
    ax.set_xticks(np.arange(len(X)))
    ax.set_xticklabels(list(X)); ax.set_yticklabels(['']+list(Y))
    ax.set_title("matshow of val auc", pad=20)
    ax.set_xlabel(X_label)
    ax.set_ylabel(Y_label)
    plt.show()
    
    f, ax = plt.subplots(1, 1, figsize=(8, 5))
    cax = ax.matshow(test_auc, interpolation='nearest') 
    f.colorbar(cax, fraction=0.01, pad=0.04)
    ax.set_xticks(np.arange(len(X)))
    ax.set_xticklabels(list(X)); ax.set_yticklabels(['']+list(Y))
    ax.set_title("matshow of test auc", pad=20)
    ax.set_xlabel(X_label)
    ax.set_ylabel(Y_label)
    plt.show()

## Linear Classification

In [None]:
Cs = [0.1, 0.2, 0.3, 0.4, 0.5, 0.7, 1, 2, 5, 10, 20, 50]
penalties = ['l1', 'l2']

auc_train = np.zeros((len(penalties), len(Cs)))
auc_val = np.zeros((len(penalties), len(Cs)))
auc_test = np.zeros((len(penalties), len(Cs)))

for i, penalty in enumerate(penalties):
    for j, C in enumerate(Cs):
        print("processing C =", C)
        reg = linear_model.LogisticRegression(penalty=penalty, C=C)
        reg.fit(train_X, train_Y)
    
        auc_train[i][j] = auc(train_X, train_Y, reg)
        auc_val[i][j] = auc(val_X, val_Y, reg)
        auc_test[i][j] = auc(test_X, test_Y, reg)

In [None]:
matshow(Cs, penalties, 'C', 'penalty', auc_train, auc_val, auc_test)

## SVM

In [None]:
Cs = [0.1, 0.5, 1, 2, 5, 10, 20]
kernels = ['rbf', 'linear', 'poly']

auc_train = np.zeros((len(kernels), len(Cs)))
auc_val = np.zeros((len(kernels), len(Cs)))
auc_test = np.zeros((len(kernels), len(Cs)))

for i, k in enumerate(kernels):
    for j, C in enumerate(Cs):
        print("processing k=",k,", C=",C);
        learner = svm.SVC(C=C, kernel=k, degree=2, gamma='scale');
        learner.fit(train_X, train_Y);
        auc_train[i][j] = auc(train_X, train_Y, learner)
        auc_val[i][j] = auc(val_X, val_Y, learner)
        auc_test[i][j] = auc(test_X, test_Y, learner)

In [None]:
matshow(Cs, kernels, 'C', 'kernel', auc_train, auc_val, auc_test)

## Neural Network

In [None]:
layers = range(1, 7, 1)
nodes = range(4, 22, 2)
auc_train = np.zeros((len(layers), len(nodes)))
auc_val = np.zeros((len(layers), len(nodes)))
auc_test = np.zeros((len(layers), len(nodes)))

for i, layer in enumerate(layers):
    for j, node in enumerate(nodes):
        print("processing layer=",layer,", node=",node)
        hidden_layer = tuple([node for k in range(layer)])
        learner = neural_network.MLPClassifier(hidden_layer_sizes=hidden_layer, activation='relu', solver='adam')
        learner.fit(train_X, train_Y);
        
        auc_train[i][j] = auc(train_X, train_Y, learner)
        auc_val[i][j] = auc(val_X, val_Y, learner)
        print("test auc: ", auc(test_X, test_Y, learner))
        auc_test[i][j] = auc(test_X, test_Y, learner)


In [None]:
matshow(nodes, layers, '#nodes', '#layers', auc_train, auc_val, auc_test)

In [None]:
auc_test

### Ensemble the models

In [None]:
svml = svm.SVC(C=5, kernel='rbf', gamma='scale')
nn = neural_network.MLPClassifier(hidden_layer_sizes=(8, 8, 8), activation='relu', solver='adam')
# rf = ensemble.RandomForestClassifier(n_estimators=110,max_features=8,min_samples_leaf=10)
lc = linear_model.LogisticRegression(penalty='l1', C=2)

In [None]:
esb = ensemble.VotingClassifier(estimators=[('neural network', nn), 
                                            ('svm', svml), 
                                            ('linear classification', lc)], voting='hard')

In [None]:
esb.fit(train_X, train_Y)

In [None]:
lc.fit(train_X, train_Y)
svml.fit(train_X, train_Y)
nn.fit(train_X, train_Y)

In [None]:
print("linear classifer:")
print(visualize_confusion_matrix(test_Y, lc.predict(test_X)))
print("auc = ", metrics.roc_auc_score(test_Y, lc.predict(test_X)))

print("svm:")
print(visualize_confusion_matrix(test_Y, svml.predict(test_X)))
print("auc = ", metrics.roc_auc_score(test_Y, svml.predict(test_X)))

print("neural network:")
print(visualize_confusion_matrix(test_Y, nn.predict(test_X)))
print("auc = ", metrics.roc_auc_score(test_Y, nn.predict(test_X)))

print("ensemble:")
print(visualize_confusion_matrix(test_Y, esb.predict(test_X)))
print("auc = ", metrics.roc_auc_score(test_Y, esb.predict(test_X)))

In [None]:
lc.coef_

In [None]:
print("train auc of ensemble: ", auc(train_X, train_Y, esb))
print("val auc of ensemble: ", auc(val_X, val_Y, esb))

## Random Forest & Decision Tree

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split,cross_val_score,KFold,GridSearchCV
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier

In [None]:
train_file_path = "../data/adult.data"
test_file_path = "../data/adult.test"
train_X, train_Y, _, _, test_X, test_Y = load_all_data(train_file_path, test_file_path, valid_rate=0, is_df=False, norm=False, one_hot=False)

In [None]:
#For trees, we should use label encoding rather than one hot to transfrom category feature
#preprocess: combine all the data to do label encoding
dataset = np.concatenate((train_X, test_X), axis=0)
#trasfrom all features if needed
for j in range(14):#14 features in total
    if type(dataset[0][j]) == str:
        labelencoder = LabelEncoder()
        dataset[:, j] = labelencoder.fit_transform(dataset[:, j])

In [None]:
#split dataset back to train, validation, test
train_X, test_X = dataset[:30162], dataset[30162:]
print(train_X[0])
print(test_X[0])
print(train_X.shape, test_X.shape)

In [None]:
rf=RandomForestClassifier(n_estimators=110,max_features=8,min_samples_leaf=10)
rf.fit(train_X,train_Y)

In [None]:
print("random forest:")
print(visualize_confusion_matrix(test_Y, rf.predict(test_X)))
print("auc = ", metrics.roc_auc_score(test_Y, rf.predict(test_X)))

In [None]:
dt = DecisionTreeClassifier(max_depth=44, max_features=8, min_samples_leaf=50)
dt.fit(train_X,train_Y)

In [None]:
print("decision tree:")
print(visualize_confusion_matrix(test_Y, dt.predict(test_X)))
print("auc = ", metrics.roc_auc_score(test_Y, dt.predict(test_X)))

In [None]:
sum(train_Y)

In [None]:
(len(test_Y) - sum(test_Y)) / sum(test_Y)

In [None]:
coeffs = pd.DataFrame(lc.coef_, columns=train.drop('income', axis=1).columns)

In [None]:
coeffs