In [6]:
# imports

# data manipulation
import pandas as pd
import numpy as np

# visualiation
#import seaborn as sb
import matplotlib.pyplot as plt
%matplotlib inline

# model training
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

# feature engineering
from sklearn.preprocessing import StandardScaler

# model evaluation
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

# classifiers
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier # KNN
from sklearn.linear_model import LogisticRegression # logistic regression
from sklearn.tree import DecisionTreeClassifier # decision tree
from sklearn.ensemble import RandomForestClassifier # random forest
from sklearn.ensemble import GradientBoostingClassifier # gradient boosting

In [4]:
# load data
train_file = './data/train.csv'
train_data = pd.read_csv(train_file)
train_data = pd.DataFrame(data=train_data)

test_file = './data/test.csv'
test_data = pd.read_csv(test_file)
test_data = pd.DataFrame(data=test_data)

In [7]:
# categorical
def preprocess(data):
    preprocessed_data = data.copy()
    
    # drop - too many categories and not very relevant intuitively
    preprocessed_data = preprocessed_data.drop(['native-country'], axis=1)
    # drop - duplicate of education-num
    preprocessed_data = preprocessed_data.drop(['education'], axis=1)
    preprocessed_data = pd.get_dummies(preprocessed_data)
    preprocessed_data = preprocessed_data.drop(['occupation_ ?'], axis=1)
    preprocessed_data = preprocessed_data.drop(['workclass_ ?'], axis=1)
    
    return preprocessed_data

preprocessed_train_data = preprocess(train_data)
preprocessed_test_data = preprocess(test_data)

x_train = preprocessed_train_data.drop('exceeds50K', axis=1)
y_train = preprocessed_train_data['exceeds50K']
#normalization
x_train_normalized = StandardScaler().fit_transform(x_train)
x_train = pd.DataFrame(x_train_normalized, columns=x_train.columns)
x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, test_size=0.25, random_state=42, shuffle=True)
x_train.head()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,workclass_ Federal-gov,workclass_ Local-gov,workclass_ Never-worked,workclass_ Private,...,occupation_ Tech-support,occupation_ Transport-moving,relationship_ Husband,relationship_ Not-in-family,relationship_ Other-relative,relationship_ Own-child,relationship_ Unmarried,relationship_ Wife,sex_ Female,sex_ Male
4441,-0.627855,-0.596586,-0.425925,-0.145954,-0.216863,0.362925,-0.176156,-0.265404,-0.016933,0.669478,...,-0.175909,-0.222605,1.214916,-0.584774,-0.182122,-0.430146,-0.340634,-0.227227,-0.703375,0.703375
16596,-1.139672,0.804556,-2.368098,-0.145954,-0.216863,-1.658539,-0.176156,-0.265404,-0.016933,0.669478,...,-0.175909,-0.222605,1.214916,-0.584774,-0.182122,-0.430146,-0.340634,-0.227227,-0.703375,0.703375
6876,1.346294,-0.521784,-0.425925,-0.145954,-0.216863,-0.041368,-0.176156,-0.265404,-0.016933,0.669478,...,-0.175909,-0.222605,1.214916,-0.584774,-0.182122,-0.430146,-0.340634,-0.227227,-0.703375,0.703375
15486,1.273178,-1.099788,-2.368098,-0.145954,-0.216863,-0.041368,-0.176156,-0.265404,-0.016933,0.669478,...,-0.175909,-0.222605,1.214916,-0.584774,-0.182122,-0.430146,-0.340634,-0.227227,-0.703375,0.703375
15141,-1.212788,0.001238,-0.03749,-0.145954,-0.216863,-0.041368,-0.176156,-0.265404,-0.016933,0.669478,...,5.684761,-0.222605,-0.823102,-0.584774,-0.182122,2.324794,-0.340634,-0.227227,-0.703375,0.703375


# Try all classification models

In [9]:
# RUN MODELS

classifiers = [KNeighborsClassifier, LogisticRegression, DecisionTreeClassifier, RandomForestClassifier, GradientBoostingClassifier, LinearDiscriminantAnalysis, GaussianNB, SVC]
model_names = ['KNN', 'LR', 'DT', 'RF', 'GBM', 'LDA', 'GNB', 'SVC'] 

acc = []
f1 = []
for classifier in classifiers:
    model = classifier().fit(x_train, y_train)
    y_pred = model.predict(x_test)
    f1.append(round(f1_score(y_test, y_pred, average='weighted') * 100, 2))
    acc.append(round(accuracy_score(y_test, y_pred) * 100, 2))

F1_record = pd.DataFrame({'Model': model_names, 'f1': f1, 'acc': acc})
#F1_record['F1_mean'] = F1_record.mean(axis=1).round(2)
F1_record.set_index('Model', inplace=True)
#F1_record.loc['avg'] = F1_record.mean()

print('\n')
print(F1_record)





          f1    acc
Model              
KNN    82.40  82.82
LR     85.08  85.59
DT     80.87  80.71
RF     84.31  84.93
GBM    86.64  87.31
LDA    83.45  84.08
GNB    62.18  59.70
SVC    84.73  85.29


# Tune Hyperparameters
> GBM seems to be the 

In [11]:
# TUNE HYPERPARAMETERS
classifier = GradientBoostingClassifier()
#learning_rate = np.arange(0.3, 2.1, 0.3)
max_depth = list(range(3,11))
param_grid = dict(max_depth=max_depth)
grid = GridSearchCV(classifier, param_grid=param_grid, cv=10, scoring='f1', verbose=True, n_jobs=2)
grid.fit(x_train, y_train)


Fitting 10 folds for each of 8 candidates, totalling 80 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:  1.9min
[Parallel(n_jobs=2)]: Done  80 out of  80 | elapsed:  6.4min finished


{'max_depth': 6}
0.7095822769615525


In [40]:
# TEST TUNED HYPERPARAMETERS
classifier = GradientBoostingClassifier(learning_rate=0.1, n_estimators=700, max_depth=3)
model = classifier.fit(x_train, y_train)
y_pred = model.predict(x_train)
print(round(f1_score(y_train, y_pred, average='weighted') * 100, 2))
y_pred = model.predict(x_test)
print(round(f1_score(y_test, y_pred, average='weighted') * 100, 2))

89.73
87.15


In [29]:
# SAVE Y_PRED TO CSV FILE
print(y_pred)
y_pred_file_name = './predictions.csv'
y_pred_file = open(y_pred_file_name, 'w')
output = 'id,prediction\n'
index = 1
for i in y_pred:
    output += str(index)
    output += ','
    output += str(i)
    output += '\n'
    index += 1
y_pred_file.write(output)
y_pred_file.close()
print(index)

[0 0 0 ... 0 0 0]
24422
