In [1]:
import numpy as np
import pandas as pd
import re
import nltk
import matplotlib.pyplot as plt
%matplotlib inline
import mltools as ml
import math
import csv

In [2]:
from numpy import asarray as arr
from numpy import asmatrix as mat
from numpy import atleast_2d as twod
from itertools import islice

### Load data

In [3]:
data_train = pd.read_csv('training_preprocessed.csv',delimiter=',',skipinitialspace=True)
data_test = pd.read_csv('test_preprocessed.csv',delimiter=',',skipinitialspace=True)

In [4]:
categorical_columns_all = ['salary', 'sex', 'education', 'workclass', 'marital-status', 'occupation', 'relationship', 'race', 'native-country']

In [5]:
data_train = data_train.drop(columns=categorical_columns_all)
data_test = data_test.drop(columns=categorical_columns_all)

In [6]:
keys = data_train.keys()
data_cols = keys.drop('salary_binary')
label_col = 'salary_binary'

In [7]:
X_test = data_test[data_cols]
y_test = data_test[label_col]

### Training and validation split

In [8]:
from sklearn.model_selection import train_test_split

In [10]:
X_train, X_val, y_train, y_val = train_test_split(data_train[data_cols], data_train[label_col], test_size=0.2, random_state=42)

### Random Forest

In [11]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

In [12]:
rf_clf = RandomForestClassifier(max_depth=20, n_estimators=20)

In [13]:
rf_clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=20, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [14]:
rf_clf.score(X_train, y_train)

0.9262712918065399

In [15]:
rf_clf.score(X_test, y_test)

0.8456839309428951

In [18]:
y_pred = rf_clf.predict(data_test[data_cols])

In [20]:
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(data_test[label_col],y_pred))
print(classification_report(data_test[label_col],y_pred))

[[10667   693]
 [ 1631  2069]]
              precision    recall  f1-score   support

         0.0       0.87      0.94      0.90     11360
         1.0       0.75      0.56      0.64      3700

   micro avg       0.85      0.85      0.85     15060
   macro avg       0.81      0.75      0.77     15060
weighted avg       0.84      0.85      0.84     15060



In [21]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(rf_clf, X_train, y_train, cv=10)

In [22]:
np.mean(scores)

0.8555680245511608

### SVM

In [None]:
# from sklearn import svm

In [None]:
# svm_clf = svm.SVC(tol=0.1, degree=2)

In [None]:
# svm_clf.fit(data_train[train_cols], data_train[train_keys])

In [None]:
# svm_clf.score(data_train[train_cols], data_train[train_keys])

In [None]:
# svm_clf.score(data_test[train_cols], data_test[train_keys])

### Neural Network

In [23]:
from sklearn import neural_network

In [43]:
mlp_clf = neural_network.MLPClassifier(hidden_layer_sizes=(100,200))

In [44]:
mlp_clf.fit(X_train, y_train)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100, 200), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)

In [45]:
mlp_clf.score(X_train, y_train)

0.7954328815947614

In [46]:
mlp_clf.score(X_val, y_val)

0.79678435272667

In [70]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import AdaBoostClassifier

clf = AdaBoostClassifier(n_estimators=100)
scores = cross_val_score(clf, X_train, y_train, cv=5)

In [71]:
clf.fit(X_train, y_train)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=1000, random_state=None)

In [72]:
clf.score(X_train, y_train)

0.8720212192797049

In [73]:
clf.score(X_test, y_test)

0.8161354581673307