In [1]:
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from keras.models import Sequential
from keras.layers import Dense, Dropout, BatchNormalization, MaxPool1D, Flatten, Conv1D
from keras.utils import to_categorical
import numpy as np

Using TensorFlow backend.


In [2]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [3]:
names = ['age', 'workclass', 'fnlwgt', 'education', 'educationnum', 'maritalstatus', 'occupation', 'relationship', 'race',
        'sex', 'capitalgain', 'capitalloss', 'hoursperweek', 'nativecountry', 'label']
train_df = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data",
                      header=None, names=names)
test_df = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test",
                      header=None, names=names, skiprows=[0])
all_df = pd.concat([train_df, test_df])

In [4]:
all_df.head()

Unnamed: 0,age,workclass,fnlwgt,education,educationnum,maritalstatus,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,nativecountry,label
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [5]:
all_df.shape

(48842, 15)

In [6]:
drop_columns = ['fnlwgt', 'education']
continuous_features = ['age', 'capitalgain', 'capitalloss', 'hoursperweek']
cat_features =['educationnum', 'workclass', 'maritalstatus', 'occupation', 'relationship', 'race', 'sex', 'nativecountry']

In [7]:
all_df_dummies = pd.get_dummies(all_df, columns=cat_features)

In [8]:
all_df_dummies.drop(drop_columns, 1, inplace=True)

In [9]:
y = all_df_dummies['label'].apply(lambda x: 0 if '<' in x else 1)
X = all_df_dummies.drop(['label'], 1)

In [10]:
y.value_counts(normalize=True)

0    0.760718
1    0.239282
Name: label, dtype: float64

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [12]:
X_train.shape

(32724, 106)

## Cleaning Pipeline

In [13]:
clean_pipeline = Pipeline([('imputer', preprocessing.Imputer(strategy="median")),
                           ('std_scaler', preprocessing.StandardScaler()),])

In [14]:
X_train_clean = clean_pipeline.fit_transform(X_train)

In [15]:
X_test_clean = clean_pipeline.transform(X_test)

## Metrics

In [16]:
def evaluate(true, pred):
    f1 = metrics.f1_score(true, pred)
    roc_auc = metrics.roc_auc_score(true, pred)
    accuracy = metrics.accuracy_score(true, pred)
    print("F1: {0}\nROC_AUC: {1}\nACCURACY: {2}".format(f1, roc_auc, accuracy))
    return f1, roc_auc, accuracy

## Logistic Regression

In [17]:
clf = LogisticRegression()
clf.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [18]:
lr_predictions = clf.predict(X_test)

In [19]:
lr_f1, lr_roc_auc, lr_acc = evaluate(y_test, lr_predictions)

F1: 0.6507094739859539
ROC_AUC: 0.7574953226590644
ACCURACY: 0.8488025809653803


## Tuned Logistic Regression

In [20]:
lr_grid = {'penalty': ['l1', 'l2'], 'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000] }
tuned_lr = GridSearchCV(LogisticRegression(), lr_grid, scoring='f1', n_jobs=10)
tuned_lr.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=10,
       param_grid={'penalty': ['l1', 'l2'], 'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='f1', verbose=0)

In [21]:
print("Best F1 Validation Score: {}".format(tuned_lr.best_score_))

Best F1 Validation Score: 0.6595912825591275


In [22]:
tuned_lr.best_params_

{'C': 1, 'penalty': 'l1'}

In [23]:
tuned_lr_predictions = tuned_lr.predict(X_test)
tuned_lr_f1, tuned_lr_roc_auc, tuned_lr_acc = evaluate(y_test, tuned_lr_predictions)

F1: 0.6509028374892518
ROC_AUC: 0.7576230692099075
ACCURACY: 0.8488646234024072


## Gradient Boosted Trees

In [24]:
gbt = GradientBoostingClassifier()
gbt.fit(X_train, y_train)
gbt_predictions = clf.predict(X_test)
gbt_f1, gbt_roc_auc, gbt_acc = evaluate(y_test, gbt_predictions)

F1: 0.6507094739859539
ROC_AUC: 0.7574953226590644
ACCURACY: 0.8488025809653803


## GBT Tuned

In [25]:
#gbt_grid = {'learning_rate': [.01], 'n_estimators': [250, 500, 1000], 'max_depth': [3, 4, 5]}
gbt_tuned = GradientBoostingClassifier(learning_rate=.01, n_estimators=1000, max_depth=5)
gbt_tuned.fit(X_train, y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.01, loss='deviance', max_depth=5,
              max_features=None, max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=1,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              n_estimators=1000, presort='auto', random_state=None,
              subsample=1.0, verbose=0, warm_start=False)

In [26]:
gbt_tuned_predictions = gbt_tuned.predict(X_test)
gbt_tuned_f1, gbt_tunded_roc_auc, gbt_tuned_acc = evaluate(y_test, gbt_tuned_predictions)

F1: 0.7042577675489067
ROC_AUC: 0.7885511539729889
ACCURACY: 0.8724407494726393


## Deep Learning Simple

In [27]:
model_simple = Sequential()
model_simple.add(Dense(1024, activation='relu' , input_dim = X_train.shape[1]))
model_simple.add(Dropout(0.5))
model_simple.add(Dense(2, activation='softmax', name='softmax'))

In [28]:
y_train_cat = to_categorical(y_train.values, 2)

In [29]:
model_simple.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [30]:
model_simple.fit(X_train.values, y_train_cat, batch_size=32, epochs=25)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<keras.callbacks.History at 0x7f97bf3b0b70>

In [31]:
deep_predictions_simple = model_simple.predict(X_test.values)
deep_simple_f1, deep_simple_roc_auc, deep_simple_acc = evaluate(np.argmax(deep_predictions_simple, 1), y_test)

F1: 0.4804451510333863
ROC_AUC: 0.7305386923619349
ACCURACY: 0.7972453157960044


## Deep Learning Tuned A Bit

In [33]:
model = Sequential()
model.add(Dense(1024, activation='elu', kernel_initializer='glorot_normal', input_dim = X_train.shape[1]))
model.add(BatchNormalization())
model.add(Dense(128, activation='elu', kernel_initializer='glorot_normal'))
model.add(BatchNormalization())
model.add(Dense(64, activation='elu', kernel_initializer='glorot_normal'))
model.add(Dropout(0.2))
model.add(Dense(2, activation='softmax', name='softmax'))

In [34]:
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [35]:
model.fit(X_train.values, y_train_cat, batch_size=512, epochs=40)

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


<keras.callbacks.History at 0x7f9748718a20>

In [36]:
deep_predictions = model.predict(X_test.values)

In [37]:
deep_f1, deep_roc_auc, deep_acc = evaluate(np.argmax(deep_predictions, 1), y_test)

F1: 0.5969761014469193
ROC_AUC: 0.835520517540418
ACCURACY: 0.8461967986102494


## Final Results

In [39]:
model_names = ["LR", "Tuned LR", "GBT", "Tuned GBT", "Deep", "Deep Tuned"]
metrics_of_interest = ["F1", "ROC_AUC", "ACCURACY"]
f1s = [lr_f1, tuned_lr_f1, gbt_f1, gbt_tuned_f1, deep_simple_f1, deep_f1]
roc_aucs = [lr_roc_auc, tuned_lr_roc_auc, gbt_roc_auc, gbt_tunded_roc_auc, deep_simple_roc_auc, deep_roc_auc]
accuracy = [lr_acc, tuned_lr_acc, gbt_acc, gbt_tuned_acc, deep_simple_acc, deep_acc]

In [40]:
results_df = pd.DataFrame(columns=metrics_of_interest, index=model_names, data=np.array([f1s, roc_aucs, accuracy]).T)

In [41]:
results_df

Unnamed: 0,F1,ROC_AUC,ACCURACY
LR,0.650709,0.757495,0.848803
Tuned LR,0.650903,0.757623,0.848865
GBT,0.650709,0.757495,0.848803
Tuned GBT,0.704258,0.788551,0.872441
Deep,0.480445,0.730539,0.797245
Deep Tuned,0.596976,0.835521,0.846197
