## Get data

In [83]:
import pandas as pd
import numpy as np

In [11]:
!curl -X GET http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.names

| This data was extracted from the census bureau database found at
| http://www.census.gov/ftp/pub/DES/www/welcome.html
| Donor: Ronny Kohavi and Barry Becker,
|        Data Mining and Visualization
|        Silicon Graphics.
|        e-mail: ronnyk@sgi.com for questions.
| Split into train-test using MLC++ GenCVFiles (2/3, 1/3 random).
| 48842 instances, mix of continuous and discrete    (train=32561, test=16281)
| 45222 if instances with unknown values are removed (train=30162, test=15060)
| Duplicate or conflicting instances : 6
| Class probabilities for adult.all file
| Probability for the label '>50K'  : 23.93% / 24.78% (without unknowns)
| Probability for the label '<=50K' : 76.07% / 75.22% (without unknowns)
|
| Extraction was done by Barry Becker from the 1994 Census database.  A set of
|   reasonably clean records was extracted using the following conditions:
|   ((AAGE>16) && (AGI>100) && (AFNLWGT>1)&& (HRSWK>0))
|
| Prediction task is to determine whether a

In [270]:
features = ['age', 'workclass', 'fnlwgt', 'education', 'education-num','marital-status','occupation','relationship','race','sex','capital-gain','capital-loss','hours-per-week','native-country','target'] 

In [271]:
df = pd.read_csv('adult.data', names=features, header=None,index_col=False)

In [272]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,target
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


We can drop `relationship` which is somewhat of a confounding variable and tied up in `marital-status` as well. `native-country` just widens our data too much, so let's shift it to be either `US` or `foreign`. Also let's group together `workclass` for the `govt` and `self` employed individuals.

In [273]:
df.drop(['fnlwgt', 'relationship','education'],axis=1,inplace=True)

In [274]:
# convert native-country
df['native-country'].unique()

array([' United-States', ' Cuba', ' Jamaica', ' India', ' ?', ' Mexico',
       ' South', ' Puerto-Rico', ' Honduras', ' England', ' Canada',
       ' Germany', ' Iran', ' Philippines', ' Italy', ' Poland',
       ' Columbia', ' Cambodia', ' Thailand', ' Ecuador', ' Laos',
       ' Taiwan', ' Haiti', ' Portugal', ' Dominican-Republic',
       ' El-Salvador', ' France', ' Guatemala', ' China', ' Japan',
       ' Yugoslavia', ' Peru', ' Outlying-US(Guam-USVI-etc)', ' Scotland',
       ' Trinadad&Tobago', ' Greece', ' Nicaragua', ' Vietnam', ' Hong',
       ' Ireland', ' Hungary', ' Holand-Netherlands'], dtype=object)

In [275]:
df['native-country'] = df['native-country'].apply(lambda s: 'US' if s == 'United-States' else 'Foreign')

In [276]:
df.head()

Unnamed: 0,age,workclass,education-num,marital-status,occupation,race,sex,capital-gain,capital-loss,hours-per-week,native-country,target
0,39,State-gov,13,Never-married,Adm-clerical,White,Male,2174,0,40,Foreign,<=50K
1,50,Self-emp-not-inc,13,Married-civ-spouse,Exec-managerial,White,Male,0,0,13,Foreign,<=50K
2,38,Private,9,Divorced,Handlers-cleaners,White,Male,0,0,40,Foreign,<=50K
3,53,Private,7,Married-civ-spouse,Handlers-cleaners,Black,Male,0,0,40,Foreign,<=50K
4,28,Private,13,Married-civ-spouse,Prof-specialty,Black,Female,0,0,40,Foreign,<=50K


In [277]:
df['workclass'].unique()

array([' State-gov', ' Self-emp-not-inc', ' Private', ' Federal-gov',
       ' Local-gov', ' ?', ' Self-emp-inc', ' Without-pay',
       ' Never-worked'], dtype=object)

In [278]:
def clean_workclass(s):
    if s.endswith('gov'):
        return 'govt'
    if s.startswith(' Self'):
        return 'self'
    if s.endswith('Private'):
        return 'private'
    if s == ' Without-pay':
        return 'volunteer'
    if s == ' Never-worked':
        return 'never'
    else: # case of ?
        return 'unknown'
df['workclass'] = df['workclass'].apply(clean_workclass)

In [279]:
df.head()

Unnamed: 0,age,workclass,education-num,marital-status,occupation,race,sex,capital-gain,capital-loss,hours-per-week,native-country,target
0,39,govt,13,Never-married,Adm-clerical,White,Male,2174,0,40,Foreign,<=50K
1,50,self,13,Married-civ-spouse,Exec-managerial,White,Male,0,0,13,Foreign,<=50K
2,38,private,9,Divorced,Handlers-cleaners,White,Male,0,0,40,Foreign,<=50K
3,53,private,7,Married-civ-spouse,Handlers-cleaners,Black,Male,0,0,40,Foreign,<=50K
4,28,private,13,Married-civ-spouse,Prof-specialty,Black,Female,0,0,40,Foreign,<=50K


## To features

In [193]:
target = df.target
X = pd.get_dummies(df.drop(['target'], axis=1)).values

In [194]:
X.shape

(32561, 41)

In [195]:
y = np.array([int(i) for i in target.values == ' <=50K'])
y.shape

(32561,)

## Modeling

In [196]:
from sklearn.model_selection import train_test_split, KFold

In [197]:
import tensorflow as tf
tf.__version__

'2.1.0'

In [198]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25)

### Neural Network
#### Base Model

In [252]:
model = tf.keras.models.Sequential([
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
metrics = [
    tf.keras.metrics.AUC(),
    tf.keras.metrics.Accuracy(),
    tf.keras.metrics.Precision(),
    tf.keras.metrics.Recall()
]
model.compile(optimizer='adam', loss='binary_crossentropy', metrics = metrics)

In [253]:
model.fit(X_train, y_train, batch_size=16, epochs=10)

Train on 24420 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f48e582b250>

In [221]:
# plot stuff
# just remember that | Class probabilities for adult.all file
# | Probability for the label '>50K'  : 23.93% / 24.78% (without unknowns)
# | Probability for the label '<=50K' : 76.07% / 75.22% (without unknowns)

#### K-Fold Cross Validation

In [255]:
import random

activation_functions = ['sigmoid', '']
weight_inits = ['']
num_layers = [i for i in range(2,5)]
hidden_layers = [2 ** i for i in range(5,7)]

models = []
for _ in range(5):
    model = tf.keras.models.Sequential()
    layers = [ tf.keras.layers.Dense(random.choice(hidden_layers), activation='relu') for _ in range(random.choice(num_layers)) ]
    for l in layers:
        model.add(l)
    output_layer = tf.keras.layers.Dense(1, activation='sigmoid')
    model.add(output_layer)
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics = metrics)
    models.append(model)

In [256]:
# let's first ensure our random search worked
for model in models:
    model.fit(X_train, y_train, batch_size=16, epochs=10, verbose=0)
    model.evaluate(X_test, y_test, batch_size=16)



In [257]:
for model in models:
    model.summary()

Model: "sequential_29"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_98 (Dense)             multiple                  1344      
_________________________________________________________________
dense_99 (Dense)             multiple                  1056      
_________________________________________________________________
dense_100 (Dense)            multiple                  2112      
_________________________________________________________________
dense_101 (Dense)            multiple                  65        
Total params: 4,577
Trainable params: 4,577
Non-trainable params: 0
_________________________________________________________________
Model: "sequential_30"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_102 (Dense)            multiple                  2688      
____________________________

In [258]:
results = []
for model in models:
    results.append(model.evaluate(X_test, y_test, batch_size=32))



In [None]:
# <TODO> visualize results

In [259]:
# now let's do cross-validation
kf = KFold(n_splits=5)
for i, (train_idx, test_idx) in enumerate(kf.split(X_train)):
    X_trn, X_tst = X_train[train_idx], X_train[test_idx]
    y_trn, y_tst = y_train[train_idx], y_train[test_idx]

    for j, model in enumerate(models):
        model.fit(X_trn, y_trn, batch_size=16, epochs=20, verbose=0)
        print(f"Evaluating model {j} on k-fold {i}")
        model.evaluate(X_tst, y_tst, batch_size=16)

Evaluating model 0 on k-fold 0
Evaluating model 1 on k-fold 0
Evaluating model 2 on k-fold 0
Evaluating model 3 on k-fold 0
Evaluating model 4 on k-fold 0
Evaluating model 0 on k-fold 1
Evaluating model 1 on k-fold 1
Evaluating model 2 on k-fold 1
Evaluating model 3 on k-fold 1
Evaluating model 4 on k-fold 1
Evaluating model 0 on k-fold 2
Evaluating model 1 on k-fold 2
Evaluating model 2 on k-fold 2
Evaluating model 3 on k-fold 2
Evaluating model 4 on k-fold 2
Evaluating model 0 on k-fold 3
Evaluating model 1 on k-fold 3
Evaluating model 2 on k-fold 3
Evaluating model 3 on k-fold 3
Evaluating model 4 on k-fold 3
Evaluating model 0 on k-fold 4
Evaluating model 1 on k-fold 4
Evaluating model 2 on k-fold 4
Evaluating model 3 on k-fold 4
Evaluating model 4 on k-fold 4


In [260]:
# now let's see our performance on the validation set...
for model in models:
    model.evaluate(X_test, y_test, batch_size=16)



It looks like our final model in particular has a nice recall. This we should probably consider our best candidate recognizing that there is an imbalance in our classes.

In [264]:
models[-1].summary()

Model: "sequential_33"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_114 (Dense)            multiple                  1344      
_________________________________________________________________
dense_115 (Dense)            multiple                  2112      
_________________________________________________________________
dense_116 (Dense)            multiple                  4160      
_________________________________________________________________
dense_117 (Dense)            multiple                  2080      
_________________________________________________________________
dense_118 (Dense)            multiple                  33        
_________________________________________________________________
dense_119 (Dense)            multiple                  64        
_________________________________________________________________
dropout (Dropout)            multiple                

#### Regularization

In [262]:
# now let's do the same procedure but with regularization included; a dropout layer after each
regularized_models = []
for _ in range(5):
    model = tf.keras.models.Sequential()
    layers = [ tf.keras.layers.Dense(random.choice(hidden_layers), activation='relu') for _ in range(random.choice(num_layers)) ]
    for l in layers:
        model.add(l)
        dropout_layer = tf.keras.layers.Dropout(.15)
        model.add(dropout_layer)
    output_layer = tf.keras.layers.Dense(1, activation='sigmoid')
    model.add(output_layer)
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics = metrics)
    regularized_models.append(model)

In [263]:
# now let's do cross-validation
kf = KFold(n_splits=5)
for i, (train_idx, test_idx) in enumerate(kf.split(X_train)):
    X_trn, X_tst = X_train[train_idx], X_train[test_idx]
    y_trn, y_tst = y_train[train_idx], y_train[test_idx]

    for j, model in enumerate(regularized_models):
        model.fit(X_trn, y_trn, batch_size=16, epochs=20, verbose=0)
        print(f"Evaluating model {j} on k-fold {i}")
        model.evaluate(X_tst, y_tst, batch_size=16)

Evaluating model 0 on k-fold 0
Evaluating model 1 on k-fold 0
Evaluating model 2 on k-fold 0
Evaluating model 3 on k-fold 0
Evaluating model 4 on k-fold 0
Evaluating model 0 on k-fold 1
Evaluating model 1 on k-fold 1
Evaluating model 2 on k-fold 1
Evaluating model 3 on k-fold 1
Evaluating model 4 on k-fold 1
Evaluating model 0 on k-fold 2
Evaluating model 1 on k-fold 2
Evaluating model 2 on k-fold 2
Evaluating model 3 on k-fold 2
Evaluating model 4 on k-fold 2
Evaluating model 0 on k-fold 3
Evaluating model 1 on k-fold 3
Evaluating model 2 on k-fold 3
Evaluating model 3 on k-fold 3
Evaluating model 4 on k-fold 3
Evaluating model 0 on k-fold 4
Evaluating model 1 on k-fold 4
Evaluating model 2 on k-fold 4
Evaluating model 3 on k-fold 4
Evaluating model 4 on k-fold 4


In [266]:
# now let's see our performance on the validation set...
for model in regularized_models:
    model.evaluate(X_test, y_test, batch_size=16)



#### Feature normalization

In [267]:
list(df)

['age',
 'workclass',
 'education-num',
 'marital-status',
 'occupation',
 'race',
 'sex',
 'capital-gain',
 'capital-loss',
 'hours-per-week',
 'native-country',
 'target']

In [295]:
min_max_norm_cols = ['age', 'education-num','hours-per-week']
norm_cols = ['capital-gain', 'capital-loss']

In [296]:
# age & education we'll use MinMax, the others we'll do normalization
for col in min_max_norm_cols:
    df[col] = (df[col] - df[col].min()) / (df[col].max() - df[col].min())

for col in norm_cols:
    df[col] = (df[col] - df[col].mean()) / df[col].std()

df.head()

Unnamed: 0,age,workclass,education-num,marital-status,occupation,race,sex,capital-gain,capital-loss,hours-per-week,native-country,target
0,0.30137,govt,0.8,Never-married,Adm-clerical,White,Male,0.148451,-0.216656,0.397959,Foreign,<=50K
1,0.452055,self,0.8,Married-civ-spouse,Exec-managerial,White,Male,-0.145918,-0.216656,0.122449,Foreign,<=50K
2,0.287671,private,0.533333,Divorced,Handlers-cleaners,White,Male,-0.145918,-0.216656,0.397959,Foreign,<=50K
3,0.493151,private,0.4,Married-civ-spouse,Handlers-cleaners,Black,Male,-0.145918,-0.216656,0.397959,Foreign,<=50K
4,0.150685,private,0.8,Married-civ-spouse,Prof-specialty,Black,Female,-0.145918,-0.216656,0.397959,Foreign,<=50K


In [297]:
target = df.target
X_normed = pd.get_dummies(df.drop(['target'], axis=1)).values

In [298]:
X_norm_train, X_norm_test, y_norm_train, y_norm_test = train_test_split(X_normed, y, test_size=.25)

In [300]:
normed_models = []
for _ in range(5):
    m = tf.keras.models.Sequential()
    layers = [ tf.keras.layers.Dense(random.choice(hidden_layers), activation='relu') for _ in range(random.choice(num_layers)) ]
    for l in layers:
        m.add(l)
        dropout_layer = tf.keras.layers.Dropout(.15)
        m.add(dropout_layer)
    output_layer = tf.keras.layers.Dense(1, activation='sigmoid')
    m.add(output_layer)
    m.compile(optimizer='adam', loss='binary_crossentropy', metrics = metrics)
    normed_models.append(m)

In [301]:
# now let's do cross-validation
kf = KFold(n_splits=5)
for i, (train_idx, test_idx) in enumerate(kf.split(X_norm_train)):
    X_trn, X_tst = X_norm_train[train_idx], X_norm_train[test_idx]
    y_trn, y_tst = y_norm_train[train_idx], y_norm_train[test_idx]

    for j, model in enumerate(normed_models):
        model.fit(X_trn, y_trn, batch_size=16, epochs=20, verbose=0)
        print(f"Evaluating model {j} on k-fold {i}")
        model.evaluate(X_tst, y_tst, batch_size=16)

Evaluating model 0 on k-fold 0
Evaluating model 1 on k-fold 0
Evaluating model 2 on k-fold 0
Evaluating model 3 on k-fold 0
Evaluating model 4 on k-fold 0
Evaluating model 0 on k-fold 1
Evaluating model 1 on k-fold 1
Evaluating model 2 on k-fold 1
Evaluating model 3 on k-fold 1
Evaluating model 4 on k-fold 1
Evaluating model 0 on k-fold 2
Evaluating model 1 on k-fold 2
Evaluating model 2 on k-fold 2
Evaluating model 3 on k-fold 2
Evaluating model 4 on k-fold 2
Evaluating model 0 on k-fold 3
Evaluating model 1 on k-fold 3
Evaluating model 2 on k-fold 3
Evaluating model 3 on k-fold 3
Evaluating model 4 on k-fold 3
Evaluating model 0 on k-fold 4
Evaluating model 1 on k-fold 4
Evaluating model 2 on k-fold 4
Evaluating model 3 on k-fold 4
Evaluating model 4 on k-fold 4


In [302]:
# now let's see our performance on the validation set...
for m in normed_models:
    m.evaluate(X_norm_test, y_norm_test, batch_size=16)

