In [1]:
# imports

# data manipulation
import pandas as pd
import numpy as np

# visualiation
import seaborn as sb
import matplotlib.pyplot as plt
%matplotlib inline

# model training
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

# model evaluation
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

# classifiers
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier # KNN
from sklearn.linear_model import LogisticRegression # logistic regression
from sklearn.tree import DecisionTreeClassifier # decision tree
from sklearn.ensemble import RandomForestClassifier # random forest
from sklearn.ensemble import GradientBoostingClassifier # gradient boosting

In [6]:
# load data
train_file = './data/train.csv'
train_data = pd.read_csv(train_file)
train_data = pd.DataFrame(data=train_data)

test_file = './data/test.csv'
test_data = pd.read_csv(test_file)
test_data = pd.DataFrame(data=test_data)

test_ground_truths_file = './data/test_ground_truths.csv'
test_ground_truths = pd.read_csv(test_ground_truths_file)
test_ground_truths = pd.DataFrame(data=test_ground_truths)

test_data['exceeds50K'] = test_ground_truths

In [60]:
# remove unwanted columns
def chooseColumns(data):
    data_copy = data.copy()
    
    # too many categories, drop for now
    data_copy = data_copy.drop(['native-country', 'education', 'fnlwgt'], axis=1)
    
    return data_copy

#preprocessed_data = pd.get_dummies(preprocessed_data)
processed_train_data = chooseColumns(train_data)
processed_test_data = chooseColumns(test_data)

processed_train_data.head()

Unnamed: 0,age,workclass,education-num,marital-status,occupation,relationship,sex,capital-gain,capital-loss,hours-per-week,exceeds50K
0,30,?,10,Never-married,?,Own-child,Female,0,0,30,0
1,60,Private,13,Divorced,Prof-specialty,Not-in-family,Female,0,0,42,0
2,52,?,10,Married-civ-spouse,?,Husband,Male,0,0,12,0
3,37,Private,13,Married-civ-spouse,Sales,Husband,Male,0,0,60,0
4,63,Private,10,Married-civ-spouse,Sales,Husband,Male,7298,0,48,1


In [131]:
categorical_columns = ['workclass', 'marital-status', 'occupation', 'relationship', 'sex']
onehot_train_data = pd.get_dummies(processed_train_data, columns=categorical_columns)
onehot_train_labels = processed_train_data['exceeds50K']
onehot_test_data = pd.get_dummies(processed_test_data, columns=categorical_columns)
onehot_test_labels = processed_train_data['exceeds50K']

X_train = onehot_train_data.drop(['exceeds50K'], axis=1).to_numpy()
y_train = onehot_train_labels.to_numpy()
X_test = onehot_test_data.drop(['exceeds50K'], axis=1).to_numpy()
y_test = onehot_test_labels.to_numpy()

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(24421, 44) (24421,)
(24421, 44) (24421,)


In [90]:
# mlp model definition

## Importing required libraries
import numpy as np
import tensorflow as tf
from sklearn.metrics import roc_auc_score, accuracy_score
#s = tf.InteractiveSession()

num_classes = y_train.shape[1]
num_features = X_train.shape[1]
num_output = y_train.shape[1]
num_layers_0 = 1024
num_layers_1 = 512
starter_learning_rate = 0.001
regularizer_rate = 0.1

# Placeholders for the input data
input_X = tf.placeholder('float32',shape =(None,num_features),name="input_X")
input_y = tf.placeholder('float32',shape = (None,num_classes),name='input_Y')
## for dropout layer
keep_prob = tf.placeholder(tf.float32)

## Weights initialized by random normal function with std_dev = 1/sqrt(number of input features)
weights_0 = tf.Variable(tf.random_normal([num_features,num_layers_0], stddev=(1/tf.sqrt(float(num_features)))))
bias_0 = tf.Variable(tf.random_normal([num_layers_0]))
weights_1 = tf.Variable(tf.random_normal([num_layers_0,num_layers_1], stddev=(1/tf.sqrt(float(num_layers_0)))))
bias_1 = tf.Variable(tf.random_normal([num_layers_1]))
weights_2 = tf.Variable(tf.random_normal([num_layers_1,num_output], stddev=(1/tf.sqrt(float(num_layers_1)))))
bias_2 = tf.Variable(tf.random_normal([num_output]))

## Initializing weigths and biases
hidden_output_0 = tf.nn.relu(tf.matmul(input_X,weights_0)+bias_0)
hidden_output_0_0 = tf.nn.dropout(hidden_output_0, rate=1-keep_prob)
hidden_output_1 = tf.nn.relu(tf.matmul(hidden_output_0_0,weights_1)+bias_1)
hidden_output_1_1 = tf.nn.dropout(hidden_output_1, rate=1-keep_prob)
predicted_y = tf.sigmoid(tf.matmul(hidden_output_1_1,weights_2) + bias_2)

## Defining the loss function
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits=predicted_y,labels=input_y)) \
        + regularizer_rate*(tf.reduce_sum(tf.square(bias_0)) + tf.reduce_sum(tf.square(bias_1)))

## Variable learning rate
learning_rate = tf.train.exponential_decay(starter_learning_rate, 0, 5, 0.85, staircase=True)
## Adam optimzer for finding the right weight
optimizer = tf.train.AdamOptimizer(learning_rate).minimize(loss,var_list=[weights_0,weights_1,weights_2,
                                                                         bias_0,bias_1,bias_2])

## Metrics definition
correct_prediction = tf.equal(tf.argmax(y_train,1), tf.argmax(predicted_y,1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

In [133]:
## mlp training
# TensorFlow and tf.keras
import tensorflow as tf
from tensorflow import keras
# Helper libraries
import numpy as np
import matplotlib.pyplot as plt
print(tf.__version__)

from keras.callbacks import LearningRateScheduler

# This is a sample of a scheduler I used in the past
def lr_scheduler(epoch, lr):
    decay_rate = 0.85
    decay_step = 1
    if epoch % 5 == 0 and epoch:
        return lr * pow(decay_rate, np.floor(epoch / decay_step))
    return lr

callbacks = [LearningRateScheduler(lr_scheduler, verbose=1)]

class_names = ['<50K', '>50K']
model = keras.Sequential([
    keras.layers.Dense(256, activation='relu'),
    keras.layers.Dense(128, activation='relu'),
    keras.layers.Dense(64, activation='relu'),
    keras.layers.Dense(2)
])
optimizer = keras.optimizers.Adam(lr=0.01)
epochs = 20
model.compile(optimizer=optimizer,
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])
model.fit(X_train.astype(float), y_train.astype(float), callbacks=callbacks, epochs=epochs)
test_loss, test_acc = model.evaluate(X_test, y_test, verbose=2)

print('\nTest accuracy:', test_acc)

1.14.0

Epoch 00001: LearningRateScheduler setting learning rate to 0.009999999776482582.
Epoch 1/20

Epoch 00002: LearningRateScheduler setting learning rate to 0.009999999776482582.
Epoch 2/20

Epoch 00003: LearningRateScheduler setting learning rate to 0.009999999776482582.
Epoch 3/20

Epoch 00004: LearningRateScheduler setting learning rate to 0.009999999776482582.
Epoch 4/20

Epoch 00005: LearningRateScheduler setting learning rate to 0.009999999776482582.
Epoch 5/20

Epoch 00006: LearningRateScheduler setting learning rate to 0.0044370530258241335.
Epoch 6/20

Epoch 00007: LearningRateScheduler setting learning rate to 0.004437053110450506.
Epoch 7/20

Epoch 00008: LearningRateScheduler setting learning rate to 0.004437053110450506.
Epoch 8/20

Epoch 00009: LearningRateScheduler setting learning rate to 0.004437053110450506.
Epoch 9/20

Epoch 00010: LearningRateScheduler setting learning rate to 0.004437053110450506.
Epoch 10/20

Epoch 00011: LearningRateScheduler setting learnin

In [134]:
probability_model = tf.keras.Sequential([model, tf.keras.layers.Softmax()])
predictions = probability_model.predict(X_test)
y_pred = []
for i in predictions:
    y_pred.append(np.argmax(i))
print(y_pred[0:5])
print(round(f1_score(y_test, y_pred, average='weighted') * 100, 2))

[0, 1, 0, 0, 0]
63.5
