In this notebook we are using the Adult Census Income dataset from UCI Machine Learning. With this dataset, we will build a neural network that will predict whether income exceeds $50k/year based on census data. 

# Read Data

In [63]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [64]:
df_census = pd.read_csv('../data/adult.csv')
df_census.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K


In [65]:
df_census.tail()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
32556,22,Private,310152,Some-college,10,Never-married,Protective-serv,Not-in-family,White,Male,0,0,40,United-States,<=50K
32557,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32558,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32559,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32560,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


Now that the data has been loaded in, I'm going to choose the input attributes that will be best serve as inputs for the neural network.

# Feature Engineering

In [66]:
df_census = df_census.drop(['workclass', 'fnlwgt', 'capital.gain', 'capital.loss', 'native.country'], axis = 1)
df_census.head()

Unnamed: 0,age,education,education.num,marital.status,occupation,relationship,race,sex,hours.per.week,income
0,90,HS-grad,9,Widowed,?,Not-in-family,White,Female,40,<=50K
1,82,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,18,<=50K
2,66,Some-college,10,Widowed,?,Unmarried,Black,Female,40,<=50K
3,54,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,40,<=50K
4,41,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,40,<=50K


Now that we have selected the desired features for the neural network, I am going to use a label encoder to create numeric outputs which we can later normalize to work in a neural network

In [67]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

le.fit(df_census['education'])
df_census['education'] = le.transform(df_census['education'])

le.fit(df_census['marital.status'])
df_census['marital.status'] = le.transform(df_census['marital.status'])

le.fit(df_census['occupation'])
df_census['occupation'] = le.transform(df_census['occupation'])

le.fit(df_census['relationship'])
df_census['relationship'] = le.transform(df_census['relationship'])

le.fit(df_census['race'])
df_census['race'] = le.transform(df_census['race'])

le.fit(df_census['sex'])
df_census['sex'] = le.transform(df_census['sex'])

le.fit(df_census['income'])
df_census['income'] = le.transform(df_census['income'])

df_census.info()
df_census.describe()
df_census.head()
df_census.tail()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 10 columns):
age               32561 non-null int64
education         32561 non-null int64
education.num     32561 non-null int64
marital.status    32561 non-null int64
occupation        32561 non-null int64
relationship      32561 non-null int64
race              32561 non-null int64
sex               32561 non-null int64
hours.per.week    32561 non-null int64
income            32561 non-null int64
dtypes: int64(10)
memory usage: 2.5 MB


Unnamed: 0,age,education,education.num,marital.status,occupation,relationship,race,sex,hours.per.week,income
32556,22,15,10,4,11,1,4,1,40,0
32557,27,7,12,2,13,5,4,0,38,0
32558,40,11,9,2,7,0,4,1,40,1
32559,58,11,9,6,1,4,4,0,40,0
32560,22,11,9,4,1,3,4,1,20,0


Next we need to normalize the data to improve the performance of our model

In [68]:
df_censusNorm = df_census[['age', 'education', 'education.num', 'marital.status', 'occupation', 'relationship', 'race', 'sex', 'hours.per.week']].apply(lambda x: (x - x.min()) / (x.max() - x.min()))

In [69]:
df_censusNorm.describe()

Unnamed: 0,age,education,education.num,marital.status,occupation,relationship,race,sex,hours.per.week
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,0.295639,0.686547,0.605379,0.435306,0.469481,0.289272,0.916464,0.669205,0.402423
std,0.186855,0.258018,0.171515,0.251037,0.302061,0.321354,0.212201,0.470506,0.125994
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.150685,0.6,0.533333,0.333333,0.214286,0.0,1.0,0.0,0.397959
50%,0.273973,0.733333,0.6,0.333333,0.5,0.2,1.0,1.0,0.397959
75%,0.424658,0.8,0.733333,0.666667,0.714286,0.6,1.0,1.0,0.44898
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [72]:
df_target = df_census['income']
dfCensus = pd.concat([df_censusNorm, df_target], axis = 1)
dfCensus.describe()

Unnamed: 0,age,education,education.num,marital.status,occupation,relationship,race,sex,hours.per.week,income
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,0.295639,0.686547,0.605379,0.435306,0.469481,0.289272,0.916464,0.669205,0.402423,0.24081
std,0.186855,0.258018,0.171515,0.251037,0.302061,0.321354,0.212201,0.470506,0.125994,0.427581
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.150685,0.6,0.533333,0.333333,0.214286,0.0,1.0,0.0,0.397959,0.0
50%,0.273973,0.733333,0.6,0.333333,0.5,0.2,1.0,1.0,0.397959,0.0
75%,0.424658,0.8,0.733333,0.666667,0.714286,0.6,1.0,1.0,0.44898,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [75]:
#seperate data to train and test
#split data with .8 train .2 test
np.random.seed(0)
mask = np.random.rand(len(dfCensus)) < 0.8

df_train = dfCensus[mask]
df_test = dfCensus[~mask]

In [78]:
df_train.sample(n=10)

Unnamed: 0,age,education,education.num,marital.status,occupation,relationship,race,sex,hours.per.week,income
13768,0.287671,0.6,0.8,0.333333,0.714286,1.0,1.0,0.0,0.397959,1
27260,0.589041,0.0,0.333333,0.333333,0.857143,0.0,1.0,1.0,0.397959,0
26592,0.123288,0.733333,0.533333,0.666667,0.071429,0.2,1.0,0.0,0.44898,0
29051,0.342466,1.0,0.6,0.333333,0.714286,0.0,1.0,1.0,0.397959,1
16200,0.60274,0.0,0.333333,0.333333,0.357143,0.0,1.0,1.0,0.55102,0
5568,0.082192,0.733333,0.533333,0.666667,0.071429,0.6,1.0,0.0,0.397959,0
23170,0.123288,1.0,0.6,0.333333,0.071429,0.0,1.0,1.0,0.397959,0
1577,0.452055,0.733333,0.533333,0.0,0.285714,0.2,1.0,0.0,0.346939,1
19477,0.39726,0.733333,0.533333,0.333333,0.071429,0.0,1.0,1.0,0.397959,1
22642,0.260274,0.733333,0.533333,0.333333,0.214286,0.0,1.0,1.0,0.44898,0


In [79]:
df_test.sample(n=10)

Unnamed: 0,age,education,education.num,marital.status,occupation,relationship,race,sex,hours.per.week,income
4823,0.232877,0.733333,0.533333,0.333333,0.214286,0.0,1.0,1.0,0.397959,1
19030,0.643836,0.733333,0.533333,1.0,0.071429,0.2,1.0,0.0,0.295918,0
4045,0.616438,0.733333,0.533333,0.0,0.5,0.2,1.0,0.0,0.397959,0
9316,0.767123,0.733333,0.533333,0.333333,0.357143,0.0,1.0,1.0,0.397959,0
1253,0.0,0.066667,0.4,0.666667,0.071429,0.2,0.5,0.0,0.397959,0
14093,0.150685,0.6,0.8,0.333333,0.928571,0.0,1.0,1.0,0.397959,1
13068,0.356164,0.733333,0.533333,0.333333,0.285714,0.0,1.0,1.0,0.397959,0
4648,0.150685,0.6,0.8,0.666667,0.714286,0.2,1.0,1.0,0.193878,0
8038,0.0,0.133333,0.466667,0.666667,0.857143,0.2,1.0,0.0,0.091837,0
9142,0.191781,0.4,0.266667,0.333333,0.285714,1.0,1.0,0.0,0.397959,0


To train our model we will get the input and output from the dataset

In [80]:
Input = df_train.values[:,:9]
print(Input[:10])

[[ 1.          0.73333333  0.53333333  1.          0.          0.2         1.
   0.          0.39795918]
 [ 0.89041096  0.73333333  0.53333333  1.          0.28571429  0.2         1.
   0.          0.17346939]
 [ 0.67123288  1.          0.6         1.          0.          0.8         0.5
   0.          0.39795918]
 [ 0.50684932  0.33333333  0.2         0.          0.5         0.8         1.
   0.          0.39795918]
 [ 0.32876712  1.          0.6         0.83333333  0.71428571  0.6         1.
   0.          0.39795918]
 [ 0.23287671  0.73333333  0.53333333  0.          0.57142857  0.8         1.
   0.          0.44897959]
 [ 0.28767123  0.          0.33333333  0.83333333  0.07142857  0.8         1.
   1.          0.39795918]
 [ 0.32876712  1.          0.6         0.66666667  0.21428571  0.8         1.
   1.          0.60204082]
 [ 0.38356164  0.66666667  1.          0.          0.71428571  0.8         0.5
   0.          0.34693878]
 [ 0.28767123  0.93333333  0.93333333  0.66666667  0.

In [81]:
targets = [[1,0], [0,1]]
output = np.array([targets[int(x)] for x in df_train.values[:,9]])
print(output[:10])

[[1 0]
 [1 0]
 [1 0]
 [1 0]
 [1 0]
 [1 0]
 [1 0]
 [0 1]
 [0 1]
 [0 1]]


# Backpropogation
This neural network has three layers: Input Layer, Hidden Layer, Output Layer
The input layer will consist of nine nodes, one for each feature used to predict in the data frame above.
The output layer will consist of 2 nodes - one for each classification for the output (salary above 50k, salary below 50k) A value of [1,0] predicts a salary above 50k, whereas a value of [0,1] would predict a salary below 50k

In [82]:
hiddenNodes = 10
inputNodes = len(Input[0])

weight1 = 2 * np.random.random((inputNodes, hiddenNodes)) - 1
print(weight1)

[[ 0.04010323 -0.09270235  0.99517426 -0.2040091  -0.55490559 -0.48787704
  -0.45239811 -0.91612834  0.99788868  0.89764372]
 [ 0.66291785  0.52039516 -0.11930381 -0.74002766 -0.90025756  0.86432301
  -0.50483226  0.39073817  0.0309265   0.47699372]
 [ 0.39479474 -0.12155583  0.86822178 -0.4158018  -0.79996935  0.09797307
  -0.73342306 -0.29333405  0.57216164  0.55684375]
 [-0.23480108 -0.44688785  0.62274519  0.4421446  -0.39099517  0.65612231
   0.86189545  0.16736554 -0.12262316  0.57457765]
 [ 0.60576978  0.88513773  0.2964565   0.78421974  0.77363519  0.83745214
  -0.19831043  0.07157144 -0.12750022 -0.27094457]
 [-0.97059542  0.26694357 -0.06172869  0.50999602  0.39994045 -0.71310286
  -0.787007   -0.87008287  0.01369074  0.12734943]
 [-0.38645786  0.09558038  0.74480251 -0.3394966   0.83594361  0.43235747
   0.61789853 -0.53600331 -0.88410814  0.22961504]
 [ 0.79126975  0.4206606  -0.69685698 -0.55254789 -0.79468988  0.53319857
   0.80108217  0.16020193  0.67192794 -0.01727241]


weight1 indicates the weight of the connection between the input node and hidden node.
weight2 will be used to represent the connections between the hidden layer and the output layer. 
Both weights will hold values that range between -1 to 1

In [85]:
outputNodes = len(output[0])

weight2 = 2 * np.random.random((hiddenNodes, outputNodes)) - 1
print(weight2)

[[ 0.03687861  0.43946294]
 [-0.45163411  0.90602924]
 [ 0.105625   -0.75952399]
 [ 0.50107135 -0.21519067]
 [-0.75998764  0.45171376]
 [ 0.78345244 -0.12704866]
 [ 0.7226618   0.3466171 ]
 [ 0.27031376 -0.64172432]
 [ 0.416202   -0.39273099]
 [ 0.5442212   0.01377087]]


# Activation Function
This neural network will use a Sigmoid Activation function. 

In [101]:
def sigmoid(x):
    return 1.0 / (1.0 + np.exp(-x))
def sigmoidDerivative(x):
    return x * (1.0 - x)
rate = 0.01

for i in range(10000):
    sig1 = sigmoid(np.dot(Input, weight1))
    
    sig2 = sigmoid(np.dot(sig1, weight2))
    
    totalErr = (abs(output - sig2)).mean()
    
    deltaSig2 = (output - sig2) * sigmoidDerivative(sig2)
    deltaSig1 = (np.dot(deltaSig2, weight2.T) * sigmoidDerivative(sig1))
    
    weight2 += (np.dot(sig1.T, deltaSig2) * rate)
    weight1 += (np.dot(Input.T, deltaSig1) * rate)

print('Error Rate: ', totalErr)

('Error Rate: ', 0.24087143295489413)


# Messuring accuracy of the Model

In [123]:
Input = df_test.values[:,:9]
output = np.array([targets[int(x)] for x in df_test.values[:,9:10]])

sig1 = sigmoid(np.dot(Input, weight1))
sig2 = sigmoid(np.dot(sig1, weight2))

In [146]:
PredictedY = np.argmax(sig2, axis = 1)
ActualY = np.argmax(output, axis = 1)
residual = (PredictedY == ActualY)

counter = 0
for i in residual:
    if i == True:
        counter = counter + 1
floatCounter = float(counter)
length = float(len(residual))
accuracy = floatCounter/length
print("Accuracy: ", accuracy)

('Accuracy: ', 0.7594390507011867)


The neural network we built predicts with an accuracy of 76 percent. I would consider this high given the number of input nodes we are feeding our model and given the two node output layer classifying whether a salary is below or above $50,000