# Project Overview

This project uses a neural network on two datasets to predict binary outcomes. The first dataset comes from the medical community and looks at breast cancer diagnoses. The second dataset is from the census bureau, and looks at whether people have incomes above or below 50,000 USD. 

In [63]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Data

The first dataset includes individual breast cancer diagnoses with many different variables about the type of cancerous tissue. 

In [25]:
df_cancer = pd.read_csv("data.csv")
df_cancer.head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,


In [27]:
df_cancer.tail()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
564,926424,M,21.56,22.39,142.0,1479.0,0.111,0.1159,0.2439,0.1389,...,26.4,166.1,2027.0,0.141,0.2113,0.4107,0.2216,0.206,0.07115,
565,926682,M,20.13,28.25,131.2,1261.0,0.0978,0.1034,0.144,0.09791,...,38.25,155.0,1731.0,0.1166,0.1922,0.3215,0.1628,0.2572,0.06637,
566,926954,M,16.6,28.08,108.3,858.1,0.08455,0.1023,0.09251,0.05302,...,34.12,126.7,1124.0,0.1139,0.3094,0.3403,0.1418,0.2218,0.0782,
567,927241,M,20.6,29.33,140.1,1265.0,0.1178,0.277,0.3514,0.152,...,39.42,184.6,1821.0,0.165,0.8681,0.9387,0.265,0.4087,0.124,
568,92751,B,7.76,24.54,47.92,181.0,0.05263,0.04362,0.0,0.0,...,30.37,59.16,268.6,0.08996,0.06444,0.0,0.0,0.2871,0.07039,


# Feature Engineering

In [38]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

le.fit(df_cancer['radius_mean'])
df_cancer['radius_mean'] = le.transform(df_cancer['radius_mean'])

le.fit(df_cancer['texture_mean'])
df_cancer['texture_mean'] = le.transform(df_cancer['texture_mean'])

le.fit(df_cancer['perimeter_mean'])
df_cancer['perimeter_mean'] = le.transform(df_cancer['perimeter_mean'])

le.fit(df_cancer['area_mean'])
df_cancer['area_mean'] = le.transform(df_cancer['area_mean'])

le.fit(df_cancer['smoothness_mean'])
df_cancer['smoothness_mean'] = le.transform(df_cancer['smoothness_mean'])

le.fit(df_cancer['compactness_mean'])
df_cancer['compactness_mean'] = le.transform(df_cancer['compactness_mean'])

le.fit(df_cancer['concavity_mean'])
df_cancer['concavity_mean'] = le.transform(df_cancer['concavity_mean'])

le.fit(df_cancer['diagnosis'])
df_cancer['diagnosis'] = le.transform(df_cancer['diagnosis'])


df_cancer.info()
df_cancer.describe()
df_cancer.head()
df_cancer.tail()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 33 columns):
id                         569 non-null int64
diagnosis                  569 non-null int64
radius_mean                569 non-null int64
texture_mean               569 non-null int64
perimeter_mean             569 non-null int64
area_mean                  569 non-null int64
smoothness_mean            569 non-null int64
compactness_mean           569 non-null int64
concavity_mean             569 non-null int64
concave points_mean        569 non-null float64
symmetry_mean              569 non-null float64
fractal_dimension_mean     569 non-null float64
radius_se                  569 non-null float64
texture_se                 569 non-null float64
perimeter_se               569 non-null float64
area_se                    569 non-null float64
smoothness_se              569 non-null float64
compactness_se             569 non-null float64
concavity_se               569 non-null float64
c

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
564,926424,1,438,370,503,522,410,364,509,0.1389,...,26.4,166.1,2027.0,0.141,0.2113,0.4107,0.2216,0.206,0.07115,
565,926682,1,413,463,483,499,295,312,415,0.09791,...,38.25,155.0,1731.0,0.1166,0.1922,0.3215,0.1628,0.2572,0.06637,
566,926954,1,340,458,410,421,115,308,329,0.05302,...,34.12,126.7,1124.0,0.1139,0.3094,0.3403,0.1418,0.2218,0.0782,
567,927241,1,429,467,500,501,442,530,530,0.152,...,39.42,184.6,1821.0,0.165,0.8681,0.9387,0.265,0.4087,0.124,
568,92751,0,3,414,1,3,0,37,0,0.0,...,30.37,59.16,268.6,0.08996,0.06444,0.0,0.0,0.2871,0.07039,


In [39]:
df_cancerNorm = df_cancer[['radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean', 'diagnosis']].apply(lambda x: (x - x.min()) / (x.max() - x.min()))

In [40]:
df_cancerNorm.describe()

Unnamed: 0,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,diagnosis
count,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0
mean,0.482609,0.495514,0.50775,0.499611,0.535159,0.506692,0.491563,0.372583
std,0.276419,0.27905,0.289838,0.287396,0.288056,0.287547,0.295275,0.483918
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.259341,0.261506,0.257198,0.252788,0.287526,0.257463,0.235075,0.0
50%,0.459341,0.491632,0.510557,0.498141,0.560254,0.516791,0.494403,0.0
75%,0.707692,0.725941,0.760077,0.745353,0.788584,0.751866,0.742537,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [49]:
df_target = df_cancer['diagnosis']
dfCancer = pd.concat([df_cancerNorm, df_target], axis = 1)
dfCancer.describe()

Unnamed: 0,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,diagnosis,diagnosis.1
count,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0
mean,0.482609,0.495514,0.50775,0.499611,0.535159,0.506692,0.491563,0.372583,0.372583
std,0.276419,0.27905,0.289838,0.287396,0.288056,0.287547,0.295275,0.483918,0.483918
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.259341,0.261506,0.257198,0.252788,0.287526,0.257463,0.235075,0.0,0.0
50%,0.459341,0.491632,0.510557,0.498141,0.560254,0.516791,0.494403,0.0,0.0
75%,0.707692,0.725941,0.760077,0.745353,0.788584,0.751866,0.742537,1.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


# Split Data

We need a training dataset and a testing dataset. We will use an 80/20 split. 

In [50]:
np.random.seed(0)
mask = np.random.rand(len(dfCancer)) < 0.8

df_train = dfCancer[mask]
df_test = dfCancer[~mask]

In [51]:
df_train.sample(n=10)

Unnamed: 0,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,diagnosis,diagnosis.1
207,0.76044,0.615063,0.796545,0.79368,0.331924,0.341418,0.537313,1.0,1
54,0.661538,0.748954,0.692898,0.70632,0.412262,0.317164,0.438433,1.0,1
285,0.362637,0.449791,0.37428,0.388476,0.224101,0.057836,0.016791,0.0,0
433,0.850549,0.74477,0.880998,0.873606,0.727273,0.79291,0.822761,1.0,1
154,0.437363,0.192469,0.489443,0.483271,0.501057,0.45709,0.623134,0.0,0
305,0.23956,0.861925,0.236084,0.247212,0.057082,0.175373,0.128731,0.0,0
528,0.542857,0.066946,0.59501,0.574349,0.970402,0.544776,0.660448,0.0,0
476,0.564835,0.627615,0.616123,0.604089,0.378436,0.630597,0.427239,0.0,0
61,0.017582,0.661088,0.019194,0.01487,0.968288,0.498134,0.246269,0.0,0
525,0.013187,0.058577,0.017274,0.013011,0.758985,0.371269,0.195896,0.0,0


In [52]:
df_test.sample(n=10)

Unnamed: 0,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,diagnosis,diagnosis.1
66,0.059341,0.667364,0.047985,0.04461,0.775899,0.384328,0.149254,0.0,0
111,0.367033,0.648536,0.416507,0.379182,0.672304,0.701493,0.677239,0.0,0
441,0.778022,0.905858,0.81382,0.806691,0.20296,0.632463,0.727612,1.0,1
449,0.956044,0.625523,0.955854,0.960967,0.581395,0.684701,0.81903,1.0,1
27,0.837363,0.612971,0.871401,0.86803,0.522199,0.602612,0.79291,1.0,1
384,0.452747,0.09205,0.49904,0.486989,0.209302,0.473881,0.429104,0.0,0
385,0.606593,0.820084,0.633397,0.652416,0.306554,0.270522,0.585821,1.0,1
459,0.085714,0.962343,0.067179,0.074349,0.124736,0.083955,0.095149,0.0,0
164,0.98022,0.751046,0.978887,0.98513,0.236786,0.666045,0.75,1.0,1
471,0.298901,0.960251,0.301344,0.315985,0.323467,0.214552,0.171642,0.0,0


In [53]:
Input = df_train.values[:,:9]
print(Input[:10])

[[0.81318681 0.00209205 0.87332054 0.8401487  0.93868922 0.99067164
  0.97201493 1.         1.        ]
 [0.93626374 0.38912134 0.9328215  0.9535316  0.25369979 0.4011194
  0.5988806  1.         1.        ]
 [0.89230769 0.67782427 0.91746641 0.90520446 0.85200846 0.85634328
  0.89179104 1.         1.        ]
 [0.21538462 0.62133891 0.31477927 0.17657993 0.99577167 0.99440299
  0.94589552 1.         1.        ]
 [0.91868132 0.12552301 0.9462572  0.94052045 0.70190275 0.77238806
  0.89552239 1.         1.        ]
 [0.34945055 0.22384937 0.42802303 0.37174721 0.97885835 0.88992537
  0.82089552 1.         1.        ]
 [0.82637363 0.58995816 0.85412668 0.85130112 0.53065539 0.62126866
  0.69776119 1.         1.        ]
 [0.35164835 0.85146444 0.45681382 0.36431227 0.94080338 0.97761194
  0.93470149 1.         1.        ]
 [0.71208791 0.81799163 0.74472169 0.75092937 0.17336152 0.27238806
  0.26865672 1.         1.        ]
 [0.70769231 0.39539749 0.75431862 0.74163569 0.59196617 0.738805

In [54]:
targets = [[1,0], [0,1]]
output = np.array([targets[int(x)] for x in df_train.values[:,7]])
print(output[:10])

[[0 1]
 [0 1]
 [0 1]
 [0 1]
 [0 1]
 [0 1]
 [0 1]
 [0 1]
 [0 1]
 [0 1]]


# Backpropogation

In [55]:
hiddenNodes = 10
inputNodes = len(Input[0])

weight1 = 2 * np.random.random((inputNodes, hiddenNodes)) - 1
print(weight1)

[[ 0.11834756  0.84459701 -0.01527719  0.74766436  0.66796329 -0.57232931
   0.54245093 -0.97565769 -0.35434092 -0.54086511]
 [ 0.01372592  0.47370632 -0.80464727  0.0298444   0.87682404 -0.5427069
   0.35428229  0.18576054 -0.97987261 -0.04834761]
 [ 0.41754078 -0.91204914  0.75904297  0.04016283 -0.9386779  -0.55117278
   0.90735139  0.16463947 -0.78505486 -0.424911  ]
 [-0.08659275 -0.95809986 -0.17676897 -0.02108273 -0.51264425  0.177278
   0.50648024 -0.52833155  0.2409998   0.27924449]
 [ 0.8970806   0.55655233  0.69669054 -0.01916018 -0.62930283  0.99163059
  -0.74128848 -0.05708536 -0.8638138   0.88770171]
 [ 0.92984988  0.43877812 -0.30001431 -0.4912352  -0.46939335 -0.74541195
   0.05161791 -0.71636545 -0.36653867  0.25341295]
 [ 0.45508722 -0.95145459 -0.13976803  0.30424919  0.70649195 -0.04935044
   0.93841174 -0.4687349  -0.97298259 -0.03249427]
 [-0.48777241  0.64743534 -0.53445466 -0.37874156  0.58245486  0.4302865
   0.11610247  0.40989612 -0.16272627 -0.9893799 ]
 [-0

In [56]:
outputNodes = len(output[0])

weight2 = 2 * np.random.random((hiddenNodes, outputNodes)) - 1
print(weight2)

[[-0.62820479  0.38301622]
 [-0.78219252 -0.4707008 ]
 [ 0.95018936  0.27892555]
 [ 0.04135558 -0.20416277]
 [ 0.54900191 -0.71808505]
 [ 0.9346756   0.72224602]
 [ 0.23531397 -0.91418762]
 [ 0.4017113   0.82656868]
 [ 0.04915413 -0.29155036]
 [-0.75944531  0.50980221]]


In [57]:
def sigmoid(x):
    return 1.0 / (1.0 + np.exp(-x))
def sigmoidDerivative(x):
    return x * (1.0 - x)
rate = 0.01

for i in range(10000):
    sig1 = sigmoid(np.dot(Input, weight1))
    
    sig2 = sigmoid(np.dot(sig1, weight2))
    
    totalErr = (abs(output - sig2)).mean()
    
    deltaSig2 = (output - sig2) * sigmoidDerivative(sig2)
    deltaSig1 = (np.dot(deltaSig2, weight2.T) * sigmoidDerivative(sig1))
    
    weight2 += (np.dot(sig1.T, deltaSig2) * rate)
    weight1 += (np.dot(Input.T, deltaSig1) * rate)

print('Error Rate: ', totalErr)

Error Rate:  0.0036639986077884124


# Accuracy of Model

In [61]:
Input = df_test.values[:,:9]
output = np.array([targets[int(x)] for x in df_test.values[:,7]])

sig1 = sigmoid(np.dot(Input, weight1))
sig2 = sigmoid(np.dot(sig1, weight2))

In [62]:
PredictedY = np.argmax(sig2, axis = 1)
ActualY = np.argmax(output, axis = 1)
residual = (PredictedY == ActualY)

counter = 0
for i in residual:
    if i == True:
        counter = counter + 1
floatCounter = float(counter)
length = float(len(residual))
accuracy = floatCounter/length
print("Accuracy: ", accuracy)

Accuracy:  1.0


Possible overfitting here....

Now we look at the second dataset where individuals from the census either have incomes above or below 50,000 USD. 

In [5]:
df_census = pd.read_csv("adult.csv")
df_census.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K


In [6]:
df_census.tail()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
32556,22,Private,310152,Some-college,10,Never-married,Protective-serv,Not-in-family,White,Male,0,0,40,United-States,<=50K
32557,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32558,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32559,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32560,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


# Feature Engineering

In this dataset, we have too many variables so we want to drop some of them and also fit others to be prepared for the neural network. 

In [8]:
df_census = df_census.drop(['workclass', 'fnlwgt', 'capital.gain', 'capital.loss', 'native.country'], axis = 1)
df_census.head()

Unnamed: 0,age,education,education.num,marital.status,occupation,relationship,race,sex,hours.per.week,income
0,90,HS-grad,9,Widowed,?,Not-in-family,White,Female,40,<=50K
1,82,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,18,<=50K
2,66,Some-college,10,Widowed,?,Unmarried,Black,Female,40,<=50K
3,54,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,40,<=50K
4,41,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,40,<=50K


In [9]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

le.fit(df_census['education'])
df_census['education'] = le.transform(df_census['education'])

le.fit(df_census['marital.status'])
df_census['marital.status'] = le.transform(df_census['marital.status'])

le.fit(df_census['occupation'])
df_census['occupation'] = le.transform(df_census['occupation'])

le.fit(df_census['relationship'])
df_census['relationship'] = le.transform(df_census['relationship'])

le.fit(df_census['race'])
df_census['race'] = le.transform(df_census['race'])

le.fit(df_census['sex'])
df_census['sex'] = le.transform(df_census['sex'])

le.fit(df_census['income'])
df_census['income'] = le.transform(df_census['income'])

df_census.info()
df_census.describe()
df_census.head()
df_census.tail()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 10 columns):
age               32561 non-null int64
education         32561 non-null int64
education.num     32561 non-null int64
marital.status    32561 non-null int64
occupation        32561 non-null int64
relationship      32561 non-null int64
race              32561 non-null int64
sex               32561 non-null int64
hours.per.week    32561 non-null int64
income            32561 non-null int64
dtypes: int64(10)
memory usage: 2.5 MB


Unnamed: 0,age,education,education.num,marital.status,occupation,relationship,race,sex,hours.per.week,income
32556,22,15,10,4,11,1,4,1,40,0
32557,27,7,12,2,13,5,4,0,38,0
32558,40,11,9,2,7,0,4,1,40,1
32559,58,11,9,6,1,4,4,0,40,0
32560,22,11,9,4,1,3,4,1,20,0


In [10]:
df_censusNorm = df_census[['age', 'education', 'education.num', 'marital.status', 'occupation', 'relationship', 'race', 'sex', 'hours.per.week']].apply(lambda x: (x - x.min()) / (x.max() - x.min()))

In [11]:
df_censusNorm.describe()


Unnamed: 0,age,education,education.num,marital.status,occupation,relationship,race,sex,hours.per.week
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,0.295639,0.686547,0.605379,0.435306,0.469481,0.289272,0.916464,0.669205,0.402423
std,0.186855,0.258018,0.171515,0.251037,0.302061,0.321354,0.212201,0.470506,0.125994
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.150685,0.6,0.533333,0.333333,0.214286,0.0,1.0,0.0,0.397959
50%,0.273973,0.733333,0.6,0.333333,0.5,0.2,1.0,1.0,0.397959
75%,0.424658,0.8,0.733333,0.666667,0.714286,0.6,1.0,1.0,0.44898
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [12]:
df_target = df_census['income']
dfCensus = pd.concat([df_censusNorm, df_target], axis = 1)
dfCensus.describe()

Unnamed: 0,age,education,education.num,marital.status,occupation,relationship,race,sex,hours.per.week,income
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,0.295639,0.686547,0.605379,0.435306,0.469481,0.289272,0.916464,0.669205,0.402423,0.24081
std,0.186855,0.258018,0.171515,0.251037,0.302061,0.321354,0.212201,0.470506,0.125994,0.427581
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.150685,0.6,0.533333,0.333333,0.214286,0.0,1.0,0.0,0.397959,0.0
50%,0.273973,0.733333,0.6,0.333333,0.5,0.2,1.0,1.0,0.397959,0.0
75%,0.424658,0.8,0.733333,0.666667,0.714286,0.6,1.0,1.0,0.44898,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


# Split the Data
We will separate the data with an 80/20 split

In [13]:
np.random.seed(0)
mask = np.random.rand(len(dfCensus)) < 0.8

df_train = dfCensus[mask]
df_test = dfCensus[~mask]

In [14]:
df_train.sample(n=10)


Unnamed: 0,age,education,education.num,marital.status,occupation,relationship,race,sex,hours.per.week,income
13768,0.287671,0.6,0.8,0.333333,0.714286,1.0,1.0,0.0,0.397959,1
27260,0.589041,0.0,0.333333,0.333333,0.857143,0.0,1.0,1.0,0.397959,0
26592,0.123288,0.733333,0.533333,0.666667,0.071429,0.2,1.0,0.0,0.44898,0
29051,0.342466,1.0,0.6,0.333333,0.714286,0.0,1.0,1.0,0.397959,1
16200,0.60274,0.0,0.333333,0.333333,0.357143,0.0,1.0,1.0,0.55102,0
5568,0.082192,0.733333,0.533333,0.666667,0.071429,0.6,1.0,0.0,0.397959,0
23170,0.123288,1.0,0.6,0.333333,0.071429,0.0,1.0,1.0,0.397959,0
1577,0.452055,0.733333,0.533333,0.0,0.285714,0.2,1.0,0.0,0.346939,1
19477,0.39726,0.733333,0.533333,0.333333,0.071429,0.0,1.0,1.0,0.397959,1
22642,0.260274,0.733333,0.533333,0.333333,0.214286,0.0,1.0,1.0,0.44898,0


In [15]:
df_test.sample(n=10)


Unnamed: 0,age,education,education.num,marital.status,occupation,relationship,race,sex,hours.per.week,income
4823,0.232877,0.733333,0.533333,0.333333,0.214286,0.0,1.0,1.0,0.397959,1
19030,0.643836,0.733333,0.533333,1.0,0.071429,0.2,1.0,0.0,0.295918,0
4045,0.616438,0.733333,0.533333,0.0,0.5,0.2,1.0,0.0,0.397959,0
9316,0.767123,0.733333,0.533333,0.333333,0.357143,0.0,1.0,1.0,0.397959,0
1253,0.0,0.066667,0.4,0.666667,0.071429,0.2,0.5,0.0,0.397959,0
14093,0.150685,0.6,0.8,0.333333,0.928571,0.0,1.0,1.0,0.397959,1
13068,0.356164,0.733333,0.533333,0.333333,0.285714,0.0,1.0,1.0,0.397959,0
4648,0.150685,0.6,0.8,0.666667,0.714286,0.2,1.0,1.0,0.193878,0
8038,0.0,0.133333,0.466667,0.666667,0.857143,0.2,1.0,0.0,0.091837,0
9142,0.191781,0.4,0.266667,0.333333,0.285714,1.0,1.0,0.0,0.397959,0


In [16]:
Input = df_train.values[:,:9]
print(Input[:10])


[[1.         0.73333333 0.53333333 1.         0.         0.2
  1.         0.         0.39795918]
 [0.89041096 0.73333333 0.53333333 1.         0.28571429 0.2
  1.         0.         0.17346939]
 [0.67123288 1.         0.6        1.         0.         0.8
  0.5        0.         0.39795918]
 [0.50684932 0.33333333 0.2        0.         0.5        0.8
  1.         0.         0.39795918]
 [0.32876712 1.         0.6        0.83333333 0.71428571 0.6
  1.         0.         0.39795918]
 [0.23287671 0.73333333 0.53333333 0.         0.57142857 0.8
  1.         0.         0.44897959]
 [0.28767123 0.         0.33333333 0.83333333 0.07142857 0.8
  1.         1.         0.39795918]
 [0.32876712 1.         0.6        0.66666667 0.21428571 0.8
  1.         1.         0.60204082]
 [0.38356164 0.66666667 1.         0.         0.71428571 0.8
  0.5        0.         0.34693878]
 [0.28767123 0.93333333 0.93333333 0.66666667 0.71428571 0.2
  1.         1.         0.44897959]]


In [17]:
targets = [[1,0], [0,1]]
output = np.array([targets[int(x)] for x in df_train.values[:,9]])
print(output[:10])


[[1 0]
 [1 0]
 [1 0]
 [1 0]
 [1 0]
 [1 0]
 [1 0]
 [0 1]
 [0 1]
 [0 1]]


# Backpropogation

In [19]:
hiddenNodes = 10
inputNodes = len(Input[0])

weight1 = 2 * np.random.random((inputNodes, hiddenNodes)) - 1
print(weight1)


[[ 0.04010323 -0.09270235  0.99517426 -0.2040091  -0.55490559 -0.48787704
  -0.45239811 -0.91612834  0.99788868  0.89764372]
 [ 0.66291785  0.52039516 -0.11930381 -0.74002766 -0.90025756  0.86432301
  -0.50483226  0.39073817  0.0309265   0.47699372]
 [ 0.39479474 -0.12155583  0.86822178 -0.4158018  -0.79996935  0.09797307
  -0.73342306 -0.29333405  0.57216164  0.55684375]
 [-0.23480108 -0.44688785  0.62274519  0.4421446  -0.39099517  0.65612231
   0.86189545  0.16736554 -0.12262316  0.57457765]
 [ 0.60576978  0.88513773  0.2964565   0.78421974  0.77363519  0.83745214
  -0.19831043  0.07157144 -0.12750022 -0.27094457]
 [-0.97059542  0.26694357 -0.06172869  0.50999602  0.39994045 -0.71310286
  -0.787007   -0.87008287  0.01369074  0.12734943]
 [-0.38645786  0.09558038  0.74480251 -0.3394966   0.83594361  0.43235747
   0.61789853 -0.53600331 -0.88410814  0.22961504]
 [ 0.79126975  0.4206606  -0.69685698 -0.55254789 -0.79468988  0.53319857
   0.80108217  0.16020193  0.67192794 -0.01727241]


In [20]:
outputNodes = len(output[0])

weight2 = 2 * np.random.random((hiddenNodes, outputNodes)) - 1
print(weight2)


[[ 0.03687861  0.43946294]
 [-0.45163411  0.90602924]
 [ 0.105625   -0.75952399]
 [ 0.50107135 -0.21519067]
 [-0.75998764  0.45171376]
 [ 0.78345244 -0.12704866]
 [ 0.7226618   0.3466171 ]
 [ 0.27031376 -0.64172432]
 [ 0.416202   -0.39273099]
 [ 0.5442212   0.01377087]]


In [21]:
def sigmoid(x):
    return 1.0 / (1.0 + np.exp(-x))
def sigmoidDerivative(x):
    return x * (1.0 - x)
rate = 0.01

for i in range(10000):
    sig1 = sigmoid(np.dot(Input, weight1))
    
    sig2 = sigmoid(np.dot(sig1, weight2))
    
    totalErr = (abs(output - sig2)).mean()
    
    deltaSig2 = (output - sig2) * sigmoidDerivative(sig2)
    deltaSig1 = (np.dot(deltaSig2, weight2.T) * sigmoidDerivative(sig1))
    
    weight2 += (np.dot(sig1.T, deltaSig2) * rate)
    weight1 += (np.dot(Input.T, deltaSig1) * rate)

print('Error Rate: ', totalErr)


Error Rate:  0.24087143295489413


# Accuracy of Model

In [23]:
Input = df_test.values[:,:9]
output = np.array([targets[int(x)] for x in df_test.values[:,9:10]])

sig1 = sigmoid(np.dot(Input, weight1))
sig2 = sigmoid(np.dot(sig1, weight2))

In [24]:
PredictedY = np.argmax(sig2, axis = 1)
ActualY = np.argmax(output, axis = 1)
residual = (PredictedY == ActualY)

counter = 0
for i in residual:
    if i == True:
        counter = counter + 1
floatCounter = float(counter)
length = float(len(residual))
accuracy = floatCounter/length
print("Accuracy: ", accuracy)

Accuracy:  0.7594390507011867
