In [380]:
%run ./DeepLearning.ipynb

In [381]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

In [382]:
data = pd.read_csv("adult.data", delimiter=",", header= None)

# checking if there are any null values in the dataset
data.isnull().values.any()
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [383]:
# converting categorical data into numerical data.
# data of columns "1", "3", "5", "6", "7", "8", "9", "13" and "14"

data = pd.get_dummies(data, columns= [1, 3, 5, 6, 7, 8, 9, 13, 14], drop_first= True)
# in above code, I applied OneHotEncoding using get_dummies and dropped first column
data.head()

Unnamed: 0,0,2,4,10,11,12,1_ Federal-gov,1_ Local-gov,1_ Never-worked,1_ Private,...,13_ Puerto-Rico,13_ Scotland,13_ South,13_ Taiwan,13_ Thailand,13_ Trinadad&Tobago,13_ United-States,13_ Vietnam,13_ Yugoslavia,14_ >50K
0,39,77516,13,2174,0,40,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,50,83311,13,0,0,13,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,38,215646,9,0,0,40,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
3,53,234721,7,0,0,40,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
4,28,338409,13,0,0,40,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [384]:
X = data[data.columns[:-1]]
y = data[data.columns[-1]].to_numpy()


In [385]:
# standardization of the data before applying PCA
scalar = StandardScaler()
scalar.fit(X)
X_scaled = scalar.transform(X) # standard scaled data

In [386]:
# applying PCA with 95% variance
pca = PCA(n_components= 0.95)
pca.fit(X_scaled)
X = pca.transform(X_scaled)
X.shape

(32561, 84)

In [387]:
# splitting the data into training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

model1 = LogisticRegression()
model1.fit(X_train, y_train)
y_pred = model1.predict(X_train)

In [388]:
matrix = confusion_matrix(y_train, y_pred)
matrix

array([[16158,  1155],
       [ 2265,  3214]], dtype=int64)

In [389]:
score = accuracy_score(y_train, y_pred)
score

0.84994734994735

In [390]:
22784//256

89

In [391]:
model2 = DeepLearning( [20, 10, 5,  1], ['relu', 'relu', 'relu', 'sigmoid'], learning_rate= 0.1)

In [392]:
y = y.reshape(y.shape[0], 1)
y.shape

(32561, 1)

In [393]:
X_train[:-8][:256]

array([[ 1.26999102,  2.18030886, -1.07233153, ...,  0.55856105,
         0.27010916,  0.06941931],
       [-0.33678909, -0.76232604, -0.05842498, ...,  0.92608271,
         0.30105624, -1.2424065 ],
       [ 1.01631607,  0.72223691, -0.90832089, ...,  0.2876612 ,
        -0.01638459, -0.20368335],
       ...,
       [-1.72381001, -0.99595353, -0.91290435, ..., -0.26027972,
         0.09774744, -0.43683787],
       [-0.72669361,  0.17872424, -0.13980089, ...,  0.54837461,
        -0.41855873,  0.71647619],
       [-0.04545583, -0.97823042, -1.16454816, ...,  1.691742  ,
         0.01944094, -0.95742494]])

In [394]:
y_train = y_train.reshape(y_train.shape[0], 1)
y_train.shape

(22792, 1)

In [395]:
model2.train(X_train, y_train)

0.7050924055644563
0.6968286308062664
0.6890936459633912
0.6818447529010505
0.6750465888368987
0.6686627547213326
0.6626643353005606
0.6570218399610311
0.6517143729564383
0.6467183850251534
0.6420131356060392
0.6375783970702843
0.6333935493265119
0.629437896929889
0.6256952641842742
0.6221524134516642
0.6187955821945286
0.6156138684299697
0.6125963833272777
0.6097337015816643
0.6070168465739341
0.6044363818388239
0.6019861058593322
0.5996580716030654
0.5974454280490604
0.5953412446837409
0.5933396860962079
0.5914355994488141
0.5896238814445302
0.5878995694948931
0.5862582676088832
0.5846954961325812
0.5832072629632731
0.5817894537689737
0.5804386777032751
0.5791515632300341
0.5779247774470854
0.5767552345393862
0.5756402166396298
0.5745768872791671
0.5735626908123608
0.572595271467983
0.5716723827991991
0.5707917193725042
0.5699512691803742
0.5691489857477696
0.5683831599595367
0.5676519430081239
0.5669536122475409
0.5662866440817615
0.5656495287702238
0.5650407819964961
0.564459101979

0.5312658550034279
0.5309124549929045
0.5305520108196933
0.5301840054637543
0.529808096745428
0.5294243923653693
0.5290330642979055
0.528634160749259
0.5282277283805419
0.5278134346429278
0.5273910984791015
0.5269602142587243
0.5265208947962083
0.5260726758182366
0.5256156428759563
0.5251501537755008
0.5246760903328469
0.5241926396659214
0.5237000921901015
0.5231983483650863
0.5226870669076996
0.5221656906882842
0.5216339662177103
0.5210918434055993
0.5205389591228176
0.5199746884818076
0.5193990155901708
0.5188125173260293
0.5182143928322882
0.5176049498095385
0.5169837957506129
0.5163506402468767
0.5157051783056056
0.5150475844749488
0.5143780012415087
0.5136960466955037
0.5130011699650655
0.5122936086237588
0.5115735222945887
0.5108410646351852
0.5100953329428267
0.5093361561620382
0.5085629774643671
0.5077764526569405
0.506975523271092
0.5061601644527702
0.5053310113842095
0.5044876183298597
0.5036304672848066
0.5027601467126444
0.5018758889166985
0.5009773286800746
0.5000639646182

0.32625517848777447
0.32620431559148844
0.3261535428143689
0.32610302296188426
0.3260528773547346
0.32600297911668347
0.3259535209130673
0.3259040308373358
0.3258546793491062
0.3258057376001267
0.32575708849450463
0.3257086826729751
0.32566041433338794
0.3256126912615446
0.3255653586979861
0.3255180502941714
0.32547086136108017
0.3254236349652485
0.3253766358151145
0.32532988524010453
0.3252834098993458
0.32523724756074446
0.32519130048050143
0.32514551066704506
0.3250997455391409
0.32505449086549215
0.32500928008216
0.32496454740656877
0.32492008262847516
0.3248759898664696
0.3248321836117213
0.32478873355150845
0.32474539804437474
0.3247021313718377
0.3246587397726997
0.3246156022208127
0.3245725958429209
0.3245298783606137
0.3244873953017261
0.3244451646517491
0.3244030761251835
0.324361239865347
0.32431940592297914
0.3242777636476183
0.3242363374599145
0.32419494517955766
0.32415351346267823
0.3241121065940391
0.3240707896104501
0.3240297450157786
0.323988650429711
0.32394751473139

In [397]:
y_pred = model2.predict(X_train)
score = accuracy_score(y_train, y_pred)
score

0.8505177255177255

In [352]:
0.504261558459979-0.5055067460012455

-0.001245187541266457