In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.callbacks import ModelCheckpoint
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

Using TensorFlow backend.


In [2]:
df = pd.read_csv('adult.csv')
df.head(3), df.tail(3)

(   age          workclass  fnlwgt   education  education-num  \
 0   39          State-gov   77516   Bachelors             13   
 1   50   Self-emp-not-inc   83311   Bachelors             13   
 2   38            Private  215646     HS-grad              9   
 
         marital-status          occupation    relationship    race    sex  \
 0        Never-married        Adm-clerical   Not-in-family   White   Male   
 1   Married-civ-spouse     Exec-managerial         Husband   White   Male   
 2             Divorced   Handlers-cleaners   Not-in-family   White   Male   
 
    capital-gain  capital-loss  hours-per-week         country  salary  
 0          2174             0              40   United-States   <=50K  
 1             0             0              13   United-States   <=50K  
 2             0             0              40   United-States   <=50K  ,
        age      workclass  fnlwgt education  education-num  \
 32558   58        Private  151910   HS-grad              9   
 3255

In [3]:
col_names = df.columns
for c in col_names:
    df[c] = df[c].replace("?", np.NaN)

df = df.apply(lambda x:x.fillna(x.value_counts().index[0]))

In [4]:
df['marital-status'].unique()

array([' Never-married', ' Married-civ-spouse', ' Divorced',
       ' Married-spouse-absent', ' Separated', ' Married-AF-spouse',
       ' Widowed'], dtype=object)

In [5]:
df['salary'] = np.where(df['salary'] == ' >50K',1,0)
df.tail()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,country,salary
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,0
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,1
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,0
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,0
32560,52,Self-emp-inc,287927,HS-grad,9,Married-civ-spouse,Exec-managerial,Wife,White,Female,15024,0,40,United-States,1


In [6]:
df.replace([' Divorced', ' Married-AF-spouse', 
              ' Married-civ-spouse', ' Married-spouse-absent', 
              ' Never-married',' Separated',' Widowed'],
             ['divorced','married','married','married',
              'not married','not married','not married'], inplace = True)
df.head(10)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,country,salary
0,39,State-gov,77516,Bachelors,13,not married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,0
1,50,Self-emp-not-inc,83311,Bachelors,13,married,Exec-managerial,Husband,White,Male,0,0,13,United-States,0
2,38,Private,215646,HS-grad,9,divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,0
3,53,Private,234721,11th,7,married,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,0
4,28,Private,338409,Bachelors,13,married,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,0
5,37,Private,284582,Masters,14,married,Exec-managerial,Wife,White,Female,0,0,40,United-States,0
6,49,Private,160187,9th,5,married,Other-service,Not-in-family,Black,Female,0,0,16,Jamaica,0
7,52,Self-emp-not-inc,209642,HS-grad,9,married,Exec-managerial,Husband,White,Male,0,0,45,United-States,1
8,31,Private,45781,Masters,14,not married,Prof-specialty,Not-in-family,White,Female,14084,0,50,United-States,1
9,42,Private,159449,Bachelors,13,married,Exec-managerial,Husband,White,Male,5178,0,40,United-States,1


In [7]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn import preprocessing


In [8]:
labelEncoder = preprocessing.LabelEncoder()
category_col =['workclass', 'race','marital-status', 'sex', 'salary'] 

for col in category_col:
    df[col] = labelEncoder.fit_transform(df[col])

In [9]:
df.head(20)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,country,salary
0,39,7,77516,Bachelors,13,2,Adm-clerical,Not-in-family,4,1,2174,0,40,United-States,0
1,50,6,83311,Bachelors,13,1,Exec-managerial,Husband,4,1,0,0,13,United-States,0
2,38,4,215646,HS-grad,9,0,Handlers-cleaners,Not-in-family,4,1,0,0,40,United-States,0
3,53,4,234721,11th,7,1,Handlers-cleaners,Husband,2,1,0,0,40,United-States,0
4,28,4,338409,Bachelors,13,1,Prof-specialty,Wife,2,0,0,0,40,Cuba,0
5,37,4,284582,Masters,14,1,Exec-managerial,Wife,4,0,0,0,40,United-States,0
6,49,4,160187,9th,5,1,Other-service,Not-in-family,2,0,0,0,16,Jamaica,0
7,52,6,209642,HS-grad,9,1,Exec-managerial,Husband,4,1,0,0,45,United-States,1
8,31,4,45781,Masters,14,2,Prof-specialty,Not-in-family,4,0,14084,0,50,United-States,1
9,42,4,159449,Bachelors,13,1,Exec-managerial,Husband,4,1,5178,0,40,United-States,1


In [10]:
category_col_1 =['workclass', 'education', 'occupation',
               'relationship','country'] 

df_2 = pd.get_dummies(df, columns=category_col_1, drop_first=True)
df_2.head(20)
df_2.columns

Index(['age', 'fnlwgt', 'education-num', 'marital-status', 'race', 'sex',
       'capital-gain', 'capital-loss', 'hours-per-week', 'salary',
       'workclass_1', 'workclass_2', 'workclass_3', 'workclass_4',
       'workclass_5', 'workclass_6', 'workclass_7', 'workclass_8',
       'education_ 11th', 'education_ 12th', 'education_ 1st-4th',
       'education_ 5th-6th', 'education_ 7th-8th', 'education_ 9th',
       'education_ Assoc-acdm', 'education_ Assoc-voc', 'education_ Bachelors',
       'education_ Doctorate', 'education_ HS-grad', 'education_ Masters',
       'education_ Preschool', 'education_ Prof-school',
       'education_ Some-college', 'occupation_ Adm-clerical',
       'occupation_ Armed-Forces', 'occupation_ Craft-repair',
       'occupation_ Exec-managerial', 'occupation_ Farming-fishing',
       'occupation_ Handlers-cleaners', 'occupation_ Machine-op-inspct',
       'occupation_ Other-service', 'occupation_ Priv-house-serv',
       'occupation_ Prof-specialty', 'occup

In [11]:
dataframe=df_2.drop('fnlwgt',1)
dataframe =dataframe[[c for c in dataframe if c not in ['salary']] + ['salary']]
dataframe.head(20)

Unnamed: 0,age,education-num,marital-status,race,sex,capital-gain,capital-loss,hours-per-week,workclass_1,workclass_2,...,country_ Puerto-Rico,country_ Scotland,country_ South,country_ Taiwan,country_ Thailand,country_ Trinadad&Tobago,country_ United-States,country_ Vietnam,country_ Yugoslavia,salary
0,39,13,2,4,1,2174,0,40,0,0,...,0,0,0,0,0,0,1,0,0,0
1,50,13,1,4,1,0,0,13,0,0,...,0,0,0,0,0,0,1,0,0,0
2,38,9,0,4,1,0,0,40,0,0,...,0,0,0,0,0,0,1,0,0,0
3,53,7,1,2,1,0,0,40,0,0,...,0,0,0,0,0,0,1,0,0,0
4,28,13,1,2,0,0,0,40,0,0,...,0,0,0,0,0,0,0,0,0,0
5,37,14,1,4,0,0,0,40,0,0,...,0,0,0,0,0,0,1,0,0,0
6,49,5,1,2,0,0,0,16,0,0,...,0,0,0,0,0,0,0,0,0,0
7,52,9,1,4,1,0,0,45,0,0,...,0,0,0,0,0,0,1,0,0,1
8,31,14,2,4,0,14084,0,50,0,0,...,0,0,0,0,0,0,1,0,0,1
9,42,13,1,4,1,5178,0,40,0,0,...,0,0,0,0,0,0,1,0,0,1


In [12]:
## making a copy of the data frame
dataframe.to_csv('adult_new_keras.csv',index=False)

In [21]:
X = dataframe.iloc[:, 0:88].values
y = dataframe.iloc[:, 88].values

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)

from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

import keras
from keras.models import Sequential
from keras.layers import Dense

model = Sequential()

model.add(Dense(activation="relu", input_dim=88, units=6, kernel_initializer="uniform"))
model.add(Dropout(0.2))
model.add(Dense(activation="relu", units=6, kernel_initializer="uniform"))
model.add(Dropout(0.2))
model.add(Dense(activation="sigmoid", units=1, kernel_initializer="uniform"))

model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
                   
model.fit(X_train, y_train, batch_size = 100, epochs = 10, verbose=1, validation_data=(X_test, y_test))

y_pred = classifier.predict(X_test)
y_pred = (y_pred > 0.5)

y_pred2 = classifier.predict(X_train)
y_pred2 = (y_pred2 > 0.5)

from sklearn.metrics import confusion_matrix
print("Test classification report")
print(classification_report(y_test, y_pred), confusion_matrix(y_test, y_pred)) 
print("Train classification report")
print(classification_report(y_train, y_pred2), confusion_matrix(y_train, y_pred2))

score = model.evaluate(X_test, y_test, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

score = model.evaluate(X_train, y_train, verbose=0)
print('Train loss:', score[0])
print('Train accuracy:', score[1])

Train on 26048 samples, validate on 6513 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test classification report
              precision    recall  f1-score   support

           0       1.00      0.80      0.89       643
           1       0.98      1.00      0.99      5870

    accuracy                           0.98      6513
   macro avg       0.99      0.90      0.94      6513
weighted avg       0.98      0.98      0.98      6513
 [[ 515  128]
 [   2 5868]]
Train classification report
              precision    recall  f1-score   support

           0       1.00      0.81      0.89      2748
           1       0.98      1.00      0.99     23300

    accuracy                           0.98     26048
   macro avg       0.99      0.91      0.94     26048
weighted avg       0.98      0.98      0.98     26048
 [[ 2229   519]
 [    6 23294]]
Test loss: 0.09378426935942435
Test accuracy: 0.9762014150619507
Train lo