## Prepare a classification model using Naive Bayes for salary data

In [39]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn import preprocessing

In [40]:
df_train = pd.read_csv("SalaryData_Train.csv")
df_test = pd.read_csv("SalaryData_Test.csv")

In [41]:
df_train

Unnamed: 0,age,workclass,education,educationno,maritalstatus,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native,Salary
0,39,State-gov,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30156,27,Private,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
30157,40,Private,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
30158,58,Private,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
30159,22,Private,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [42]:
df_test

Unnamed: 0,age,workclass,education,educationno,maritalstatus,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native,Salary
0,25,Private,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,34,Private,10th,6,Never-married,Other-service,Not-in-family,White,Male,0,0,30,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15055,33,Private,Bachelors,13,Never-married,Prof-specialty,Own-child,White,Male,0,0,40,United-States,<=50K
15056,39,Private,Bachelors,13,Divorced,Prof-specialty,Not-in-family,White,Female,0,0,36,United-States,<=50K
15057,38,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,50,United-States,<=50K
15058,44,Private,Bachelors,13,Divorced,Adm-clerical,Own-child,Asian-Pac-Islander,Male,5455,0,40,United-States,<=50K


In [43]:
df_train.columns

Index(['age', 'workclass', 'education', 'educationno', 'maritalstatus',
       'occupation', 'relationship', 'race', 'sex', 'capitalgain',
       'capitalloss', 'hoursperweek', 'native', 'Salary'],
      dtype='object')

In [44]:
columns = ['workclass', 'education', 'maritalstatus', 'occupation', 'relationship', 'race', 'sex', 'native']

label_encoder = preprocessing.LabelEncoder()

for i in columns:
    df_train[i] = label_encoder.fit_transform(df_train[i])
    df_test[i] = label_encoder.fit_transform(df_test[i])
    

In [45]:
df_train.head()

Unnamed: 0,age,workclass,education,educationno,maritalstatus,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native,Salary
0,39,5,9,13,4,0,1,4,1,2174,0,40,37,<=50K
1,50,4,9,13,2,3,0,4,1,0,0,13,37,<=50K
2,38,2,11,9,0,5,1,4,1,0,0,40,37,<=50K
3,53,2,1,7,2,5,0,2,1,0,0,40,37,<=50K
4,28,2,9,13,2,9,5,2,0,0,0,40,4,<=50K


In [46]:
# split train and test data into x features and y target

train_x = df_train.iloc[:, 0:13]
train_y = df_train.iloc[:, 13]
test_x = df_test.iloc[:, 0:13]
test_y = df_test.iloc[:, 13]

In [47]:
train_x.head()

Unnamed: 0,age,workclass,education,educationno,maritalstatus,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native
0,39,5,9,13,4,0,1,4,1,2174,0,40,37
1,50,4,9,13,2,3,0,4,1,0,0,13,37
2,38,2,11,9,0,5,1,4,1,0,0,40,37
3,53,2,1,7,2,5,0,2,1,0,0,40,37
4,28,2,9,13,2,9,5,2,0,0,0,40,4


In [48]:
train_y.head()

0     <=50K
1     <=50K
2     <=50K
3     <=50K
4     <=50K
Name: Salary, dtype: object

In [49]:
print(train_x.shape)
print(train_y.shape)

(30161, 13)
(30161,)


### Applying naive bayes for classification

In [56]:
# Preparing a naive bayes model on training data set 

from sklearn.naive_bayes import MultinomialNB as MB
from sklearn.naive_bayes import GaussianNB as GB
from sklearn.metrics import confusion_matrix

In [52]:
# Multinomial Naive Bayes

classifier_mb = MB()
classifier_mb.fit(train_x, train_y)
train_pred_m = classifier_mb.predict(train_x)
accuracy_train_m = np.mean(train_pred_m==train_y)

test_pred_m = classifier_mb.predict(test_x)
accuracy_test_m = np.mean(test_pred_m==test_y)

print('train acc ', accuracy_train_m)
print('test acc ', accuracy_test_m)

train acc  0.7729186698053778
test acc  0.7749667994687915


In [55]:
# Gaussian Naive Bayes

classifier_gb = GB()

# we need to convert tfidf into array format which is compatible for gaussian naive bayes

classifier_gb.fit(train_x.values, train_y.values) 
train_pred_g = classifier_gb.predict(train_x.values)
accuracy_train_g = np.mean(train_pred_g==train_y)

test_pred_g = classifier_gb.predict(test_x.values)
accuracy_test_g = np.mean(test_pred_g==test_y)

print('train acc ', accuracy_train_g)
print('test acc ', accuracy_test_g)

train acc  0.7953317197705646
test acc  0.7946879150066402


In [57]:
# Confusion Matrix

confusion_matrix = confusion_matrix(test_y, test_pred_g)
confusion_matrix

array([[10759,   601],
       [ 2491,  1209]], dtype=int64)