In [65]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder,StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
from sklearn.decomposition import PCA

## 1. Loading data

In [37]:
train_data = pd.read_csv('Datasets/SalaryData_Train.csv')
train_data.head()

Unnamed: 0,age,workclass,education,educationno,maritalstatus,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native,Salary
0,39,State-gov,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [30]:
train_data.shape

(30161, 14)

In [38]:
test_data = pd.read_csv('Datasets/SalaryData_Test.csv')
test_data.head()

Unnamed: 0,age,workclass,education,educationno,maritalstatus,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native,Salary
0,25,Private,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,34,Private,10th,6,Never-married,Other-service,Not-in-family,White,Male,0,0,30,United-States,<=50K


In [31]:
test_data.shape

(15060, 14)

In [32]:
15060+30161

45221

## Data preprocesssing

### Concatinating the two datasets

In [39]:
data = pd.concat([train_data,test_data])
data.shape

(45221, 14)

In [34]:
data.dtypes

age               int64
workclass        object
education        object
educationno       int64
maritalstatus    object
occupation       object
relationship     object
race             object
sex              object
capitalgain       int64
capitalloss       int64
hoursperweek      int64
native           object
Salary           object
dtype: object

In [35]:
data.isna().sum()

age              0
workclass        0
education        0
educationno      0
maritalstatus    0
occupation       0
relationship     0
race             0
sex              0
capitalgain      0
capitalloss      0
hoursperweek     0
native           0
Salary           0
dtype: int64

In [40]:
lbl = LabelEncoder()
data['Salary'] = lbl.fit_transform(data['Salary'])
data.Salary.unique()

array([0, 1])

In [25]:
data.education.unique()

array([' Bachelors', ' HS-grad', ' 11th', ' Masters', ' 9th',
       ' Some-college', ' Assoc-acdm', ' 7th-8th', ' Doctorate',
       ' Assoc-voc', ' Prof-school', ' 5th-6th', ' 10th', ' Preschool',
       ' 12th', ' 1st-4th'], dtype=object)

In [26]:
data.educationno.unique()

array([13,  9,  7, 14,  5, 10, 12,  4, 16, 11, 15,  3,  6,  1,  8,  2])

In [43]:
data.drop(labels=['education','educationno'],axis=1,inplace=True)

In [45]:
data.head()

Unnamed: 0,age,workclass,maritalstatus,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native,Salary
0,39,State-gov,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,0
1,50,Self-emp-not-inc,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,0
2,38,Private,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,0
3,53,Private,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,0
4,28,Private,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,0


In [46]:
data['workclass'] = lbl.fit_transform(data['workclass'])
data.head()

Unnamed: 0,age,workclass,maritalstatus,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native,Salary
0,39,5,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,0
1,50,4,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,0
2,38,2,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,0
3,53,2,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,0
4,28,2,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,0


In [47]:
data['maritalstatus'] = lbl.fit_transform(data['maritalstatus'])
data['occupation'] = lbl.fit_transform(data['occupation'])
data['relationship'] = lbl.fit_transform(data['relationship'])
data['race'] = lbl.fit_transform(data['race'])
data['sex'] = lbl.fit_transform(data['sex'])
data['native'] = lbl.fit_transform(data['native'])
data.head()

Unnamed: 0,age,workclass,maritalstatus,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native,Salary
0,39,5,4,0,1,4,1,2174,0,40,37,0
1,50,4,2,3,0,4,1,0,0,13,37,0
2,38,2,0,5,1,4,1,0,0,40,37,0
3,53,2,2,5,0,2,1,0,0,40,37,0
4,28,2,2,9,5,2,0,0,0,40,4,0


In [49]:
train_data = data.iloc[0:30161,:]
train_data.head()

Unnamed: 0,age,workclass,maritalstatus,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native,Salary
0,39,5,4,0,1,4,1,2174,0,40,37,0
1,50,4,2,3,0,4,1,0,0,13,37,0
2,38,2,0,5,1,4,1,0,0,40,37,0
3,53,2,2,5,0,2,1,0,0,40,37,0
4,28,2,2,9,5,2,0,0,0,40,4,0


In [50]:
train_data.shape

(30161, 12)

In [51]:
test_data = data.iloc[30161:,:]

In [52]:
test_data.shape

(15060, 12)

In [53]:
X_train = train_data.drop(labels='Salary',axis=1)
X_train.head()

Unnamed: 0,age,workclass,maritalstatus,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native
0,39,5,4,0,1,4,1,2174,0,40,37
1,50,4,2,3,0,4,1,0,0,13,37
2,38,2,0,5,1,4,1,0,0,40,37
3,53,2,2,5,0,2,1,0,0,40,37
4,28,2,2,9,5,2,0,0,0,40,4


In [54]:
y_train = train_data[['Salary']]
y_train

Unnamed: 0,Salary
0,0
1,0
2,0
3,0
4,0
...,...
30156,0
30157,1
30158,0
30159,0


In [56]:
X_test = train_data.drop(labels='Salary',axis=1)
X_test.head()

Unnamed: 0,age,workclass,maritalstatus,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native
0,39,5,4,0,1,4,1,2174,0,40,37
1,50,4,2,3,0,4,1,0,0,13,37
2,38,2,0,5,1,4,1,0,0,40,37
3,53,2,2,5,0,2,1,0,0,40,37
4,28,2,2,9,5,2,0,0,0,40,4


In [57]:
y_test = train_data[['Salary']]
y_test

Unnamed: 0,Salary
0,0
1,0
2,0
3,0
4,0
...,...
30156,0
30157,1
30158,0
30159,0


In [58]:
std_sclr = StandardScaler()

X_train = std_sclr.fit_transform(X_train)
X_test = std_sclr.fit_transform(X_test)

### Model building andtraining

In [60]:
nb_classifier = GaussianNB()

nb_classifier.fit(X_train,y_train)

  return f(*args, **kwargs)


GaussianNB()

### Model prediction

In [61]:
y_pred = nb_classifier.predict(X_test)

In [63]:
accuracy_score(y_test,y_pred)

0.790491031464474

In [64]:
confusion_matrix(y_test,y_pred)

array([[21526,  1127],
       [ 5192,  2316]])

### Apllying PCA

In [66]:
pca_model = PCA(n_components=2)
X_train_transformed = pca_model.fit_transform(X_train)
X_test_transformed = pca_model.fit_transform(X_test)

X_train_transformed = pd.DataFrame(X_train_transformed,columns=['p1','p2'])
X_test_transformed = pd.DataFrame(X_test_transformed,columns=['p1','p2'])

X_train_transformed.head(),X_test_transformed.head()

(         p1        p2
 0  0.551924 -0.006015
 1  0.636353  0.569478
 2  1.014223  0.574208
 3  0.888299  0.402657
 4 -2.700992  0.325578,
          p1        p2
 0  0.551924 -0.006015
 1  0.636353  0.569478
 2  1.014223  0.574208
 3  0.888299  0.402657
 4 -2.700992  0.325578)

In [67]:
nb_classifier2 = GaussianNB()

nb_classifier2.fit(X_train_transformed,y_train)

  return f(*args, **kwargs)


GaussianNB()

In [69]:
y_pred2 = nb_classifier2.predict(X_test_transformed)

In [70]:
accuracy_score(y_test,y_pred2)

0.7621100096150658

In [71]:
confusion_matrix(y_test,y_pred2)

array([[21058,  1595],
       [ 5580,  1928]])