In [124]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [125]:
data = pd.read_csv('adult.csv')
data.columns = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'Target']
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,Target
0,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
1,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
2,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
3,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
4,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K


In [126]:
data.shape

(32560, 15)

In [127]:
data.isna().sum().sum()

0

In [128]:
(data.values == '?').sum()

0

In [129]:
data = data.replace('?', np.nan)
data.dropna(inplace=True)

In [130]:
data.isna().sum()

age               0
workclass         0
fnlwgt            0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
Target            0
dtype: int64

In [131]:
data.shape

(32560, 15)

In [132]:
# data['Target'].unique()
data['marital-status'].unique()
# data['sex'].unique()

array([' Married-civ-spouse', ' Divorced', ' Married-spouse-absent',
       ' Never-married', ' Separated', ' Married-AF-spouse', ' Widowed'],
      dtype=object)

In [133]:
data = data.replace(' <=50K', 1)
data = data.replace(' >50K', 2)

data = data.replace(' Male', 1)
data = data.replace(' Female', 2)

In [134]:
data['Target'].unique()

array([1, 2])

In [135]:
data_t = data[['age', 'fnlwgt', 'sex', 'hours-per-week', 'Target']]
data_t.head()

Unnamed: 0,age,fnlwgt,sex,hours-per-week,Target
0,50,83311,1,13,1
1,38,215646,1,40,1
2,53,234721,1,40,1
3,28,338409,2,40,1
4,37,284582,2,40,1


## **Error correcting (Outlier Detection and Removal)**



In [136]:
data_t = data_t.astype(float)
data_t.shape

(32560, 5)

In [137]:
def remove_outliers_zscore(data_t, threshold=3):
  zscore = np.abs((data_t - data_t.mean()) / data_t.std())
  out = zscore > 3
  data_t = data_t[~out.any(axis=1)]

  return data_t

filtered_data = remove_outliers_zscore(data_t)

In [138]:
filtered_data.head()

Unnamed: 0,age,fnlwgt,sex,hours-per-week,Target
0,50.0,83311.0,1.0,13.0,1.0
1,38.0,215646.0,1.0,40.0,1.0
2,53.0,234721.0,1.0,40.0,1.0
3,28.0,338409.0,2.0,40.0,1.0
4,37.0,284582.0,2.0,40.0,1.0


In [139]:
filtered_data.shape

(31668, 5)

In [141]:
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

In [144]:
filtered_data.loc[filtered_data['Target'] == 1, 'Target'] = 'Y'
filtered_data.loc[filtered_data['Target'] == 2, 'Target'] = 'N'

le = LabelEncoder()
le.fit(filtered_data['Target'])
le.classes_

array(['N', 'Y'], dtype=object)

In [146]:
filtered_data['Target'] = le.transform(filtered_data['Target'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data['Target'] = le.transform(filtered_data['Target'])


In [147]:
filtered_data.head()

Unnamed: 0,age,fnlwgt,sex,hours-per-week,Target
0,50.0,83311.0,1.0,13.0,1
1,38.0,215646.0,1.0,40.0,1
2,53.0,234721.0,1.0,40.0,1
3,28.0,338409.0,2.0,40.0,1
4,37.0,284582.0,2.0,40.0,1


In [150]:
filtered_data['Target'].unique()
data1 = filtered_data

In [151]:
X = data1.drop('Target', axis=1)
y = data1['Target']

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [152]:
reg = LogisticRegression()
reg.fit(x_train, y_train)

In [154]:
y_pred_reg = reg.predict(x_test)
print('Accuracy - Logistic Regression : ', accuracy_score(y_test, y_pred_reg))

Accuracy - Logistic Regression :  0.7565519419008525


In [155]:
navi = GaussianNB()
navi.fit(x_train, y_train)

In [157]:
y_pred_navi = navi.predict(x_test)
print('Accuracy - Naive Bayes : ', accuracy_score(y_test, y_pred_navi))

Accuracy - Naive Bayes :  0.7605515208925376


In [158]:

print('Accuracy - Logistic Regression : ', accuracy_score(y_test, y_pred_reg))
print('Accuracy - Naive Bayes : ', accuracy_score(y_test, y_pred_navi))

Accuracy - Logistic Regression :  0.7565519419008525
Accuracy - Naive Bayes :  0.7605515208925376
