In [1]:
# Required Python Machine learning Packages
import pandas as pd
import numpy as np
# For preprocessing the data
from sklearn.preprocessing import Imputer
from sklearn import preprocessing
# To split the dataset into train and test datasets
from sklearn.cross_validation import train_test_split
# To model the Gaussian Navie Bayes classifier
from sklearn.naive_bayes import GaussianNB
# To calculate the accuracy score of the model
from sklearn.metrics import accuracy_score

In [None]:
# loading the dataset
adult_df = pd.read_csv('adult.data',header = None, delimiter=' *, *', engine='python')

In [None]:
# add headers
adult_df.columns = ['age', 'workclass', 'fnlwgt', 'education', 'education_num',
                    'marital_status', 'occupation', 'relationship',
                    'race', 'sex', 'capital_gain', 'capital_loss',
                    'hours_per_week', 'native_country', 'income']

In [None]:
# Handling missing data
adult_df.isnull().sum()

In [None]:
for value in ['workclass', 'education','marital_status', 'occupation','relationship','race', 'sex','native_country', 'income']:
    print value,":", sum(adult_df[value] == '?')

In [None]:
# Data preprocessing
# As we want to perform some imputation for missing values. Before doing that, we need some summary statistics of our dataframe. For this, we can use describe() method. It can be used to generate various summary statistics, excluding NaN values.
adult_df_rev = adult_df

In [None]:
# We are passing an “include” parameter with value as “all”, this is used to specify that. we want summary statistics of all the attributes.
# You check the basic statistics about the dataset after running the above command. You can spend some time here to get in details about each and ever stats provided.
adult_df_rev.describe(include= 'all')

In [None]:
# Data Imputation Step
# Now, it’s time to impute the missing values. Some of our categorical values have missing values i.e, “?”. We are going to replace the “?” with the above describe methods top row’s value. For example, we are going to replace the “?” values of workplace attribute with “Private” value.
for value in ['workclass', 'education','marital_status', 'occupation','relationship','race', 'sex','native_country', 'income']:
    adult_df_rev[value].replace(['?'], [adult_df_rev.describe(include='all')[value][2]],inplace='True')

# We have performed data imputation step. You can check changes in dataframe by printing  adult_df_rev

In [None]:
# For naive Bayes, we need to convert all the data values in one format. We are going to encode all the labels with the value between 0 and n_classes-1.
# One-Hot Encoder
le = preprocessing.LabelEncoder()
workclass_cat = le.fit_transform(adult_df.workclass)
education_cat = le.fit_transform(adult_df.education)
marital_cat   = le.fit_transform(adult_df.marital_status)
occupation_cat = le.fit_transform(adult_df.occupation)
relationship_cat = le.fit_transform(adult_df.relationship)
race_cat = le.fit_transform(adult_df.race)
sex_cat = le.fit_transform(adult_df.sex)
native_country_cat = le.fit_transform(adult_df.native_country)

In [None]:
#initialize the encoded categorical columns
adult_df_rev['workclass_cat'] = workclass_cat
adult_df_rev['education_cat'] = education_cat
adult_df_rev['marital_cat'] = marital_cat
adult_df_rev['occupation_cat'] = occupation_cat
adult_df_rev['relationship_cat'] = relationship_cat
adult_df_rev['race_cat'] = race_cat
adult_df_rev['sex_cat'] = sex_cat
adult_df_rev['native_country_cat'] = native_country_cat


In [None]:
#drop the old categorical columns from dataframe
dummy_fields = ['workclass', 'education', 'marital_status','occupation', 'relationship', 'race','sex', 'native_country']
adult_df_rev = adult_df_rev.drop(dummy_fields, axis = 1)

In [None]:
# For reindexing the columns, you can use the code snippet provided below:
adult_df_rev = adult_df_rev.reindex_axis(['age', 'workclass_cat', 'fnlwgt', 'education_cat','education_num', 'marital_cat', 'occupation_cat',
                                    'relationship_cat', 'race_cat', 'sex_cat', 'capital_gain',
                                    'capital_loss', 'hours_per_week', 'native_country_cat', 
                                    'income'], axis= 1)

adult_df_rev.head(1)


In [None]:
# Standardization of Data
num_features = ['age', 'workclass_cat', 'fnlwgt', 'education_cat', 'education_num',
                'marital_cat', 'occupation_cat', 'relationship_cat', 'race_cat',
                'sex_cat', 'capital_gain', 'capital_loss', 'hours_per_week',
                'native_country_cat']

scaled_features = {}
for each in num_features:
    mean, std = adult_df_rev[each].mean(), adult_df_rev[each].std()
    scaled_features[each] = [mean, std]
    adult_df_rev.loc[:, each] = (adult_df_rev[each] - mean)/std


In [None]:
# Data Slicing
features = adult_df_rev.values[:,:14]
target = adult_df_rev.values[:,14]
features_train, features_test, target_train, target_test = train_test_split(features,
                                                                            target, test_size = 0.33, random_state = 10)


In [None]:
# Gaussian Naive Bayes Implementation
clf = GaussianNB()
clf.fit(features_train, target_train)
target_pred = clf.predict(features_test)


In [None]:
# Accuracy of our Gaussian Naive Bayes model
accuracy_score(target_test, target_pred, normalize = True)