Naive Bayes
====


In [1]:
# Import libraries
import pandas as pd
import numpy as np
import scipy.stats as stats

import time
import memory_profiler

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, RepeatedStratifiedKFold
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix

# open dataset and adding header 
headerList = ['age', 'workclass', 'fnlwgt', 'education', 'education-num' , 'marital-status','occupation', 'relationship' , 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country','label' ] 

data=pd.read_csv('adult.data',names=headerList,sep=',',skipinitialspace=True)
print(data)

       age         workclass  fnlwgt   education  education-num  \
0       39         State-gov   77516   Bachelors             13   
1       50  Self-emp-not-inc   83311   Bachelors             13   
2       38           Private  215646     HS-grad              9   
3       53           Private  234721        11th              7   
4       28           Private  338409   Bachelors             13   
...    ...               ...     ...         ...            ...   
32556   27           Private  257302  Assoc-acdm             12   
32557   40           Private  154374     HS-grad              9   
32558   58           Private  151910     HS-grad              9   
32559   22           Private  201490     HS-grad              9   
32560   52      Self-emp-inc  287927     HS-grad              9   

           marital-status         occupation   relationship   race     sex  \
0           Never-married       Adm-clerical  Not-in-family  White    Male   
1      Married-civ-spouse    Exec-manag

#Data preprocessing  
======================

In [2]:
#Remove duplicate data 
data = data.drop_duplicates()

In [3]:
#change value '?' to mode value of that column
data[data == '?'] = np.nan
colWithMissingVal = ['workclass', 'occupation', 'native-country']

for column in colWithMissingVal:
    data[column].fillna(data[column].mode()[0], inplace=True)

In [4]:
#Cap outliers : cap age to 99 percentile
p_age = np.percentile(data['age'], 99)
print(p_age)

data['age'] = np.where(data['age'] > np.percentile(data['age'], 99), np.percentile(data['age'], 99), data['age'])

74.0


In [5]:
#Trim outliers: for 'education-num' and 'hours-per-week' column: only keep rows in dataframe with all z-scores less than absolute value of 3 

data = data[(np.abs(stats.zscore(data['education-num'])) < 3)]
data = data[(np.abs(stats.zscore(data['hours-per-week'])) < 3)]

In [6]:
#Convert label(income) column :'<=50K': 0, '>50K': 1}
data['label'] = data['label'].map({'<=50K': 0, '>50K': 1})

# Convert age and education.num to integer data type
data['age'] = data['age'].astype(int)
data['education-num'] = data['education-num'].astype(int)
data['hours-per-week'] = data['hours-per-week'].astype(int)

In [7]:
#for categorical Data, use LabelEncoder to transform these non-numerical labels.
categoricalData = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']
label_encoder = LabelEncoder()
for i in categoricalData:
    data[i] = label_encoder.fit_transform(data[i])

#for continuous attribute: Standardize features by removing the mean and scaling to unit variance
scalerData= ['fnlwgt', 'capital-gain', 'capital-loss']
scaler = StandardScaler()
for i in scalerData:
    data[i] = scaler.fit_transform(data[i].values.reshape(-1, 1))

In [8]:
headerList = ['age', 'workclass', 'fnlwgt', 'education', 'education-num' , 'marital-status','occupation', 'relationship' , 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country','label' ] 

for i in range(0, len(headerList)): 
    print( headerList[i], data[headerList[i]].unique() )

age [39 50 38 53 28 37 49 52 31 42 30 23 32 40 34 25 43 54 35 59 56 19 20 45
 22 48 21 24 57 44 41 29 18 47 46 36 74 27 33 17 55 61 70 64 66 51 26 58
 60 65 62 63 67 72 69 71 68 73]
workclass [6 5 3 0 1 4 7 2]
fnlwgt [-1.0644036  -1.00937187  0.24733637 ... -1.47702289 -0.99655171
  0.64291908]
education [ 8 10  1 11  5  6  7  4  9 13 12  3  0  2]
education-num [13  9  7 14  5 12 11  4 16 10 15  3  6  8]
marital-status [4 2 0 3 5 1 6]
occupation [ 0  3  5  9  7 11  2 13  4  6 12 10  1  8]
relationship [1 0 5 3 4 2]
race [4 2 1 0 3]
sex [1 0]
capital-gain [ 1.49131647e-01 -1.45993581e-01  1.76594005e+00  5.56931180e-01
  5.34532071e-01  1.80761904e-01  1.80123561e+00  1.89354709e+00
  8.97669139e-01  4.48247682e+00  4.05703258e-01  4.49415458e-01
  8.44725791e-01  4.52812336e-02  3.52759909e-01 -2.77503622e-03
  3.24251953e-01  1.32298378e-01  1.49403152e-01 -6.53567888e-02
  2.57597328e+00  7.83773064e-01  4.10726088e-01  4.82708556e-03
  1.02337565e+00  3.17057087e-01  2.06554817e-01 

In [9]:
# Separate features (X) and target variable (y)
X = data.drop(['label'] , axis=1)
y = data['label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#Building Naive Bayes Classifier
==

In [10]:
# Create a Naive Bayes Classifier with hyperparameter tuning

#load memory_profiler to calculate memory usage
%load_ext memory_profiler  
#%reload_ext memory_profiler

#state Cross-validation methods
cv_method = RepeatedStratifiedKFold(n_splits=5, 
                                    n_repeats=3, 
                                    random_state=999)

#state range of 'var_smoothing' for testing
params_NB = {'var_smoothing': np.logspace(0,-9, num=100)}

#=======================================================================================

#record start time
start_time = time.time()

%memit nb_classifier = GaussianNB()
%memit nbModel_grid = GridSearchCV(estimator=nb_classifier, param_grid=params_NB, cv=cv_method, verbose=1, scoring='accuracy') 
%memit nbModel_grid.fit(X_train, y_train)

#time taken performing tuning
tuning_time = time.time()

%memit best_nb = nbModel_grid.best_estimator_
%memit best_nb.fit(X_train, y_train)

#record end time
end_time = time.time()

#=======================================================================================

total_tuning_time = tuning_time - start_time #Calculate time for hyperparameter tuning
model_training_time = end_time - tuning_time #Calculate time for train the model 
total_time = end_time - start_time  # Calculate total time

print('=======================================================================================')
print("Total time taken to tune the hyperparameter: {:0.3f} seconds".format(total_tuning_time))
print("Total time taken to train the model: {:0.3f} seconds".format(model_training_time))
print("Total time taken to build the Naive Bayes model: {:0.3f} seconds".format(total_time))

print("Hyperparameter used: {}".format(nbModel_grid.best_params_))


peak memory: 173.81 MiB, increment: 0.07 MiB
peak memory: 173.82 MiB, increment: 0.00 MiB
Fitting 15 folds for each of 100 candidates, totalling 1500 fits
peak memory: 183.88 MiB, increment: 10.06 MiB
peak memory: 176.69 MiB, increment: 0.01 MiB
peak memory: 176.69 MiB, increment: 0.00 MiB
Total time taken to tune the hyperparameter: 21.661 seconds
Total time taken to train the model: 1.916 seconds
Total time taken to build the Naive Bayes model: 23.577 seconds
Hyperparameter used: {'var_smoothing': 0.0006579332246575676}


In [11]:
y_pred = best_nb.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

accuracy, precision, recall, f1

(0.818880351262349,
 0.7388059701492538,
 0.38596491228070173,
 0.5070422535211266)

In [12]:
# Print classification report and confusion matrix
print("Classification Report:")
print(classification_report(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.96      0.89      4838
           1       0.74      0.39      0.51      1539

    accuracy                           0.82      6377
   macro avg       0.78      0.67      0.70      6377
weighted avg       0.81      0.82      0.80      6377



#Test using adult.test dataset
=

In [13]:
testData=pd.read_csv('adult.test',names=headerList,sep=',',skipinitialspace=True)
#testData.info()
print(testData.head())

#remove 1st row
testData = testData.iloc[1: , :]
print(testData.head())

                    age  workclass    fnlwgt     education  education-num  \
0  |1x3 Cross validator        NaN       NaN           NaN            NaN   
1                    25    Private  226802.0          11th            7.0   
2                    38    Private   89814.0       HS-grad            9.0   
3                    28  Local-gov  336951.0    Assoc-acdm           12.0   
4                    44    Private  160323.0  Some-college           10.0   

       marital-status         occupation relationship   race   sex  \
0                 NaN                NaN          NaN    NaN   NaN   
1       Never-married  Machine-op-inspct    Own-child  Black  Male   
2  Married-civ-spouse    Farming-fishing      Husband  White  Male   
3  Married-civ-spouse    Protective-serv      Husband  White  Male   
4  Married-civ-spouse  Machine-op-inspct      Husband  Black  Male   

   capital-gain  capital-loss  hours-per-week native-country   label  
0           NaN           NaN             NaN

In [14]:
#Convert 'label' column
testData['label'] = testData['label'].map({'<=50K.': 0, '>50K.': 1})

# Convert age and education.num to integer data type
testData['age'] = testData['age'].astype(int)
testData['education-num'] = testData['education-num'].astype(int)
testData['hours-per-week'] = testData['hours-per-week'].astype(int)

In [15]:
categoricalData = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']
label_encoder = LabelEncoder()

for i in categoricalData:
    testData[i] = label_encoder.fit_transform(testData[i])

scalerData= ['fnlwgt', 'capital-gain', 'capital-loss']
scaler = StandardScaler()
for i in scalerData:
    testData[i] = scaler.fit_transform(testData[i].values.reshape(-1, 1))

#print(testData)

In [16]:
# Select features of test set
test_dataset_log = testData.drop(['label'], axis=1)
test_dataset_result = testData['label']

# Using trained SMV model to predict outcome
y_pred_test = best_nb.predict(test_dataset_log)

In [17]:
# Evaluate the model
accuracy = accuracy_score(test_dataset_result, y_pred_test)
precision = precision_score(test_dataset_result, y_pred_test)
recall = recall_score(test_dataset_result, y_pred_test)
f1 = f1_score(test_dataset_result, y_pred_test)

accuracy, precision, recall, f1

(0.8220011055831952,
 0.7073490813648294,
 0.4204368174726989,
 0.5273972602739726)