# Gaussian Naive Bayes

In [1]:
# Load Python libraries
import pandas as pd
import numpy as np

In [2]:
# Load dataset and display the first several data samples.
df = pd.read_csv("customer-behaviour.csv")
df.head()

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0


This is a dataset for predicting whether a user purchases a specific product.
- Gender: Customer gender (categorical: male, female)
- Age: Customer age (numeric)
- EstimatedSalary: Estimated customer salary (numeric)
- Purchased (Label): whether the customer has purchased the product (categorical: 0 (no), 1 (yes)).

In [3]:
# Get some basic data information
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   User ID          400 non-null    int64 
 1   Gender           400 non-null    object
 2   Age              400 non-null    int64 
 3   EstimatedSalary  400 non-null    int64 
 4   Purchased        400 non-null    int64 
dtypes: int64(4), object(1)
memory usage: 15.8+ KB


In [4]:
# Drop "User ID" column as it does not contain useful information for building the model
df = df.drop(columns=["User ID"], axis=1)

In [5]:
# Show the dataframe
df

Unnamed: 0,Gender,Age,EstimatedSalary,Purchased
0,Male,19,19000,0
1,Male,35,20000,0
2,Female,26,43000,0
3,Female,27,57000,0
4,Male,19,76000,0
...,...,...,...,...
395,Female,46,41000,1
396,Male,51,23000,1
397,Female,50,20000,1
398,Male,36,33000,0


# Model Training

In [6]:
# Make a data copy
df_copy = df.copy()

In [7]:
# The Pandas get_dummies function creates dummy variables from Pandas objects in Python,
# i.e., a dummy variable is a numeric variable that encodes categorical information.
# this is also sometimes referred to as “one-hot” encoding of categorical data.
data = pd.get_dummies(data=df_copy, columns=["Gender"])

# Show the data with dummy variables
data

Unnamed: 0,Age,EstimatedSalary,Purchased,Gender_Female,Gender_Male
0,19,19000,0,0,1
1,35,20000,0,0,1
2,26,43000,0,1,0
3,27,57000,0,1,0
4,19,76000,0,0,1
...,...,...,...,...,...
395,46,41000,1,1,0
396,51,23000,1,0,1
397,50,20000,1,1,0
398,36,33000,0,0,1


In [8]:
feature_names = data.columns.tolist()     # Get the list of data features from column names
feature_names.remove("Purchased")         # Remove the label column "Purchased" from the data features
X = data[feature_names].values            # Assign data feature values to variable X

y = data.Purchased.values                 # Assign data label values to variable y

In [9]:
# Show data feature shape
X.shape

(400, 4)

In [10]:
# Show label shape
y.shape

(400,)

In [11]:
# Split the data into train/test set using sklearn library
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

In [12]:
# Standardize the data using Standard scaler
from sklearn.preprocessing import StandardScaler
normalizer = StandardScaler()
X_normal_train = normalizer.fit_transform(X_train)     # Note that we use fit_transform() on training data so that it can learn the scaling parameters of that data.
X_normal_test = normalizer.transform(X_test)           # But we only transform() in test data using the learned scaling parameters.

In [13]:
# Initialize and train Gaussian Naive Bayes model using X_normal_train (data features) and y_train (data label)
from sklearn.naive_bayes import GaussianNB
naive_model = GaussianNB()
naive_model.fit(X_normal_train, y_train)

GaussianNB()

In [14]:
# Impport libraries to calculate evaluation metrics: precision, recall, f1 score.
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report

# Make prediction on the test data
predicted_label = naive_model.predict(X_normal_test)

# Calculate evaluation metrics by comparing the prediction with the data label y_test
print(precision_score(predicted_label, y_test))
print(recall_score(predicted_label, y_test))
print(f1_score(predicted_label, y_test))
print(classification_report(predicted_label, y_test))

0.7916666666666666
0.8085106382978723
0.7999999999999999
              precision    recall  f1-score   support

           0       0.88      0.86      0.87        73
           1       0.79      0.81      0.80        47

    accuracy                           0.84       120
   macro avg       0.83      0.84      0.83       120
weighted avg       0.84      0.84      0.84       120

