<a href="https://colab.research.google.com/github/tiensu/DATA_SCIENCE_PJ/blob/master/Naive_Bayes_Classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

https://chrisalbon.com/machine_learning/naive_bayes/naive_bayes_classifier_from_scratch/

In [0]:
import pandas as pd
import numpy as np

In [3]:
# Create an empty dataframe
data = pd.DataFrame()

# Create feature variable
data['Height'] = [6,5.92,5.58,5.92,5,5.5,5.42,5.75]
data['Weight'] = [180,190,170,165,100,150,130,150]
data['Foot_Size'] = [12,11,12,10,6,8,7,9]

# Create target variable
data['Gender'] = ['male','male','male','male','female','female','female','female']

# View data
data

Unnamed: 0,Height,Weight,Foot_Size,Gender
0,6.0,180,12,male
1,5.92,190,11,male
2,5.58,170,12,male
3,5.92,165,10,male
4,5.0,100,6,female
5,5.5,150,8,female
6,5.42,130,7,female
7,5.75,150,9,female


In [4]:
# Create a new person (new feature vector) to predict
person = pd.DataFrame()
person['Height'] = [6]
person['Weight'] = [130]
person['Foot_Size'] = [8]

# View person
person

Unnamed: 0,Height,Weight,Foot_Size
0,6,130,8


In [0]:
# Calculate Priors (p(data))

# Number of males
n_male = data['Gender'][data['Gender'] == 'male'].count()

# Number of females
n_female = data['Gender'][data['Gender'] == 'female'].count()

# Total rows
total_ppl = data['Gender'].count()

# Number of males divided by the total rows
P_male = n_male/total_ppl

# Number of females divided by the total rows
P_female = n_female/total_ppl

In [7]:
# Calculate Likelihood

# Group the data by gender and calculate the mean of each feature
data_means = data.groupby('Gender').mean()

# View the means
data_means

Unnamed: 0_level_0,Height,Weight,Foot_Size
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,5.4175,132.5,7.5
male,5.855,176.25,11.25


In [22]:
# Group the data by gender and calculate the variance of each feature
data_var = data.groupby('Gender').var()

# View the veriance
data_var

Unnamed: 0_level_0,Height,Weight,Foot_Size
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,0.097225,558.333333,1.666667
male,0.035033,122.916667,0.916667


In [0]:
# Mean for male
male_height_mean = data_means.Height[data_means.index == 'male'].values[0]
male_weight_mean = data_means.Weight[data_means.index == 'male'].values[0]
male_footsize_mean = data_means.Foot_Size[data_means.index == 'male'].values[0]

In [0]:
# Mean for female
female_height_mean = data_means.Height[data_means.index == 'female'].values[0]
female_weight_mean = data_means.Weight[data_means.index == 'female'].values[0]
female_footsize_mean = data_means.Foot_Size[data_means.index == 'female'].values[0]

In [0]:
# Variance for male
male_height_var = data_var.Height[data_var.index == 'male'].values[0]
male_weight_var = data_var.Weight[data_var.index == 'male'].values[0]
male_footsize_var = data_var.Foot_Size[data_var.index == 'male'].values[0]

In [0]:
# Variance for male
female_height_var = data_var.Height[data_var.index == 'female'].values[0]
female_weight_var = data_var.Weight[data_var.index == 'female'].values[0]
female_footsize_var = data_var.Foot_Size[data_var.index == 'female'].values[0]

In [0]:
# Function to calculate the probability density of each the feature (e.g p(height|female))
def p_x_given_y(x, mean_y, var_y):
  # Input the arguments into a probability density function
  p = 1/(np.sqrt(2*np.pi*var_y))*np.exp((-(x-mean_y)**2)/(2*var_y))
  
  return p
  

In [30]:
"""
Apply Bayes Classifier To New Data Point
we can ignore the marginal probability (the demoninator), what we are actually calculating is this:
numerator of the posterior=P(female)p(height∣female)p(weight∣female)p(foot size∣female)
"""
# Numerator of the posterior if the unclassified observation is a male
p_male = P_male * \
p_x_given_y(person['Height'][0], male_height_mean, male_height_var) * \
p_x_given_y(person['Weight'][0], male_weight_mean, male_weight_var) * \
p_x_given_y(person['Foot_Size'][0], male_footsize_mean, male_footsize_var)

p_male

6.197071843878078e-09

In [31]:
# Numerator of the posterior if the unclassified observation is a female
p_female = P_female * \
p_x_given_y(person['Height'][0], female_height_mean, female_height_var) * \
p_x_given_y(person['Weight'][0], female_weight_mean, female_weight_var) * \
p_x_given_y(person['Foot_Size'][0], female_footsize_mean, female_footsize_var)

p_female

0.0005377909183630018

In [0]:
# Because the numerator of the posterior for female is greater than male, then we predict that the person is female.

In [51]:
"""NBC with sklearn"""

# Load the iris dataset
from sklearn.datasets import load_iris
iris = load_iris()

# Store the feature matrix X and response vector Y
X = iris.data
y = iris.target

# Spliting data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model 
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train, y_train)

# Making prediction
y_pred = gnb.predict(X_test)

# Valuation model
from sklearn import metrics
print('Gaussian Naive Bayes model accuracy (in %):', metrics.accuracy_score(y_test, y_pred)*100)

Gaussian Naive Bayes model accuracy (in %): 100.0
