## Gaussian Naive Bayes classifier

A Gaussian Naive Bayes algorithm is a special type of NB algorithm, used in case of continuous features. It assumes that all the features are following a Gaussian distribution i.e, normal distribution.

There are two ways modifying continuous data: 

1. To fit an approximate distribution upon these features, most commonly the Gaussian (Normal) distribution. This is called Gaussian Naive Bayes.

2. Convert continuous to categorical features.

## Step wise Implementatio

### Import libraries

In [None]:
import numpy as np 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

### Load the dataset

In [None]:
data = pd.read_csv("Breast_cancer_data.csv")
data.head(10)

In [None]:
#Plot a histogram to see target value distribution

data["diagnosis"].hist()

### Checking wheather the features are independent using a heat map

In [None]:
#using pearson correlation

corr = data.iloc[:,:-1].corr(method="pearson")
cmap = sns.diverging_palette(250,354,80,60,center='dark',as_cmap=True)
sns.heatmap(corr, vmax=1, vmin=-.5, cmap=cmap, square=True, linewidths=.2)

The attribute "mean_parameter" and "mean_area" are positiviely correlated to the attribute "mean_radius". However, there is no such correlation among the attributes "mean_radius", "mean_texture" and "mean_smoothness". 

In [None]:
data = data[["mean_radius", "mean_texture", "mean_smoothness", "diagnosis"]]
data.head(10)

### Approach 1: Fitting an approximate distribution upon these features

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(18, 6), sharey=True)
sns.histplot(data, ax=axes[0], x="mean_radius", kde=True, color='r')
sns.histplot(data, ax=axes[1], x="mean_smoothness", kde=True, color='b')
sns.histplot(data, ax=axes[2], x="mean_texture", kde=True)

All the three attributes kind of follow the normal distribution, so we can use the guassian distribution

### Calculating prior probabilities: P(Y=y) for all possible y

In [None]:
def calculate_prior(df, Y):
    classes = sorted(list(df[Y].unique()))
    prior = []
    for i in classes:
        prior.append(len(df[df[Y]==i])/len(df))
    return prior

Calculating P(X=x|Y=y) using the Gaussian distribution. As per Naive bayes each feature is considered independent.

In [None]:
def calculate_likelihood_gaussian(df, feat_name, feat_val, Y, label):
    feat = list(df.columns)
    df = df[df[Y]==label]
    mean, std = df[feat_name].mean(), df[feat_name].std() #calculating the mean and std for features
    p_x_given_y = (1 / (np.sqrt(2 * np.pi) * std)) *  np.exp(-((feat_val-mean)**2 / (2 * std**2 ))) #pdf for normal distribution
    return p_x_given_y

### Calculating Posterior Probabilities: P(X=x1|Y=y)P(X=x2|Y=y)...P(X=xn|Y=y) * P(Y=y) for all y and find the maximum

In [None]:
def naive_bayes_gaussian(df, X, Y):
    # get feature names
    features = list(df.columns)[:-1]

    # calculate prior for each class
    prior = calculate_prior(df, Y)

    Y_pred = []
    # loop over every data sample
    for x in X:   #iterate over all the data
        # calculate likelihood
        labels = sorted(list(df[Y].unique()))
        likelihood = [1]*len(labels)
        for j in range(len(labels)): #traverse through every class
            for i in range(len(features)): ##for each feature
                likelihood[j] *= calculate_likelihood_gaussian(df, features[i], x[i], Y, labels[j])

        # calculate posterior probability (numerator only)
        post_prob = [1]*len(labels)
        for j in range(len(labels)):
            post_prob[j] = likelihood[j] * prior[j]

        Y_pred.append(np.argmax(post_prob))

    return np.array(Y_pred) 

### Testing the Gaussian model

In [None]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(data, test_size=.2, random_state=41)

X_test = test.iloc[:,:-1].values
Y_test = test.iloc[:,-1].values
Y_pred = naive_bayes_gaussian(train, X=X_test, Y="diagnosis")

from sklearn.metrics import confusion_matrix, f1_score
print(confusion_matrix(Y_test, Y_pred))
print(f1_score(Y_test, Y_pred))

### Approach 2: Convert continuous features to Categorical features

In [None]:
#dividing each features into three bins

data["cat_mean_radius"] = pd.cut(data["mean_radius"].values, bins = 3, labels = [0,1,2])
data["cat_mean_texture"] = pd.cut(data["mean_texture"].values, bins = 3, labels = [0,1,2])
data["cat_mean_smoothness"] = pd.cut(data["mean_smoothness"].values, bins = 3, labels = [0,1,2])

data = data.drop(columns=["mean_radius", "mean_texture", "mean_smoothness"])
data = data[["cat_mean_radius",	"cat_mean_texture",	"cat_mean_smoothness", "diagnosis"]]
data.head(10)

### Calculating P(X=x|Y=y) categorically

In [None]:
def calculate_likelihood_categorical(df, feat_name, feat_val, Y, label):
    feat = list(df.columns)
    df = df[df[Y]==label]
    p_x_given_y = len(df[df[feat_name]==feat_val]) / len(df)
    return p_x_given_y

### Calculating P(X=x1|Y=y)P(X=x2|Y=y)...P(X=xn|Y=y) * P(Y=y) for all y and find the maximum

In [None]:
def naive_bayes_categorical(df, X, Y):
    # get feature names
    features = list(df.columns)[:-1]

    # calculate prior
    prior = calculate_prior(df, Y)

    Y_pred = []
    # loop over every data sample
    for x in X:
        # calculate likelihood
        labels = sorted(list(df[Y].unique()))
        likelihood = [1]*len(labels)
        for j in range(len(labels)):
            for i in range(len(features)):
                likelihood[j] *= calculate_likelihood_categorical(df, features[i], x[i], Y, labels[j])

        # calculate posterior probability (numerator only)
        post_prob = [1]*len(labels)
        for j in range(len(labels)):
            post_prob[j] = likelihood[j] * prior[j]

        Y_pred.append(np.argmax(post_prob))

    return np.array(Y_pred) 

### Testing the categorical model

In [None]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(data, test_size=.2, random_state=41)

X_test = test.iloc[:,:-1].values
Y_test = test.iloc[:,-1].values
Y_pred = naive_bayes_categorical(train, X=X_test, Y="diagnosis")

from sklearn.metrics import confusion_matrix, f1_score
print(confusion_matrix(Y_test, Y_pred))
print(f1_score(Y_test, Y_pred))

## Evaluation Section:

### Built a Naive bayes classifier from scratch to classify whether someone will attempt to evade taxes based on the following features:

#### Refund: Categorical feature (Yes/No)
#### Marital Status: Categorical feature (Single/Married/Divorced)
#### Taxable Income: Continuous feature
#### Evade: Target class (Yes/No)

#### Assume categorical features follow categorical distribution, and continuous features follow a Gaussian distribution.

### Steps for Implementation:

#### 1. Prepare the dataset
#### 2. Calculate prior probabilities for each class (Evade = Yes/No)
#### 3. Calculate likelihoods for categorical features 𝑃(𝑋𝑖∣𝑌)P(Xi∣Y)
#### 4. Calculate likelihoods for continuous features 𝑃(𝑋𝑖∣𝑌)P(Xi∣Y) using the Gaussian distribution
#### 5. Evaluate the model

In [None]:
import pandas as pd
import numpy as np
from math import sqrt, pi, exp
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, f1_score

### 1. Prepare the dataset

In [None]:
data = {
    'Refund': ['Yes', 'No', 'No', 'Yes', 'No', 'No', 'Yes', 'No', 'Yes', 'No'],
    'Marital Status': ['Single', 'Married', 'Single', 'Married', 'Divorced', 'Married', 'Divorced', 'Single', 'Single', 'Married'],
    'Taxable Income': [125000, 100000, 70000, 120000, 95000, 60000, 130000, 75000, 115000, 90000],
    'Evade': ['No', 'No', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes']
}

# Convert to a DataFrame
df = pd.DataFrame(data)
print(df)

### 2. Calculatating Prior Probabilities