# Lecture 7 Naive Bayes

In [1]:
%matplotlib inline
# import necessary libraries and specify that graphs should be plotted inline. 

# from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt
import numpy as np

import warnings
warnings.filterwarnings('ignore')

## Naive Bayes

Naive Bayes can be implemented in several ways. We discuss two specific cases in today's class: (1) If all predictors are categorical, and (2) If some or all predictors are continuous.

#### Categorical Features
If all predictors are categorical, use syntax:

**<center>sklearn.naive_bayes.CategoricalNB()</center>**
- alpha: A smoothing factor. The default value is 1. To get the same result as manually calculated, set alpha = 0.

#### Continuous Features
If all predictors are continuous, we would need to use one of the methods below:
1. Bin the continuous variable first, then use the previously mentioned syntax, sklearn.naive_bayes.CategoricalNB(). *We do not discuss on this method in here.*

2. Assume the data follows a normal distribution. Then we can use the syntax below: 

**<center>sklearn.naive_bayes.GaussianNB()</center>**

For both models (i.e., categorical NB and Gaussian NB), we can obtain predicted probability for each class using .predict_proba. 

#### Practice 1
Replicate the result of the in-class practice (i.e., firm report example, probability = 0.47).


In [4]:
# Plug in data: 
## Charges: =1 if yes, =0 if no.
## Size: =1 if large, =0 if small
## Y: =1 if T, =0 if F
X = np.array([[1,0], [0,0], [0,1], [0, 1], [0,0], [0,0], [1,0], [1,1], [0,1], [1,1]])
Y = np.array([1,1,1,1,1,1,0,0,0,0])
X, Y

(array([[1, 0],
        [0, 0],
        [0, 1],
        [0, 1],
        [0, 0],
        [0, 0],
        [1, 0],
        [1, 1],
        [0, 1],
        [1, 1]]),
 array([1, 1, 1, 1, 1, 1, 0, 0, 0, 0]))

In [6]:
from sklearn.naive_bayes import CategoricalNB

cat_nb = CategoricalNB(alpha=0)
cat_nb.fit(X, Y)

# New Record: Yes, Small
cat_nb.predict([[1, 0]])

array([0])

In [9]:
cat_nb.classes_, cat_nb.predict_proba([[1, 0]])

(array([0, 1]), array([[0.52941176, 0.47058824]]))

#### Practice 2
Using the iris data, train a Naive Bayes model. Assume variables are normally distributed.
- Split the data into training and test
- Train the model on the training set (Use Gaussian NB)

In [11]:
from sklearn.datasets import load_iris
iris = load_iris() 
# load the complete data information in. It consists both data and descriptive info.
iris.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module'])

In [13]:
iris.data

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0.3],
       [5. , 3.4, 1.5, 0.2],
       [4.4, 2.9, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1],
       [5.4, 3.7, 1.5, 0.2],
       [4.8, 3.4, 1.6, 0.2],
       [4.8, 3. , 1.4, 0.1],
       [4.3, 3. , 1.1, 0.1],
       [5.8, 4. , 1.2, 0.2],
       [5.7, 4.4, 1.5, 0.4],
       [5.4, 3.9, 1.3, 0.4],
       [5.1, 3.5, 1.4, 0.3],
       [5.7, 3.8, 1.7, 0.3],
       [5.1, 3.8, 1.5, 0.3],
       [5.4, 3.4, 1.7, 0.2],
       [5.1, 3.7, 1.5, 0.4],
       [4.6, 3.6, 1. , 0.2],
       [5.1, 3.3, 1.7, 0.5],
       [4.8, 3.4, 1.9, 0.2],
       [5. , 3. , 1.6, 0.2],
       [5. , 3.4, 1.6, 0.4],
       [5.2, 3.5, 1.5, 0.2],
       [5.2, 3.4, 1.4, 0.2],
       [4.7, 3.2, 1.6, 0.2],
       [4.8, 3.1, 1.6, 0.2],
       [5.4, 3.4, 1.5, 0.4],
       [5.2, 4.1, 1.5, 0.1],
       [5.5, 4.2, 1.4, 0.2],
       [4.9, 3

In [15]:
iris.target_names, iris.target

(array(['setosa', 'versicolor', 'virginica'], dtype='<U10'),
 array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]))

In [17]:
iris.data.shape

(150, 4)

In [18]:
# Splitting data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state=22)

In [19]:
# Training the Model
from sklearn.naive_bayes import GaussianNB

g_nb = GaussianNB()
g_nb.fit(X_train, y_train)

GaussianNB()

In [20]:
g_nb.score(X_test, y_test), g_nb.score(X_train, y_train)

(0.9736842105263158, 0.9553571428571429)