In [1]:
import numpy
import pandas
import joblib

import scipy

# Instructions

- Read **the train data** from the CSV file and properly set the index
- Use `joblib` to load the trained model and print out the model parameters
- Again, print out thoses model parameters

In [2]:
data_train = pandas.read_csv('./data/features.train.csv').set_index('id')
data_train

Unnamed: 0_level_0,feature_1,feature_2,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
253,2.2189,0.3114,1
667,-0.2929,0.3086,1
85,0.6741,-0.4251,1
969,0.9695,-0.5618,1
75,-1.5488,0.6378,0
...,...,...,...
835,0.6986,-0.2398,1
192,-0.4636,0.5754,0
629,0.6531,0.2976,0
559,1.1244,1.4332,0


In [3]:
model = joblib.load('model/model.joblib')
model

In [4]:
print(f"model.theta_ :\n {model.theta_}")
print(f"model.var_   :\n {model.var_}")
print(f"model.class_prior_   :\n {model.class_prior_}")

model.theta_ :
 [[ 0.00135744  0.66153238]
 [ 0.99415068 -0.14765967]]
model.var_   :
 [[0.56437314 0.19552838]
 [0.58906966 0.17705458]]
model.class_prior_   :
 [0.51066667 0.48933333]


# Instructions
- Use `.predict_proba` to calculate prediction probabilities for all training samples
- Properly put the prediction probabilities $\mathrm{P}(y=1| \mathbf{x})$ into `pandas.DataFrame`, with a column named `probability`
- The prediction probabilities $\mathrm{P}(y=0| \mathbf{x})$ can be ignored

In [5]:
probabilities_from_sklearn = model.predict_proba(data_train[['feature_1', 'feature_2']])
probabilities_from_sklearn


array([[0.05801393, 0.94198607],
       [0.8338463 , 0.1661537 ],
       [0.0430407 , 0.9569593 ],
       ...,
       [0.48951577, 0.51048423],
       [0.98841339, 0.01158661],
       [0.01192084, 0.98807916]])

In [6]:
probabilities_from_sklearn = pandas.DataFrame(
    data = probabilities_from_sklearn[:, 1],
    index = data_train.index,
    columns = ['probability']
)
probabilities_from_sklearn

Unnamed: 0_level_0,probability
id,Unnamed: 1_level_1
253,0.941986
667,0.166154
85,0.956959
969,0.984602
75,0.005969
...,...
835,0.916488
192,0.043767
629,0.510484
559,0.011587


# Instructions
- In naive bayes models with binary features, $P(x_i|y)$ can be calculated easily using counting methods
- However, when the features are continuous, normal distributions are used instead
- Now, use `scipy.stats.norm.pdf`the class-conditional probabilities $P(x_i|y)$ for all training samples, which are:
  - $P(x_0|y=0) = \mathrm{scipy.stats.norm.pdf}(x_0, \mathrm{loc=mean_0}, \mathrm{scale=standard\_deviation_0}) $
  - $P(x_1|y=0) = \mathrm{scipy.stats.norm.pdf}(x_1, \mathrm{loc=mean_0}, \mathrm{scale=standard\_deviation_0}) $
  - $P(x_0|y=1) = \mathrm{scipy.stats.norm.pdf}(x_0, \mathrm{loc=mean_1}, \mathrm{scale=standard\_deviation_1}) $
  - $P(x_1|y=1) = \mathrm{scipy.stats.norm.pdf}(x_1, \mathrm{loc=mean_1}, \mathrm{scale=standard\_deviation_1}) $
  - Hint: https://scikit-learn.org/stable/modules/naive_bayes.html#gaussian-naive-bayes 
- Then, **manually calculate** the prediction probabilities $P(y|x_0,x_1)$ for all training samples
- Also, properly put the data into `pandas.DataFrame`, with a column named `probability`


In [7]:
proba = pandas.DataFrame({
    'x0_given_y_is_1': data_train['feature_1'].apply(
        func = scipy.stats.norm.pdf, 
        loc = model.theta_[1,0], 
        scale = model.var_[1,0] ** 0.5
    ),
    
    'x1_given_y_is_1': data_train['feature_2'].apply(
        func = scipy.stats.norm.pdf, 
        loc = model.theta_[1,1], 
        scale = model.var_[1,1] ** 0.5
    ),
    
    'x0_given_y_is_0': data_train['feature_1'].apply(
        func = scipy.stats.norm.pdf, 
        loc = model.theta_[0,0], 
        scale = model.var_[0,0] ** 0.5
    ),
    
    'x1_given_y_is_0': data_train['feature_2'].apply(
        func = scipy.stats.norm.pdf, 
        loc = model.theta_[0,1], 
        scale = model.var_[0,1] ** 0.5
    ),
    
})

proba

Unnamed: 0_level_0,x0_given_y_is_1,x1_given_y_is_1,x0_given_y_is_0,x1_given_y_is_0
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
253,0.145506,0.522879,0.006809,0.659414
667,0.127408,0.526677,0.491826,0.656102
85,0.476505,0.762875,0.355625,0.044054
969,0.519521,0.584126,0.231469,0.019647
75,0.002148,0.166038,0.063177,0.900906
...,...,...,...,...
835,0.482644,0.925644,0.345206,0.113001
192,0.085601,0.216598,0.438478,0.885250
629,0.470923,0.541635,0.364497,0.643005
559,0.512357,0.000816,0.173724,0.196786


In [8]:
probabilities_from_calculation = (
    proba['x0_given_y_is_1'] * proba['x1_given_y_is_1'] * model.class_prior_[1]
    /
    (
        + proba['x0_given_y_is_1'] * proba['x1_given_y_is_1'] * model.class_prior_[1]
        + proba['x0_given_y_is_0'] * proba['x1_given_y_is_0'] * model.class_prior_[0]
    )
    
)

In [9]:
probabilities_from_calculation = pandas.DataFrame(
    data = probabilities_from_calculation,
    index = data_train.index,
    columns = ['probability']
)
probabilities_from_calculation

Unnamed: 0_level_0,probability
id,Unnamed: 1_level_1
253,0.941986
667,0.166154
85,0.956959
969,0.984602
75,0.005969
...,...
835,0.916488
192,0.043767
629,0.510484
559,0.011587


# Instructions
- Verify that the probabilities from `.predict_proba` and manual calculations are equal
    - Hint: use `numpy.allclose` to compare float data

In [10]:
numpy.allclose(probabilities_from_sklearn, probabilities_from_calculation)

True

# Instructions
- Use `.predict` to produce predictions for all training samples
- Properly put the predictions into `pandas.DataFrame`, with a column named `prediction`
- Now, produce predictions for all training samples using the manually calculated probabilities
- Also, properly put the predictions into another `pandas.DataFrame`, with a column named `prediction`
- Verify that the predictions from `.predict` and manual calculations are equal

In [11]:
predictions_from_sklearn = model.predict(data_train[['feature_1', 'feature_2']])
predictions_from_sklearn

array([1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0,
       0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0,
       1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1,
       0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0,
       0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0,
       0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1,
       1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1,

In [12]:
predictions_from_sklearn = pandas.DataFrame(
    data    = predictions_from_sklearn,
    index   = data_train.index,
    columns = ['prediction']
)
predictions_from_sklearn

Unnamed: 0_level_0,prediction
id,Unnamed: 1_level_1
253,1
667,0
85,1
969,1
75,0
...,...
835,1
192,0
629,1
559,0


In [13]:
predictions_from_calculation = (
    (probabilities_from_calculation > 0.5)
    .astype(int)
    .rename(columns={'probability': 'prediction'})
)
predictions_from_calculation

Unnamed: 0_level_0,prediction
id,Unnamed: 1_level_1
253,1
667,0
85,1
969,1
75,0
...,...
835,1
192,0
629,1
559,0


In [14]:
(predictions_from_sklearn == predictions_from_calculation).all()

prediction    True
dtype: bool