# Understanding Naive Bayes using Python & Pandas

## 1. Create Dataset

In [1]:
import pandas as pd


td = pd.DataFrame({'Department': ['Sales', 'Sales', 'Sales',
                                 'IT', 'IT', 'IT', 'IT',
                                 'Marketing', 'Marketing',
                                 'Administration', 'Administration'],
                  'Age': ['31-35', '26-30', '31-35', '21-25', '31-35', 
                          '26-30', '41-45', '36-40', '31-35', '46-50',
                         '26-30'],
                  'Salary': ['46K-50K', '26K-30K', '31K-35K', '46K-50K',
                            '66K-70K', '46K-50K', '66K-70K', '46K-50K',
                            '41K-45K', '36K-40K', '26K-30K'],
                  'Status': ['Senior', 'Junior', 'Junior', 'Junior',
                            'Senior', 'Junior', 'Senior', 'Senior',
                            'Junior', 'Senior', 'Junior'],
                  'Count': [30, 40, 40, 20, 5, 3, 3, 10, 4, 4, 6]},
                  columns=['Department', 'Age', 'Salary', 'Status', 'Count'])
display(td)

Unnamed: 0,Department,Age,Salary,Status,Count
0,Sales,31-35,46K-50K,Senior,30
1,Sales,26-30,26K-30K,Junior,40
2,Sales,31-35,31K-35K,Junior,40
3,IT,21-25,46K-50K,Junior,20
4,IT,31-35,66K-70K,Senior,5
5,IT,26-30,46K-50K,Junior,3
6,IT,41-45,66K-70K,Senior,3
7,Marketing,36-40,46K-50K,Senior,10
8,Marketing,31-35,41K-45K,Junior,4
9,Administration,46-50,36K-40K,Senior,4


## 2. Calculate Prior Probabilities

In [5]:
prior_junior, prior_senior = td.groupby(['Status']).Count.sum() / td.Count.sum()

display('Prior Probability Junior: ' + str(prior_junior),
        'Prior Probability Senior: ' + str(prior_senior))

'Prior Probability Junior: 0.6848484848484848'

'Prior Probability Senior: 0.3151515151515151'

## 3. Splitting Features into DFs

In [28]:
cols = ['Department', 'Age', 'Salary']
feature_list = [td.groupby([col, 'Status']).Count.sum().unstack(fill_value=0).
    reset_index().rename_axis(None, axis='columns') for col in cols]

[display(df) for df in feature_list]

Unnamed: 0,Department,Junior,Senior
0,Administration,6,4
1,IT,23,8
2,Marketing,4,10
3,Sales,80,30


Unnamed: 0,Age,Junior,Senior
0,21-25,20,0
1,26-30,49,0
2,31-35,44,35
3,36-40,0,10
4,41-45,0,3
5,46-50,0,4


Unnamed: 0,Salary,Junior,Senior
0,26K-30K,46,0
1,31K-35K,40,0
2,36K-40K,0,4
3,41K-45K,4,0
4,46K-50K,23,40
5,66K-70K,0,8


[None, None, None]

## 4. Missing Bins

In [31]:
department_bins = pd.DataFrame()

age_bins = pd.DataFrame({'Age': ['51-55', '56-60', '61-65'],
                   'Junior': [0, 0, 0],
                   'Senior': [0, 0, 0]},
                   columns=['Age', 'Junior', 'Senior'])

salary_bins = pd.DataFrame({'Salary': ['21K-25K', '51K-55K', '56K-60K', '61K-65K'],
                   'Junior': [0, 0, 0, 0],
                   'Senior': [0, 0, 0, 0]},
                   columns=['Salary', 'Junior', 'Senior'])

bin_list = [department_bins, age_bins, salary_bins]

adjusted_list = [pd.concat([feature_df, bin_df]).sort_values(by=[col]).reset_index(drop=True)
                 for feature_df, bin_df, col in zip(feature_list, bin_list, cols)]

[display(df) for df in adjusted_list]

Unnamed: 0,Department,Junior,Senior
0,Administration,6,4
1,IT,23,8
2,Marketing,4,10
3,Sales,80,30


Unnamed: 0,Age,Junior,Senior
0,21-25,20,0
1,26-30,49,0
2,31-35,44,35
3,36-40,0,10
4,41-45,0,3
5,46-50,0,4
6,51-55,0,0
7,56-60,0,0
8,61-65,0,0


Unnamed: 0,Salary,Junior,Senior
0,21K-25K,0,0
1,26K-30K,46,0
2,31K-35K,40,0
3,36K-40K,0,4
4,41K-45K,4,0
5,46K-50K,23,40
6,51K-55K,0,0
7,56K-60K,0,0
8,61K-65K,0,0
9,66K-70K,0,8


[None, None, None]

## 5. Laplace Smoothing

In [33]:
for df in adjusted_list:
    if any(df.Senior == 0):
        df.Senior += 1
    if any(df.Junior == 0):
        df.Junior += 1

[display(df) for df in adjusted_list]

Unnamed: 0,Department,Junior,Senior
0,Administration,6,4
1,IT,23,8
2,Marketing,4,10
3,Sales,80,30


Unnamed: 0,Age,Junior,Senior
0,21-25,21,1
1,26-30,50,1
2,31-35,45,36
3,36-40,1,11
4,41-45,1,4
5,46-50,1,5
6,51-55,1,1
7,56-60,1,1
8,61-65,1,1


Unnamed: 0,Salary,Junior,Senior
0,21K-25K,1,1
1,26K-30K,47,1
2,31K-35K,41,1
3,36K-40K,1,5
4,41K-45K,5,1
5,46K-50K,24,41
6,51K-55K,1,1
7,56K-60K,1,1
8,61K-65K,1,1
9,66K-70K,1,9


[None, None, None]

## 6. Calculate Likelihoods

In [34]:
for df in adjusted_list:
    df.Junior /= df.Junior.sum()
    df.Senior /= df.Senior.sum()
    
[display(df) for df in adjusted_list]

Unnamed: 0,Department,Junior,Senior
0,Administration,0.053097,0.076923
1,IT,0.20354,0.153846
2,Marketing,0.035398,0.192308
3,Sales,0.707965,0.576923


Unnamed: 0,Age,Junior,Senior
0,21-25,0.172131,0.016393
1,26-30,0.409836,0.016393
2,31-35,0.368852,0.590164
3,36-40,0.008197,0.180328
4,41-45,0.008197,0.065574
5,46-50,0.008197,0.081967
6,51-55,0.008197,0.016393
7,56-60,0.008197,0.016393
8,61-65,0.008197,0.016393


Unnamed: 0,Salary,Junior,Senior
0,21K-25K,0.00813,0.016129
1,26K-30K,0.382114,0.016129
2,31K-35K,0.333333,0.016129
3,36K-40K,0.00813,0.080645
4,41K-45K,0.04065,0.016129
5,46K-50K,0.195122,0.66129
6,51K-55K,0.00813,0.016129
7,56K-60K,0.00813,0.016129
8,61K-65K,0.00813,0.016129
9,66K-70K,0.00813,0.145161


[None, None, None]

## 7. Staple DFs back together

In [39]:
[[df.rename(columns={col:'Feature'}, inplace=True) for df in adjusted_list] for col in cols]

likelihood_table = pd.concat(adjusted_list).reset_index(drop=True)

display(likelihood_table)

Unnamed: 0,Feature,Junior,Senior
0,Administration,0.053097,0.076923
1,IT,0.20354,0.153846
2,Marketing,0.035398,0.192308
3,Sales,0.707965,0.576923
4,21-25,0.172131,0.016393
5,26-30,0.409836,0.016393
6,31-35,0.368852,0.590164
7,36-40,0.008197,0.180328
8,41-45,0.008197,0.065574
9,46-50,0.008197,0.081967


## 8. Predict

In [58]:
def predict(department, age, salary):
    junior_value = \
        prior_junior * \
        likelihood_table.loc[likelihood_table.Feature == department].Junior.item() * \
        likelihood_table.loc[likelihood_table.Feature == age].Junior.item() * \
        likelihood_table.loc[likelihood_table.Feature == salary].Junior.item()
    senior_value = \
        prior_senior * \
        likelihood_table.loc[likelihood_table.Feature == department].Senior.item() * \
        likelihood_table.loc[likelihood_table.Feature == age].Senior.item() * \
        likelihood_table.loc[likelihood_table.Feature == salary].Senior.item()
    return junior_value, senior_value

j1, s1 = predict('Sales', '26-30', '46K-50K')
j2, s2 = predict('Administration', '26-30', '51K-55K')
j3, s3 = predict('IT', '21-25', '66K-70K')

display('Sales - 26-30 - 46K-50K:', 'J: ' + str(j1) + ' S: ' + str(s1),
        'Administration - 36-40 - 41K-45K:', 'J: ' + str(j2) + ' S: ' + str(s2),
        'IT - 21-25 - 66K-70K:', 'J: ' + str(j3) + ' S: ' + str(s3))

'Sales - 26-30 - 46K-50K:'

'J: 0.03877236983994281 S: 0.001971059083697899'

'Administration - 36-40 - 41K-45K:'

'J: 0.0001211636557498213 S: 6.409948239667965e-06'

'IT - 21-25 - 66K-70K:'

'J: 0.0001950734857572123 S: 0.00011537906831402338'