# Naive Bayes

In [1]:
import pandas as pd
import io
import requests
import numpy as np


In [3]:
df = pd.read_table('../data/table7.txt', delim_whitespace=True)
df

Unnamed: 0,age,income,student,credit_rating,buys_computer
0,<=30,high,no,fair,no
1,<=30,high,no,excellent,no
2,>40,medium,no,fair,yes
3,>40,low,yes,fair,yes
4,>40,low,yes,excellent,no
5,31..40,low,yes,excellent,yes
6,<=30,medium,no,fair,no
7,<=30,low,yes,fair,yes
8,31..40,high,yes,fair,yes
9,>40,medium,no,excellent,no


In [4]:
def class_prior(class_list, class_value):
    class_p = class_list.count(class_value)/len(class_list)
    print(f"P({class_value})={class_list.count(class_value)}/{len(class_list)}={class_p}")
    return class_p

## Train and build distribution with labels

In [5]:
class_name = "buys_computer"
class_values = df[class_name].unique()
priors = {}
for class_value in class_values:    
    priors[class_value] = (class_prior(list(df[class_name]), class_value))

P(no)=6/11=0.5454545454545454
P(yes)=5/11=0.45454545454545453


In [6]:
p_yes_dic = {}
p_no_dic = {}
for class_value in class_values: 
    for column_name in df.drop(columns=[class_name]):
        for value in df[column_name].unique():
            value_count = len(df[(df[class_name] == class_value) & (df[column_name] == value)])
            class_count = len(df[(df[class_name] == class_value)])
            if class_value == "yes":
                p_yes_dic[value] = value_count/class_count
            if class_value == "no":
                p_no_dic[value] = value_count/class_count
print(f"yes: ", p_yes_dic)
print("no: ", p_no_dic)
            

yes:  {'<=30': 0.2, '>40': 0.4, '31..40': 0.4, 'high': 0.2, 'medium': 0.2, 'low': 0.6, 'no': 0.2, 'yes': 0.8, 'fair': 0.8, 'excellent': 0.2}
no:  {'<=30': 0.5, '>40': 0.3333333333333333, '31..40': 0.16666666666666666, 'high': 0.3333333333333333, 'medium': 0.3333333333333333, 'low': 0.3333333333333333, 'no': 0.8333333333333334, 'yes': 0.16666666666666666, 'fair': 0.5, 'excellent': 0.5}


## Classify new instances without label

In [7]:
df = pd.read_table('../data/table8.txt', delim_whitespace=True, index_col=0)
df

Unnamed: 0_level_0,age,income,student,creditrating
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CustomerA,31..40,high,no,fair
CustomerB,<=30,medium,yes,excellent
CustomerC,>40,medium,yes,fair
CustomerD,31..40,high,yes,excellent


In [8]:
for class_value in class_values:  
    for index, row in df.iterrows():
        p_new = []
        if class_value=="yes":
            p_dic = p_yes_dic
        else: 
            p_dic = p_no_dic
        for value in row: 
            p_new.append(p_dic[value])
        p_new = [round(num, 2) for num in p_new]
        print(f"P({class_value}|{index}) = Product({p_new})*{priors[class_value]} = {np.product(p_new)*priors[class_value]}")

P(no|CustomerA) = Product([0.17, 0.33, 0.83, 0.5])*0.5454545454545454 = 0.012698999999999998
P(no|CustomerB) = Product([0.5, 0.33, 0.17, 0.5])*0.5454545454545454 = 0.00765
P(no|CustomerC) = Product([0.33, 0.33, 0.17, 0.5])*0.5454545454545454 = 0.005049
P(no|CustomerD) = Product([0.17, 0.33, 0.17, 0.5])*0.5454545454545454 = 0.0026010000000000004
P(yes|CustomerA) = Product([0.4, 0.2, 0.2, 0.8])*0.45454545454545453 = 0.0058181818181818196
P(yes|CustomerB) = Product([0.2, 0.2, 0.8, 0.2])*0.45454545454545453 = 0.0029090909090909098
P(yes|CustomerC) = Product([0.4, 0.2, 0.8, 0.8])*0.45454545454545453 = 0.023272727272727278
P(yes|CustomerD) = Product([0.4, 0.2, 0.8, 0.2])*0.45454545454545453 = 0.0058181818181818196
