## Naive Bayes Classifier

Naive assumption: features are conditionally independent given the class label (no dependencies/correlations between features)

In [1]:
import pandas as pd
from pandas import DataFrame, Series

In [10]:
df = pd.read_csv('example_data.csv')
df.head(15)

Unnamed: 0,CID,age,income,student,credit_rating,buys_computer
0,1,<=30,high,no,fair,no
1,2,<=30,high,no,excellent,no
2,3,31-40,high,no,fair,yes
3,4,>40,medium,no,fair,yes
4,5,>40,low,yes,fair,yes
5,6,>40,low,yes,excellent,no
6,7,31-40,low,yes,excellent,yes
7,8,<=30,medium,no,fair,no
8,9,<=30,low,yes,fair,yes
9,10,>40,medium,yes,fair,yes


Example of a naive bayes classifier for a given $X = ( \text{age} <=30, \text{income} =medium, \text{student} =yes, \text{credit rating} =fair)$

In [7]:
age_counts = df['age'].value_counts()
age_counts.index


def cast_probability(data_frame: DataFrame, x_column: str, c_column: str, x_i):
    groups = data_frame[[x_column, c_column]].groupby([c_column, x_column])
    groups_count: Series = groups.value_counts()
    level_c_index = groups_count.index.names.index(c_column)
    level_x_index = groups_count.index.names.index(x_column)
    categories = groups_count.index.levels[level_c_index]
    categories = list(categories)
    categories.sort()
    categories.reverse()
    probability_format = f"P({x_column} = \"{x_i}\" | C_i)"
    print(probability_format)
    ret = {}
    for c_i in categories:
        key = []
        key.insert(level_c_index, c_i)
        key.insert(level_x_index, x_i)
        total_ci = groups_count[c_i].sum()
        count = groups_count[tuple(key)]

        probability = count / total_ci
        probability_format = (f"\tP({x_column} = \"{x_i}\" | {c_column} = \"{c_i}\") ="
                              f" {count}/{total_ci} ="
                              f" {probability:.3f}")
        ret[c_i] = probability
        print(probability_format)
    return ret


p_age = cast_probability(df, 'age', 'buys_computer', "<=30")
p_income = cast_probability(df, 'income', 'buys_computer', 'medium')
p_student = cast_probability(df, 'student', 'buys_computer', 'yes')
p_credit_rating = cast_probability(df, 'credit_rating', 'buys_computer', 'fair')


P(age = "<=30" | C_i)
	P(age = "<=30" | buys_computer = "yes") = 2/9 = 0.222
	P(age = "<=30" | buys_computer = "no") = 3/5 = 0.600
P(income = "medium" | C_i)
	P(income = "medium" | buys_computer = "yes") = 4/9 = 0.444
	P(income = "medium" | buys_computer = "no") = 2/5 = 0.400
P(student = "yes" | C_i)
	P(student = "yes" | buys_computer = "yes") = 6/9 = 0.667
	P(student = "yes" | buys_computer = "no") = 1/5 = 0.200
P(credit_rating = "fair" | C_i)
	P(credit_rating = "fair" | buys_computer = "yes") = 6/9 = 0.667
	P(credit_rating = "fair" | buys_computer = "no") = 2/5 = 0.400


$P(X|C_i)$

In [4]:
p = [p_age, p_income, p_student, p_credit_rating]
c_is:list = list(p_age.keys())
c_is.sort()
c_is.reverse()
p_x_c = {}
for c_i in c_is:
    p_x_c_i = 1
    for i in range(len(p)):
        p_x_c_i *= p[i][c_i]
    inner_product = " x ".join(f"{p_i[c_i]:.3f}" for p_i in p)
    p_x_c[c_i] = p_x_c_i
    print(f"P(X|C_{c_i}) = {inner_product} = {p_x_c_i:.3f}")

P(X|C_yes) = 0.222 x 0.444 x 0.667 x 0.667 = 0.044
P(X|C_no) = 0.600 x 0.400 x 0.200 x 0.400 = 0.019


$P(X|C_i)P(C_i)$


In [5]:
p_c = df['buys_computer'].value_counts(normalize=True)
for c_i in c_is:
    p_x_c_i = p_x_c[c_i]
    p_c_i = p_c[c_i]
    print(f"P(X|C_{c_i})P(C_{c_i}) = {p_x_c_i:.3f} x {p_c_i:.3f} = {p_x_c_i * p_c_i:.3f}")

P(X|C_yes)P(C_yes) = 0.044 x 0.643 = 0.028
P(X|C_no)P(C_no) = 0.019 x 0.357 = 0.007
