# 5 Naive Bayes Classifier from Scratch
Write a program to implement the naïve Bayesian classifier for a sample training data set stored
as a .CSV file. Compute the accuracy of the classifier, considering few test data sets.

In [7]:
import pandas as pd

In [8]:
def probAttr(data, attr, val):
    Total = data.shape[0]
    cnt = len(data[data[attr] == val])
    return cnt, cnt / Total

In [9]:
def train(data, Attr, conceptVals, concept):
    conceptProbs = {}
    countConcept = {}
    for cVal in conceptVals:
        countConcept[cVal], conceptProbs[cVal] = probAttr(data, concept, cVal)
    
    AttrConcept = {}
    probability_list = {}
    for att in Attr:
        probability_list[att] = {}
        AttrConcept[att] = {}
        for val in Attr[att]:
            AttrConcept[att][val] = {}
            a, probability_list[att][val] = probAttr(data, att, val)
            for cVal in conceptVals:
                dataTemp = data[data[att] == val]
                AttrConcept[att][val][cVal] = len(dataTemp[dataTemp[concept] == cVal]) / countConcept[cVal]
    print(f"P(A) : {conceptProbs}\n")
    print(f"P(X/A) : {AttrConcept}\n")
    print(f"P(X) : {probability_list}\n")
    return conceptProbs, AttrConcept, probability_list

In [10]:
def test(examples, Attr, concept_list, conceptProbs, AttrConcept,probability_list):
    misclassification_count = 0
    Total = len(examples)
    for ex in examples:
        px = {}
        for a in Attr:
            for x in ex:
                for c in concept_list:
                    if x in AttrConcept[a]:
                        if c not in px:
                            px[c] = conceptProbs[c] * AttrConcept[a][x][c] /probability_list[a][x]
                        else:
                            px[c] = px[c] * AttrConcept[a][x][c] /probability_list[a][x]
        print(px)
        classification = max(px, key=px.get)
        print(f"Classification : {classification} Expected : {ex[-1]}")
        if (classification != ex[-1]):
            misclassification_count += 1
    misclassification_rate = misclassification_count * 100 / Total
    accuracy = 100 - misclassification_rate
    print(f"Misclassification Count={misclassification_count}")
    print(f"Misclassification Rate={misclassification_rate}%")
    print(f"Accuracy={accuracy}%")

In [11]:

df = pd.read_csv('PlayTennis5.csv')
concept = str(list(df)[-1])
concept_list = set(df[concept])
Attr = {}

for a in df.columns[:-1]:
    Attr[a] = set(df[a])
    print(f"{a}: {Attr[a]}")
    
conceptProbs, AttrConcept, probability_list = train(df, Attr, concept_list,concept)
examples = pd.read_csv('PlayTennis5.csv')
test(examples.values, Attr, concept_list, conceptProbs, AttrConcept,probability_list)

Outlook: {'Rain', 'Sunny', 'Overcast'}
Temperature: {'Cool', 'Hot', 'Mild'}
Humidity: {'Normal', 'High'}
Wind: {'Strong', 'Weak'}
P(A) : {'No': 0.35714285714285715, 'Yes': 0.6428571428571429}

P(X/A) : {'Outlook': {'Rain': {'No': 0.4, 'Yes': 0.3333333333333333}, 'Sunny': {'No': 0.6, 'Yes': 0.2222222222222222}, 'Overcast': {'No': 0.0, 'Yes': 0.4444444444444444}}, 'Temperature': {'Cool': {'No': 0.2, 'Yes': 0.3333333333333333}, 'Hot': {'No': 0.4, 'Yes': 0.2222222222222222}, 'Mild': {'No': 0.4, 'Yes': 0.4444444444444444}}, 'Humidity': {'Normal': {'No': 0.2, 'Yes': 0.6666666666666666}, 'High': {'No': 0.8, 'Yes': 0.3333333333333333}}, 'Wind': {'Strong': {'No': 0.6, 'Yes': 0.3333333333333333}, 'Weak': {'No': 0.4, 'Yes': 0.6666666666666666}}}

P(X) : {'Outlook': {'Rain': 0.35714285714285715, 'Sunny': 0.35714285714285715, 'Overcast': 0.2857142857142857}, 'Temperature': {'Cool': 0.2857142857142857, 'Hot': 0.2857142857142857, 'Mild': 0.42857142857142855}, 'Humidity': {'Normal': 0.5, 'High': 0.5},