In [285]:
import numpy as np
import re
import pprint
import math

In [286]:

def is_number(s):
    try:
        complex(s) # for int, long, float and complex
    except ValueError:
        return False

    return True


def numericalize_table(records):
    records = records.copy()
    str_to_num ={}
    number=0
    for i in range(len(records[0])):
        for ele in set(records[:,i]):
            if not is_number(ele):
                str_to_num[ele] = number
                number+=1

    for i,row in enumerate(records) :
        for j, col in enumerate(row) :
            if not is_number(col):
                records[i][j] = int(str_to_num[col])
    records=records.astype(float)
    
    return records,str_to_num


def select_row(col_idx, col_val,operator,records):
    selected_records=None
    if operator == "==":
        selected_records = records[records[:,col_idx] == col_val]
    elif operator == "<=":
        selected_records = records[records[:,col_idx] <= col_val]
    elif operator == "<":
        selected_records = records[records[:,col_idx] < col_val]
    elif operator == ">":
        selected_records = records[records[:,col_idx] > col_val]
    elif operator == ">=":
        selected_records = records[records[:,col_idx] >= col_val]
    return selected_records


def print_format_division(a,b,laplaceC=0):
    if laplaceC:
        return str(len(a)+1)+"/"+str(len(b)+laplaceC)+"="+str(round((len(a)+1)/(len(b)+laplaceC),3))
    else:
        return str(len(a))+"/"+str(len(b))+"="+str(round(len(a)/len(b),3))    
    

def gaussian_pdf(mu,var,x):
    return (1/math.sqrt(2*math.pi*var))*math.exp((-math.pow(x-mu,2))/(2*var))
    
def naive_bayes_classifier(cols,records,instance,isGaussian=False,isLaplace=False):
    records = records.copy()
    classes = set(records[:,-1])
    str_to_num = {}
    records, str_to_num =numericalize_table(records)

    table={}
    for i, cls in enumerate(classes) :    
        cls_key = cls
        cls = int(str_to_num[cls])
        class_records = select_row(len(cols)-1, cls, "==",records)
        table[cls_key] = {cls_key:print_format_division(class_records,records)}
        prob = 1
        prob*=len(class_records)/len(records)
        for key, val in instance.items():
            if is_number(val):
                if not isGaussian:
                    s_records = select_row(cols.index(key), val, "<=",class_records)
                    p=len(s_records)/len(class_records)
                    prob*=p
                    table[cls_key][str(key)+"<="+str(val)] = print_format_division(s_records,class_records)
                else :
                    p=gaussian_pdf(np.mean(class_records[:,cols.index(key)]), np.var(class_records[:,cols.index(key)],ddof=1), val)
                    prob*=p
                    table[cls_key]["(G)"+str(key)+"=="+str(val)] = str(round(p,3))
            else :
                if not isLaplace:
                    s_records = select_row(cols.index(key), str_to_num[val], "==",class_records)
                    p=len(s_records)/len(class_records)
                    prob*=p
                    table[cls_key][str(key)+"="+str(val)] = print_format_division(s_records,class_records)
                else :
                    s_records = select_row(cols.index(key), str_to_num[val], "==",class_records)
                    p=(len(s_records))+1/(len(class_records)+len(classes))
                    prob*=p
                    table[cls_key][str(key)+"="+str(val)] = print_format_division(s_records,class_records,len(classes))
                    
                          
        print("P("+cls_key+"|X)="+str(round(prob,10)))
    pprint.pprint(table)

            

In [287]:

Cols = ["Outlook","Temp","Humidity","Windy", "Play"]
Records = np.array([
    ["sunny",85,85,"false","No"],
    ["sunny",80,90,"true","No"],
    ["overcast",83,78,"false","Yes"],
    ["rain",70,96,"false","Yes"],
    ["rain",68,80,"false","Yes"],
    ["rain",65,70,"true","No"],
    ["overcast",64,65,"true","Yes"],
    ["sunny",72,95,"false","No"],
    ["sunny",69,70,"false","Yes"],
    ["rain",75,80,"false","Yes"],
    ["sunny",75,70,"true","Yes"],
    ["overcast",72,90,"true","Yes"],
    ["overcast",81,75,"false","Yes"],
    ["rain",71,80,"true","No"],
])


X = {"Outlook":"sunny", "Temp":75,"Humidity":75,"Windy":"true"}
naive_bayes_classifier(Cols,Records,X)



X = {"Outlook":"rain", "Temp":87,"Humidity":90,"Windy":"false"}
naive_bayes_classifier(Cols,Records,X,True)

X = {"Outlook":"rain", "Temp":87,"Humidity":90,"Windy":"false"}
naive_bayes_classifier(Cols,Records,X,True,True)

P(Yes|X)=0.0164609053
P(No|X)=0.0154285714
{'No': {'Humidity<=75': '1/5=0.2',
        'No': '5/14=0.357',
        'Outlook=sunny': '3/5=0.6',
        'Temp<=75': '3/5=0.6',
        'Windy=true': '3/5=0.6'},
 'Yes': {'Humidity<=75': '4/9=0.444',
         'Outlook=sunny': '2/9=0.222',
         'Temp<=75': '7/9=0.778',
         'Windy=true': '3/9=0.333',
         'Yes': '9/14=0.643'}}
P(Yes|X)=1.3917e-05
P(No|X)=2.87094e-05
{'No': {'(G)Humidity==90': '0.034',
        '(G)Temp==87': '0.015',
        'No': '5/14=0.357',
        'Outlook=rain': '2/5=0.4',
        'Windy=false': '2/5=0.4'},
 'Yes': {'(G)Humidity==90': '0.02',
         '(G)Temp==87': '0.005',
         'Outlook=rain': '3/9=0.333',
         'Windy=false': '6/9=0.667',
         'Yes': '9/14=0.643'}}
P(Yes|X)=0.0011790355
P(No|X)=0.0008239301
{'No': {'(G)Humidity==90': '0.034',
        '(G)Temp==87': '0.015',
        'No': '5/14=0.357',
        'Outlook=rain': '3/7=0.429',
        'Windy=false': '3/7=0.429'},
 'Yes': {'(G)Humidity