In [1]:
import numpy as np
import pandas as pd
import tracemalloc
import copy
import re
import csv
import time
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_recall_fscore_support
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score

In [17]:
def getData(dataset_name):
    attribute_file_name = 'Data/'+dataset_name+".attribute"
    dataset_file_name = 'Data/'+dataset_name+".data"
    att = pd.read_csv(attribute_file_name,
                      delim_whitespace=True,
                     header = None)
    attributes = {rows[0]:rows[1] for _,rows in att.iterrows()}
    dataset = pd.read_csv(dataset_file_name,
                      names=list(attributes.keys()))
#     print(dataset.head(5))
    return attributes, dataset

In [25]:
class NaiveBayesClassifier:

  def __init__(self, X, Y, discreteThreshold = 10, eps = 1e-6):
    self.X = X
    self.Y = Y
    self.discreteThreshold = discreteThreshold
    self.eps = eps
    self.attributeInfo = self.get_attribute_info()
    print(self.attributeInfo)

  def normal_PDF(self, val, mu, sigma):
    sigma = sigma if sigma != 0 else self.eps 
    exponentTerm = (-1) * ( ( (val-mu) ** 2 ) / ( 2 * (sigma ** 2) ) )
    return (1/(np.sqrt(2*np.pi) * sigma)) * np.exp(exponentTerm)

  def get_attribute_info(self):
    attributeInfo = []
    distinctClasses, classCounts = np.unique(self.Y, return_counts=True)
    for i in range(self.X.shape[1]):
      column = self.X[:, i]
      distinctValues = np.unique(column)
      attributeType = 'discrete' if len(distinctValues) <= self.discreteThreshold else 'continuous'
      classWiseMean = {}
      classWiseStd = {}
      if attributeType == 'continuous':
        for cls in distinctClasses:
          classWiseMean[cls] = np.mean(column[self.Y == cls])
          classWiseStd[cls] = np.std(column[self.Y == cls])

      attributeInfo.append({
          'idx' : i,
          'type' : attributeType,
          'distinctValues' : distinctValues if attributeType == 'discrete' else None,
          'classWiseMean' : classWiseMean,
          'classWiseStd' : classWiseStd
      })
    
    

    return np.array(attributeInfo)

  def predict_one(self, x):
    distinctClasses, classCounts = np.unique(self.Y, return_counts=True)
    classProbs = classCounts/np.sum(classCounts)
    maxPosterior = -np.inf
    winClass = None
    for i in range(len(distinctClasses)):
      likelihood = 0
      for j in range(len(self.attributeInfo)):
        if self.attributeInfo[j]['type'] == 'discrete':
          column = self.X[:, j]
          classCorresporendingValues = column[self.Y == distinctClasses[i]]
          conditionalProb = (classCorresporendingValues == x[j]).sum()/classCounts[i]
          conditionalProb = conditionalProb if conditionalProb != 0 else self.eps
          likelihood += np.log(conditionalProb)
          # print("attribute : {} and class : {} and Prob : {}".format(j, distinctClasses[i], conditionalProb))

        else:
          conditionalProb = self.normal_PDF(x[j], self.attributeInfo[j]['classWiseMean'][distinctClasses[i]], self.attributeInfo[j]['classWiseStd'][distinctClasses[i]])
          conditionalProb = conditionalProb if conditionalProb != 0 else self.eps
          likelihood += np.log(conditionalProb)
          # print("attribute : {} and class : {} and Prob : {}".format(j, distinctClasses[i], conditionalProb))

      # print("Class {} -> Likelihood {}".format(distinctClasses[i], likelihood))
      posterior = likelihood + np.log(classProbs[i]) 
      # print("Class {} -> Posterior {}".format(distinctClasses[i], posterior))
      if posterior >= maxPosterior:
        maxPosterior = posterior
        winClass = distinctClasses[i]
    
    # print("\nWinclass: {}\n".format(winClass))
    return winClass 

  def predict(self, XTest):
    YPred = []
    for x in XTest:
      YPred.append(self.predict_one(x))
    return np.array(YPred)  

In [125]:
filePath = 'Data/iris.data'
df = pd.read_csv(filePath, sep=",", header=None)
dfX = df.iloc[:,:-1]
dfY = df.iloc[:,-1]
# print(dfX.head())
# print(dfY.head())

attributes, dataset = getData('iris')
columns = dataset.columns
print(columns)
training_data, testing_data = train_test_split(dataset, test_size = 0.2)

print("here--------------------------------------------")
print(dataset.head(3))
print(att)


X = dfX.to_numpy()
Y = np.squeeze(dfY.to_numpy())
# print('instances = {}, features= {} '.format(X.shape[0], X.shape[1]))

naiveBayesClassifier = NaiveBayesClassifier(X, Y, discreteThreshold = 10)
YPred = naiveBayesClassifier.predict(X)


Index(['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'class'], dtype='object')
here--------------------------------------------
   sepal_length  sepal_width  petal_length  petal_width        class
0           5.1          3.5           1.4          0.2  Iris-setosa
1           4.9          3.0           1.4          0.2  Iris-setosa
2           4.7          3.2           1.3          0.2  Iris-setosa
{'weather': 'category', 'tempareture': 'category', 'humidity': 'category', 'wind': 'category', 'class': 'category'}
[{'idx': 0, 'type': 'continuous', 'distinctValues': None, 'classWiseMean': {'Iris-setosa': 5.006, 'Iris-versicolor': 5.936, 'Iris-virginica': 6.587999999999998}, 'classWiseStd': {'Iris-setosa': 0.3489469873777391, 'Iris-versicolor': 0.5109833656783751, 'Iris-virginica': 0.6294886813914926}}
 {'idx': 1, 'type': 'continuous', 'distinctValues': None, 'classWiseMean': {'Iris-setosa': 3.418, 'Iris-versicolor': 2.7700000000000005, 'Iris-virginica': 2.974}, 'classWis

In [163]:
def getInfo(dataset, attributes):
    Info = {}
    mean = {}
    std = {}
#     grouped = dataset.group_by(dataset['class'])
    for column in dataset.columns:
        if column == 'class' or attributes[column] == 'category': continue
        mean[column] = dataset.groupby('class')[column].mean().to_dict()
        std[column] = dataset.groupby('class')[column].std().to_dict()
    Info['mean'] = mean
    Info['std'] = std
    return Info
    

In [164]:
Info = getInfo(dataset, attributes)
print(Info)

{'mean': {'sepal_length': {'Iris-setosa': 5.005999999999999, 'Iris-versicolor': 5.936, 'Iris-virginica': 6.587999999999998}, 'sepal_width': {'Iris-setosa': 3.4180000000000006, 'Iris-versicolor': 2.7700000000000005, 'Iris-virginica': 2.9739999999999998}, 'petal_length': {'Iris-setosa': 1.464, 'Iris-versicolor': 4.26, 'Iris-virginica': 5.552}, 'petal_width': {'Iris-setosa': 0.2439999999999999, 'Iris-versicolor': 1.3259999999999998, 'Iris-virginica': 2.026}}, 'std': {'sepal_length': {'Iris-setosa': 0.3524896872134513, 'Iris-versicolor': 0.5161711470638635, 'Iris-virginica': 0.635879593274432}, 'sepal_width': {'Iris-setosa': 0.38102439795469095, 'Iris-versicolor': 0.3137983233784114, 'Iris-virginica': 0.3224966381726375}, 'petal_length': {'Iris-setosa': 0.1735111594364455, 'Iris-versicolor': 0.46991097723995784, 'Iris-virginica': 0.5518946956639835}, 'petal_width': {'Iris-setosa': 0.1072095030816784, 'Iris-versicolor': 0.19775268000454407, 'Iris-virginica': 0.27465005563666745}}}


In [165]:
  def predict_one(x):
    distinctClasses, classCounts = np.unique(self.Y, return_counts=True)
    classProbs = classCounts/np.sum(classCounts)
    maxPosterior = -np.inf
    winClass = None
    for i in range(len(distinctClasses)):
      likelihood = 0
      for j in range(len(self.attributeInfo)):
        if self.attributeInfo[j]['type'] == 'discrete':
          column = self.X[:, j]
          classCorresporendingValues = column[self.Y == distinctClasses[i]]
          conditionalProb = (classCorresporendingValues == x[j]).sum()/classCounts[i]
          conditionalProb = conditionalProb if conditionalProb != 0 else self.eps
          likelihood += np.log(conditionalProb)
          # print("attribute : {} and class : {} and Prob : {}".format(j, distinctClasses[i], conditionalProb))

        else:
          conditionalProb = self.normal_PDF(x[j], self.attributeInfo[j]['classWiseMean'][distinctClasses[i]], self.attributeInfo[j]['classWiseStd'][distinctClasses[i]])
          conditionalProb = conditionalProb if conditionalProb != 0 else self.eps
          likelihood += np.log(conditionalProb)
          # print("attribute : {} and class : {} and Prob : {}".format(j, distinctClasses[i], conditionalProb))

      # print("Class {} -> Likelihood {}".format(distinctClasses[i], likelihood))
      posterior = likelihood + np.log(classProbs[i]) 
      # print("Class {} -> Posterior {}".format(distinctClasses[i], posterior))
      if posterior >= maxPosterior:
        maxPosterior = posterior
        winClass = distinctClasses[i]
    
    # print("\nWinclass: {}\n".format(winClass))
    return winClass 

In [166]:
def getPrediction(dataset, Info, x): 
    distinct_class = dataset['class'].value_counts()
    classProb = distinct_class/ distinct_class.sum()
    grouped = dataset.groupby(['class'])
    Winner = None
    maxPosterior = -np.inf
    
#     print(classProb.index)
    for att_class in distinct_class.index:
        like_hood = 0
        OnlyClassData = grouped.get_group(att_class)
#         print(OnlyClassData)
        for column in dataset.columns:
            if column == 'class': continue
            if attributes[column] == 'category':
                print("--------------------category--------------------")
#                 print(x[column])
                grouped_column = (OnlyClassData.groupby(column).count()+1e-6)/len(OnlyClassData)
#                 print(np.log(grouped_column['class']))
#                 print("#######################")
#                 print(np.log(grouped_column['class'][x[column]]))
                like_hood += np.log(grouped_column['class'][x[column]])
            else:
                conditionalProbability = normal_PDF(x[column],Info['mean'][column][att_class],Info['std'][column][att_class])
                conditionalProbability += 1e-6
                like_hood += np.log(conditionalProbability)
        posterior = like_hood+np.log(classProb[att_class])
        if posterior > maxPosterior: 
            maxPosterior = posterior
            Winner = att_class
    print(Winner)

In [167]:
print(dataset.iloc[144])
getPrediction(dataset, Info, dataset.iloc[144])

sepal_length               6.7
sepal_width                3.3
petal_length               5.7
petal_width                2.5
class           Iris-virginica
Name: 144, dtype: object
Iris-virginica


In [150]:
def normal_PDF(val, mu, sigma):
    sigma = sigma if sigma != 0 else self.eps 
    exponentTerm = (-1) * ( ( (val-mu) ** 2 ) / ( 2 * (sigma ** 2) ) )
    return (1/(np.sqrt(2*np.pi) * sigma)) * np.exp(exponentTerm)
