In [10]:
import pandas as pd 
import numpy as np 

class NB():
    def __init__(self,X,y):
        self.X = X
        self.y = y
        self.features = list(self.X.columns)
        
         # making a list of unique classes
        self.classes = list(self.y.unique())
        # counting the total no of classes in the target feature 
        self.class_0count = list(self.y).count(self.classes[0])
        self.class_1count = list(self.y).count(self.classes[1]) 
        
    def fit(self):       
        # getting the features names and target name      
        target = self.y.name
            
        # creating a dictionary with features as keys 
        final_dict = dict.fromkeys(self.features)

        # start looping over the features using feature index 
        for f in range(len(self.features)):
            
            # creating the gps list consists of uniques values for each feature 
            gps = list(self.X[self.features[f]].unique())
            
            # creating a dictionary for each gp  
            gp_counts = dict.fromkeys(gps)
            
            for j in range(len(gps)):          
                # getting the count of first group 
                dict_count = self.y[self.X[self.features[f]] == gps[j]].value_counts()
                
                # dividing the value counts by total no of classes respectively (yes and no)
                dc = dict.fromkeys(self.classes)
                dc[self.classes[0]] = dict_count[self.classes[0]]/self.class_0count
                dc[self.classes[1]] = dict_count[self.classes[1]]/self.class_1count
                
                # storing the counts dictionary in 
                gp_counts[gps[j]] = dc
                         
            # adding all the gp counts to the respective features :    
            final_dict[self.features[f]] = gp_counts
                    
        # returning the final dictionary which holds the apriori probabilities 
        return final_dict
    
    def predict(self):
        final_dict = NB.fit(self)
        # defining a function 
        def pred(feature_vec):                   
            # finding the class probabilities        
            class_proba = dict.fromkeys(self.classes)
            for i in range(len(self.classes)):
                ct = len(self.y[self.y == self.classes[i]])/len(self.y)
                class_proba[self.classes[i]] = ct            
            
            prob_yes = 1
            prob_no = 1
            for i in range(len(feature_vec)):
                r = final_dict[self.features[i]]  [feature_vec[i]]    [self.classes[0]]
                prob_yes = prob_yes * r  
                
                g = final_dict[self.features[i]][feature_vec[i]]    [self.classes[1]]
                prob_no = prob_no * g 
                

        
        
            # multiplying with class probabilities     
            fin_yes = prob_yes * class_proba[self.classes[0]]
            fin_no = prob_no * class_proba[self.classes[1]]
            
            if fin_yes> fin_no:
                return self.classes[0]
            else:
                return self.classes[1]
        
        preds = []
        for row in range(len(self.X)):
            preds.append(pred(list(self.X.iloc[row])))
        return pd.DataFrame(preds,columns = ['predictions'])  


In [11]:
import pandas as pd 
import numpy as nb 
df = pd.read_csv('/home/thomaskutty/Gitrepo/Statistics/data/heart.csv')
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    int64  
 1   sex       303 non-null    int64  
 2   cp        303 non-null    int64  
 3   trestbps  303 non-null    int64  
 4   chol      303 non-null    int64  
 5   fbs       303 non-null    int64  
 6   restecg   303 non-null    int64  
 7   thalach   303 non-null    int64  
 8   exang     303 non-null    int64  
 9   oldpeak   303 non-null    float64
 10  slope     303 non-null    int64  
 11  ca        303 non-null    int64  
 12  thal      303 non-null    int64  
 13  target    303 non-null    int64  
dtypes: float64(1), int64(13)
memory usage: 33.3 KB


In [13]:
X = df.drop(['age','chol','trestbps', 'thalach','oldpeak','target'], axis = 1)
X.head()

Unnamed: 0,sex,cp,fbs,restecg,exang,slope,ca,thal
0,1,3,1,0,0,0,0,1
1,1,2,0,1,0,0,0,2
2,0,1,0,0,0,2,0,2
3,1,1,0,1,0,2,0,2
4,0,0,0,1,1,2,0,2


In [14]:
y = df.target

In [15]:
nb_model = NB(X,y)

In [17]:
nb_model.fit() 

{'sex': {1: {1: 0.5636363636363636, 0: 0.8260869565217391},
  0: {1: 0.43636363636363634, 0: 0.17391304347826086}},
 'cp': {3: {1: 0.09696969696969697, 0: 0.050724637681159424},
  2: {1: 0.41818181818181815, 0: 0.13043478260869565},
  1: {1: 0.24848484848484848, 0: 0.06521739130434782},
  0: {1: 0.23636363636363636, 0: 0.7536231884057971}},
 'fbs': {1: {1: 0.1393939393939394, 0: 0.15942028985507245},
  0: {1: 0.8606060606060606, 0: 0.8405797101449275}},
 'restecg': {0: {1: 0.4121212121212121, 0: 0.572463768115942},
  1: {1: 0.5818181818181818, 0: 0.4057971014492754},
  2: {1: 0.006060606060606061, 0: 0.021739130434782608}},
 'exang': {0: {1: 0.8606060606060606, 0: 0.4492753623188406},
  1: {1: 0.1393939393939394, 0: 0.5507246376811594}},
 'slope': {0: {1: 0.05454545454545454, 0: 0.08695652173913043},
  2: {1: 0.6484848484848484, 0: 0.2536231884057971},
  1: {1: 0.296969696969697, 0: 0.6594202898550725}},
 'ca': {0: {1: 0.7878787878787878, 0: 0.32608695652173914},
  2: {1: 0.04242424242

In [20]:
predictions = nb_model.predict() 

In [24]:
result = pd.DataFrame()
result['actual'] = y
result['predictions'] = predictions 
result

Unnamed: 0,actual,predictions
0,1,1
1,1,1
2,1,1
3,1,1
4,1,1
...,...,...
298,0,0
299,0,1
300,0,0
301,0,0


In [28]:
# correctly predicted labels 
accuracy = len(result[result.actual== result.predictions])/len(result)
accuracy

0.834983498349835