In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session


In [2]:
class NaiveBayes:
    
    def __init__(self):
        self.prior = None
        self.conditional_prob = None
    
    def fit(self, X, y):
        n_samples, n_features = X.shape
        self.prior = {}
        self.conditional_prob = {}
        
        # Calculate prior probability of each class
        for label in np.unique(y):
            self.prior[label] = np.sum(y == label) / n_samples
        
        # Calculate conditional probability of each feature for each class
        for label in np.unique(y):
            X_label = X[y == label]
            self.conditional_prob[label] = {}
            for feature in range(n_features):
                feature_values = np.unique(X.iloc[:, feature])
                feature_prob = {}
                for value in feature_values:
                    feature_prob[value] = np.sum(X_label.iloc[:, feature] == value) / X_label.shape[0]
                self.conditional_prob[label][feature] = feature_prob
            
    def predict(self, X):
        n_samples, n_features = X.shape
        y_pred = np.zeros(n_samples)
        
        # Calculate probability of each class for each sample
        for i in range(n_samples):
            posteriors = {}
            for label in self.prior:
                prior = np.log(self.prior[label])
                likelihood = 0
                for feature in range(n_features):
                    value = X.iloc[i, feature]
                    if value in self.conditional_prob[label][feature]:
                        likelihood += np.log(self.conditional_prob[label][feature][value] + 1e-6)
                    else:
                        # If a feature value is not observed in the training data, assign it a small probability
                        likelihood += np.log(1e-6)
                posterior = prior + likelihood
                posteriors[label] = posterior
            
            # Choose the class with the highest probability as the predicted class
            y_pred[i] = max(posteriors, key=posteriors.get)
        
        return y_pred

In [3]:

#df=pd.read_csv("/kaggle/input/new-datacsv/New.csv",na_values="?" )
df=pd.read_csv("./adult.csv",na_values="?" )
df['income'].replace(['<=50K', '>50K'], [0, 1], inplace=True)


#filling the null values with the most frequest one

df.workclass.fillna('Private',inplace=True)
# df.occupation.value_counts()
df.occupation.fillna("Prof-specialty",inplace=True)
df["native-country"].value_counts()
df["native-country"].fillna("United-States",inplace=True)
# df.isnull().sum()


#deriving the training and testing datasets

numOfDataPoints = np.shape(df)[0] 
trainingDataPoints = (int)(67/100 * numOfDataPoints)

iters = 10
totalacc = 0
totalprec = 0
totalrec = 0
totalf1 = 0


In [4]:
for i in range(iters):
    df = df.sample(frac=1).reset_index(drop=True)
    TrainingDataSet = df[:trainingDataPoints] 
    TestingDataSet = df[trainingDataPoints:]
    TrainingX = TrainingDataSet.drop('income',axis = 1)
    TrainingY = TrainingDataSet['income']
    TestingX = TestingDataSet.drop('income', axis = 1)
    TestingY = TestingDataSet['income']
    NB  = NaiveBayes()
    NB.fit(TrainingX, TrainingY)
    y_pred = NB.predict(TestingX)
    
    #Calculating precision,accuracy,recall,f1 score
    tpositive=0
    tnegative=0
    fpositive=0
    fnegative=0

    for i in range(len(y_pred)):
    #     print(y_pred[i], TestingY.iloc[i])
        y_p = y_pred[i]
        y_a =  TestingY.iloc[i]
        if y_p == 1:
            if y_a == 1:
                tpositive+=1
            else:
                fpositive+=1
        else:
            if y_a==1:
                fnegative+=1
            else:
                tnegative+=1

    # print(tpositive, tnegative, fpositive, fnegative)

    accuracy = (tpositive + tnegative)/(tpositive + tnegative + fpositive + fnegative)
    precision = tpositive / (tpositive + fpositive)
    recall = tpositive/(tpositive + fnegative)
    f1score  = (2 * precision*recall)/ (precision+recall)
    
    totalacc+=accuracy
    totalprec+=precision
    totalrec+=recall
    totalf1+=f1score



#     print("Accuracy = ",accuracy)
#     print("Precision = ",precision)
#     print("Recall = ",recall)
#     print("f1score = ",f1score)
print("Accuracy = ",totalacc/iters)
print("Precision = ",totalprec/iters)
print("Recall = ",totalrec/iters)
print("f1score = ",totalf1/iters)
    

Accuracy =  0.8295135872937089
Precision =  0.617615118503917
Recall =  0.74728084515444
f1score =  0.6762687157904435
