Naive Bayes - Play Tennis Dataset 

In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv('/kaggle/input/play-tennis/playtennis.csv')
data.head()

Unnamed: 0,Outlook,Temperature,Humidity,Wind,Answer
0,sunny,hot,high,weak,no
1,sunny,hot,high,strong,no
2,overcast,hot,high,weak,yes
3,rain,mild,high,weak,yes
4,rain,cool,normal,weak,yes


In [3]:
y = list(data['Answer'].values)
X = data.iloc[:,:-1].values
print(f'Target Values: {y}')
print(f'Features: \n{X}')


Target Values: ['no', 'no', 'yes', 'yes', 'yes', 'no', 'yes', 'no', 'yes', 'yes', 'yes', 'yes', 'yes', 'no']
Features: 
[['sunny' 'hot' 'high' 'weak']
 ['sunny' 'hot' 'high' 'strong']
 ['overcast' 'hot' 'high' 'weak']
 ['rain' 'mild' 'high' 'weak']
 ['rain' 'cool' 'normal' 'weak']
 ['rain' 'cool' 'normal' 'strong']
 ['overcast' 'cool' 'normal' 'strong']
 ['sunny' 'mild' 'high' 'weak']
 ['sunny' 'cool' 'normal' 'weak']
 ['rain' 'mild' 'normal' 'weak']
 ['sunny' 'mild' 'normal' 'strong']
 ['overcast' 'mild' 'high' 'strong']
 ['overcast' 'hot' 'normal' 'weak']
 ['rain' 'mild' 'high' 'strong']]


In [4]:
y_train = y[:8]
y_val = y[8:]
X_train = X[:8]
X_val = X[8:]
print(f"Number of instances in training set: {len(X_train)}")
print(f"Number of instances in testing set: {len(X_val)}")

Number of instances in training set: 8
Number of instances in testing set: 6


In [5]:
class NaiveBayesClassifier:   
    def __init__(self, X, y):
        self.X, self.y = X, y 
        self.N = len(self.X)
        self.dim = len(self.X[0]) 
        self.attrs = [[] for _ in range(self.dim)] 
        self.output_dom = {} 
        self.data = []     
        for i in range(len(self.X)):
            for j in range(self.dim):
                if not self.X[i][j] in self.attrs[j]:
                    self.attrs[j].append(self.X[i][j])         
            if not self.y[i] in self.output_dom.keys():
                self.output_dom[self.y[i]] = 1
            else:
                self.output_dom[self.y[i]] += 1
            self.data.append([self.X[i], self.y[i]])
    def classify(self, entry):
        solve = None 
        max_arg = -1
        for y in self.output_dom.keys():
            prob = self.output_dom[y]/self.N 
            for i in range(self.dim):
                cases = [x for x in self.data if x[0][i] == entry[i] and x[1] == y] 
                n = len(cases)
                prob *= n/self.N     
            if prob > max_arg:
                max_arg = prob
                solve = y
        return solve

In [6]:
nbc = NaiveBayesClassifier(X_train, y_train)
total_cases = len(y_val)
correct = 0
wrong = 0
predictions = []
for i in range(total_cases):
    predict = nbc.classify(X_val[i])
    predictions.append(predict)
    if y_val[i] == predict:
        correct += 1
    else:
        wrong += 1
print('Predicted values:', predictions)
print('Actual values:', y_val)
print()
print('Total number of testing instances in the dataset:', total_cases)
print('Number of correct predictions:', correct)
print('Number of wrong predictions:', wrong)
print()
print('Accuracy of Bayes Classifier:', correct/total_cases)

Predicted values: ['no', 'yes', 'no', 'yes', 'yes', 'no']
Actual values: ['yes', 'yes', 'yes', 'yes', 'yes', 'no']

Total number of testing instances in the dataset: 6
Number of correct predictions: 4
Number of wrong predictions: 2

Accuracy of Bayes Classifier: 0.6666666666666666


Naive Bayes - Pima Indian Diabetes Dataset


In [7]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics

df = pd.read_csv("/kaggle/input/pima-indians-diabetes-database/diabetes.csv")
feature_col_names = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']
predicted_class_names = ['Outcome']
X = df[feature_col_names].values 
y = df[predicted_class_names].values
print(df.head)
xtrain,xtest,ytrain,ytest=train_test_split(X,y,test_size=0.33)
print ('\nThe total number of Training Data:',ytrain.shape)
print ('The total number of Test Data:',ytest.shape)

<bound method NDFrame.head of      Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0              6      148             72             35        0  33.6   
1              1       85             66             29        0  26.6   
2              8      183             64              0        0  23.3   
3              1       89             66             23       94  28.1   
4              0      137             40             35      168  43.1   
..           ...      ...            ...            ...      ...   ...   
763           10      101             76             48      180  32.9   
764            2      122             70             27        0  36.8   
765            5      121             72             23      112  26.2   
766            1      126             60              0        0  30.1   
767            1       93             70             31        0  30.4   

     DiabetesPedigreeFunction  Age  Outcome  
0                       0.627   50 

In [8]:
clf = GaussianNB().fit(xtrain,ytrain.ravel())
predicted = clf.predict(xtest)
predictTestData= clf.predict([[6,148,72,35,0,33.6,0.627,50]])
print('\nConfusion matrix')
print(metrics.confusion_matrix(ytest,predicted))
print('\nAccuracy of the classifier:',metrics.accuracy_score(ytest,predicted))
print('The value of Precision:', metrics.precision_score(ytest,predicted))
print('The value of Recall:', metrics.recall_score(ytest,predicted))
print("Predicted Value for individual Test Data:", predictTestData)


Confusion matrix
[[130  35]
 [ 35  54]]

Accuracy of the classifier: 0.7244094488188977
The value of Precision: 0.6067415730337079
The value of Recall: 0.6067415730337079
Predicted Value for individual Test Data: [1]
