In [1]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_blobs
from scipy.stats import norm

In [2]:
X, y = make_blobs(n_samples=10000, centers=2, n_features=2, random_state=1)

print(X.shape, y.shape)
print(X[:5])
print(y[:5])

(10000, 2) (10000,)
[[-3.08389358  5.70067218]
 [-8.80258525 -5.07389013]
 [-1.68452735  5.22511143]
 [-1.44683075  4.51471432]
 [-3.36067232  3.22371079]]
[0 1 0 0 0]


In [21]:
"""
1 - split data
2 - each column is a univariate gaussian - we need mean and std per column
3 - we need gaussian density function
4 - probability function - prior * likelihood(s)
5 - calculate prior 
6 - predict function
7 - fit function
"""

'\n1 - split data\n2 - each column is a univariate gaussian - we need mean and std per column\n3 - we need gaussian density function\n4 - probability function - prior * likelihood(s)\n5 - calculate prior \n6 - predict function\n7 - fit function\n'

In [3]:
class GaussianNaiveBayes:
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def dataSplit(self):
        X_train, X_test, y_train, y_test = train_test_split(self.X, self.y, test_size=0.3, random_state=24, shuffle=True)
        return X_train, X_test, y_train, y_test

    def fitDistribution(self, x):
        mean = np.mean(x)
        std = np.std(x)
        dist = norm(mean, std)
        return dist

    def probabilityFunction(self, x, prior, dist1, dist2):
        return prior * dist1.pdf(x[0]) * dist2.pdf(x[1]) # hard coded for 2 columns

    def fit(self):
        self.X_train, self.X_test, self.y_train, self.y_test = self.dataSplit()

        self.X0_train = self.X_train[self.y_train == 0]
        self.X1_train = self.X_train[self.y_train == 1]

        self.X0_prior = len(self.X0_train)/ len(self.X_train)
        self.X1_prior = len(self.X1_train)/ len(self.X_train)

        self. X00_dist = self.fitDistribution(self.X0_train[:, 0])
        self.X10_dist = self.fitDistribution(self.X0_train[:, 1])

        self.X01_dist = self.fitDistribution(self.X1_train[:, 0])
        self.X11_dist = self.fitDistribution(self.X1_train[:, 1])

    def predict(self):
        for sample, target in zip(self.X_test, self.y_test):
            py0 = self.probabilityFunction(sample, self.X0_prior, self.X00_dist, self.X10_dist)
            py1 = self.probabilityFunction(sample, self.X1_prior, self.X01_dist, self.X11_dist)

            print("P(y=0|%s) = %.3f" % (sample, py0 * 100))
            print("P(y=1|%s) = %.3f" % (sample, py1 * 100))
            print("Model predicted class {} and the true label was {} \n".format(np.argmax([py0, py1]), target))



        

In [4]:
clf = GaussianNaiveBayes(X, y)
clf.fit()

In [5]:
clf.predict()

P(y=0|[-3.16550295  5.03811611]) = 2.097
P(y=1|[-3.16550295  5.03811611]) = 0.000
Model predicted class 0 and the true label was 0 

P(y=0|[-2.01222037  5.01766574]) = 6.160
P(y=1|[-2.01222037  5.01766574]) = 0.000
Model predicted class 0 and the true label was 0 

P(y=0|[-2.21390906  4.91317334]) = 5.975
P(y=1|[-2.21390906  4.91317334]) = 0.000
Model predicted class 0 and the true label was 0 

P(y=0|[-3.34125908  3.73665452]) = 1.553
P(y=1|[-3.34125908  3.73665452]) = 0.000
Model predicted class 0 and the true label was 0 

P(y=0|[-12.35996648  -3.98476476]) = 0.000
P(y=1|[-12.35996648  -3.98476476]) = 0.500
Model predicted class 1 and the true label was 1 

P(y=0|[-2.56835077  3.34833213]) = 2.999
P(y=1|[-2.56835077  3.34833213]) = 0.000
Model predicted class 0 and the true label was 0 

P(y=0|[-10.10551412  -5.25842152]) = 0.000
P(y=1|[-10.10551412  -5.25842152]) = 3.284
Model predicted class 1 and the true label was 1 

P(y=0|[-0.09714038  3.53568832]) = 1.569
P(y=1|[-0.09714038  

#### Discrete Naive Bayes Classifier

In [6]:
"""
Things to implement
- Split data
- likelihood(s)
- fit function
- predict function
"""

'\nThings to implement\n- Split data\n- likelihood(s)\n- fit function\n- predict function\n'

In [7]:
class DiscreteNaiveBayes:
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def datasplit(self, X, y):
        """
        Takes a matrix X and target vector y and splits it into training and test sets with a 30/70 split
        """
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)
        return X_train, X_test, y_train, y_test
    
    def likelihood(self, x, target, k = 5):
        """
        Takes a vector x and a target value and returns the likelihood of the target variable
        """
        count = 0
        for i in x:
            if i == target:
                count = count + 1
                #print(count)
            else:
                continue
        likelihood = count/len(x)

        # If laplace smoothing is added to all variables then remove if statement
        if likelihood == 0:
            likelihood = k/(len(x)+k*2)
        #print(likelihood)
        return likelihood
    

    def fit(self, smoothing_para = 5):
        """
        -splits the dataset
        -finds the likelihoood of the target for each feature and for each class
        """
        # Splitting the data
        self.X_train, self.X_test, self.y_train, self.y_test = self.datasplit(self.X, self.y)

        # Separating the dataset by class
        self.X0_train = self.X_train[self.y_train == 0]
        self.X1_train = self.X_train[self.y_train == 1]

        # Finding the priors for each class
        self.X0_prior = len(self.X0_train)/ len(self.X_train)
        self.X1_prior = len(self.X1_train)/ len(self.X_train)

        # Find the likelihood of 1s in each feature for each class. 
        # Likelihood_0_1 means class 0 value 1
        self.likelihood_0_1 = []
        for x in range(self.X0_train.shape[1]):
            self.likelihood_0_1.append(self.likelihood(self.X0_train[:,x], 1, smoothing_para))
        self.likelihood_0_0 = []
        for x in range(self.X0_train.shape[1]):
            self.likelihood_0_0.append(self.likelihood(self.X0_train[:,x], 0, smoothing_para))

        self.likelihood_1_1 = []
        for x in range(self.X1_train.shape[1]):
            self.likelihood_1_1.append(self.likelihood(self.X1_train[:,x], 1, smoothing_para))
        self.likelihood_1_0 = []
        for x in range(self.X1_train.shape[1]):
            self.likelihood_1_0.append(self.likelihood(self.X1_train[:,x], 0, smoothing_para))


    def predict(self):
        """
        Outputs predictions for each row in the X Train
        """
        y = 0
        for x in self.X_test:
            probablity_class_0 = self.X0_prior
            probablity_class_1 = self.X1_prior
            
            for i in range(len(x)):
                if x[i] == 1:
                    probablity_class_0 = probablity_class_0*self.likelihood_0_1[i]
                else:
                    probablity_class_0 = probablity_class_0*self.likelihood_0_0[i]

            for i in range(len(x)):
                if x[i] == 1:
                    probablity_class_1 = probablity_class_1*self.likelihood_1_1[i]
                else:
                    probablity_class_1 = probablity_class_1*self.likelihood_1_0[i]

            print("P(y=0|%s) = %.3f" % (x, probablity_class_0 * 100))
            print("P(y=1|%s) = %.3f" % (x, probablity_class_1 * 100))
            print("Model predicted class {} and the true label was {} \n".format(np.argmax([probablity_class_0, probablity_class_1]), self.y_test[y]))
            y = y + 1


    def specialpredict(self, x, y):
        """
        Outputs predictions for the given input x
        """
        probablity_class_0 = self.X0_prior
        probablity_class_1 = self.X1_prior

        # Class 0
        for i in range(len(x)):    
            if x[i] == 1:
                probablity_class_0 = probablity_class_0*self.likelihood_0_1[i]
            else:
                probablity_class_0 = probablity_class_0*self.likelihood_0_0[i]

        # Class 1
        for i in range(len(x)):
            if x[i] == 1:
                probablity_class_1 = probablity_class_1*self.likelihood_1_1[i]
            else:
                probablity_class_1 = probablity_class_1*self.likelihood_1_0[i]

        print("P(y=0|%s) = %.3f" % (x, probablity_class_0 * 100))
        print("P(y=1|%s) = %.3f" % (x, probablity_class_1 * 100))
        print("Model predicted class {} and the true label was {} \n".format(np.argmax([probablity_class_0, probablity_class_1]), y))
    

#### Generating Discrete Dataset

In [8]:
# Set random seed for reproducibility
np.random.seed(42)

# Generate random binary input variables
X = np.random.randint(0, 2, size=(1000, 2))

# Generate random binary target variable
y = np.random.randint(0, 2, size=1000)

# Show the first 5 rows of X and y
print(X[:5])
print(y[:5])

[[0 1]
 [0 0]
 [0 1]
 [0 0]
 [0 1]]
[0 1 1 1 1]


#### Testing

In [9]:
dnb = DiscreteNaiveBayes(X, y)

In [10]:
dnb.fit()
dnb.predict()

P(y=0|[0 1]) = 13.429
P(y=1|[0 1]) = 12.549
Model predicted class 0 and the true label was 1 

P(y=0|[0 0]) = 13.429
P(y=1|[0 0]) = 11.594
Model predicted class 0 and the true label was 0 

P(y=0|[0 1]) = 13.429
P(y=1|[0 1]) = 12.549
Model predicted class 0 and the true label was 1 

P(y=0|[1 0]) = 11.286
P(y=1|[1 0]) = 12.692
Model predicted class 1 and the true label was 0 

P(y=0|[0 1]) = 13.429
P(y=1|[0 1]) = 12.549
Model predicted class 0 and the true label was 0 

P(y=0|[0 0]) = 13.429
P(y=1|[0 0]) = 11.594
Model predicted class 0 and the true label was 1 

P(y=0|[1 0]) = 11.286
P(y=1|[1 0]) = 12.692
Model predicted class 1 and the true label was 1 

P(y=0|[0 1]) = 13.429
P(y=1|[0 1]) = 12.549
Model predicted class 0 and the true label was 0 

P(y=0|[0 0]) = 13.429
P(y=1|[0 0]) = 11.594
Model predicted class 0 and the true label was 0 

P(y=0|[1 0]) = 11.286
P(y=1|[1 0]) = 12.692
Model predicted class 1 and the true label was 0 

P(y=0|[0 0]) = 13.429
P(y=1|[0 0]) = 11.594
Model 

#### Testing Laplace Smoothing

In [11]:
# Creating the first columns with only 1s
# create X array
X = np.zeros((1000, 2))
X[:, 0] = 1
X[:, 1] = np.random.randint(2, size=1000)

# create y array
y = np.random.randint(2, size=1000)

print(X[0:6,:])
print(y[0:6])

[[1. 1.]
 [1. 1.]
 [1. 1.]
 [1. 1.]
 [1. 0.]
 [1. 0.]]
[1 0 0 0 1 1]


In [12]:
dnb = DiscreteNaiveBayes(X,y)
dnb.fit(smoothing_para = 6)
dnb.specialpredict([0, 1], 1)

P(y=0|[0, 1]) = 0.430
P(y=1|[0, 1]) = 0.425
Model predicted class 0 and the true label was 1 

