In [3]:
import numpy as np
import pandas as pd 
import random
from sklearn.naive_bayes import GaussianNB

np.random.seed(42)

# this function takes confusion matrix and outputs precision
def precison(cmm):
	tp = cmm[1][1]
	tn = cmm[0][0]
	fp = cmm[0][1]
	fn = cmm[1][0]
	return tp/(tp+fp)

# this function takes confusion matrix and outputs recall
def recall(cmm):
	tp = cmm[1][1]
	tn = cmm[0][0]
	fp = cmm[0][1]
	fn = cmm[1][0]
	return tp/(tp+fn)

def predict(x,mean,stdev,prob_Y):			#predict function to predict given a single record
	m=-10**300
	best_class=-1
	for i in range(0,2):
		temp=np.exp(-1*(np.square(x-mean[i])/(2*(np.square(stdev[i])))))/(np.sqrt(2*np.pi*np.square(stdev[i])))		#calculating probability for each label
		for j in range(len(temp)):						#handling 0/0 and 255/0 case
			if(temp[j]!=temp[j]):
				if(x[j]!=mean[i][j] and stdev[i][j]==0):
					# print(1,x[j],mean[i][j],stdev[i][j])
					temp[j]=10**(-6)
				elif(x[j]==mean[i][j] and stdev[i][j]==0):
					# print(2,x[j],mean[i][j],stdev[i][j])
					temp[j]=1
		temp=(np.sum(np.log(temp)))+np.log(prob_Y[i])
		if(temp>m):										#calculating best label for given row
			m=temp
			best_class=i
	return best_class									#returning best label

# scratch implementation of Naive Bayes
def scratch(X_train, Y_train, X_test, Y_test):
	prob_Y = {}
	means = {}
	stdev = {}
	confusion_matrix_train = np.array([[0,0],[0,0]])
	confusion_matrix_test = np.array([[0,0],[0,0]])
	for i in range(0,2):							#calculating mean and stdev for each label
		X_new = X_train[Y_train==i]
		means[i] = np.mean(X_new, axis=0)
		stdev[i] = np.std(X_new, axis=0)
		prob_Y[i] = len(X_new)/len(X_train)

	train_correct=0
	for i in range(len(X_train)):					#predicting on each train row
		pred=predict(X_train[i],means,stdev,prob_Y)
		confusion_matrix_train[Y_train[i]][pred] += 1
		if(pred==Y_train[i]):
			train_correct+=1
	train_accuracy = train_correct/X_train.shape[0]

	test_correct=0
	for i in range(len(X_test)):					#predicting on each test row
		pred=predict(X_test[i],means,stdev,prob_Y)
		confusion_matrix_test[Y_test[i]][pred] += 1
		if(pred==Y_test[i]):
			test_correct+=1
	test_accuracy = test_correct/X_test.shape[0]

	return train_accuracy, test_accuracy, confusion_matrix_train, confusion_matrix_test

# scratch implementation of k-fold
def k_fold(x, y, k):
	p = x.shape[0]//k
	x_ = x.tolist()
	y_ = y.tolist()
	min_acc = 0
	ret_val = (0,0,0,0)
	for i in range(k):
		x_train = np.array(x_[:i*p] + x_[(i+1)*p:], dtype=np.uint8)
		y_train = np.array(y_[:i*p] + y_[(i+1)*p:], dtype=np.uint8)

		x_test = np.array(x_[i*p:(i+1)*p], dtype=np.uint8)
		y_test = np.array(y_[i*p:(i+1)*p], dtype=np.uint8)

		train_accuracy, test_accuracy, cm_train, cm_test = scratch(x_train, y_train, x_test, y_test)

		if(test_accuracy > min_acc):
			min_acc = test_accuracy
			ret_val = (train_accuracy, test_accuracy, cm_train, cm_test)
	return ret_val


# pre-processing of data
df_train = pd.read_csv('fashion-mnist_train.csv')
df_train = df_train[df_train['label'].isin([1,2])]		# removing all the labels except 1 and 2
x_train = df_train[df_train.columns[1:]].to_numpy()		# converting X and y to numpy arrays
y_train = df_train[df_train.columns[0:1]].to_numpy().flatten()
print(x_train.shape, y_train.shape)

df_test = pd.read_csv('fashion-mnist_test.csv')
df_test = df_test[df_test['label'].isin([1,2])]
x_test = df_test[df_test.columns[1:]].to_numpy()
y_test = df_test[df_test.columns[0:1]].to_numpy().flatten()
print(x_test.shape, y_test.shape)

# binarization of image
threshold = 128

x_train[x_train<threshold] = 0
x_train[x_train>=threshold] = 255
x_test[x_test<threshold] = 0
x_test[x_test>=threshold] = 255

y_train[y_train==1] = 0
y_train[y_train==2] = 1
y_test[y_test==1] = 0
y_test[y_test==2] = 1

# randomizing the dataset
p1 = np.random.permutation(x_train.shape[0])
x_train, y_train = x_train[p1], y_train[p1]

p2 = np.random.permutation(x_test.shape[0])
x_test, y_test = x_test[p2], y_test[p2]

train_accuracy, test_accuracy, cm_train, cm_test = scratch(x_train, y_train, x_test, y_test)
print("Running Scratch Implementation on Train and Test FMNIST data.")
print("Train Accuracy =", train_accuracy)
print("Test Accuracy =", test_accuracy)
print("\n")

x = np.concatenate((x_train, x_test))
y = np.concatenate((y_train, y_test))

for i in range(3,11):
	train_accuracy, test_accuracy, cm_train, cm_test = k_fold(x, y, i)
	print("For k="+str(i)+", following are the results -")
	print("Train Accuracy =",train_accuracy)
	print("Test Accuracy =",test_accuracy)
	print("")

optimal_k = 5
train_accuracy, test_accuracy, cm_train, cm_test = k_fold(x, y, optimal_k)

print("\n\nResults on Optimal value of k -")
print("\nTrain Accuracy =",train_accuracy)
print("Train Confusion Matrix =",cm_train)
print("Train Precision =",precison(cm_train))
print("Train Recall =",recall(cm_train))

print("\nTest Accuracy =",test_accuracy)
print("Test Confusion Matrix =",cm_test)
print("Test Precision =",precison(cm_test))
print("Test Recall =",recall(cm_test))


(12000, 784) (12000,)
(2000, 784) (2000,)




Running Scratch Implementation on Train and Test FMNIST data.
Train Accuracy = 0.93575
Test Accuracy = 0.9295




KeyboardInterrupt: 