In [9]:
import numpy as np 
import pandas as pd 	
import matplotlib.pyplot as plt 
import math
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# Naive Bayes - From Scratch

In [32]:
class  NaiveBayes(object):

	"""
		Bayes Theorem:
										Likelihood * Class prior probability
				Posterior Probability = -------------------------------------
											Predictor prior probability
				
							  			 P(x|c) * p(c)
							   P(c|x) = ------------------ 
											  P(x)
	"""

	def __init__(self):

		"""
			Attributes:
				likelihoods: Likelihood of each feature per class
				class_priors: Prior probabilities of classes 
				pred_priors: Prior probabilities of features 
				features: All features of dataset

		"""
		self.features = list
		self.likelihoods = {}
		self.class_priors = {}
		self.pred_priors = {}

		self.X_train = np.array
		self.y_train = np.array
		self.train_size = int
		self.num_feats = int

	def fit(self, X, y):

		self.features = list(X.columns)
		self.X_train = X
		self.y_train = y
		self.train_size = X.shape[0]
		self.num_feats = X.shape[1]

		for feature in self.features:
			self.likelihoods[feature] = {}
			self.pred_priors[feature] = {}

			for feat_val in np.unique(self.X_train[feature]):
				self.pred_priors[feature].update({feat_val: 0})
				print(feat_val)
				for outcome in np.unique(self.y_train):
					self.likelihoods[feature].update({feat_val+'_'+outcome:0})
					self.class_priors.update({outcome: 0})
					print('\t'+feat_val+'_'+outcome)

		self._calc_class_prior()
		self._calc_likelihoods()
		self._calc_predictor_prior()

	def _calc_class_prior(self):

		""" P(c) - Prior Class Probability """

		for outcome in np.unique(self.y_train):
			outcome_count = sum(self.y_train == outcome)
			self.class_priors[outcome] = outcome_count / self.train_size
			print(outcome, outcome_count, self.class_priors[outcome])

	def _calc_likelihoods(self):

		""" P(x|c) - Likelihood """

		for feature in self.features:

			for outcome in np.unique(self.y_train):
				outcome_count = sum(self.y_train == outcome)
				feat_likelihood = self.X_train[feature][self.y_train[self.y_train == outcome].index.values.tolist()].value_counts().to_dict()
				print(outcome, feat_likelihood)

				for feat_val, count in feat_likelihood.items():
					self.likelihoods[feature][feat_val + '_' + outcome] = count/outcome_count
					#print(self.likelihoods[feature][feat_val + '_' + outcome])


	def _calc_predictor_prior(self):

		""" P(x) - Evidence """

		for feature in self.features:
			feat_vals = self.X_train[feature].value_counts().to_dict()
			print(feat_vals)

			for feat_val, count in feat_vals.items():
				self.pred_priors[feature][feat_val] = count/self.train_size
				print(self.pred_priors[feature][feat_val])


	def predict(self, X):

		""" Calculates Posterior probability P(c|x) """

		results = []
		X = np.array(X)
		print("Predict:")
		for query in X:
			probs_outcome = {}
			for outcome in np.unique(self.y_train):
				prior = self.class_priors[outcome]
				likelihood = 1
				evidence = 1

				for feat, feat_val in zip(self.features, query):
					likelihood *= self.likelihoods[feat][feat_val + '_' + outcome]
					evidence *= self.pred_priors[feat][feat_val]

				posterior = (likelihood * prior) / (evidence)

				probs_outcome[outcome] = posterior

			result = max(probs_outcome, key = lambda x: probs_outcome[x])
			results.append(result)

		return np.array(results)


In [11]:
def pre_processing(df):

	""" partioning data into features and target """

	X = df.drop([df.columns[-1]], axis = 1)
	y = df[df.columns[-1]]

	return X, y

## Check the Model

In [33]:
#Weather Dataset
print("\nWeather Dataset:")

df = pd.read_table("data/weather.txt")
#print(df)

#Split fearures and target
X,y  = pre_processing(df)

nb_clf = NaiveBayes()
nb_clf.fit(X, y)

print("Train Accuracy: {}".format(accuracy_score(y, nb_clf.predict(X))))

#Query 1:
query = np.array([['Rainy','Mild', 'Normal', 't']])
print("Query 1:- {} ---> {}".format(query, nb_clf.predict(query)))

#Query 2:
query = np.array([['Overcast','Cool', 'Normal', 't']])
print("Query 2:- {} ---> {}".format(query, nb_clf.predict(query)))

#Query 3:
query = np.array([['Sunny','Hot', 'High', 't']])
print("Query 3:- {} ---> {}".format(query, nb_clf.predict(query)))


Weather Dataset:
Overcast
	Overcast_no
	Overcast_yes
Rainy
	Rainy_no
	Rainy_yes
Sunny
	Sunny_no
	Sunny_yes
Cool
	Cool_no
	Cool_yes
Hot
	Hot_no
	Hot_yes
Mild
	Mild_no
	Mild_yes
High
	High_no
	High_yes
Normal
	Normal_no
	Normal_yes
f
	f_no
	f_yes
t
	t_no
	t_yes
no 5 0.35714285714285715
yes 9 0.6428571428571429
no {'Rainy': 3, 'Sunny': 2}
yes {'Overcast': 4, 'Sunny': 3, 'Rainy': 2}
no {'Hot': 2, 'Mild': 2, 'Cool': 1}
yes {'Mild': 4, 'Cool': 3, 'Hot': 2}
no {'High': 4, 'Normal': 1}
yes {'Normal': 6, 'High': 3}
no {'t': 3, 'f': 2}
yes {'f': 6, 't': 3}
{'Rainy': 5, 'Sunny': 5, 'Overcast': 4}
0.35714285714285715
0.35714285714285715
0.2857142857142857
{'Mild': 6, 'Hot': 4, 'Cool': 4}
0.42857142857142855
0.2857142857142857
0.2857142857142857
{'High': 7, 'Normal': 7}
0.5
0.5
{'f': 8, 't': 6}
0.5714285714285714
0.42857142857142855
Train Accuracy: 0.9285714285714286
Query 1:- [['Rainy' 'Mild' 'Normal' 't']] ---> ['yes']
Query 2:- [['Overcast' 'Cool' 'Normal' 't']] ---> ['yes']
Query 3:- [['Sunny'

## Use the Model