# Naive Bayes

In [1]:
from IPython.display import IFrame
IFrame('https://www.youtube.com/embed/CPqOCI0ahss',560,315)

## Probability Review

### Conditional Probability

$$P(A|B) = \frac{P(A\cap{B})}{P(B)} $$

## Bayes Rule

$$P(A|B) = \frac{P(A)\cdot P(B|A)}{P(A)}$$

We can expand the denominator with law of total probability.

$$P(A|B) = \frac{P(A)\cdot P(B|A)}{P(A)\cdot P(B|A) + P(A^C)\cdot P(B|A^C)}$$


## Independence Assumption

$$P(A\cap{B}) = P(A){P(B)} $$

## Implementation in Python (An oversimplification)

In [107]:
import string
import math

class NaiveBayes:
    
    
    def __init__(self):
        self.k = 0.5
        self.pos_counts = dict()
        self.neg_counts = dict()
        self.pos, self.neg = 0, 0
        self.tokens = set()
    
    def fit(self,X,y):
        for i in range(len(X)):
            if y[i] == 1: # Positive class
                self.pos += 1
                for token in self.tokenize(X[i]):
                    self.tokens.add(token)
                    if token not in self.pos_counts:
                        self.pos_counts[token] = 1 
                    else:
                        self.pos_counts[token] += 1 
                        
                    
                    
            elif y[i] == 0: # Negative Class
                self.neg += 1
                for token in self.tokenize(X[i]):
                    if token not in self.neg_counts:
                        self.tokens.add(token)
                        self.neg_counts[token] = 1 
                    else:
                        self.neg_counts[token] += 1 
    
    def predict(self,X):
        pos_log_prob, neg_log_prob = 0., 0.
        
        for x in X:
            print(x)
            tokens = self.tokenize(x)
            for token in tokens:
                p_pos, p_neg = self._predict(token)
                if token in self.tokens:
                    pos_log_prob += math.log(p_pos)
                    neg_log_prob += math.log(p_neg)
                else: # Never seen before
                    pos_log_prob += math.log(1 - p_pos)
                    neg_log_prob += math.log(1 - p_neg)
        
        prob_pos = math.exp(pos_log_prob)
        prob_neg = math.exp(neg_log_prob)
        
        return prob_pos / (prob_pos + prob_neg)
                    
    
    def _predict(self,token):
        pos = self.pos_counts[token]
        neg = self.neg_counts[token]
        
        prob_pos = (pos + self.k) / (self.pos + 2 * self.k)
        prob_neg = (neg + self.k) / (self.neg + 2 * self.k)
        
        return prob_pos, prob_neg
    
    def tokenize(self,text):
        text = text.lower()
        text = "".join([i for i in text if i not in string.punctuation])
        return list(set(text.split()))
    

In [108]:
t = "Data Science. is, science"

In [109]:
X = [t]*100

In [110]:
y = [1, 1, 0, 1, 1] * 20
    

In [111]:
n = NaiveBayes()

In [112]:
n.fit(X,y)

In [113]:
n.predict(["Data Science. is, science"])

Data Science. is, science


0.5134259577829403