# Naive Bayes
## November 29th, 2022
### Overview: Use Naive Bayes, Poisson Bayes, and SKLearn's Naive Bayes classifiers to predict whether an email is legitimate or spam

In [1]:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn
import numpy as np
import pandas as pd
from scipy import stats
from sklearn.base import ClassifierMixin
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split

In [2]:
# load in the sms dataset
df = pd.read_csv('sms_spam_collection.csv')
# separate the data into the messages and labels
X = df.Message
y = df.Label

In [3]:
class NaiveBayesFilter(ClassifierMixin):
    '''
    A Naive Bayes Classifier that sorts messages in to spam or ham.
    '''

    def __init__(self):
        return

    def fit(self, X, y):
        '''
        Create a table that will allow the filter to evaluate P(H), P(S)
        and P(w|C)

        Parameters:
            X (pd.Series): training data
            y (pd.Series): training labels
        '''
        #masks for determining what messages are ham and which are spam
        ham_mask = y == 'ham'
        spam_mask = ~ham_mask
        
        #count word occurrences in ham,spam; dict records counts
        ham_dict = dict()
        spam_dict = dict()
        for i, message in enumerate(X):
            message = message.split()
            if y.iloc[i] == 'ham':
                for word in set(message):
                    if word in ham_dict.keys():
                        ham_dict[word] += message.count(word)
                    else:
                        ham_dict[word] = message.count(word)
            else:
                for word in set(message):
                    if word in spam_dict.keys():
                        spam_dict[word] += message.count(word)
                    else:
                        spam_dict[word] = message.count(word)
        
        #make dicts into dfs and combine into self.data (replace nans with 0 since there are 0 occurrences of that word)
        df1 = pd.DataFrame(ham_dict,index=['ham'])
        df2 = pd.DataFrame(spam_dict,index=['spam'])
        self.data = df1.append(df2).fillna(0)

    def predict_proba(self, X):
        '''
        Find P(C=k|x) for each x in X and for each class k by computing
        P(C=k)P(x|C=k)

        Parameters:
            X (pd.Series)(N,): messages to classify

        Return:
            (ndarray)(N,2): Probability each message is ham, spam
                0 column is ham
                1 column is spam
        '''
        #prob of ham and spam
        ham_prob = self.data.loc['ham'].sum()/(self.data.sum().sum())
        spam_prob = self.data.loc['spam'].sum()/(self.data.sum().sum())
        
        #this is a dummy initialization of the array to be returned; the nans are taken out later
        to_return = np.array([np.nan,np.nan])
        
        #we're checking each message in X (each message being a string of words)
        for message in X:
            #split it
            message = message.split()
            #keeping track of which words have already been accounted for (no want repeats)
            tapped = set()
            
            #these keep track of the total probability product updated below
            ham_product = 1
            spam_product = 1
            
            #for each word
            for word in set(message):
                n = message.count(word)
                    
                #calculating prob of this word given that it's in ham; then same w/ spam
                if word in self.data.loc['ham']:
                    P_in_ham = self.data.loc['ham'][word]/self.data.loc['ham'].sum()
                else:
                    P_in_ham = 1 
                if word in self.data.loc['spam']:
                    P_in_spam = self.data.loc['spam'][word]/self.data.loc['spam'].sum()
                else:
                    P_in_spam = 1 
                
                #updating products
                ham_product = ham_product * (P_in_ham**n)
                spam_product = spam_product * (P_in_spam**n)
            
            #finally calc full probabilities for this message
            prob_is_ham = ham_product*ham_prob
            prob_is_spam = spam_product*spam_prob
            
            #append to return array
            probs = np.array([prob_is_ham, prob_is_spam])
            to_return = np.vstack([to_return,probs])
            
        return to_return[1:]

    def predict(self, X):
        '''
        Use self.predict_proba to assign labels to X,
        the label will be a string that is either 'spam' or 'ham'

        Parameters:
            X (pd.Series)(N,): messages to classify

        Return:
            (ndarray)(N,): label for each message
        '''
        #get probabilities
        probs = self.predict_proba(X)
        
        #get ea
        predictions = []
        for row in probs:
            if row[0] >= row[1]:
                predictions.append('ham')
            else:
                predictions.append('spam')
                
        return np.array(predictions)

    def predict_log_proba(self, X):
        '''
        Find ln(P(C=k|x)) for each x in X and for each class k

        Parameters:
            X (pd.Series)(N,): messages to classify

        Return:
            (ndarray)(N,2): Probability each message is ham, spam
                0 column is ham
                1 column is spam
        '''

        #prob of ham and spam
        ham_prob = self.data.loc['ham'].sum()/(self.data.sum().sum())
        spam_prob = self.data.loc['spam'].sum()/(self.data.sum().sum())
        
        #this is a dummy initialization of the array to be returned; the nans are taken out later
        to_return = np.array([np.nan,np.nan])
        
        #we're checking each message in X (each message being a string of words)
        for message in X:
            #split it
            message = message.split()
            #keeping track of which words have already been accounted for (no want repeats)
            tapped = set()
            
            #these keep track of the total probability product updated below
            ham_SUM = 0
            spam_SUM = 0
            
            #for each word
            for word in set(message):
                n = message.count(word)
                
                #calculating prob of this word given that it's in ham; then same w/ spam 
                if word in self.data.loc['ham']:
                    P_in_ham = (self.data.loc['ham'][word] + 1)/(self.data.loc['ham'].sum() + 2)
                else:
                    P_in_ham = 1
                if word in self.data.loc['spam']:
                    P_in_spam = (self.data.loc['spam'][word] + 1)/(self.data.loc['spam'].sum() + 2)
                else:
                    P_in_spam = 1
                
                #updating products (now log sums)
                ham_SUM = ham_SUM + n*np.log(P_in_ham)
                spam_SUM = spam_SUM + n*np.log(P_in_spam)
            
            #finally calc full probabilities for this message
            prob_is_ham = ham_SUM + np.log(ham_prob)
            prob_is_spam = spam_SUM + np.log(spam_prob)
            
            #append to return array
            probs = np.array([prob_is_ham, prob_is_spam])
            to_return = np.vstack([to_return,probs])
            
        return to_return[1:]


    def predict_log(self, X):
        '''
        Use self.predict_log_proba to assign labels to X,
        the label will be a string that is either 'spam' or 'ham'

        Parameters:
            X (pd.Series)(N,): messages to classify

        Return:
            (ndarray)(N,): label for each message
        '''
        #get the log probabilities
        probs = self.predict_log_proba(X)
        
        #get each prediction
        predictions = []
        for row in probs:
            if row[0] >= row[1]:
                predictions.append('ham')
            else:
                predictions.append('spam')
                
        return np.array(predictions)

In [4]:
class PoissonBayesFilter(ClassifierMixin):
    '''
    A Naive Bayes Classifier that sorts messages in to spam or ham.
    This classifier assumes that words are distributed like
    Poisson random variables
    '''

    def __init__(self):
        return


    def fit(self, X, y):
        '''
        Uses bayesian inference to find the poisson rate for each word
        found in the training set. For this we will use the formulation
        of l = rt since we have variable message lengths.

        This method creates a tool that will allow the filter to
        evaluate P(H), P(S), and P(w|C)


        Parameters:
            X (pd.Series): training data
            y (pd.Series): training labels

        Returns:
            self: this is an optional method to train
        '''

        #masks for determining what messages are ham and which are spam
        ham_mask = y == 'ham'
        spam_mask = ~ham_mask
        
        #count word occurrences in ham and spam, store in dicts
        ham_dict = dict()
        spam_dict = dict()
        
        htot = 0
        stot = 0
        for i, message in enumerate(X):
            message = message.split()
            if y.iloc[i] == 'ham':
                htot += len(message)
                for word in set(message):
                    if word in ham_dict.keys():
                        ham_dict[word] += message.count(word)
                    else:
                        ham_dict[word] = message.count(word)
            else:
                stot += len(message)
                for word in set(message):
                    if word in spam_dict.keys():
                        spam_dict[word] += message.count(word)
                    else:
                        spam_dict[word] = message.count(word)
        
        #make dicts into dfs and combine into self.data (replace nans with 0 since there are 0 occurrences of that word)
        df1 = pd.DataFrame(ham_dict,index=['ham'])
        df2 = pd.DataFrame(spam_dict,index=['spam'])
        self.data = df1.append(df2).fillna(0)
        self.htot = htot
        self.stot = stot
        
        #creating and filling dicts of r values
        self.ham_rates =  dict()
        self.spam_rates = dict()
        for word in self.data.loc['ham'].index:
            self.ham_rates[word] = (self.data.loc['ham'][word]+1)/(htot+2)
            self.spam_rates[word] = (self.data.loc['spam'][word]+1)/(stot+2)
        
        return self
        
    def predict_log_proba(self, X):
        '''
        Find ln(P(C=k|x)) for each x in X and for each class

        Parameters:
            X (pd.Series)(N,): messages to classify

        Return:
            (ndarray)(N,2): Log probability each message is ham or spam
                column 0 is ham, column 1 is spam
        '''
        #prob of ham and spam
        ham_prob = self.data.loc['ham'].sum()/(self.data.sum().sum())
        spam_prob = self.data.loc['spam'].sum()/(self.data.sum().sum())
        
        #this is a dummy initialization of the array to be returned; the nans are taken out later
        to_return = np.array([np.nan,np.nan])
        
        #we're checking each message in X (each message being a string of words)
        for message in X:
            #split it
            message = message.split()
            
            #these keep track of the total probability product updated below
            ham_SUM = 0
            spam_SUM = 0
            
            #for each word
            for word in set(message):
                ni = message.count(word)
                n = len(message)
                
                #calculating prob of this word given that it's in ham; smoooth
                if word in self.ham_rates.keys():
                    lam = (self.ham_rates[word]*n)
                    P_in_ham = (ni*np.log(lam)) - lam - np.log(np.math.factorial(ni))
                else:
                    #lam = (2/(self.htot+2))*n
                    P_in_ham = 0# (ni*np.log(lam)) - lam - np.log(np.math.factorial(ni))

                if word in self.spam_rates.keys():
                    lam = (self.spam_rates[word]*n)
                    P_in_spam = (ni*np.log(lam)) - lam - np.log(np.math.factorial(ni))
                else:
                    #lam = (2/(self.stot+2))*n
                    P_in_spam = 0#(ni*np.log(lam)) - lam - np.log(np.math.factorial(ni))
                
                #updating products
                ham_SUM = ham_SUM + P_in_ham
                spam_SUM = spam_SUM + P_in_spam
            
            #finally calc full probabilities for this message
            prob_is_ham = ham_SUM + np.log(ham_prob)
            prob_is_spam = spam_SUM + np.log(spam_prob)
            
            #append to return array
            probs = np.array([prob_is_ham, prob_is_spam])
            to_return = np.vstack([to_return,probs])
            
        return to_return[1:]

    def predict(self, X):
        '''
        Use self.predict_log_proba to assign labels to X

        Parameters:
            X (pd.Series)(N,): messages to classify

        Return:
            (ndarray)(N,): label for each message
        '''

        #get the log probabilities
        probs = self.predict_log_proba(X)
        
        #get each prediction
        predictions = []
        for row in probs:
            if row[0] >= row[1]:
                predictions.append('ham')
            else:
                predictions.append('spam')
                
        return np.array(predictions)

In [5]:
def sklearn_method(X_train, y_train, X_test):
    '''
    Use sklearn's methods to transform X_train and X_test, create a
    naïve Bayes filter, and classify the provided test set.

    Parameters:
        X_train (pandas.Series): messages to train on
        y_train (pandas.Series): labels for X_train
        X_test  (pandas.Series): messages to classify

    Returns:
        (ndarray): classification of X_test
    '''
    #create dictionary and transform training data
    vectorizer = CountVectorizer()
    train_counts = vectorizer.fit_transform(X_train)
    
    #fit naive bayes
    clf = MultinomialNB()
    clf = clf.fit(train_counts, y_train)
    
    #classify
    test_counts = vectorizer.transform(X_test)
    labels = clf.predict(test_counts)
    return labels

## Predictions from each of the above classifiers

In [11]:
# Naive Bayes, regular prediction
NB = NaiveBayesFilter()
NB.fit(X[:300],y[:300])
NB.predict(X[530:535])

array(['ham', 'spam', 'ham', 'ham', 'ham'], dtype='<U4')

In [12]:
# Naive Bayes, log prediction
NB.predict_log(X[530:535])

array(['ham', 'spam', 'ham', 'ham', 'ham'], dtype='<U4')

In [13]:
# Poisson Bayes prediction
PB = PoissonBayesFilter()
PB.fit(X[:300],y[:300])
PB.predict(X[530:535])

array(['ham', 'spam', 'ham', 'ham', 'ham'], dtype='<U4')

In [14]:
# sklearn prediction
sklearn_method(X[:300],y[:300],X[530:535])

array(['ham', 'spam', 'ham', 'ham', 'ham'], dtype='<U4')

In [15]:
# true answers (same as all the predictions)
y[530:535]

530     ham
531    spam
532     ham
533     ham
534     ham
Name: Label, dtype: object