# Naive Bayes in Python


In [1]:
import numpy as np
import pandas as pd

### The Corpus

In [2]:
documents = [
    ("I ate dinner early", "HAM"),
    ("free money today", "SPAM"),
    ("I had a blast", "HAM"),
    ("sign up free early today", "HAM"),
    ("only free today", "SPAM")
]

In [3]:
corpus = set()

# Build corpus
for document in documents:
    text = document[0]
    class_value = document[1]
    for word in text.split():
        corpus.add(word)

# Generate Conditional Probabilities
We need to generate first $P(x|y)$. For instance, what is the likelihood of finding the word `free` if we know the document is `HAM` is represented as `P(x="free"|y="HAM")`.

In [4]:
conditional_probabilities = pd.DataFrame(index=list(corpus), 
                                         columns=["likelihood_given_ham", "likelihood_given_spam"])

In [5]:
spam_documents = 0
ham_documents = 0
for document in documents:
    if document[1] == "SPAM":
        spam_documents += 1
    else:
        ham_documents += 1

    print(f"{document}")
    print(f"Spam documents: {spam_documents}")
    print(f"Ham documents: {ham_documents} \n\n")
    
p_ham = ham_documents / (spam_documents + ham_documents)
p_spam = spam_documents / (spam_documents + ham_documents)

('I ate dinner early', 'HAM')
Spam documents: 0
Ham documents: 1 


('free money today', 'SPAM')
Spam documents: 1
Ham documents: 1 


('I had a blast', 'HAM')
Spam documents: 1
Ham documents: 2 


('sign up free early today', 'HAM')
Spam documents: 1
Ham documents: 3 


('only free today', 'SPAM')
Spam documents: 2
Ham documents: 3 




In [15]:
for word in corpus:
    
    ham_documents_with_word = 0
    spam_documents_with_word = 0
    
    for document in documents:
        document_class = document[1]
        if word in document[0].split():
            if document[1] == "HAM":
                ham_documents_with_word += 1
            else:
                spam_documents_with_word += 1
    
    print(f"For word {word}, {ham_documents_with_word} ham out of {ham_documents} ham documents.")
    print(f"For word {word}, {spam_documents_with_word} spam out of {spam_documents} spam documents.\n")
    conditional_probabilities.loc[word, "likelihood_given_ham"] = ham_documents_with_word * 1.0 / ham_documents
    conditional_probabilities.loc[word, "likelihood_given_spam"] = spam_documents_with_word * 1.0 / spam_documents

For word ate, 1 ham out of 3 ham documents.
For word ate, 0 spam out of 2 spam documents.

For word up, 1 ham out of 3 ham documents.
For word up, 0 spam out of 2 spam documents.

For word a, 1 ham out of 3 ham documents.
For word a, 0 spam out of 2 spam documents.

For word only, 0 ham out of 3 ham documents.
For word only, 1 spam out of 2 spam documents.

For word free, 1 ham out of 3 ham documents.
For word free, 2 spam out of 2 spam documents.

For word dinner, 1 ham out of 3 ham documents.
For word dinner, 0 spam out of 2 spam documents.

For word had, 1 ham out of 3 ham documents.
For word had, 0 spam out of 2 spam documents.

For word blast, 1 ham out of 3 ham documents.
For word blast, 0 spam out of 2 spam documents.

For word early, 2 ham out of 3 ham documents.
For word early, 0 spam out of 2 spam documents.

For word sign, 1 ham out of 3 ham documents.
For word sign, 0 spam out of 2 spam documents.

For word money, 0 ham out of 3 ham documents.
For word money, 1 spam out of 

In [16]:
test_document = "free today"

In [17]:
def get_likelihood(test_document, conditional_probabilities):
    likelihood_ham = 1
    likelihood_spam = 1
    for word in test_document.split():
        likelihood_ham = likelihood_ham * conditional_probabilities.loc[word, "likelihood_given_ham"]
        likelihood_spam = likelihood_spam * conditional_probabilities.loc[word, "likelihood_given_spam"]
    
    return likelihood_ham, likelihood_spam

In [18]:
likelihood_ham, likelihood_spam = get_likelihood(test_document, conditional_probabilities)

In [19]:
def get_posterior(likelihood_ham, likelihood_spam, p_ham, p_spam):
    posterior_ham = likelihood_ham * p_ham / (likelihood_ham * p_ham + likelihood_spam * p_spam)
    posterior_spam = likelihood_spam * p_spam / (likelihood_ham * p_ham + likelihood_spam * p_spam)
    return posterior_ham, posterior_spam

In [20]:
get_posterior(likelihood_ham, likelihood_spam, p_ham, p_spam)

(0.14285714285714285, 0.8571428571428572)

In [21]:
def fit_naive_bayes(documents):
    corpus = set()
    # Build corpus
    for document in documents:
        text = document[0]
        class_value = document[1]
        for word in text.split():
            corpus.add(word)
    
    conditional_probabilities = pd.DataFrame(index=list(corpus), 
                                             columns=["likelihood_given_ham", "likelihood_given_spam"])
    
    spam_documents = 0
    ham_documents = 0
    for document in documents:
        if document[1] == "SPAM":
            spam_documents += 1
        else:
            ham_documents += 1
    p_ham = ham_documents / (spam_documents + ham_documents)
    p_spam = spam_documents / (spam_documents + ham_documents)
    
    for word in corpus:
        ham_documents_with_word = 0
        spam_documents_with_word = 0
    
        for document in documents:
            document_class = document[1]
            if word in document[0].split():
                if document[1] == "HAM":
                    ham_documents_with_word += 1
                else:
                    spam_documents_with_word += 1

        #print(f"For word {word}, {ham_documents_with_word} ham out of {ham_documents}.")
        #print(f"For word {word}, {spam_documents_with_word} spam out of {spam_documents}.")
        conditional_probabilities.loc[word, "likelihood_given_ham"] = ham_documents_with_word * 1.0 / ham_documents
        conditional_probabilities.loc[word, "likelihood_given_spam"] = spam_documents_with_word * 1.0 / spam_documents

    
    return conditional_probabilities, p_ham, p_spam

In [22]:
fit_naive_bayes(documents)

(       likelihood_given_ham likelihood_given_spam
 ate                0.333333                     0
 up                 0.333333                     0
 a                  0.333333                     0
 only                      0                   0.5
 free               0.333333                     1
 dinner             0.333333                     0
 had                0.333333                     0
 blast              0.333333                     0
 early              0.666667                     0
 sign               0.333333                     0
 money                     0                   0.5
 today              0.333333                     1
 I                  0.666667                     0, 0.6, 0.4)

## Dealing with Non-Existent Words

From [Sebastian Raschka, Python Machine Learning](https://arxiv.org/pdf/1410.5329.pdf)
![Correlations](images/smoothing.png "Visualization of various r values for Pearson correlation coefficient")
