<a href="https://colab.research.google.com/github/vpolkampally0125/aaru/blob/main/Naive_Bayes_w_Laplace_Smoothing_Spam_Classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import collections

import numpy as np

import util
import svm

In [2]:
def get_words(message):
    """Get the normalized list of words from a message string.

    This function should split a message into words, normalize them, and return
    the resulting list. For splitting, you should split on spaces. For normalization,
    you should convert everything to lowercase.

    Args:
        message: A string containing an SMS message

    Returns:
       The list of normalized words from the message.
    """

    words = message.lower().split()
    return words


In [4]:
def create_dictionary(messages):
    """Create a dictionary mapping words to integer indices.

    This function should create a dictionary of word to indices using the provided
    training messages. Use get_words to process each message.

    Rare words are often not useful for modeling. Please only add words to the dictionary
    if they occur in at least five messages.

    Args:
        messages: A list of strings containing SMS messages

    Returns:
        A python dict mapping words to integers.
    """

    countWords = collections.defaultdict(int)
    for message in messages:
        words = get_words(message)
        for word in words:
            countWords[word] += 1

    wordDict = collections.defaultdict(int)
    index = 0
    for word, count in countWords.items():
        if count >= 5:
            wordDict[word] = index
            index += 1
    return wordDict

In [5]:
def fit_naive_bayes_model(matrix, labels):
    """Fit a naive bayes model.

    This function should fit a Naive Bayes model given a training matrix and labels.

    The function should return the state of that model.

    Feel free to use whatever datatype you wish for the state of the model.

    Args:
        matrix: A numpy array containing word counts for the training data
        labels: The binary (0 or 1) labels for that training data

    Returns: The trained model
    """

    m,n = matrix.shape
    phi_y = np.mean(labels)
    phi_k_y1 = (1 + matrix[labels == 1].sum(axis=0)) / (n + matrix[labels == 1].sum())
    phi_k_y0 = (1 + matrix[labels == 0].sum(axis=0)) / (n + matrix[labels == 0].sum())
    return (phi_y, phi_k_y1, phi_k_y0)

In [6]:
def predict_from_naive_bayes_model(model, matrix):
    """Use a Naive Bayes model to compute predictions for a target matrix.

    This function should be able to predict on the models that fit_naive_bayes_model
    outputs.

    Args:
        model: A trained model from fit_naive_bayes_model
        matrix: A numpy array containing word counts

    Returns: A numpy array containg the predictions from the model
    """

    phi_y, phi_k_y1, phi_k_y0 = model

    return matrix @ (np.log(phi_k_y1) - np.log(phi_k_y0)) + np.log(phi_y / (1 - phi_y)) >= 0

In [7]:
def get_top_five_naive_bayes_words(model, dictionary):
    """Compute the top five words that are most indicative of the spam (i.e positive) class.

    Ues the metric given in 6c as a measure of how indicative a word is.
    Return the words in sorted form, with the most indicative word first.

    Args:
        model: The Naive Bayes model returned from fit_naive_bayes_model
        dictionary: A mapping of word to integer ids

    Returns: The top five most indicative words in sorted order with the most indicative first
    """

    _, phi_i_y1, phi_i_y0 = model

    inv_dictionary = { v: k for k, v in dictionary.items() }

    # Sort log(phi_i_y1 / phi_i_y0) in descending order and pick the top five
    top_five_indicative_word_index = np.argsort(-(np.log(phi_i_y1) - np.log(phi_i_y0)))[:5]

    return [inv_dictionary[i] for i in top_five_indicative_word_index]

In [8]:
def compute_best_svm_radius(train_matrix, train_labels, val_matrix, val_labels, radius_to_consider):
    """Compute the optimal SVM radius using the provided training and evaluation datasets.

    You should only consider radius values within the radius_to_consider list.
    You should use accuracy as a metric for comparing the different radius values.

    Args:
        train_matrix: The word counts for the training data
        train_labels: The spam or not spam labels for the training data
        val_matrix: The word counts for the validation data
        val_labels: The spam or not spam labels for the validation data
        radius_to_consider: The radius values to consider

    Returns:
        The best radius which maximizes SVM accuracy.
    """

    best_radius = radius_to_consider[0]
    best_accuracy = .0

    for radius in radius_to_consider:
        accuracy = np.mean(svm.train_and_predict_svm(train_matrix, train_labels, val_matrix, radius) == val_labels)
        if accuracy > best_accuracy:
            best_radius = radius
            best_accuracy = accuracy
    return best_radius
