In [6]:
import collections

import numpy as np

import utils
import svm

In [7]:
def get_words(message):
    """split message into words, normalize them (lowercase), return list.
    
    Args:
        message: string containing SMS message
        
    Returns: 
        list of normalized words
    """
    # we split by spaces " "
    words = message.split(" ")
    
    normalized_words = [word.lower() for word in words]
    
    return normalized_words 

In [8]:
def create_dictionary(messages):
    """Create a dict mapping words to integer indices
    
    We'll only add words that occur in at least five messages 

    Args:
        messages (list of strings): contains SMS messages
    
    Returns: 
        python dict mapping words to int
    """
    
    word_counts = collections.Counter()
    for message in messages:
        words = get_words(message)
        unique_words = set(words)
        for word in unique_words:
            word_counts[word] += 1
    
    dictionary = {}
    index = 0
    for word,count in word_counts.items():
        if count >= 5:
            dictionary[word] = index
            index += 1
    
    print(f'dictionary shape is: {len(dictionary)} and its content is: {dictionary}')
    return dictionary

In [9]:
def transform_text(messages, word_dictionary):
    """Transform a list of text messages into a numpy array

    Args:
        messages (list of strings): each string is an SMS message
        word_dictionary (dict): mapping words to integers
        
    Returns:
         a numpy array marking the words present in each message
         where the component (i, j) is the number of occurences of the j-th vocabulary word in the i-th message
    """
    
    matrix = np.zeros((len(messages) , len(word_dictionary)))
    for i, message in enumerate(messages):
        words = get_words(message)
        unique_words = set(words)
        for word in unique_words:
            if word in word_dictionary:
                matrix[i, word_dictionary[word]] += 1
    print(f'matrix shape is: {matrix.shape}')
    print(f'matrix content is: {matrix}')
    
    return matrix

In [10]:
class NaiveBayesClassifier:
    
    def __init__(self, alpha=1):
        self.alpha = alpha
        self.phi_y1 = None
        self.phi_y0 = None
        self.phi_k_y1 = None
        self.phi_k_y0 = None
        self.vocabulary = None
        
    def fit(self, matrix, labels):
        """Fit a naive bayes model

        Args:
            matrix (numpy array): contains words for counts for the training data
            labels (array): labels for that training data 0s or 1s
        
        Returns the train model.
        Should return the state of that model as a dictionary with the following keys:

            phi_{y=1} - the model parameter that matches p(y=1)
            phi_{y=0} - the model parameter that matches p(y=0)
            phi_{k|y=1} - the model parameter that matches p(x_j = k|y = 1) for any j
            phi_{k|y=0} - the model parameter that matches p(x_j = k|y = 0) for any j
        """
        
        m , n = matrix.shape
        
        # Calculate phi_y=1 and phi_y=0
        self.phi_y1 = np.log(np.mean(labels == 1))
        # Better to use the exp of the log probability to avoid numerical instability
        self.phi_y0 = np.log(1 - np.exp(self.phi_y1))
        
        # Laplace smoothing
        alpha = self.alpha
        
        # Calculate phi_k|y=1 and phi_k|y=0
        phi_k_y1_numerator = matrix[labels==1].sum(axis=0) + alpha
        phi_k_y1_denominator = np.sum(matrix[labels == 1]) + alpha*n
        
        phi_k_y0_numerator = matrix[labels==0].sum(axis=0) + alpha
        phi_k_y0_denominator = np.sum(matrix[labels == 0]) + alpha*n
        
        # Store parameters using log probabilities
        self.phi_k_y1 = np.log(phi_k_y1_numerator) - np.log(phi_k_y1_denominator)
        self.phi_k_y0 = np.log(phi_k_y0_numerator) - np.log(phi_k_y0_denominator)
        
    def predict(self, matrix):
        """Predict labels for a given data matrix using the trained model

        Args:
            matrix (np array): i rows of messages x j columns of vocabulary dictionary
            
        Returns predictions
        """
        log_prob_1 = matrix @ self.phi_k_y1 + self.phi_y1
        log_prob_0 = matrix @ self.phi_k_y0 + self.phi_y0
        predictions = (log_prob_1 > log_prob_0).astype(int)
        return predictions
    
    def get_top_five_naive_bayes_words(self, dictionary):
        """Compute the top five words that are most indicative of the spam class

        Args:
            model (model): naive bayes model from fit function
            dictionary (dict): mapping of word to integer ids
            
        Returns: a list of the top five most indicative words.
        """
        self.vocabulary = dictionary
        
        metric = self.phi_k_y1 - self.phi_k_y0
        
        # sort words by their indicative metric
        sorted_words = sorted(self.vocabulary.keys(),
                              key=lambda word: metric[dictionary[word]],
                              reverse=True)
        
        # Make a list of the top 5 words
        top_5_words = sorted_words[:5]
        
        return top_5_words

In [14]:
def compute_best_svm_radius(train_matrix, train_labels, val_matrix, val_labels, radius_to_consider):
    """Compute optimal SVM radius using train and eval datasets.

    Args:
        train_matrix (matrix): word counts for the training data
        train_labels (list): The spam or not spam labels for the training data
        val_matrix (matrix): word counts for the validation data
        val_labels (list): spam or not spam labels for val data
        radius_to_consider (list): radius values to consider
    """
    best_radius = None
    best_accuracy = 0
    
    for radius in radius_to_consider:
        predictions = svm.train_and_predict_svm(train_matrix, train_labels, val_matrix, radius)
        
        accuracy = np.mean(predictions == val_labels)
        
        if accuracy > best_accuracy:
            
            best_accuracy = accuracy
            best_radius = radius
    return best_radius
    

In [18]:
train_messages, train_labels = utils.load_spam_dataset('../data/spam_train.tsv')
val_messages, val_labels = utils.load_spam_dataset('../data/spam_val.tsv')
test_messages, test_labels = utils.load_spam_dataset('../data/spam_test.tsv')

dictionary = create_dictionary(train_messages)

print(f'Size of dictionary: {len(dictionary)}')

utils.write_json('spam_dictionary_(soln)', dictionary)

train_matrix = transform_text(train_messages, dictionary)
val_matrix = transform_text(val_messages, dictionary)
test_matrix = transform_text(test_messages, dictionary)

np.savetxt('spam_sample_train_matrix_(soln)', train_matrix[:100, :])

naive_bayes_model = NaiveBayesClassifier()

naive_bayes_model.fit(train_matrix, train_labels)

nb_predictions = naive_bayes_model.predict(test_matrix)

np.savetxt('spam_nayve_bayes_predictions_(soln)', nb_predictions)

naive_bayes_accuracy = np.mean(nb_predictions == test_labels)

print(f'Naive Bayes had an accuracy of {naive_bayes_accuracy} on the testing set')

top_5_words = naive_bayes_model.get_top_five_naive_bayes_words(dictionary)

print(f'The top 5 indicative words for NB are: {top_5_words}')

utils.write_json('spam_top_indicative_words_(soln)', top_5_words)

optimal_radius = compute_best_svm_radius(train_matrix, train_labels, test_matrix, test_labels, [0.01, 0.1, 1, 10])

utils.write_json('spam_optimal_radius_(soln)', optimal_radius)

print(f'The optimal SVM radius was {optimal_radius}')

svm_predictions = svm.train_and_predict_svm(train_matrix, train_labels, test_matrix, optimal_radius)

svm_accuracy = np.mean(svm_predictions == test_labels)

print(f'The SVM model had an accuracy of {svm_accuracy}')

dictionary shape is: 1717 and its content is: {'down': 0, 'da': 1, 'fone': 2, 'on': 3, '4': 4, 'thanx': 5, 'the': 6, 'so': 7, 'what': 8, 'up': 9, 'to.': 10, "how's": 11, 'and': 12, 'you': 13, 'how': 14, 'are': 15, 'john': 16, 'then': 17, 'father': 18, 'name': 19, 'of': 20, 'is': 21, 'see': 22, 'u': 23, 'in': 24, 'there,': 25, 'a': 26, 'almost': 27, 'study': 28, 'yes': 29, 'all': 30, 'can': 31, 'baby!': 32, 'we': 33, 'to': 34, 'great!': 35, 'any': 36, 'use': 37, 'dont': 38, 'it': 39, 'be': 40, 'now.': 41, 'shall': 42, 'have': 43, 'will': 44, 'i': 45, 'fine.': 46, 'world': 47, 'lot': 48, 'dis': 49, 'n': 50, 'with': 51, 'which': 52, 'thing': 53, 'only': 54, 'this': 55, 'number': 56, 'my': 57, 'by': 58, 'good': 59, 'wish': 60, 'd': 61, 'hi': 62, 'get': 63, 'soon': 64, 'me': 65, 'give': 66, 'well': 67, 'call': 68, 'r': 69, 'hope': 70, 'again': 71, 'long': 72, 'time': 73, 'del': 74, 'from': 75, 'u?': 76, 'hey': 77, 'an': 78, 'no': 79, 'tell': 80, 'your': 81, '@': 82, 'plan': 83, 'valentines'

ValueError: operands could not be broadcast together with shapes (557,) (558,) 