##### ### The University of Melbourne, School of Computing and Information Systems
# COMP30027 Machine Learning, 2025 Semester 1

## Assignment 1: Scam detection with naive Bayes


**Student ID(s):**     `1375531`


This iPython notebook is a template which you will use for your Assignment 1 submission.

**NOTE: YOU SHOULD ADD YOUR RESULTS, GRAPHS, AND FIGURES FROM YOUR OBSERVATIONS IN THIS FILE TO YOUR REPORT (the PDF file).** Results, figures, etc. which appear in this file but are NOT included in your report will not be marked.

**Adding proper comments to your code is MANDATORY. **

In [1]:
# Import libraries
import pandas as pd
import numpy as np

In [2]:
# Read data
train_df = pd.read_csv('./data/sms_supervised_train.csv')
test_df = pd.read_csv('./data/sms_test.csv')
unlabelled_def = pd.read_csv('./data/sms_unlabelled.csv') 

### Function definitions

In [7]:
def preprocess(data, column='textPreprocessed'):
    """
    Cleans input pandas DataFrame by removing NaNs and 
    tokenising 'textPreprocessed'.
    
    Returns:
        - data: cleaned pandas DataFrame
        - tokenised_instances: a Series of lists of words
    """
    data = data.dropna(subset=[column]).reset_index(drop=True)
    tokenised_instances = data[column].apply(lambda x: x.split())
    
    return data, tokenised_instances


def create_vocabulary(tokenised_instances):
    """
    Creates a set of unique words from the tokenised instances
    
    Returns:
        - vocabulary: a sorted list of unique words 
    """
    vocabulary = set()
    
    for instance in tokenised_instances:
        for token in instance:
            vocabulary.add(token)
            
    return sorted(vocabulary)


def create_count_matrix(tokenised_instances, vocabulary):
    """
    Creates a count matrix where the rows are the message instances, and the 
    columns are all the unique vocabulary words. Each cell in the matrix 
    represents the number of times a given word (matrix column) appeared in 
    a given message (matrix row).
    
    Returns:
        - count_matrix: pandas DataFrame
    """
    count_matrix = pd.DataFrame(0, index=tokenised_instances.index, columns=vocabulary)
    
    for index, instance in enumerate(tokenised_instances):
        for token in instance:
            count_matrix.at[index, token] += 1
            
    return count_matrix


def compute_priors(data, class_column='class'):
    """
    Computes the prior probability P(class=c) for each class c.
    
    Returns:
        - priors: a pandas Series of the respective prior probabilities 
          for each class.
    """
    instances_by_class = data[class_column].value_counts()
    total_instances = len(data)
    priors = instances_by_class.apply(lambda x: x / total_instances) 
    
    return priors

def compute_log_likelihoods(data, count_matrix, vocabulary, class_column='class', alpha=1):
    """
    Computes likelihoods P(word=w|class=c) for each word w within each class c.
    Store as log-likelihoods to avoid underflow problems later. Additionally, 
    implements Laplace smoothing.
    
    Returns:
        - log_likelihoods: a 2D dictionary which stores all the likelihood 
          values.
    """
    log_likelihoods = {}
    classes = data[class_column].unique()
    
    for c in classes:
        # create a subset of count_matrix to only include instances of class c
        class_indices = data[data[class_column] == c].index
        count_matrix_c = count_matrix.loc[class_indices]
        
        # count how many times each word appears
        word_counts = count_matrix_c.sum()
        # count the total amount of words in class c
        total_word_count = word_counts.sum()
        
        # compute and store log-likelihoods
        log_likelihoods_c = {}
        for word in vocabulary:
            word_count = word_counts[word]
            likelihood = (word_count + alpha) / (total_word_count + len(vocabulary) * alpha)
            log_likelihoods_c[word] = np.log(likelihood)
        
        log_likelihoods[c] = log_likelihoods_c
    
    return log_likelihoods

def create_count_vector(instance, vocabulary):
    """
    Creates a word count vector for an instance based on the vocabulary
    given. Essentially an instance/row of the count_matrix described above.
    
    Returns: 
        - count_vector: pandas Series indexed by vocabulary words, and stores 
          their respective word counts.
    """
    count_vector = pd.Series(0, index=vocabulary)
    
    for token in instance:
        count_vector[token] += 1
        
    return count_vector

def compute_posteriors(count_vector, priors, log_likelihoods, vocabulary):
    """
    Computes the log posterior of an instance (count_vector) for each class.
    
    Returns:
        - posteriors: a dictionary which stores each class (key) and their
          respective log-posterior probability (value).
    """
    posteriors = {}
    
    for c in priors.keys():
        log_posterior = np.log(priors[c])

        for word in vocabulary:
            count = count_vector[word]
            
            if count > 0:
                log_posterior += count * log_likelihoods[c][word]
                
        posteriors[c] = log_posterior
    
    return posteriors


def train_naive_bayes(data):
    """
    Trains a multinomial Naive Bayes model.
    
    Returns;
        - priors: a pandas Series of the respective prior probabilities for 
          each class.
        - log_likelihoods: a 2D dictionary which stores all the likelihood 
          values.
        - vocabulary: a sorted list of unique words.
    """
    data, tokenised_instances = preprocess(data)
    vocabulary = create_vocabulary(tokenised_instances)
    count_matrix = create_count_matrix(tokenised_instances, vocabulary)
    priors = compute_priors(data)
    log_likelihoods = compute_log_likelihoods(data, count_matrix, vocabulary)
    
    return priors, log_likelihoods, vocabulary


def predict_naive_bayes(data, priors, log_likelihoods, vocabulary):
    """
    Predict class labels for each instance in the test data using the
    multinomial Naive Bayes model.
    
    Returns:
        - predicted_values: a list of predicted class labels. The label can 
          be NaN if an instance doesn't contain any words from the training
          data.
    """
    predicted_values = []
    data, tokenised_instances = preprocess(data)
    
    for instance in tokenised_instances:
        # only keep tokens that were in the training set
        instance_cleaned = [token for token in instance if token in vocabulary]
        # skip instance entirely if it doesn't contain any words from the training set
        if not instance_cleaned:
            predictions.append(np.nan)
            continue
        
        count_vector = create_count_vector(instance)
        posteriors = compute_posteriors(count_vector, priors, log_likelihoods, vocabulary)
        
        predicted_class = max(posteriors, key=posteriors.get)
        predicted_values.append(predicted_class)
    
    return predicted_values


In [9]:
train_naive_bayes(train_df)

hi


## 1. Supervised model training


## 2. Supervised model evaluation

## 3. Extending the model with semi-supervised training

## 4. Supervised model evaluation