##### ### The University of Melbourne, School of Computing and Information Systems
# COMP30027 Machine Learning, 2025 Semester 1

## Assignment 1: Scam detection with Naive Bayes


**Student ID(s):**     1462539


## 0. Set-up

Let us load data from `sms_supervised_train.csv`

In [72]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from typing import List, Set, Dict, Tuple

# Load data
labeled_train = pd.read_csv('data/sms_supervised_train.csv')

Now that we loaded our data, let us delete rows where `textPreprocessed` is empty by calling `data.dropna`. These instances do not have any features we can learn from. We can then tokenise (split) the already preprossessd text for our next stage.

In [73]:
# Split text into tokens
labeled_train.dropna(subset=['textPreprocessed'], inplace=True)
labeled_train['tokens'] = labeled_train['textPreprocessed'].apply(lambda x: x.split())

Let us define the **vocabulary**, which is a list of every word which occurs in the training data set

In [88]:
vocabulary = set()
for tokens in labeled_train['tokens']:
    vocabulary.update(tokens)

Define **count** matrix (Bag-of-Words feature matrix). Since the dataset is already preprocessed, I'm directly supplying `vocabulary=vocabulary` to `CountVectorizer` without calling `fit()`, to avoid any unintended token filtering.

In [90]:
vectorizer: CountVectorizer = CountVectorizer(vocabulary=vocabulary)
X: np.ndarray = vectorizer.transform(labeled_train['textPreprocessed'])
y: np.ndarray = labeled_train['class'].values

## 1. Supervised model training


We now computer the prior probability of each class $P(C)$. It returns a dictionary of class label with prior probabilities $\{C: P(C)\}$.

In [91]:
def calc_prior(data: pd.DataFrame, label_col: str = 'class') -> Dict[int, float]:
    class_counts = data[label_col].value_counts()
    total = class_counts.sum()
    return (class_counts / total).to_dict()

The next set of probabilities we need to calculate are the conditional probabilities. We need to know the likelihood of each feature value given a specific label.

message1: [1, 0, 2]   → "hello now now"

message2: [0, 1, 1]   → "win now"

[1 + 0, 0 + 1, 2 + 1] = [1, 1, 3]

then we can computer [1 + 0, 0 + 1, 2 + 1] = [1, 1, 3]

where each `word_counts[i]` is exactly $count_{c,i}$ where

$$
p_{c,i} = \frac{count_{c,i} + \alpha}{total_c + V \alpha}
$$




In [92]:
def calc_likelihood(X: np.ndarray, y: np.ndarray, alpha: float = 1.0) -> Dict[int, np.ndarray]:
    classes = np.unique(y)
    vocab_size = X.shape[1]
    likelihoods = {}

    for c in classes:
        X_c = X[np.where(y == c)[0]]
        word_counts = X_c.sum(axis=0)
        word_counts = np.asarray(word_counts).flatten()
        total_count = word_counts.sum()
        likelihoods[c] = (word_counts + alpha) / (total_count + alpha * vocab_size)

    return likelihoods

We are ready to train our dataset, but before that, let us create function to out most probable words in each class and also 

$$
\text{Predictiveness} = \frac{P(w \mid \text{scam})}{P(w \mid \text{non-malicious})}
$$

The ratio tells us how "words that happen a lot more in this group than the other one"

In [93]:
def get_top_words(likelihood, vocab, class_label, top_n=10):
    indices = np.argsort(-likelihood[class_label])[:top_n]
    return [(vocab[i], likelihood[class_label][i]) for i in indices]

In [94]:
def get_predictive_words(likelihoods, vocab, top_n=10):
    ratio_1_over_0 = likelihoods[1] / likelihoods[0]
    ratio_0_over_1 = likelihoods[0] / likelihoods[1]
    top_1_indices = np.argsort(-ratio_1_over_0)[:top_n]
    top_0_indices = np.argsort(-ratio_0_over_1)[:top_n]
    top_scams = [(vocab[i], ratio_1_over_0[i]) for i in top_1_indices]
    top_nonmal = [(vocab[i], ratio_0_over_1[i]) for i in top_0_indices]
    return top_scams, top_nonmal

In [95]:
priors = calc_prior(labeled_train)
likelihoods = calc_likelihood(X, y)

# print("Prior probabilities:")
# for c, p in priors.items():
#     print(f"Class {c} ({'scam' if c==1 else 'non-malicious'}): {p:.4f}")


print calculation

In [96]:
top_words_0 = get_top_words(likelihoods, list(vocabulary), 0)
top_words_1 = get_top_words(likelihoods, list(vocabulary), 1)

# print("Top 10 most probable words in non-malicious class:")
# for word, prob in top_words_0:
#     print(f"{word}: {prob:.4f}")

# print("\nTop 10 most probable words in scam class:")
# for word, prob in top_words_1:
#     print(f"{word}: {prob:.4f}")

In [97]:
top_predictive_1, top_predictive_0 = get_predictive_words(likelihoods, list(vocabulary))

# print("Top 10 most predictive words for scam class (P(w|1) / P(w|0)):")
# for word, ratio in top_predictive_1:
#     print(f"{word}: {ratio:.2f}")

# print("\nTop 10 most predictive words for non-malicious class (P(w|0) / P(w|1)):")
# for word, ratio in top_predictive_0:
#     print(f"{word}: {ratio:.2f}")

## 2. Supervised model evaluation

$$P(c \mid count) \propto P(c)P(count \mid c)$$
$$\log{P(c \mid doc)} \propto \log{P(c)} + \sum_{i}{x_i \cdot \log{P(w_i \mid c)}}$$

In [98]:
def calc_posterior(counts: np.ndarray, priors: Dict[int, float], likelihoods: Dict[int, np.ndarray]) -> Dict[int, float]:
    """Calculate log-posterior score for each class"""
    scores = {}
    for c in priors:
        log_prior = np.log(priors[c])
        log_likelihood = np.log(likelihoods[c])
        scores[c] = log_prior + np.dot(counts, log_likelihood)
    return scores

In [100]:
def predict_batch(texts: List[str], priors: Dict[int, float], likelihoods: Dict[int, np.ndarray], 
                  vectorizer: CountVectorizer) -> Tuple[np.ndarray, np.ndarray]:
    predictions: List[int] = []
    confidence_ratios: List[float] = []

    for text in texts:
        # Transform the text into a count vector
        counts: np.ndarray = vectorizer.transform([text]).toarray().flatten()
        # Calculate posterior scores for each class
        scores: Dict[int, float] = calc_posterior(counts, priors, likelihoods)
        # Append the predicted class (class with the highest posterior score)
        predictions.append(max(scores, key=scores.get))
        # Append the confidence ratio (P(class 1) / P(class 0))
        confidence_ratios.append(np.exp(scores[1] - scores[0]))

    return np.array(predictions), np.array(confidence_ratios)

now, we calculate

In [101]:
test_df = pd.read_csv('data/sms_test.csv')
test_df.dropna(subset=['textPreprocessed'], inplace=True)

test_texts = test_df['textPreprocessed'].tolist()
true_labels = test_df['class'].values
predicted_labels, confidence_ratios = predict_batch(test_texts, priors, likelihoods, vectorizer)

Ill just use `sklearn` library to evaluate my NB

In [102]:
acc = accuracy_score(true_labels, predicted_labels)
prec = precision_score(true_labels, predicted_labels)
rec = recall_score(true_labels, predicted_labels)
f1 = f1_score(true_labels, predicted_labels)
conf_matrix = confusion_matrix(true_labels, predicted_labels)

In [103]:
# print(f"Accuracy:  {acc:.4f}")
# print(f"Precision: {prec:.4f}")
# print(f"Recall:    {rec:.4f}")
# print(f"F1 Score:  {f1:.4f}")
# print("\nConfusion Matrix:")

# # Plot it
# plt.figure(figsize=(6, 5))
# sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues',
#             xticklabels=["Non-Malicious", "Scam"],
#             yticklabels=["Non-Malicious", "Scam"])
# plt.xlabel("Predicted Label")
# plt.ylabel("True Label")
# plt.tight_layout()
# plt.show()

In [105]:
oov_messages = 0
skipped_messages = 0

for text in test_texts:
    tokens = text.split()
    tokens_in_vocab = [token for token in tokens if token in vocabulary]
    if not tokens_in_vocab:
        skipped_messages += 1
    elif len(tokens) != len(tokens_in_vocab):
        oov_messages += 1

print(f"OOV Messages (some tokens missing): {oov_messages}")
print(f"Skipped Messages (no tokens in vocab): {skipped_messages}")

OOV Messages (some tokens missing): 142
Skipped Messages (no tokens in vocab): 0


In [107]:
test_df['prediction'] = predicted_labels
test_df['confidence_ratio'] = confidence_ratios

# print("--- High Confidence Scam Predictions ---")
# high_conf_scam = test_df[test_df['prediction'] == 1].sort_values(by='confidence_ratio', ascending=False).head(3)
# print(high_conf_scam[['textPreprocessed', 'confidence_ratio']])

# print("\n--- High Confidence Non-Malicious Predictions ---")
# high_conf_nonmal = test_df[test_df['prediction'] == 0].sort_values(by='confidence_ratio', ascending=True).head(5)
# print(high_conf_nonmal[['textPreprocessed', 'confidence_ratio']])

# print("\n--- Boundary Cases (Confidence Ratio ≈ 1) ---")
# boundary = test_df[(test_df['confidence_ratio'] > 0.9) & (test_df['confidence_ratio'] < 1.1)].head(3)
# print(boundary[['textPreprocessed', 'confidence_ratio']])

## 3. Extending the model with semi-supervised training

## 4. Supervised model evaluation