In [22]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix


# Load dataset

In [23]:
data = pd.read_csv('sms+spam+collection/SMSSpamCollection', sep='\t', header=None, names=['label', 'message'])


print(data.head())
print(data['label'].value_counts())


  label                                            message
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...
label
ham     4825
spam     747
Name: count, dtype: int64


# Clean and tokenize the messages

In [24]:
def preprocess_text(text):
    text = text.lower()  
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  
    words = text.split() 
    return words

data['processed_message'] = data['message'].apply(preprocess_text)
print(data.head())

  label                                            message  \
0   ham  Go until jurong point, crazy.. Available only ...   
1   ham                      Ok lar... Joking wif u oni...   
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...   
3   ham  U dun say so early hor... U c already then say...   
4   ham  Nah I don't think he goes to usf, he lives aro...   

                                   processed_message  
0  [go, until, jurong, point, crazy, available, o...  
1                     [ok, lar, joking, wif, u, oni]  
2  [free, entry, in, 2, a, wkly, comp, to, win, f...  
3  [u, dun, say, so, early, hor, u, c, already, t...  
4  [nah, i, dont, think, he, goes, to, usf, he, l...  


In [25]:
train_data, test_data = train_test_split(data, test_size=0.3, random_state=42)

# Separate features and labels for training and testing
train_messages = train_data['message']
train_labels = train_data['label']
test_messages = test_data['message']
test_labels = test_data['label']


In [26]:
# Calculate priors
total_messages = len(train_data)
spam_messages = len(train_data[train_data['label'] == 'spam'])
ham_messages = len(train_data[train_data['label'] == 'ham'])

prior_spam = spam_messages / total_messages
prior_ham = ham_messages / total_messages

print(f"P(spam): {prior_spam}, P(ham): {prior_ham}")


P(spam): 0.1341025641025641, P(ham): 0.865897435897436


In [27]:
# Get word frequencies for spam and ham
spam_words = train_data[train_data['label'] == 'spam']['processed_message'].explode()
ham_words = train_data[train_data['label'] == 'ham']['processed_message'].explode()

spam_word_counts = spam_words.value_counts()
ham_word_counts = ham_words.value_counts()

# Vocabulary size and total words
vocab_size = len(set(spam_words).union(set(ham_words)))
total_spam_words = spam_words.count()
total_ham_words = ham_words.count()

# Likelihood calculation with Laplace smoothing
def likelihood(word, word_counts, total_words, alpha=1):
    return (word_counts.get(word, 0) + alpha) / (total_words + alpha * vocab_size)


In [28]:
# Compute posterior probabilities and classify
def classify_message(message):
    words = preprocess_text(message)
    spam_prob = np.log(prior_spam)
    ham_prob = np.log(prior_ham)
    
    for word in words:
        spam_prob += np.log(likelihood(word, spam_word_counts, total_spam_words))
        ham_prob += np.log(likelihood(word, ham_word_counts, total_ham_words))
    
    return 'spam' if spam_prob > ham_prob else 'ham'

# Apply classification
train_data['predicted_label'] = train_data['message'].apply(classify_message)


# Calculate metrics

In [30]:
# Classify messages in the test set
test_data['predicted_label'] = test_data['message'].apply(classify_message)

# Evaluate the model
accuracy = accuracy_score(test_labels, test_data['predicted_label'])
precision = precision_score(test_labels, test_data['predicted_label'], pos_label='spam')
recall = recall_score(test_labels, test_data['predicted_label'], pos_label='spam')
f1 = f1_score(test_labels, test_data['predicted_label'], pos_label='spam')
conf_matrix = confusion_matrix(test_labels, test_data['predicted_label'])

# Print metrics
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")
print(f"Confusion Matrix:\n{conf_matrix}")


Accuracy: 0.9778708133971292
Precision: 0.8945147679324894
Recall: 0.9464285714285714
F1 Score: 0.9197396963123644
Confusion Matrix:
[[1423   25]
 [  12  212]]
