### Spam Detection using Pytorch

In [1]:
import pandas as pd
import numpy as np
import re
from collections import Counter

In [2]:
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report


In [3]:
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from torchtext.data.utils import get_tokenizer
import torch.optim as optim


In [4]:
df = pd.read_csv('spam_or_ham.csv',encoding= 'latin1')
df

Unnamed: 0,mail type,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [5]:
df['mail type'].value_counts()

ham     4825
spam     747
Name: mail type, dtype: int64

In [6]:
#Encode the labels
le = LabelEncoder()
df['mail type'] = le.fit_transform(df['mail type'])

## Text Preprocessing

In [7]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))


In [8]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]+', ' ',text) #to remove punctuation
    text = re.sub(r'\d+(\.\d+)?', ' ', text)#to remove numbers
    text = ' '.join([word for word in text.split() if word not in stop_words])#to remove stopwords
    text = ' '.join([lemmatizer.lemmatize(y,'v') for y in text.split()]) #to lemmatize
    text = ' '.join([lemmatizer.lemmatize(y,'n') for y in text.split()]) #to lemmatize
    return text

In [9]:
df['cleaned_message'] = df['message'].apply(preprocess_text)

In [10]:
df.head()

Unnamed: 0,mail type,message,cleaned_message
0,0,"Go until jurong point, crazy.. Available only ...",go jurong point crazy available bugis n great ...
1,0,Ok lar... Joking wif u oni...,ok lar joke wif u oni
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,free entry wkly comp win fa cup final tkts st ...
3,0,U dun say so early hor... U c already then say...,u dun say early hor u c already say
4,0,"Nah I don't think he goes to usf, he lives aro...",nah think go usf live around though


In [11]:
train_texts, test_texts, train_labels, test_labels = train_test_split(df["cleaned_message"], df["mail type"], 
                                                                      test_size=0.2, random_state=42,
                                                                      stratify=df['mail type'])

In [12]:
tokenizer = get_tokenizer("basic_english")


In [13]:
# Build vocabulary
word_counts = Counter([word for text in train_texts for word in text.split(' ') ])
vocab = {word: idx + 1 for idx, (word, _) in enumerate(word_counts.most_common())}  # Start indexing from 1

# Convert text to sequence of indices
def text_to_sequence(text):
    tokens = word_tokenize(text)
    seq = [vocab.get(word, 0) for word in tokens]
    return  seq 

train_sequences = [text_to_sequence(text) for text in train_texts]
test_sequences = [text_to_sequence(text) for text in test_texts]


In [14]:
len(train_sequences), len(test_sequences)

(4457, 1115)

In [15]:
MAX_LEN = 80  # Truncate or pad sequences to this length

def pad_sequences(sequences, max_len=MAX_LEN):
    """This function takes a sequence and either truncates it or pads it with zeros 
    to make it the same length as the max_len"""
    return [seq[:max_len] + [0] * (max_len - len(seq)) if len(seq) < max_len else seq[:max_len] for seq in sequences]

train_sequences = pad_sequences(train_sequences)
test_sequences = pad_sequences(test_sequences)

# Convert the sequence to tensors
train_tensor = torch.tensor(train_sequences, dtype=torch.long)
test_tensor = torch.tensor(test_sequences, dtype=torch.long)
train_labels_tensor = torch.tensor(train_labels.values, dtype=torch.float32)
test_labels_tensor = torch.tensor(test_labels.values, dtype=torch.float32)

### Datasets and Dataloaders

In [16]:
# Create Custom pytorch Dataset
class MessageTextDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.texts[idx], self.labels[idx]

# Create datasets
train_dataset = MessageTextDataset(train_tensor, train_labels_tensor)
test_dataset = MessageTextDataset(test_tensor, test_labels_tensor)

# Create dataloaders
BATCH_SIZE = 16
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)


### Spam classifier model using RNN

In [17]:
#Spam classifier model using RNN

class SimpleRNNModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim=32, hidden_dim1=32, hidden_dim2=64, sequence_length=80):
        super(SimpleRNNModel, self).__init__()
        
        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim)
        
        self.rnn1 = nn.RNN(input_size=embedding_dim, hidden_size=hidden_dim1, batch_first=True)
        self.dropout1 = nn.Dropout(0.2) 

        self.rnn2 = nn.RNN(input_size=hidden_dim1, hidden_size=hidden_dim2, batch_first=True)
        self.dropout2 = nn.Dropout(0.2)

        self.fc = nn.Linear(hidden_dim2 * sequence_length, 1)  
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, x):
        x = self.embedding(x)
        
        x, _ = self.rnn1(x)
        x = self.dropout1(x)
        
        x, _ = self.rnn2(x)
        x = self.dropout2(x)

        x = x.reshape(x.shape[0], -1)  # Flatten
        x = self.fc(x)
        x = self.sigmoid(x)

        return x

vocab_size = len(vocab) + 1 
sequence_length = 80
model = SimpleRNNModel(vocab_size=vocab_size, sequence_length=sequence_length)

print(model)


SimpleRNNModel(
  (embedding): Embedding(5706, 32)
  (rnn1): RNN(32, 32, batch_first=True)
  (dropout1): Dropout(p=0.2, inplace=False)
  (rnn2): RNN(32, 64, batch_first=True)
  (dropout2): Dropout(p=0.2, inplace=False)
  (fc): Linear(in_features=5120, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)


## Model Training

In [18]:
# Loss and optimizer
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
EPOCHS = 20
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in range(EPOCHS):
    model.train()
    epoch_loss = 0
    for texts, labels in train_loader:
        texts, labels = texts.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(texts)
        labels = labels.view(-1, 1)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

    print(f"Epoch {epoch+1}, Loss: {epoch_loss/len(train_loader):.4f}")


Epoch 1, Loss: 0.2262
Epoch 2, Loss: 0.1171
Epoch 3, Loss: 0.0758
Epoch 4, Loss: 0.0578
Epoch 5, Loss: 0.0374
Epoch 6, Loss: 0.0310
Epoch 7, Loss: 0.0253
Epoch 8, Loss: 0.0174
Epoch 9, Loss: 0.0151
Epoch 10, Loss: 0.0096
Epoch 11, Loss: 0.0088
Epoch 12, Loss: 0.0086
Epoch 13, Loss: 0.0029
Epoch 14, Loss: 0.0014
Epoch 15, Loss: 0.0018
Epoch 16, Loss: 0.0044
Epoch 17, Loss: 0.0023
Epoch 18, Loss: 0.0041
Epoch 19, Loss: 0.0004
Epoch 20, Loss: 0.0001


## Model Evaluation

In [19]:
model.eval()
all_preds, all_labels = [], []

with torch.no_grad():
    for texts, labels in test_loader:
        texts, labels = texts.to(device), labels.to(device)
        preds = model(texts).squeeze().round() 
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# Compute metrics
accuracy = accuracy_score(all_labels, all_preds)
precision = precision_score(all_labels, all_preds)
recall = recall_score(all_labels, all_preds)
f1 = f1_score(all_labels, all_preds)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")


Accuracy: 0.9812
Precision: 0.9638
Recall: 0.8926
F1 Score: 0.9268


In [20]:
print(classification_report(all_labels, all_preds))

              precision    recall  f1-score   support

         0.0       0.98      0.99      0.99       966
         1.0       0.96      0.89      0.93       149

    accuracy                           0.98      1115
   macro avg       0.97      0.94      0.96      1115
weighted avg       0.98      0.98      0.98      1115

