In [None]:
from IPython.display import clear_output
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import spacy
import re
import string
from collections import Counter
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
import tqdm

import warnings
warnings.filterwarnings('ignore')


!wget https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip
!unzip /content/smsspamcollection.zip
!rm /content/readme
!rm /content/smsspamcollection.zip

clear_output()

!wget https://nlp.stanford.edu/data/glove.6B.zip
!unzip /content/glove.6B.zip
!rm -rf /content/glove.6B.zip
!rm /content/glove.6B.100d.txt
!rm /content/glove.6B.200d.txt
!rm /content/glove.6B.300d.txt

clear_output()

text = []
label = []

with open("/content/SMSSpamCollection") as f:
    for line in f:
        line = line.strip()
        label.append(1 if line.split("\t")[0] == "spam" else 0)
        text.append(line.split("\t")[1])

sms = pd.DataFrame(zip(text, label), columns=["Text", "Label"])
sms['Text_Length'] = sms['Text'].apply(len)


spacy_tokenizer = spacy.load('en_core_web_sm')

def tokenize(text):
    text = text.encode('ascii', errors='ignore').decode('ascii')
    text = text.translate(str.maketrans("", "", string.punctuation))
    return [token.text.lower() for token in spacy_tokenizer(text) if not token.is_stop]


sms["Tokenized_Text"] = sms["Text"].apply(tokenize)

def load_GloVe_embeddings(glove_file):
    embeddings = {}
    with open(glove_file, 'r') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], "float32")
            embeddings[word] = vector
    return embeddings

glove_embeddings = load_GloVe_embeddings('/content/glove.6B.50d.txt')

def embed_text(tokenized_text, word_embeddings, max_text_length=20, embedding_size=50):
    embeddings = np.zeros((max_text_length, embedding_size))
    for i, token in enumerate(tokenized_text[:max_text_length]):
        if token in word_embeddings:
            embeddings[i] = word_embeddings[token]
    return embeddings

sms["Embedded_Text"] = sms["Tokenized_Text"].apply(lambda x: embed_text(x, glove_embeddings))

class load_dataset(Dataset):
    def __init__(self, X, Y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(Y, dtype=torch.long)

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(RNN, self).__init__()
        self.rnn = nn.RNN(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        h0 = torch.zeros(num_layers, x.size(0), hidden_size).to(x.device)
        out, _ = self.rnn(x, h0)
        out = self.fc(out[:, -1, :])
        return out

def train_model(num_epochs, train_loader, model, criterion, optimizer):
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        correct = 0
        total = 0

        for data in train_loader:
            inputs, labels = data
            inputs, labels = inputs.to(device), labels.to(device)

            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

        accuracy = 100 * correct / total
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss:.4f}, Accuracy: {accuracy:.2f}%')

X_train, X_test, y_train, y_test = train_test_split(sms["Embedded_Text"].tolist(), sms["Label"], test_size=0.2, random_state=42)

train_dataset = load_dataset(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

input_size = 50
hidden_size = 128
num_layers = 2
output_size = 2
num_epochs = 10
learning_rate = 0.001

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = RNN(input_size, hidden_size, num_layers, output_size).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

train_model(num_epochs, train_loader, model, criterion, optimizer)

torch.save(model.state_dict(), 'rnn_model.pth')


Epoch [1/10], Loss: 29.1813, Accuracy: 91.99%
Epoch [2/10], Loss: 20.1410, Accuracy: 95.63%
Epoch [3/10], Loss: 21.9633, Accuracy: 94.98%
Epoch [4/10], Loss: 18.1557, Accuracy: 96.01%
Epoch [5/10], Loss: 17.7751, Accuracy: 96.14%
Epoch [6/10], Loss: 17.2675, Accuracy: 96.32%
Epoch [7/10], Loss: 17.0080, Accuracy: 96.48%
Epoch [8/10], Loss: 16.7840, Accuracy: 96.59%
Epoch [9/10], Loss: 17.0208, Accuracy: 96.41%
Epoch [10/10], Loss: 15.4770, Accuracy: 96.66%
