In [3]:
# Importing the necessary Libraries
!pip install imbalanced-learn
!pip install transformers
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertModel
import torch
import torch.nn as nn
from xgboost import XGBClassifier
from torch.utils.data import TensorDataset, DataLoader
from sklearn.metrics import classification_report



In [4]:
# Load the dataset
data = pd.read_csv('data.csv')

In [5]:
# Preprocessing
data['label'] = data['Category'].apply(lambda x: 1 if x == 'spam' else 0)

In [6]:
# Split the dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(data['Message'].values, data['label'].values, test_size=0.2, random_state=42)

In [7]:
# Tokenize the input sequences
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

X_train = tokenizer(list(X_train), return_tensors='pt', padding=True, truncation=True, max_length=512)
X_test = tokenizer(list(X_test), return_tensors='pt', padding=True, truncation=True, max_length=512)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [8]:
# Create PyTorch datasets
train_dataset = TensorDataset(X_train['input_ids'], X_train['attention_mask'])
test_dataset = TensorDataset(X_test['input_ids'], X_test['attention_mask'])

In [9]:
# Create data loaders
batch_size = 1024  # Set your desired batch size
train_dataloader = DataLoader(train_dataset, batch_size=batch_size)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)

In [10]:
# Load the BERT model
model = BertModel.from_pretrained('bert-base-uncased')

# Process data in batches
with torch.no_grad():
    train_embeddings = []
    for batch in train_dataloader:
        input_ids, attention_mask = batch
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        embeddings = outputs['pooler_output']
        train_embeddings.append(embeddings)
    X_train = torch.cat(train_embeddings, dim=0)

    test_embeddings = []
    for batch in test_dataloader:
        input_ids, attention_mask = batch
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        embeddings = outputs['pooler_output']
        test_embeddings.append(embeddings)
    X_test = torch.cat(test_embeddings, dim=0)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [11]:
# XGB classifier
classifier = XGBClassifier()
classifier.fit(X_train.numpy(), y_train)

# Evaluate the classifier
y_pred = classifier.predict(X_test.numpy())
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       966
           1       0.96      0.91      0.93       149

    accuracy                           0.98      1115
   macro avg       0.97      0.95      0.96      1115
weighted avg       0.98      0.98      0.98      1115



In [12]:
# Feed Forward Neural Network

# Hyperparameters
num_epochs = 30
batch_size = 128
learning_rate = 0.001
embedding_dim = 768
hidden_dim = 128

class FeedForwardNN(nn.Module):
  def __init__(self, embedding_dim, hidden_dim):
    super(FeedForwardNN,self).__init__()
    self.l1 = nn.Linear(embedding_dim, hidden_dim)
    self.l2 = nn.Linear(hidden_dim, hidden_dim)
    self.l3 = nn.Linear(hidden_dim, 1)
    self.relu = nn.ReLU()
  def forward(self,x):
    output = self.l1(x)
    output = self.relu(output)
    output = self.l2(output)
    output = self.relu(output)
    output = self.l3(output)
    return output
FFNN = FeedForwardNN(embedding_dim, hidden_dim)

In [13]:
# Training the FFNN
train_data = TensorDataset(X_train,torch.tensor(y_train))
test_data = TensorDataset(X_test,torch.tensor(y_test))

train_loader = DataLoader(train_data, batch_size=batch_size)
test_loader = DataLoader(test_data, batch_size=batch_size)

criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(FFNN.parameters(), lr=0.001)

for epoch in range(num_epochs):
  for inputs, labels in train_loader:
    optimizer.zero_grad()
    labels = labels.view(-1, 1).float()
    outputs = FFNN(inputs)
    loss = criterion(outputs, labels)
    loss.backward()
    optimizer.step()

In [14]:
# Evaluating the FFNN
model.eval()
y_pred = []

for inputs, labels in test_loader:
  with torch.no_grad():
    logits = FFNN(inputs)
    output = torch.sigmoid(logits)
    predicted = output >= 0.5
    y_pred.extend(predicted.tolist())

print(classification_report(y_test, np.array(y_pred)))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       966
           1       0.99      0.95      0.97       149

    accuracy                           0.99      1115
   macro avg       0.99      0.97      0.98      1115
weighted avg       0.99      0.99      0.99      1115

