# Classify Tweets using Bag-of-Words
Author: Yoaz Menda

## Step 1: Import Necessary Libraries

In [46]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

import torch
from torch.utils.data import Dataset, DataLoader

## Step 2: Load the Dataset

In [47]:
# Assuming the dataset is the same and has been preloaded
df = pd.read_csv('../data/SportPolitics.csv')

## Step 3: Preprocessing
With BERT, preprocessing mainly involves tokenizing the text, which converts the raw text into a format that the model can understand (input IDs and attention masks).

In [48]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def preprocess_data(text):
    return tokenizer(text, padding=True, truncation=True, max_length=256, return_tensors='pt')

# Example preprocessing
example_text = df['tweet'].iloc[0]
example_processed = preprocess_data(example_text)
print(example_processed)

{'input_ids': tensor([[  101,  5585,  2420,  2000,  2175,  2127,  1996,  2707,  1997,  1001,
         14931, 17134,  1012,  2106,  2017,  2113,  3782,  5637,  2571,  2003,
          1996,  2069,  2447,  1999,  1996,  2724,  1032, 23343, 24096,  2683,
          2015,  2381,  2000,  2022,  7219,  2005,  5585,  1029,  1005,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}


## Step 4: Dataset Preparation
Create a PyTorch dataset to handle the tokenized text for training and validation.

In [49]:
class TweetDataset(Dataset):
    def __init__(self, tweets, labels, tokenizer, max_len):
        self.tweets = tweets
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __len__(self):
        return len(self.tweets)
    
    def __getitem__(self, item):
        tweet = str(self.tweets[item])
        label = self.labels[item]
        encoding = self.tokenizer(tweet, add_special_tokens=True, max_length=self.max_len, return_token_type_ids=False, padding='max_length', return_attention_mask=True, truncation=True, return_tensors='pt')
        
        return {
          'tweet_text': tweet,
          'input_ids': encoding['input_ids'].flatten(),
          'attention_mask': encoding['attention_mask'].flatten(),
          'labels': torch.tensor(label, dtype=torch.long)
        }


## Step 5: Split the Data

In [50]:
# Convert labels to numerical values
df['label'] = df['topic'].map({'Sports': 0, 'Politics': 1})

# Split the data
X_train, X_val, y_train, y_val = train_test_split(df['tweet'], df['label'], test_size=0.1, random_state=42)

# Create datasets
train_dataset = TweetDataset(X_train.to_numpy(), y_train.to_numpy(), tokenizer, max_len=256)
val_dataset = TweetDataset(X_val.to_numpy(), y_val.to_numpy(), tokenizer, max_len=256)


## Step 6: Train the Model
Using Hugging Face's Trainer API to fine-tune BERT for our specific task.

In [52]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm  # for progress bars

# Assuming `TweetDataset` is already defined and instantiated as `train_dataset` and `val_dataset`

# Load the tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Specify device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# DataLoader
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8)

# Optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

# Training loop
epochs = 3
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
    avg_train_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{epochs}, Loss: {avg_train_loss:.4f}")

    # Validation step
    model.eval()
    total_eval_accuracy = 0
    for batch in val_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        total_eval_accuracy += (predictions == labels).float().mean()
    avg_val_accuracy = total_eval_accuracy / len(val_loader)
    print(f"Validation Accuracy: {avg_val_accuracy:.4f}")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  0%|                                                                      | 0/727 [00:00<?, ?it/s]

: 

: 

: 

## Step 7: Evaluate the Model
After training, evaluate the model's performance on the validation set.

In [None]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

trainer.evaluate(eval_dataset=val_dataset, metric_key_prefix="eval", compute_metrics=compute_metrics)
