In [1]:
### Hate Tweet Classification using PyTorch
#### Step 1: Import Libraries
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from transformers import BertTokenizer, BertModel
import re

In [2]:
#### Step 2: Load Dataset
train_df = pd.read_csv("/content/train.csv")
test_df = pd.read_csv("/content/test.csv")

In [3]:
#### Step 3: Preprocessing
def clean_text(text):
    text = text.lower()  # Lowercasing
    text = re.sub(r'@\w+', '', text)  # Remove mentions
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'[^a-zA-Z0-9 ]', '', text)  # Remove special characters
    return text

train_df['tweet'] = train_df['tweet'].apply(clean_text)
test_df['tweet'] = test_df['tweet'].apply(clean_text)

In [4]:
#### Step 4: Tokenization using BERT tokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_text(text, max_length=64):
    tokens = tokenizer(text, padding='max_length', truncation=True, max_length=max_length, return_tensors="pt")
    return tokens.input_ids.squeeze(), tokens.attention_mask.squeeze()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [5]:
#### Step 5: Create Dataset Class
class TweetDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        input_ids, attention_mask = tokenize_text(self.texts[idx])
        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'label': torch.tensor(self.labels[idx], dtype=torch.long)
        }

In [6]:
#### Step 6: Train-Val Split

X_train, X_val, y_train, y_val = train_test_split(train_df['tweet'].values, train_df['label'].values, test_size=0.2)
train_dataset = TweetDataset(X_train, y_train)
val_dataset = TweetDataset(X_val, y_val)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

In [7]:
#### Step 7: Define Model Class

class HateTweetClassifier(nn.Module):
    def __init__(self):
        super(HateTweetClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(768, 2)  # 2 classes (hate/non-hate)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        x = self.dropout(pooled_output)
        return self.fc(x)

In [8]:
#### Step 8: Train Model

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = HateTweetClassifier().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=2e-5)

for epoch in range(3):
    model.train()
    total_loss = 0
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch {epoch+1}, Loss: {total_loss/len(train_loader)}")

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Epoch 1, Loss: 0.1304609259361025
Epoch 2, Loss: 0.05634273243460286
Epoch 3, Loss: 0.01909496094166075


In [9]:
#### Step 9: Evaluate Model

model.eval()
y_true, y_pred = [], []

with torch.no_grad():
    for batch in val_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].cpu().numpy()

        outputs = model(input_ids, attention_mask)
        predictions = torch.argmax(outputs, dim=1).cpu().numpy()

        y_true.extend(labels)
        y_pred.extend(predictions)

print("Classification Report:")
print(classification_report(y_true, y_pred))

Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.99      0.98      5955
           1       0.82      0.76      0.79       438

    accuracy                           0.97      6393
   macro avg       0.90      0.87      0.89      6393
weighted avg       0.97      0.97      0.97      6393



In [10]:
#### Step 11: Inference Function

def predict_tweet(text):
    model.eval()
    input_ids, attention_mask = tokenize_text(clean_text(text))
    input_ids, attention_mask = input_ids.unsqueeze(0).to(device), attention_mask.unsqueeze(0).to(device)

    with torch.no_grad():
        output = model(input_ids, attention_mask)
        prediction = torch.argmax(output, dim=1).item()

    return "Hate Speech" if prediction == 1 else "Not Hate Speech"

# Example Usage
print(predict_tweet("I hate this!"))

Not Hate Speech


In [12]:
torch.save(model.state_dict(), "hate_tweet_model.pth")
print("Model saved successfully!")


Model saved successfully!


In [13]:
# Load the trained model
model.load_state_dict(torch.load("hate_tweet_model.pth"))
model.to(device)
model.eval()

# Function to classify test data
def classify_test_tweets(test_df):
    predictions = []

    with torch.no_grad():
        for tweet in test_df['tweet']:
            input_ids, attention_mask = tokenize_text(clean_text(tweet))
            input_ids, attention_mask = input_ids.unsqueeze(0).to(device), attention_mask.unsqueeze(0).to(device)

            output = model(input_ids, attention_mask)
            prediction = torch.argmax(output, dim=1).item()
            predictions.append(prediction)

    return predictions

# Apply classification on test.csv
test_df['label'] = classify_test_tweets(test_df)

# Save results to new CSV file
test_df.to_csv("test_predictions.csv", index=False)

print("Classification completed! Results saved in test_predictions.csv.")


Classification completed! Results saved in test_predictions.csv.
