In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD  
from sklearn.metrics import classification_report
from scipy.sparse import hstack
from torch.utils.data import Dataset

#### Loading Dataset

In [2]:
df_tr = pd.read_csv("/Users/triptibhardwaj/Downloads/train.csv")
df_va = pd.read_csv("/Users/triptibhardwaj/Downloads/dev.csv")
df_te = pd.read_csv("/Users/triptibhardwaj/Downloads/test.csv")

label_map = {"OFF": 1, "NOT": 0}
df_tr["label"] = df_tr["label"].map(label_map)
df_va["label"] = df_va["label"].map(label_map)
df_te["label"] = df_te["label"].map(label_map)

print("Train:", df_tr.shape, " Dev:", df_va.shape, " Test:", df_te.shape)
print(df_tr.sample(5), df_va.sample(5), df_te.sample(5))

Train: (60000, 6)  Dev: (20000, 6)  Test: (20000, 6)
            id                                               text        user  \
37708  45353.0  @USER @USER @USER @USER সাদা উদারপন্থী do not ...       ann87   
11367  90011.0                        @USER ভাই...সে है cray mehn  jennifer55   
7410       NaN  @USER @USER @USER @USER @USER @USER यह एंटीफा ...         NaN   
26700  27255.0             @USER कर सकना আপনি say DESPERATE?!?!?!       sam42   
57749  52628.0     @USER I’ll যাওয়া चुनना u up लानत है একটি उबेर     steve11   

      state      time  label  
37708    CT  12:09:07      0  
11367    FL   1:39:29      0  
7410    NaN       NaN      1  
26700    NY   0:22:52      0  
57749    VA  17:57:49      1               id                                               text        user  \
12041  16051.0                       @USER Exactly... वे সব कड़वा  jennifer55   
18558      NaN  .@USER है her own ব্যক্তি সে এটি না a @USER नक...         NaN   
8492   49300.0  @USER Well your

#### PyTorch Dataset

In [3]:
class TextDataset(Dataset):
    def __init__(self, df, vocab):
        self.texts = df["text"].tolist()
        self.labels = df["label"].tolist()
        self.vocab = vocab

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        tokens = self.texts[idx].split()
        ids = [self.vocab.get(tok, 1) for tok in tokens]  # unk=1
        return {
            "doc_ids": torch.tensor(ids, dtype=torch.long),
            "y_off": torch.tensor(self.labels[idx], dtype=torch.float32),
        }

#### HAN

In [4]:
class WordAttention(nn.Module):
    def __init__(self, vocab_size, emb_dim, hidden_dim):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, emb_dim, padding_idx=0)
        self.rnn = nn.GRU(emb_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.attn = nn.Linear(2*hidden_dim, 1)

    def forward(self, x):
        emb = self.embed(x)  # [B, W, E]
        h, _ = self.rnn(emb) # [B, W, 2H]
        a = torch.softmax(self.attn(h), dim=1) # [B, W, 1]
        rep = (a * h).sum(dim=1)               # [B, 2H]
        return rep

class HAN(nn.Module):
    def __init__(self, vocab_size, emb_dim=100, hidden_dim=64):
        super().__init__()
        self.word_encoder = WordAttention(vocab_size, emb_dim, hidden_dim)

    def forward(self, doc_ids):
        return self.word_encoder(doc_ids)

#### Fusion Model

In [5]:
class FusionHANModel(nn.Module):
    def __init__(self, vocab_size, tfidf_dim, emb_dim=100, hidden_dim=64):
        super().__init__()
        self.han = HAN(vocab_size, emb_dim, hidden_dim)
        self.fc_off = nn.Linear(2*hidden_dim + tfidf_dim, 1)

    def forward(self, doc_ids, feats):
        han_vec = self.han(doc_ids)
        x = torch.cat([han_vec, feats], dim=1)
        return {"offense": self.fc_off(x).squeeze(1)}

#### Trainer

In [6]:
class Trainer:
    def __init__(self, model, device="cpu"):
        self.model = model.to(device)
        self.opt = torch.optim.Adam(model.parameters(), lr=1e-3)
        self.device = device

    def step_batch(self, batch, feats):
        self.model.train()
        self.opt.zero_grad()
        out = self.model(batch["doc_ids"].to(self.device), feats.to(self.device))
        loss = F.binary_cross_entropy_with_logits(out["offense"], batch["y_off"].to(self.device))
        loss.backward()
        self.opt.step()

    def evaluate(self, loader, feats):
        self.model.eval()
        ys, yh = [], []
        with torch.no_grad():
            for i, batch in enumerate(loader):
                f = feats[i*batch_size:(i+1)*batch_size]
                out = self.model(batch["doc_ids"].to(self.device), f.to(self.device))
                yh.extend(torch.sigmoid(out["offense"]).cpu().numpy() > 0.5)
                ys.extend(batch["y_off"].numpy())
        print(classification_report(ys, yh, digits=3))

#### Training Loop

In [7]:
def train_model(trainer, train_loader, dev_loader, Xtr, Xva, num_epochs=10, batch_size=32):
    for epoch in range(num_epochs):
        for i, batch in enumerate(train_loader):
            f = torch.tensor(Xtr[i*batch_size:(i+1)*batch_size], dtype=torch.float32)
            trainer.step_batch(batch, f)
        print(f"Epoch {epoch+1}/{num_epochs} completed.")  

#### Main

In [8]:
def main():
    # Vocabulary
    vocab = {"<pad>": 0, "<unk>": 1}
    for sent in df_tr["text"]:
        for tok in sent.split():
            if tok not in vocab:
                vocab[tok] = len(vocab)

    # TF-IDF
    tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
    tfidf.fit(df_tr["text"])
    Xtr = tfidf.transform(df_tr["text"]).toarray()
    Xva = tfidf.transform(df_va["text"]).toarray()
    Xte = tfidf.transform(df_te["text"]).toarray()

    # Datasets + Loaders
    train_set = TextDataset(df_tr, vocab)
    dev_set   = TextDataset(df_va, vocab)
    test_set  = TextDataset(df_te, vocab)

    global batch_size
    batch_size = 32
    tr_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True, collate_fn=lambda x: {
        "doc_ids": nn.utils.rnn.pad_sequence([item["doc_ids"] for item in x], batch_first=True),
        "y_off": torch.stack([item["y_off"] for item in x])
    })
    va_loader = DataLoader(dev_set, batch_size=batch_size, shuffle=False, collate_fn=lambda x: {
        "doc_ids": nn.utils.rnn.pad_sequence([item["doc_ids"] for item in x], batch_first=True),
        "y_off": torch.stack([item["y_off"] for item in x])
    })
    te_loader = DataLoader(test_set, batch_size=batch_size, shuffle=False, collate_fn=lambda x: {
        "doc_ids": nn.utils.rnn.pad_sequence([item["doc_ids"] for item in x], batch_first=True),
        "y_off": torch.stack([item["y_off"] for item in x])
    })

    # Model + Trainer
    model = FusionHANModel(len(vocab), tfidf_dim=5000)
    trainer = Trainer(model, device="cpu")

    # Train
    train_model(trainer, tr_loader, va_loader, Xtr, Xva, num_epochs=10, batch_size=batch_size)

    # Final Test
    print("\n=== Final Test Evaluation ===")
    trainer.evaluate(te_loader, torch.tensor(Xte, dtype=torch.float32))

if __name__ == "__main__":
    main()

Epoch 1/10 completed.
Epoch 2/10 completed.
Epoch 3/10 completed.
Epoch 4/10 completed.
Epoch 5/10 completed.
Epoch 6/10 completed.
Epoch 7/10 completed.
Epoch 8/10 completed.
Epoch 9/10 completed.
Epoch 10/10 completed.

=== Final Test Evaluation ===
              precision    recall  f1-score   support

         0.0      0.939     0.929     0.934     13340
         1.0      0.861     0.878     0.869      6660

    accuracy                          0.912     20000
   macro avg      0.900     0.904     0.902     20000
weighted avg      0.913     0.912     0.912     20000

