In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD  
from sklearn.metrics import classification_report
from scipy.sparse import hstack
from torch.utils.data import Dataset

#### Loading Dataset

In [2]:
df_tr = pd.read_csv("/Users/triptibhardwaj/Downloads/train.csv")
df_va = pd.read_csv("/Users/triptibhardwaj/Downloads/dev.csv")
df_te = pd.read_csv("/Users/triptibhardwaj/Downloads/test.csv")

label_map = {"OFF": 1, "NOT": 0}
df_tr["label"] = df_tr["label"].map(label_map)
df_va["label"] = df_va["label"].map(label_map)
df_te["label"] = df_te["label"].map(label_map)

print("Train:", df_tr.shape, " Dev:", df_va.shape, " Test:", df_te.shape)
print(df_tr.sample(5), df_va.sample(5), df_te.sample(5))

Train: (60000, 6)  Dev: (20000, 6)  Test: (20000, 6)
            id                                               text        user  \
51732  40223.0  @USER Ngl এটা দেখায় अच्छा किन्तु अनानास पेशाब...  jennifer55   
22250      NaN  @USER @USER I thought তারা বলেছিল नंबर थे আপ? ...         NaN   
19726      NaN  @USER He lets her ramble on for কিছুটা, হাসছে ...         NaN   
12676  21487.0  @USER प्राप्त करें that shit আমার বন্ধ ফোন বোন...      matt31   
40169      NaN  @USER @USER मैं खड़ा हूं corrected! Guess the ...         NaN   

      state      time  label  
51732    VA  20:53:45      0  
22250   NaN       NaN      0  
19726   NaN       NaN      0  
12676    FL   5:53:51      1  
40169   NaN       NaN      0               id                                               text    user  \
17297  76555.0  @USER I বলেছিলাম रोक लेना मेरा खा रहा है booty...   ann87   
12994  12914.0  @USER এটা আসছে আমার প্রিয় প্রেসিডেন্ট ওবামা! ...  matt31   
19412  94350.0  @USER @USER ওহ খুব handsome

#### PyTorch Dataset

In [3]:
class TextDataset(Dataset):
    def __init__(self, df, vocab):
        self.texts = df["text"].tolist()
        self.labels = df["label"].tolist()
        self.vocab = vocab

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        tokens = self.texts[idx].split()
        ids = [self.vocab.get(tok, 1) for tok in tokens]
        return {
            "doc_ids": torch.tensor(ids, dtype=torch.long),
            "y_off": torch.tensor(self.labels[idx], dtype=torch.float32),
        }

#### HAN

In [4]:
class WordAttention(nn.Module):
    def __init__(self, vocab_size, emb_dim, hidden_dim):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, emb_dim, padding_idx=0)
        self.rnn = nn.GRU(emb_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.attn = nn.Linear(2*hidden_dim, 1)

    def forward(self, x):
        emb = self.embed(x)  # [B, W, E]
        h, _ = self.rnn(emb) # [B, W, 2H]
        a = torch.softmax(self.attn(h), dim=1) # [B, W, 1]
        rep = (a * h).sum(dim=1)               # [B, 2H]
        return rep

class HAN(nn.Module):
    def __init__(self, vocab_size, emb_dim=100, hidden_dim=64):
        super().__init__()
        self.word_encoder = WordAttention(vocab_size, emb_dim, hidden_dim)

    def forward(self, doc_ids):
        return self.word_encoder(doc_ids)

#### Fusion Model

In [5]:
class FusionHANModel(nn.Module):
    def __init__(self, vocab_size, tfidf_dim, emb_dim=100, hidden_dim=64):
        super().__init__()
        self.han = HAN(vocab_size, emb_dim, hidden_dim)
        self.fc_off = nn.Linear(2*hidden_dim + tfidf_dim, 1)

    def forward(self, doc_ids, feats):
        han_vec = self.han(doc_ids)
        x = torch.cat([han_vec, feats], dim=1)
        return {"offense": self.fc_off(x).squeeze(1)}

#### Trainer

In [6]:
class Trainer:
    def __init__(self, model, device="cpu"):
        self.model = model.to(device)
        self.opt = torch.optim.Adam(model.parameters(), lr=1e-3)
        self.device = device

    def step_batch(self, batch, feats):
        self.model.train()
        self.opt.zero_grad()
        out = self.model(batch["doc_ids"].to(self.device), feats.to(self.device))
        loss = F.binary_cross_entropy_with_logits(out["offense"], batch["y_off"].to(self.device))
        loss.backward()
        self.opt.step()

    def evaluate(self, loader, feats):
        self.model.eval()
        ys, yh = [], []
        with torch.no_grad():
            for i, batch in enumerate(loader):
                f = feats[i*batch_size:(i+1)*batch_size]
                out = self.model(batch["doc_ids"].to(self.device), f.to(self.device))
                yh.extend(torch.sigmoid(out["offense"]).cpu().numpy() > 0.5)
                ys.extend(batch["y_off"].numpy())
        print(classification_report(ys, yh, digits=3))

#### Training Loop

In [7]:
def train_model(trainer, train_loader, dev_loader, Xtr, Xva, num_epochs=30, batch_size=32):
    for epoch in range(num_epochs):
        for i, batch in enumerate(train_loader):
            f = torch.tensor(Xtr[i*batch_size:(i+1)*batch_size], dtype=torch.float32)
            trainer.step_batch(batch, f)
        print(f"Epoch {epoch+1}/{num_epochs} completed.")  

#### Main

In [8]:
def main():
    # Vocabulary
    vocab = {"<pad>": 0, "<unk>": 1}
    for sent in df_tr["text"]:
        for tok in sent.split():
            if tok not in vocab:
                vocab[tok] = len(vocab)

    # TF-IDF
    tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
    tfidf.fit(df_tr["text"])
    Xtr = tfidf.transform(df_tr["text"]).toarray()
    Xva = tfidf.transform(df_va["text"]).toarray()
    Xte = tfidf.transform(df_te["text"]).toarray()

    # Datasets + Loaders
    train_set = TextDataset(df_tr, vocab)
    dev_set   = TextDataset(df_va, vocab)
    test_set  = TextDataset(df_te, vocab)

    global batch_size
    batch_size = 32
    tr_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True, collate_fn=lambda x: {
        "doc_ids": nn.utils.rnn.pad_sequence([item["doc_ids"] for item in x], batch_first=True),
        "y_off": torch.stack([item["y_off"] for item in x])
    })
    va_loader = DataLoader(dev_set, batch_size=batch_size, shuffle=False, collate_fn=lambda x: {
        "doc_ids": nn.utils.rnn.pad_sequence([item["doc_ids"] for item in x], batch_first=True),
        "y_off": torch.stack([item["y_off"] for item in x])
    })
    te_loader = DataLoader(test_set, batch_size=batch_size, shuffle=False, collate_fn=lambda x: {
        "doc_ids": nn.utils.rnn.pad_sequence([item["doc_ids"] for item in x], batch_first=True),
        "y_off": torch.stack([item["y_off"] for item in x])
    })

    # Model + Trainer
    model = FusionHANModel(len(vocab), tfidf_dim=5000)
    trainer = Trainer(model, device="cpu")

    # Train
    train_model(trainer, tr_loader, va_loader, Xtr, Xva, num_epochs=30, batch_size=batch_size)

    # Final Test
    print("\n=== Final Test Evaluation ===")
    trainer.evaluate(te_loader, torch.tensor(Xte, dtype=torch.float32))

if __name__ == "__main__":
    main()

Epoch 1/30 completed.
Epoch 2/30 completed.
Epoch 3/30 completed.
Epoch 4/30 completed.
Epoch 5/30 completed.
Epoch 6/30 completed.
Epoch 7/30 completed.
Epoch 8/30 completed.
Epoch 9/30 completed.
Epoch 10/30 completed.
Epoch 11/30 completed.
Epoch 12/30 completed.
Epoch 13/30 completed.
Epoch 14/30 completed.
Epoch 15/30 completed.
Epoch 16/30 completed.
Epoch 17/30 completed.
Epoch 18/30 completed.
Epoch 19/30 completed.
Epoch 20/30 completed.
Epoch 21/30 completed.
Epoch 22/30 completed.
Epoch 23/30 completed.
Epoch 24/30 completed.
Epoch 25/30 completed.
Epoch 26/30 completed.
Epoch 27/30 completed.
Epoch 28/30 completed.
Epoch 29/30 completed.
Epoch 30/30 completed.

=== Final Test Evaluation ===
              precision    recall  f1-score   support

         0.0      0.911     0.969     0.939     13340
         1.0      0.929     0.810     0.865      6660

    accuracy                          0.916     20000
   macro avg      0.920     0.889     0.902     20000
weighted avg    