CharCNN + FastText + BiGRU + Attention + Aux features + TF-IDF (Tuned)

In [2]:
# Full corrected script: CharCNN + FastText + BiGRU + Attention + Aux features + TF-IDF (Tuned)
# Requirements:
# pip install torch scikit-learn gensim pandas tqdm

import os, time, random, json
from collections import Counter
from dataclasses import dataclass
from typing import List, Dict, Set

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import FastText
from tqdm import tqdm

# -------------------------
# Config
# -------------------------
@dataclass
class CFG:
    data_csv: str = "tamil_sentiment_full.csv"  # tab-separated file: label \t text
    output_dir: str = "outputs_char_bigru_tuned"
    device: str = "cuda" if torch.cuda.is_available() else "cpu"
    min_class_samples: int = 3

    # vocab / chars
    min_token_freq: int = 2
    max_chars_per_token: int = 12
    min_char_freq: int = 1

    # embedding / ft
    ft_dim: int = 300
    ft_min_count: int = 2
    ft_epochs: int = 8
    embedding_trainable: bool = True

    # model
    hidden_dim: int = 256
    gru_layers: int = 2 ### MODIFIED ###: Stacked 2 GRU layers
    char_emb_dim: int = 50
    char_out: int = 100
    attn_dim: int = 128
    aux_dim: int = 10      ### MODIFIED ###: Increased from 8 to 10 for lexicon features
    tfidf_dim: int = 5000
    tfidf_proj_dim: int = 64
    dropout: float = 0.3

    # training
    epochs: int = 6
    batch_size: int = 64
    lr_emb: float = 5e-5
    lr_head: float = 1e-3
    weight_decay: float = 1e-5
    use_sampler: bool = False
    use_focal: bool = True
    focal_gamma: float = 2.0
    seed: int = 42

cfg = CFG()
os.makedirs(cfg.output_dir, exist_ok=True)

# -------------------------
# Utilities
# -------------------------
def seed_everything(seed=42):
    random.seed(seed); np.random.seed(seed); torch.manual_seed(seed)
    if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed)
seed_everything(cfg.seed)

def read_data(path):
    df = pd.read_csv(path, sep='\t', header=None, names=['label','text'], engine='python')
    df.dropna(subset=['text','label'], inplace=True)
    df['text'] = df['text'].astype(str)
    return df

# -------------------------
# Preprocessing / vocabs
# -------------------------
### NEW ###: Function for lexicon features
def add_lexicon_features(tokens: List[str]) -> List[float]:
    """
    Counts positive and negative words based on a placeholder lexicon.
    NOTE: Replace these placeholder sets with a real Tamil sentiment lexicon.
    """
    # Placeholder - please replace with a comprehensive Tamil lexicon
    POSITIVE_WORDS: Set[str] = {"நல்ல", "சிறந்த", "அற்புதம்", "மகிழ்ச்சி"}
    NEGATIVE_WORDS: Set[str] = {"மோசம்", "கெட்ட", "வருத்தம்", "தவறு"}

    pos_count = sum(1 for token in tokens if token in POSITIVE_WORDS)
    neg_count = sum(1 for token in tokens if token in NEGATIVE_WORDS)

    # Normalize by text length to prevent long texts from having unfairly high scores
    num_tokens = len(tokens) + 1e-6
    return [pos_count / num_tokens, neg_count / num_tokens]


def build_token_vocab(texts: List[str], min_freq=2):
    cnt = Counter()
    for t in texts:
        for w in t.split():
            cnt[w] += 1
    word2idx = {'<pad>':0, '<unk>':1}
    for w,c in cnt.items():
        if c>=min_freq:
            word2idx[w] = len(word2idx)
    return word2idx

def build_char_vocab(texts: List[str], min_freq=1, max_chars=12):
    cnt = Counter()
    for t in texts:
        for tok in t.split():
            for ch in list(tok)[:max_chars]:
                cnt[ch] += 1
    char2idx = {'<pad>':0, '<unk>':1}
    for ch,c in cnt.items():
        if c>=min_freq:
            char2idx[ch] = len(char2idx)
    return char2idx

def text_to_token_ids(text, word2idx, max_len):
    ids = [word2idx.get(w, word2idx['<unk>']) for w in text.split()]
    if len(ids) < max_len: ids += [word2idx['<pad>']] * (max_len - len(ids))
    else: ids = ids[:max_len]
    return ids

def text_to_char_ids(text, char2idx, max_len_tokens, max_chars_per_token):
    toks = text.split()
    char_ids = []
    for i in range(max_len_tokens):
        if i < len(toks):
            tok = toks[i][:max_chars_per_token]
            ids = [char2idx.get(ch, char2idx['<unk>']) for ch in tok]
            if len(ids) < max_chars_per_token:
                ids += [char2idx['<pad>']] * (max_chars_per_token - len(ids))
        else:
            ids = [char2idx['<pad>']] * max_chars_per_token
        char_ids.append(ids)
    return char_ids

# Auxiliary features generator
def compute_aux_features(text): ### MODIFIED ###
    toks = text.split()
    num_tokens = len(toks)
    num_chars = len(text)
    # Original features
    emoji_count = sum(1 for ch in text if ord(ch) > 10000)
    punct_count = sum(1 for ch in text if ch in '?!.,;:')
    has_english = 1.0 if any('a' <= ch.lower() <= 'z' for ch in text) else 0.0
    has_tamil = 1.0 if any('\u0B80' <= ch <= '\u0BFF' for ch in text) else 0.0
    avg_token_len = (sum(len(t) for t in toks)/num_tokens) if num_tokens>0 else 0.0
    cap_ratio = sum(1 for ch in text if ch.isupper()) / (num_chars+1)
    base_features = [num_tokens, num_chars, emoji_count, punct_count, has_english, has_tamil, avg_token_len, cap_ratio]
    # New lexicon features
    lex_features = add_lexicon_features(toks)
    return base_features + lex_features

# -------------------------
# Dataset
# -------------------------
class CharTokenDataset(Dataset):
    def __init__(self, records, tfidf_matrix, label_map, word2idx, char2idx, max_len_tokens, max_chars_per_token, aux_dim):
        self.records = records
        self.tfidf_matrix = tfidf_matrix
        self.label_map = label_map
        self.word2idx = word2idx
        self.char2idx = char2idx
        self.max_len_tokens = max_len_tokens
        self.max_chars_per_token = max_chars_per_token
        self.aux_dim = aux_dim

        self.samples = []
        for i, r in enumerate(records):
            text = str(r['text'])
            label = label_map[r['label']]
            token_ids = text_to_token_ids(text, word2idx, max_len_tokens)
            char_ids = text_to_char_ids(text, char2idx, max_len_tokens, max_chars_per_token)
            aux = compute_aux_features(text)
            aux = (aux + [0.0]*aux_dim)[:aux_dim]
            tfidf_vec = self.tfidf_matrix[i].toarray().squeeze()
            self.samples.append((token_ids, char_ids, aux, tfidf_vec, label))

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        token_ids, char_ids, aux, tfidf, label = self.samples[idx]
        token_ids = torch.tensor(token_ids, dtype=torch.long)
        char_ids = torch.tensor(char_ids, dtype=torch.long)
        aux = torch.tensor(aux, dtype=torch.float32)
        tfidf = torch.tensor(tfidf, dtype=torch.float32)
        label = torch.tensor(label, dtype=torch.long)
        return {"token_ids": token_ids, "char_ids": char_ids, "aux": aux, "tfidf": tfidf, "label": label}

# -------------------------
# FastText training / loading
# -------------------------
def train_or_load_fasttext(sentences, path, dim=300, min_count=2, epochs=6):
    if os.path.exists(path):
        print("Loading FastText from", path)
        return FastText.load(path)
    print("Training FastText...")
    tokenized = [s.split() for s in sentences]
    ft = FastText(vector_size=dim, window=5, min_count=min_count, workers=os.cpu_count(), epochs=epochs)
    ft.build_vocab(tokenized)
    ft.train(tokenized, total_examples=len(tokenized), epochs=epochs)
    ft.save(path)
    print("Saved FastText at", path)
    return ft

def build_embedding_matrix(word2idx, ft_model, dim):
    V = len(word2idx)
    mat = np.random.normal(scale=0.01, size=(V, dim)).astype(np.float32)
    found = 0
    for w,i in word2idx.items():
        if w in ft_model.wv:
            mat[i] = ft_model.wv[w]
            found += 1
    print(f"Found {found}/{V} tokens in FastText.")
    return mat

# -------------------------
# Model components
# -------------------------
class CharCNN(nn.Module):
    def __init__(self, char_vocab_size, char_emb_dim=50, out_dim=100, kernel_sizes=(3,4,5), dropout=0.1, max_chars=12):
        super().__init__()
        self.char_emb = nn.Embedding(char_vocab_size, char_emb_dim, padding_idx=0)
        k = len(kernel_sizes)
        base = out_dim // k
        extras = out_dim - (base * k)
        out_channels_list = [base + (1 if i < extras else 0) for i in range(k)]
        self.convs = nn.ModuleList([
            nn.Conv2d(in_channels=1, out_channels=out_channels_list[i], kernel_size=(kernel_sizes[i], char_emb_dim))
            for i in range(k)
        ])
        self.out_dim_actual = sum(out_channels_list)
        self.dropout = nn.Dropout(dropout)
        self.max_chars = max_chars

    def forward(self, x_char):
        B,T,C = x_char.size()
        x = self.char_emb(x_char)
        x = x.view(B*T, C, -1).unsqueeze(1)
        conv_outs = []
        for conv in self.convs:
            o = conv(x)
            o = F.relu(o.squeeze(-1))
            o = F.max_pool1d(o, o.size(2)).squeeze(2)
            conv_outs.append(o)
        out = torch.cat(conv_outs, dim=1)
        out = out.view(B, T, -1)
        out = self.dropout(out)
        return out

class BiGRUCharFastText(nn.Module): ### MODIFIED ###: Renamed from BiLSTM... and using GRU
    def __init__(self, emb_matrix, char_vocab_size, cfg, num_labels):
        super().__init__()
        emb_matrix = torch.tensor(emb_matrix)
        V, E = emb_matrix.shape
        self.embedding = nn.Embedding.from_pretrained(emb_matrix, freeze=not cfg.embedding_trainable, padding_idx=0)
        self.char_cnn = CharCNN(char_vocab_size, char_emb_dim=cfg.char_emb_dim, out_dim=cfg.char_out, dropout=cfg.dropout, max_chars=cfg.max_chars_per_token)
        token_in_dim = E + self.char_cnn.out_dim_actual
        # Swapped LSTM for GRU and used gru_layers config
        self.bigru = nn.GRU(token_in_dim, cfg.hidden_dim//2, num_layers=cfg.gru_layers, bidirectional=True, batch_first=True, dropout=cfg.dropout if cfg.gru_layers>1 else 0)
        self.attn_proj = nn.Linear(cfg.hidden_dim, cfg.attn_dim)
        self.attn_v = nn.Linear(cfg.attn_dim, 1, bias=False)
        self.aux_proj = nn.Linear(cfg.aux_dim, 32)
        self.tfidf_proj = nn.Linear(cfg.tfidf_dim, cfg.tfidf_proj_dim)
        self.classifier = nn.Sequential(
            nn.Linear(cfg.hidden_dim + 32 + cfg.tfidf_proj_dim, 256),
            nn.ReLU(),
            nn.Dropout(cfg.dropout),
            nn.Linear(256, num_labels)
        )

    def forward(self, token_ids, char_ids, aux, tfidf):
        emb = self.embedding(token_ids)
        char_vec = self.char_cnn(char_ids)
        x = torch.cat([emb, char_vec], dim=-1)
        h, _ = self.bigru(x) # Using GRU here
        a = torch.tanh(self.attn_proj(h))
        scores = self.attn_v(a).squeeze(-1)
        mask = (token_ids != 0).float()
        scores = scores.masked_fill(mask==0, -1e9)
        alpha = torch.softmax(scores, dim=1).unsqueeze(-1)
        pooled = (h * alpha).sum(dim=1)
        aux_p = torch.relu(self.aux_proj(aux))
        tfidf_p = torch.relu(self.tfidf_proj(tfidf))
        cat = torch.cat([pooled, aux_p, tfidf_p], dim=1)
        logits = self.classifier(cat)
        return logits

# -------------------------
# Loss: Focal
# -------------------------
class FocalLoss(nn.Module):
    def __init__(self, gamma=2.0, alpha=None):
        super().__init__()
        self.gamma = gamma
        if alpha is not None:
            self.alpha = torch.tensor(alpha, dtype=torch.float32)
        else:
            self.alpha = None

    def forward(self, logits, targets):
        ce = F.cross_entropy(logits, targets, reduction='none')
        pt = torch.exp(-ce)
        loss = ((1 - pt) ** self.gamma) * ce
        if self.alpha is not None:
            if self.alpha.device != targets.device:
                self.alpha = self.alpha.to(targets.device)
            loss = self.alpha[targets] * loss
        return loss.mean()

# -------------------------
# Evaluate
# -------------------------
def evaluate(model, dataloader, device, id2label):
    model.eval()
    y_true, y_pred = [], []
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Eval", leave=False):
            tokens = batch['token_ids'].to(device)
            chars  = batch['char_ids'].to(device)
            aux    = batch['aux'].to(device)
            tfidf  = batch['tfidf'].to(device)
            labels = batch['label'].cpu().numpy().tolist()
            logits = model(tokens, chars, aux, tfidf)
            preds = torch.argmax(logits, dim=1).cpu().numpy().tolist()
            y_true.extend(labels); y_pred.extend(preds)
    report = classification_report(y_true, y_pred, target_names=list(id2label.values()), digits=4, output_dict=True, zero_division=0)
    cm = confusion_matrix(y_true, y_pred)
    return report, cm, y_true, y_pred

# -------------------------
# Training loop
# -------------------------
def train(train_loader, val_loader, model, cfg, class_weights, id2label):
    device = cfg.device
    model.to(device)
    emb_params = list(model.embedding.parameters())
    other_params = [p for n,p in model.named_parameters() if not n.startswith('embedding.')]
    optimizer = torch.optim.AdamW([
        {"params": emb_params, "lr": cfg.lr_emb},
        {"params": other_params, "lr": cfg.lr_head}
    ], weight_decay=cfg.weight_decay)

    criterion = FocalLoss(cfg.focal_gamma, alpha=None) if cfg.use_focal else nn.CrossEntropyLoss()

    best_ckpt = None
    best_macro = -1.0
    for epoch in range(cfg.epochs):
        model.train()
        pbar = tqdm(train_loader, desc=f"Train E{epoch+1}/{cfg.epochs}")
        for batch in pbar:
            tokens = batch['token_ids'].to(device)
            chars  = batch['char_ids'].to(device)
            aux    = batch['aux'].to(device)
            tfidf  = batch['tfidf'].to(device)
            labels = batch['label'].to(device)
            optimizer.zero_grad()
            logits = model(tokens, chars, aux, tfidf)
            loss = criterion(logits, labels)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            pbar.set_postfix(loss=loss.item())
        report, cm, _, _ = evaluate(model, val_loader, cfg.device, id2label)
        macro_f1 = report['macro avg']['f1-score']
        print(f"Epoch {epoch+1} -> Val Macro F1: {macro_f1:.4f}")
        if macro_f1 > best_macro:
            best_macro = macro_f1
            best_ckpt = os.path.join(cfg.output_dir, f"best_macro_{macro_f1:.4f}.pt")
            torch.save({"model_state_dict": model.state_dict(), "cfg": cfg.__dict__}, best_ckpt)
            print("Saved", best_ckpt)
    return best_ckpt

# -------------------------
# Pipeline orchestration
# -------------------------
def main():
    df = read_data(cfg.data_csv)
    cnt = df['label'].value_counts()
    keep = cnt[cnt >= cfg.min_class_samples].index.tolist()
    if len(keep) < len(cnt):
        df = df[df['label'].isin(keep)].reset_index(drop=True)
    print("Records:", len(df), "labels:", df['label'].nunique())

    labels_unique = sorted(df['label'].unique())
    label_map = {lab:i for i,lab in enumerate(labels_unique)}
    id2label = {v:k for k,v in label_map.items()}

    data = df.to_dict(orient='records')
    lablist = [label_map[r['label']] for r in data]
    train_idx, test_idx = train_test_split(range(len(data)), test_size=0.15, random_state=cfg.seed, stratify=lablist)
    train_idx, val_idx = train_test_split(train_idx, test_size=0.15, random_state=cfg.seed, stratify=[lablist[i] for i in train_idx])
    train_records, val_records, test_records = [data[i] for i in train_idx], [data[i] for i in val_idx], [data[i] for i in test_idx]
    print("split sizes:", len(train_records), len(val_records), len(test_records))

    print("Computing TF-IDF features...")
    tfidf_vectorizer = TfidfVectorizer( ### MODIFIED ###
        max_features=cfg.tfidf_dim,
        ngram_range=(1, 2),
        token_pattern=r'(?u)\b\w+\b',
        min_df=3,         # Ignore terms that appear in less than 3 documents
        max_df=0.9        # Ignore terms that appear in more than 90% of documents
    )
    train_texts = [r['text'] for r in train_records]
    train_tfidf = tfidf_vectorizer.fit_transform(train_texts)
    val_tfidf = tfidf_vectorizer.transform([r['text'] for r in val_records])
    test_tfidf = tfidf_vectorizer.transform([r['text'] for r in test_records])
    print("TF-IDF matrix shape (train):", train_tfidf.shape)

    word2idx = build_token_vocab([r['text'] for r in train_records], cfg.min_token_freq)
    char2idx = build_char_vocab([r['text'] for r in train_records], cfg.min_char_freq, cfg.max_chars_per_token)
    print("Vocab sizes: tokens", len(word2idx), "chars", len(char2idx))

    ft_path = os.path.join(cfg.output_dir, "fasttext.model")
    ft = train_or_load_fasttext([r['text'] for r in data], ft_path, dim=cfg.ft_dim, min_count=cfg.ft_min_count, epochs=cfg.ft_epochs)
    emb_matrix = build_embedding_matrix(word2idx, ft, cfg.ft_dim)

    max_len = 64
    train_ds = CharTokenDataset(train_records, train_tfidf, label_map, word2idx, char2idx, max_len, cfg.max_chars_per_token, cfg.aux_dim)
    val_ds   = CharTokenDataset(val_records, val_tfidf, label_map, word2idx, char2idx, max_len, cfg.max_chars_per_token, cfg.aux_dim)
    test_ds  = CharTokenDataset(test_records, test_tfidf, label_map, word2idx, char2idx, max_len, cfg.max_chars_per_token, cfg.aux_dim)

    train_loader = DataLoader(train_ds, batch_size=cfg.batch_size, shuffle=True, num_workers=2)
    val_loader = DataLoader(val_ds, batch_size=cfg.batch_size*2, shuffle=False, num_workers=2)
    test_loader = DataLoader(test_ds, batch_size=cfg.batch_size*2, shuffle=False, num_workers=2)

    model = BiGRUCharFastText(emb_matrix, char_vocab_size=len(char2idx), cfg=cfg, num_labels=len(label_map)) ### MODIFIED ###
    print("Model trainable params:", sum(p.numel() for p in model.parameters() if p.requires_grad))

    best_ckpt = train(train_loader, val_loader, model, cfg, None, id2label)

    if best_ckpt and os.path.exists(best_ckpt):
        ckpt = torch.load(best_ckpt, map_location=cfg.device)
        model.load_state_dict(ckpt['model_state_dict'])
        report, cm, y_true, y_pred = evaluate(model, test_loader, cfg.device, id2label)
        print("\nFinal Test Report:")
        print(classification_report(y_true, y_pred, target_names=list(id2label.values()), digits=4))
        print("\nConfusion Matrix:\n", cm)

        rows = []
        for rec, yt, yp in zip(test_records, y_true, y_pred):
            rows.append({"text": rec['text'], "label": id2label[yt], "pred": id2label[yp]})
        pd.DataFrame(rows).to_csv(os.path.join(cfg.output_dir, "test_preds.csv"), index=False)
        print("Saved test_preds.csv")
    print("Done.")

if __name__ == "__main__":
    main()

Records: 44019 labels: 5
split sizes: 31803 5613 6603
Computing TF-IDF features...
TF-IDF matrix shape (train): (31803, 5000)
Vocab sizes: tokens 21997 chars 657
Training FastText...
Saved FastText at outputs_char_bigru_tuned/fasttext.model
Found 21997/21997 tokens in FastText.
Model trainable params: 7800581


Train E1/6: 100%|██████████| 497/497 [06:47<00:00,  1.22it/s, loss=0.469]


Epoch 1 -> Val Macro F1: 0.4035
Saved outputs_char_bigru_tuned/best_macro_0.4035.pt


Train E2/6: 100%|██████████| 497/497 [06:45<00:00,  1.23it/s, loss=0.523]


Epoch 2 -> Val Macro F1: 0.4545
Saved outputs_char_bigru_tuned/best_macro_0.4545.pt


Train E3/6: 100%|██████████| 497/497 [06:45<00:00,  1.23it/s, loss=0.387]


Epoch 3 -> Val Macro F1: 0.4713
Saved outputs_char_bigru_tuned/best_macro_0.4713.pt


Train E4/6: 100%|██████████| 497/497 [06:45<00:00,  1.23it/s, loss=0.329]


Epoch 4 -> Val Macro F1: 0.4678


Train E5/6: 100%|██████████| 497/497 [06:45<00:00,  1.23it/s, loss=0.403]


Epoch 5 -> Val Macro F1: 0.4742
Saved outputs_char_bigru_tuned/best_macro_0.4742.pt


Train E6/6: 100%|██████████| 497/497 [06:45<00:00,  1.23it/s, loss=0.332]


Epoch 6 -> Val Macro F1: 0.4715





Final Test Report:
                precision    recall  f1-score   support

Mixed_feelings     0.3694    0.1340    0.1966       739
      Negative     0.4015    0.4005    0.4010       784
      Positive     0.7230    0.8486    0.7808      3731
     not-Tamil     0.6161    0.6613    0.6379       313
 unknown_state     0.4618    0.3736    0.4130      1036

      accuracy                         0.6320      6603
     macro avg     0.5144    0.4836    0.4859      6603
  weighted avg     0.5992    0.6320    0.6058      6603


Confusion Matrix:
 [[  99  140  393   13   94]
 [  43  314  323   14   90]
 [  53  207 3166   66  239]
 [   1    9   68  207   28]
 [  72  112  429   36  387]]
Saved test_preds.csv
Done.


MuRIL (google/muril-base-cased) + CharCNN + CNN (multi-kernel) + BiGRU + Attention + TF-IDF + Aux

In [3]:
# Full cleaned script: MuRIL (google/muril-base-cased) + CharCNN + CNN (multi-kernel) + BiGRU + Attention + TF-IDF + Aux (Late fusion)
# Requirements:
#   pip install torch transformers scikit-learn pandas tqdm
#
# Save as bert_charcnn_muril_hybrid.py and run. Edit CFG at top for paths / hyperparams.

import os
import random
from collections import Counter
from dataclasses import dataclass, asdict
from typing import List, Dict, Tuple, Set

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import AutoTokenizer, AutoModel, get_linear_schedule_with_warmup
from torch.optim import AdamW
from tqdm.auto import tqdm

# -------------------------
# Config
# -------------------------
@dataclass
class CFG:
    data_csv: str = "tamil_sentiment_full.csv"    # TSV: label \t text
    output_dir: str = "outputs_muril_hybrid"
    model_name: str = "google/muril-base-cased"   # MuRIL for Indic languages
    device: str = "cuda" if torch.cuda.is_available() else "cpu"
    seed: int = 42

    # token / char
    max_len: int = 64
    max_char_per_token: int = 12
    min_class_samples: int = 3

    # architecture
    char_emb_dim: int = 50
    char_out: int = 96
    cnn_out_channels: int = 128
    cnn_kernel_sizes: Tuple[int, ...] = (2, 3, 4)
    hidden_dim: int = 256
    gru_layers: int = 1
    attn_dim: int = 128
    dropout: float = 0.3

    # TF-IDF / aux
    tfidf_dim: int = 5000
    tfidf_proj_dim: int = 64
    aux_dim: int = 10

    # training
    epochs: int = 4
    batch_size: int = 32
    lr_bert: float = 2e-5
    lr_head: float = 1e-3
    weight_decay: float = 1e-6
    warmup_ratio: float = 0.06
    grad_clip: float = 1.0
    num_workers: int = 0  # set to 0 for notebooks; >0 for scripts
    save_best: bool = True

cfg = CFG()
os.makedirs(cfg.output_dir, exist_ok=True)

# -------------------------
# Utilities
# -------------------------
def seed_everything(seed: int):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

seed_everything(cfg.seed)

def read_data(path: str):
    df = pd.read_csv(path, sep='\t', header=None, names=['label', 'text'], engine='python', on_bad_lines='skip')
    df.dropna(subset=['text', 'label'], inplace=True)
    df['text'] = df['text'].astype(str)
    return df

# Aux features
def add_lexicon_features(tokens: List[str]) -> List[float]:
    POSITIVE_WORDS: Set[str] = {"நல்ல", "சிறந்த", "அற்புதம்", "மகிழ்ச்சி"}
    NEGATIVE_WORDS: Set[str] = {"மோசம்", "கெட்ட", "வருத்தம்", "தவறு"}
    pos_count = sum(1 for token in tokens if token in POSITIVE_WORDS)
    neg_count = sum(1 for token in tokens if token in NEGATIVE_WORDS)
    num_tokens = len(tokens) + 1e-6
    return [pos_count / num_tokens, neg_count / num_tokens]

def compute_aux_features(text: str) -> List[float]:
    toks = text.split()
    num_chars = len(text)
    base_features = [
        float(len(toks)),
        float(num_chars),
        float(sum(1 for ch in text if ord(ch) > 10000)),
        float(sum(1 for ch in text if ch in '?!.,;:')),
        1.0 if any('a' <= ch.lower() <= 'z' for ch in text) else 0.0,
        1.0 if any('\u0B80' <= ch <= '\u0BFF' for ch in text) else 0.0,
        float((sum(len(t) for t in toks) / len(toks)) if toks else 0.0),
        float(sum(1 for ch in text if ch.isupper()) / (num_chars + 1)),
    ]
    lex_features = add_lexicon_features(toks)
    return base_features + lex_features

# -------------------------
# Tokenizer / char helpers
# -------------------------
tokenizer = AutoTokenizer.from_pretrained(cfg.model_name, use_fast=True)

def build_char_vocab_from_token_strings(token_strings_list: List[List[str]], min_freq=1, max_chars=12):
    cnt = Counter()
    for toks in token_strings_list:
        for tok in toks:
            s = tok.replace("##", "")
            for ch in list(s)[:max_chars]:
                cnt[ch] += 1
    char2idx = {'<pad>': 0, '<unk>': 1}
    for ch, c in cnt.items():
        if c >= min_freq and ch not in char2idx:
            char2idx[ch] = len(char2idx)
    return char2idx

def tokens_to_char_ids(token_strings: List[str], char2idx: Dict[str,int], max_chars=12, max_tokens=64):
    char_ids = []
    for tok in token_strings[:max_tokens]:
        s = tok.replace("##", "")
        ids = [char2idx.get(ch, char2idx['<unk>']) for ch in list(s)[:max_chars]]
        ids += [char2idx['<pad>']] * (max_chars - len(ids))
        char_ids.append(ids)
    while len(char_ids) < max_tokens:
        char_ids.append([char2idx['<pad>']] * max_chars)
    return char_ids

# -------------------------
# Dataset
# -------------------------
class BertHybridDataset(Dataset):
    def __init__(self, records, label_map, tokenizer, tfidf_vectorizer, char2idx, cfg: CFG, tfidf_matrix=None):
        self.records = records
        self.label_map = label_map
        self.tokenizer = tokenizer
        self.tfidf_vectorizer = tfidf_vectorizer
        self.char2idx = char2idx
        self.cfg = cfg

        texts = [r['text'] for r in records]
        enc = tokenizer(texts, padding='max_length', truncation=True, max_length=cfg.max_len, return_tensors='np')
        self.input_ids = enc['input_ids']
        self.attention_mask = enc['attention_mask']
        self.token_type_ids = enc['token_type_ids'] if 'token_type_ids' in enc else np.zeros_like(self.input_ids)

        token_strings_all = [tokenizer.convert_ids_to_tokens(seq) for seq in self.input_ids]
        self.char_ids = [tokens_to_char_ids(tokstr, char2idx, max_chars=cfg.max_char_per_token, max_tokens=cfg.max_len) for tokstr in token_strings_all]

        if tfidf_vectorizer is not None:
            self.tfidf = tfidf_vectorizer.transform(texts)
        else:
            self.tfidf = None

        self.aux = [compute_aux_features(t) for t in texts]

    def __len__(self):
        return len(self.records)

    def __getitem__(self, idx):
        input_ids = torch.tensor(self.input_ids[idx], dtype=torch.long)
        attention_mask = torch.tensor(self.attention_mask[idx], dtype=torch.long)
        token_type_ids = torch.tensor(self.token_type_ids[idx], dtype=torch.long)

        char_ids = torch.tensor(self.char_ids[idx], dtype=torch.long)  # (T, Cchars)

        aux = self.aux[idx] + [0.0] * max(0, self.cfg.aux_dim - len(self.aux[idx]))
        aux = torch.tensor(aux[:self.cfg.aux_dim], dtype=torch.float32)

        if self.tfidf is not None:
            try:
                tfidf_vec = self.tfidf[idx].toarray().squeeze().astype(np.float32)
            except Exception:
                tfidf_vec = np.asarray(self.tfidf[idx]).squeeze().astype(np.float32)
        else:
            tfidf_vec = np.zeros(cfg.tfidf_dim, dtype=np.float32)
        tfidf_vec = torch.tensor(tfidf_vec, dtype=torch.float32)

        label = torch.tensor(self.label_map[self.records[idx]['label']], dtype=torch.long)

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'token_type_ids': token_type_ids,
            'char_ids': char_ids,
            'aux': aux,
            'tfidf': tfidf_vec,
            'label': label
        }

# -------------------------
# Model components
# -------------------------
class CharCNN(nn.Module):
    def __init__(self, char_vocab_size:int, cfg:CFG):
        super().__init__()
        self.char_emb = nn.Embedding(char_vocab_size, cfg.char_emb_dim, padding_idx=0)
        self.kernel_sizes = (2, 3, 4)
        num_filters_each = max(1, cfg.char_out // len(self.kernel_sizes))
        self.convs = nn.ModuleList([nn.Conv2d(1, num_filters_each, (k, cfg.char_emb_dim)) for k in self.kernel_sizes])
        self.dropout = nn.Dropout(cfg.dropout)
        self.out_dim_actual = num_filters_each * len(self.kernel_sizes)

    def forward(self, x_char):
        B, T, C = x_char.size()
        emb = self.char_emb(x_char)              # (B, T, C, char_emb_dim)
        x = emb.view(B*T, C, -1).unsqueeze(1)    # (B*T, 1, C, char_emb_dim)
        conv_outs = [F.relu(conv(x)).squeeze(3) for conv in self.convs]  # each (B*T, filters, L_i)
        pooled = [F.max_pool1d(o, o.size(2)).squeeze(2) for o in conv_outs]  # each (B*T, filters)
        out = torch.cat(pooled, dim=1).view(B, T, -1)  # (B, T, out_dim)
        return self.dropout(out)

class BERTHybrid(nn.Module):
    def __init__(self, bert_model_name: str, char_vocab_size: int, cfg: CFG, num_labels: int):
        super().__init__()
        self.cfg = cfg
        self.bert = AutoModel.from_pretrained(bert_model_name)
        bert_hidden = self.bert.config.hidden_size

        self.char_cnn = CharCNN(char_vocab_size, cfg)
        token_in_dim = bert_hidden + self.char_cnn.out_dim_actual

        self.word_cnns = nn.ModuleList([
            nn.Conv1d(token_in_dim, cfg.cnn_out_channels, k, padding=k//2)
            for k in cfg.cnn_kernel_sizes
        ])
        cnn_output_dim = cfg.cnn_out_channels * len(cfg.cnn_kernel_sizes)

        self.bigru = nn.GRU(cnn_output_dim, cfg.hidden_dim // 2, num_layers=cfg.gru_layers,
                           bidirectional=True, batch_first=True, dropout=cfg.dropout if cfg.gru_layers > 1 else 0)
        self.attn_proj = nn.Linear(cfg.hidden_dim, cfg.attn_dim)
        self.attn_v = nn.Linear(cfg.attn_dim, 1, bias=False)

        self.tfidf_proj = nn.Linear(cfg.tfidf_dim, cfg.tfidf_proj_dim)
        self.aux_proj = nn.Linear(cfg.aux_dim, 32)

        fusion_dim = bert_hidden + cfg.hidden_dim + cfg.tfidf_proj_dim + 32
        self.classifier = nn.Sequential(
            nn.Linear(fusion_dim, 256),
            nn.ReLU(),
            nn.Dropout(cfg.dropout),
            nn.Linear(256, num_labels)
        )

    def forward(self, input_ids, attention_mask, token_type_ids, char_ids, aux, tfidf):
        bert_out = self.bert(input_ids=input_ids, attention_mask=attention_mask,
                             token_type_ids=token_type_ids if token_type_ids is not None else None, return_dict=True)
        last_hidden = bert_out.last_hidden_state    # (B, T, H)
        pooled_cls = getattr(bert_out, 'pooler_output', None)
        if pooled_cls is None:
            pooled_cls = last_hidden[:, 0, :]

        char_feats = self.char_cnn(char_ids)       # (B, T, char_dim)
        x = torch.cat([last_hidden, char_feats], dim=-1)  # (B, T, token_in_dim)
        x_permuted = x.permute(0, 2, 1)             # (B, token_in_dim, T)

        cnn_feats = [F.relu(conv(x_permuted)) for conv in self.word_cnns]  # list (B, out_ch, L_i)
        seq_lens = [f.size(2) for f in cnn_feats]
        min_len = min(seq_lens)
        if len(set(seq_lens)) != 1:
            aligned = []
            for f in cnn_feats:
                if f.size(2) > min_len:
                    aligned.append(f[:, :, :min_len])
                elif f.size(2) < min_len:
                    pad_amt = min_len - f.size(2)
                    aligned.append(F.pad(f, (0, pad_amt)))
                else:
                    aligned.append(f)
            cnn_feats = aligned

        cnn_cat = torch.cat(cnn_feats, dim=1)  # (B, out_ch * num_kernels, T')
        x_seq = cnn_cat.permute(0, 2, 1)       # (B, T', cnn_output_dim)

        h, _ = self.bigru(x_seq)               # (B, T', hidden_dim)
        a = torch.tanh(self.attn_proj(h))      # (B, T', attn_dim)
        scores = self.attn_v(a).squeeze(-1)    # (B, T')

        mask = attention_mask
        if mask.size(1) > scores.size(1):
            mask = mask[:, :scores.size(1)]
        elif mask.size(1) < scores.size(1):
            pad_amt = scores.size(1) - mask.size(1)
            mask = F.pad(mask, (0, pad_amt), value=0)

        scores = scores.masked_fill(mask == 0, float('-1e9'))
        alpha = torch.softmax(scores, dim=1).unsqueeze(-1)
        seq_pooled = (h * alpha).sum(dim=1)   # (B, hidden_dim)

        tfidf_p = torch.relu(self.tfidf_proj(tfidf))
        aux_p = torch.relu(self.aux_proj(aux))

        final_vec = torch.cat([pooled_cls, seq_pooled, tfidf_p, aux_p], dim=1)
        logits = self.classifier(final_vec)
        return logits

# -------------------------
# Training utils
# -------------------------
class FocalLoss(nn.Module):
    def __init__(self, gamma=2.0):
        super().__init__()
        self.gamma = gamma
    def forward(self, logits, targets):
        ce = F.cross_entropy(logits, targets, reduction='none')
        pt = torch.exp(-ce)
        return (((1-pt)**self.gamma) * ce).mean()

def evaluate(model, dataloader, device):
    model.eval()
    y_true, y_pred = [], []
    with torch.no_grad():
        for batch in dataloader:
            labels = batch['label'].to(device)
            inputs = {k: v.to(device) for k, v in batch.items() if k != 'label'}
            logits = model(**inputs)
            preds = torch.argmax(logits, dim=1)
            y_true.extend(labels.cpu().tolist())
            y_pred.extend(preds.cpu().tolist())
    return y_true, y_pred

# -------------------------
# Main pipeline
# -------------------------
def main():
    print("Reading CSV...")
    df = read_data(cfg.data_csv)

    cnt = df['label'].value_counts()
    keep = cnt[cnt >= cfg.min_class_samples].index.tolist()
    df = df[df['label'].isin(keep)].reset_index(drop=True)

    labels_unique = sorted(df['label'].unique())
    label_map = {lab: i for i, lab in enumerate(labels_unique)}
    id2label = {v:k for k,v in label_map.items()}

    data = df.to_dict(orient='records')
    lablist = [label_map[r['label']] for r in data]

    train_idx, test_idx = train_test_split(range(len(data)), test_size=0.15, random_state=cfg.seed, stratify=lablist)
    train_idx, val_idx = train_test_split(train_idx, test_size=0.1, random_state=cfg.seed, stratify=[lablist[i] for i in train_idx])

    train_records = [data[i] for i in train_idx]
    val_records   = [data[i] for i in val_idx]
    test_records  = [data[i] for i in test_idx]
    print(f"Split sizes: Train={len(train_records)}, Val={len(val_records)}, Test={len(test_records)}")

    # TF-IDF
    tfidf_vec = TfidfVectorizer(max_features=cfg.tfidf_dim, ngram_range=(1,2), min_df=3, max_df=0.9)
    train_texts = [r['text'] for r in train_records]
    val_texts   = [r['text'] for r in val_records]
    test_texts  = [r['text'] for r in test_records]
    train_tfidf = tfidf_vec.fit_transform(train_texts)
    val_tfidf   = tfidf_vec.transform(val_texts)
    test_tfidf  = tfidf_vec.transform(test_texts)

    # build char vocab from training tokens
    print("Building char vocab...")
    enc_train = tokenizer(train_texts, padding='max_length', truncation=True, max_length=cfg.max_len, return_tensors='np')
    token_strings_train = [tokenizer.convert_ids_to_tokens(seq) for seq in enc_train['input_ids']]
    char2idx = build_char_vocab_from_token_strings(token_strings_train, min_freq=1, max_chars=cfg.max_char_per_token)
    print(f"Char vocab size: {len(char2idx)}")

    # datasets & loaders
    train_ds = BertHybridDataset(train_records, label_map, tokenizer, tfidf_vec, char2idx, cfg, tfidf_matrix=train_tfidf)
    val_ds   = BertHybridDataset(val_records, label_map, tokenizer, tfidf_vec, char2idx, cfg, tfidf_matrix=val_tfidf)
    test_ds  = BertHybridDataset(test_records, label_map, tokenizer, tfidf_vec, char2idx, cfg, tfidf_matrix=test_tfidf)

    train_loader = DataLoader(train_ds, batch_size=cfg.batch_size, shuffle=True, num_workers=cfg.num_workers, pin_memory=True)
    val_loader   = DataLoader(val_ds,   batch_size=cfg.batch_size*2, shuffle=False, num_workers=cfg.num_workers, pin_memory=True)
    test_loader  = DataLoader(test_ds,  batch_size=cfg.batch_size*2, shuffle=False, num_workers=cfg.num_workers, pin_memory=True)

    # model
    model = BERTHybrid(cfg.model_name, len(char2idx), cfg, num_labels=len(label_map))
    model.to(cfg.device)
    print(f"Model params (trainable): {sum(p.numel() for p in model.parameters() if p.requires_grad):,}")

    # optimizer groups
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {"params": [p for n,p in model.named_parameters() if n.startswith("bert.") and not any(nd in n for nd in no_decay)],
         "lr": cfg.lr_bert, "weight_decay": cfg.weight_decay},
        {"params": [p for n,p in model.named_parameters() if n.startswith("bert.") and any(nd in n for nd in no_decay)],
         "lr": cfg.lr_bert, "weight_decay": 0.0},
        {"params": [p for n,p in model.named_parameters() if not n.startswith("bert.") and not any(nd in n for nd in no_decay)],
         "lr": cfg.lr_head, "weight_decay": cfg.weight_decay},
        {"params": [p for n,p in model.named_parameters() if not n.startswith("bert.") and any(nd in n for nd in no_decay)],
         "lr": cfg.lr_head, "weight_decay": 0.0},
    ]
    optimizer = AdamW(optimizer_grouped_parameters)
    total_steps = len(train_loader) * cfg.epochs
    warmup_steps = int(total_steps * cfg.warmup_ratio) if total_steps > 0 else 0
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=max(1, total_steps))
    criterion = FocalLoss(2.0)

    best_macro = -1.0
    best_ckpt = None

    # training loop
    for epoch in range(cfg.epochs):
        model.train()
        pbar = tqdm(train_loader, desc=f"Train E{epoch+1}/{cfg.epochs}", leave=False)
        for batch in pbar:
            labels = batch['label'].to(cfg.device)
            inputs = {k: v.to(cfg.device) for k,v in batch.items() if k != 'label'}
            optimizer.zero_grad()
            logits = model(**inputs)
            loss = criterion(logits, labels)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), cfg.grad_clip)
            optimizer.step()
            scheduler.step()
            pbar.set_postfix(loss=float(loss.item()))

        # validation
        y_true, y_pred = evaluate(model, val_loader, cfg.device)
        report = classification_report(y_true, y_pred, target_names=list(label_map.keys()), output_dict=True, zero_division=0, digits=4)
        macro_f1 = float(report['macro avg']['f1-score'])
        print(f"Epoch {epoch+1}/{cfg.epochs} -> Val Macro F1: {macro_f1:.4f}")

        if cfg.save_best and macro_f1 > best_macro:
            best_macro = macro_f1
            best_ckpt = os.path.join(cfg.output_dir, f"best_macro_{macro_f1:.4f}.pt")
            torch.save({'model_state_dict': model.state_dict(), 'cfg': asdict(cfg), 'label_map': label_map}, best_ckpt)
            print(f"Saved checkpoint to {best_ckpt}")

    # final evaluation
    if best_ckpt is not None and os.path.exists(best_ckpt):
        ckpt = torch.load(best_ckpt, map_location=cfg.device)
        model.load_state_dict(ckpt['model_state_dict'])
    print("Running final evaluation on test set...")
    y_true, y_pred = evaluate(model, test_loader, cfg.device)
    print("\nFinal Test Report:")
    print(classification_report(y_true, y_pred, target_names=list(label_map.keys()), digits=4, zero_division=0))

if __name__ == "__main__":
    main()


Reading CSV...
Split sizes: Train=33674, Val=3742, Test=6603
Building char vocab...
Char vocab size: 455


pytorch_model.bin:   0%|          | 0.00/953M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/953M [00:00<?, ?B/s]

Model params (trainable): 239,625,635


Train E1/4:   0%|          | 0/1053 [00:00<?, ?it/s]

Epoch 1/4 -> Val Macro F1: 0.4933
Saved checkpoint to outputs_muril_hybrid/best_macro_0.4933.pt


Train E2/4:   0%|          | 0/1053 [00:00<?, ?it/s]

Epoch 2/4 -> Val Macro F1: 0.4928


Train E3/4:   0%|          | 0/1053 [00:00<?, ?it/s]

Epoch 3/4 -> Val Macro F1: 0.4945
Saved checkpoint to outputs_muril_hybrid/best_macro_0.4945.pt


Train E4/4:   0%|          | 0/1053 [00:00<?, ?it/s]

Epoch 4/4 -> Val Macro F1: 0.5120
Saved checkpoint to outputs_muril_hybrid/best_macro_0.5120.pt
Running final evaluation on test set...

Final Test Report:
                precision    recall  f1-score   support

Mixed_feelings     0.3699    0.1732    0.2359       739
      Negative     0.4319    0.4770    0.4533       784
      Positive     0.7512    0.8279    0.7877      3731
     not-Tamil     0.6886    0.6358    0.6611       313
 unknown_state     0.4525    0.4324    0.4423      1036

      accuracy                         0.6418      6603
     macro avg     0.5388    0.5093    0.5161      6603
  weighted avg     0.6208    0.6418    0.6261      6603

