CharCNN + FastText + BiGRU + Attention + Aux features + TF-IDF (Tuned)

MuRIL (google/muril-base-cased) + CharCNN + CNN (multi-kernel) + BiGRU + Attention + TF-IDF + Aux

In [3]:
# Full cleaned script: MuRIL (google/muril-base-cased) + CharCNN + CNN (multi-kernel) + BiGRU + Attention + TF-IDF + Aux (Late fusion)
# Requirements:
#   pip install torch transformers scikit-learn pandas tqdm
#
# Save as bert_charcnn_muril_hybrid.py and run. Edit CFG at top for paths / hyperparams.

import os
import random
from collections import Counter
from dataclasses import dataclass, asdict
from typing import List, Dict, Tuple, Set

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import AutoTokenizer, AutoModel, get_linear_schedule_with_warmup
from torch.optim import AdamW
from tqdm.auto import tqdm

# -------------------------
# Config
# -------------------------
@dataclass
class CFG:
    data_csv: str = "tamil_sentiment_full.csv"    # TSV: label \t text
    output_dir: str = "outputs_muril_hybrid"
    model_name: str = "google/muril-base-cased"   # MuRIL for Indic languages
    device: str = "cuda" if torch.cuda.is_available() else "cpu"
    seed: int = 42

    # token / char
    max_len: int = 64
    max_char_per_token: int = 12
    min_class_samples: int = 3

    # architecture
    char_emb_dim: int = 50
    char_out: int = 96
    cnn_out_channels: int = 128
    cnn_kernel_sizes: Tuple[int, ...] = (2, 3, 4)
    hidden_dim: int = 256
    gru_layers: int = 1
    attn_dim: int = 128
    dropout: float = 0.3

    # TF-IDF / aux
    tfidf_dim: int = 5000
    tfidf_proj_dim: int = 64
    aux_dim: int = 10

    # training
    epochs: int = 4
    batch_size: int = 32
    lr_bert: float = 2e-5
    lr_head: float = 1e-3
    weight_decay: float = 1e-6
    warmup_ratio: float = 0.06
    grad_clip: float = 1.0
    num_workers: int = 0  # set to 0 for notebooks; >0 for scripts
    save_best: bool = True

cfg = CFG()
os.makedirs(cfg.output_dir, exist_ok=True)

# -------------------------
# Utilities
# -------------------------
def seed_everything(seed: int):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

seed_everything(cfg.seed)

def read_data(path: str):
    df = pd.read_csv(path, sep='\t', header=None, names=['label', 'text'], engine='python', on_bad_lines='skip')
    df.dropna(subset=['text', 'label'], inplace=True)
    df['text'] = df['text'].astype(str)
    return df

# Aux features
def add_lexicon_features(tokens: List[str]) -> List[float]:
    POSITIVE_WORDS: Set[str] = {"நல்ல", "சிறந்த", "அற்புதம்", "மகிழ்ச்சி"}
    NEGATIVE_WORDS: Set[str] = {"மோசம்", "கெட்ட", "வருத்தம்", "தவறு"}
    pos_count = sum(1 for token in tokens if token in POSITIVE_WORDS)
    neg_count = sum(1 for token in tokens if token in NEGATIVE_WORDS)
    num_tokens = len(tokens) + 1e-6
    return [pos_count / num_tokens, neg_count / num_tokens]

def compute_aux_features(text: str) -> List[float]:
    toks = text.split()
    num_chars = len(text)
    base_features = [
        float(len(toks)),
        float(num_chars),
        float(sum(1 for ch in text if ord(ch) > 10000)),
        float(sum(1 for ch in text if ch in '?!.,;:')),
        1.0 if any('a' <= ch.lower() <= 'z' for ch in text) else 0.0,
        1.0 if any('\u0B80' <= ch <= '\u0BFF' for ch in text) else 0.0,
        float((sum(len(t) for t in toks) / len(toks)) if toks else 0.0),
        float(sum(1 for ch in text if ch.isupper()) / (num_chars + 1)),
    ]
    lex_features = add_lexicon_features(toks)
    return base_features + lex_features

# -------------------------
# Tokenizer / char helpers
# -------------------------
tokenizer = AutoTokenizer.from_pretrained(cfg.model_name, use_fast=True)

def build_char_vocab_from_token_strings(token_strings_list: List[List[str]], min_freq=1, max_chars=12):
    cnt = Counter()
    for toks in token_strings_list:
        for tok in toks:
            s = tok.replace("##", "")
            for ch in list(s)[:max_chars]:
                cnt[ch] += 1
    char2idx = {'<pad>': 0, '<unk>': 1}
    for ch, c in cnt.items():
        if c >= min_freq and ch not in char2idx:
            char2idx[ch] = len(char2idx)
    return char2idx

def tokens_to_char_ids(token_strings: List[str], char2idx: Dict[str,int], max_chars=12, max_tokens=64):
    char_ids = []
    for tok in token_strings[:max_tokens]:
        s = tok.replace("##", "")
        ids = [char2idx.get(ch, char2idx['<unk>']) for ch in list(s)[:max_chars]]
        ids += [char2idx['<pad>']] * (max_chars - len(ids))
        char_ids.append(ids)
    while len(char_ids) < max_tokens:
        char_ids.append([char2idx['<pad>']] * max_chars)
    return char_ids

# -------------------------
# Dataset
# -------------------------
class BertHybridDataset(Dataset):
    def __init__(self, records, label_map, tokenizer, tfidf_vectorizer, char2idx, cfg: CFG, tfidf_matrix=None):
        self.records = records
        self.label_map = label_map
        self.tokenizer = tokenizer
        self.tfidf_vectorizer = tfidf_vectorizer
        self.char2idx = char2idx
        self.cfg = cfg

        texts = [r['text'] for r in records]
        enc = tokenizer(texts, padding='max_length', truncation=True, max_length=cfg.max_len, return_tensors='np')
        self.input_ids = enc['input_ids']
        self.attention_mask = enc['attention_mask']
        self.token_type_ids = enc['token_type_ids'] if 'token_type_ids' in enc else np.zeros_like(self.input_ids)

        token_strings_all = [tokenizer.convert_ids_to_tokens(seq) for seq in self.input_ids]
        self.char_ids = [tokens_to_char_ids(tokstr, char2idx, max_chars=cfg.max_char_per_token, max_tokens=cfg.max_len) for tokstr in token_strings_all]

        if tfidf_vectorizer is not None:
            self.tfidf = tfidf_vectorizer.transform(texts)
        else:
            self.tfidf = None

        self.aux = [compute_aux_features(t) for t in texts]

    def __len__(self):
        return len(self.records)

    def __getitem__(self, idx):
        input_ids = torch.tensor(self.input_ids[idx], dtype=torch.long)
        attention_mask = torch.tensor(self.attention_mask[idx], dtype=torch.long)
        token_type_ids = torch.tensor(self.token_type_ids[idx], dtype=torch.long)

        char_ids = torch.tensor(self.char_ids[idx], dtype=torch.long)  # (T, Cchars)

        aux = self.aux[idx] + [0.0] * max(0, self.cfg.aux_dim - len(self.aux[idx]))
        aux = torch.tensor(aux[:self.cfg.aux_dim], dtype=torch.float32)

        if self.tfidf is not None:
            try:
                tfidf_vec = self.tfidf[idx].toarray().squeeze().astype(np.float32)
            except Exception:
                tfidf_vec = np.asarray(self.tfidf[idx]).squeeze().astype(np.float32)
        else:
            tfidf_vec = np.zeros(cfg.tfidf_dim, dtype=np.float32)
        tfidf_vec = torch.tensor(tfidf_vec, dtype=torch.float32)

        label = torch.tensor(self.label_map[self.records[idx]['label']], dtype=torch.long)

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'token_type_ids': token_type_ids,
            'char_ids': char_ids,
            'aux': aux,
            'tfidf': tfidf_vec,
            'label': label
        }

# -------------------------
# Model components
# -------------------------
class CharCNN(nn.Module):
    def __init__(self, char_vocab_size:int, cfg:CFG):
        super().__init__()
        self.char_emb = nn.Embedding(char_vocab_size, cfg.char_emb_dim, padding_idx=0)
        self.kernel_sizes = (2, 3, 4)
        num_filters_each = max(1, cfg.char_out // len(self.kernel_sizes))
        self.convs = nn.ModuleList([nn.Conv2d(1, num_filters_each, (k, cfg.char_emb_dim)) for k in self.kernel_sizes])
        self.dropout = nn.Dropout(cfg.dropout)
        self.out_dim_actual = num_filters_each * len(self.kernel_sizes)

    def forward(self, x_char):
        B, T, C = x_char.size()
        emb = self.char_emb(x_char)              # (B, T, C, char_emb_dim)
        x = emb.view(B*T, C, -1).unsqueeze(1)    # (B*T, 1, C, char_emb_dim)
        conv_outs = [F.relu(conv(x)).squeeze(3) for conv in self.convs]  # each (B*T, filters, L_i)
        pooled = [F.max_pool1d(o, o.size(2)).squeeze(2) for o in conv_outs]  # each (B*T, filters)
        out = torch.cat(pooled, dim=1).view(B, T, -1)  # (B, T, out_dim)
        return self.dropout(out)

class BERTHybrid(nn.Module):
    def __init__(self, bert_model_name: str, char_vocab_size: int, cfg: CFG, num_labels: int):
        super().__init__()
        self.cfg = cfg
        self.bert = AutoModel.from_pretrained(bert_model_name)
        bert_hidden = self.bert.config.hidden_size

        self.char_cnn = CharCNN(char_vocab_size, cfg)
        token_in_dim = bert_hidden + self.char_cnn.out_dim_actual

        self.word_cnns = nn.ModuleList([
            nn.Conv1d(token_in_dim, cfg.cnn_out_channels, k, padding=k//2)
            for k in cfg.cnn_kernel_sizes
        ])
        cnn_output_dim = cfg.cnn_out_channels * len(cfg.cnn_kernel_sizes)

        self.bigru = nn.GRU(cnn_output_dim, cfg.hidden_dim // 2, num_layers=cfg.gru_layers,
                           bidirectional=True, batch_first=True, dropout=cfg.dropout if cfg.gru_layers > 1 else 0)
        self.attn_proj = nn.Linear(cfg.hidden_dim, cfg.attn_dim)
        self.attn_v = nn.Linear(cfg.attn_dim, 1, bias=False)

        self.tfidf_proj = nn.Linear(cfg.tfidf_dim, cfg.tfidf_proj_dim)
        self.aux_proj = nn.Linear(cfg.aux_dim, 32)

        fusion_dim = bert_hidden + cfg.hidden_dim + cfg.tfidf_proj_dim + 32
        self.classifier = nn.Sequential(
            nn.Linear(fusion_dim, 256),
            nn.ReLU(),
            nn.Dropout(cfg.dropout),
            nn.Linear(256, num_labels)
        )

    def forward(self, input_ids, attention_mask, token_type_ids, char_ids, aux, tfidf):
        bert_out = self.bert(input_ids=input_ids, attention_mask=attention_mask,
                             token_type_ids=token_type_ids if token_type_ids is not None else None, return_dict=True)
        last_hidden = bert_out.last_hidden_state    # (B, T, H)
        pooled_cls = getattr(bert_out, 'pooler_output', None)
        if pooled_cls is None:
            pooled_cls = last_hidden[:, 0, :]

        char_feats = self.char_cnn(char_ids)       # (B, T, char_dim)
        x = torch.cat([last_hidden, char_feats], dim=-1)  # (B, T, token_in_dim)
        x_permuted = x.permute(0, 2, 1)             # (B, token_in_dim, T)

        cnn_feats = [F.relu(conv(x_permuted)) for conv in self.word_cnns]  # list (B, out_ch, L_i)
        seq_lens = [f.size(2) for f in cnn_feats]
        min_len = min(seq_lens)
        if len(set(seq_lens)) != 1:
            aligned = []
            for f in cnn_feats:
                if f.size(2) > min_len:
                    aligned.append(f[:, :, :min_len])
                elif f.size(2) < min_len:
                    pad_amt = min_len - f.size(2)
                    aligned.append(F.pad(f, (0, pad_amt)))
                else:
                    aligned.append(f)
            cnn_feats = aligned

        cnn_cat = torch.cat(cnn_feats, dim=1)  # (B, out_ch * num_kernels, T')
        x_seq = cnn_cat.permute(0, 2, 1)       # (B, T', cnn_output_dim)

        h, _ = self.bigru(x_seq)               # (B, T', hidden_dim)
        a = torch.tanh(self.attn_proj(h))      # (B, T', attn_dim)
        scores = self.attn_v(a).squeeze(-1)    # (B, T')

        mask = attention_mask
        if mask.size(1) > scores.size(1):
            mask = mask[:, :scores.size(1)]
        elif mask.size(1) < scores.size(1):
            pad_amt = scores.size(1) - mask.size(1)
            mask = F.pad(mask, (0, pad_amt), value=0)

        scores = scores.masked_fill(mask == 0, float('-1e9'))
        alpha = torch.softmax(scores, dim=1).unsqueeze(-1)
        seq_pooled = (h * alpha).sum(dim=1)   # (B, hidden_dim)

        tfidf_p = torch.relu(self.tfidf_proj(tfidf))
        aux_p = torch.relu(self.aux_proj(aux))

        final_vec = torch.cat([pooled_cls, seq_pooled, tfidf_p, aux_p], dim=1)
        logits = self.classifier(final_vec)
        return logits

# -------------------------
# Training utils
# -------------------------
class FocalLoss(nn.Module):
    def __init__(self, gamma=2.0):
        super().__init__()
        self.gamma = gamma
    def forward(self, logits, targets):
        ce = F.cross_entropy(logits, targets, reduction='none')
        pt = torch.exp(-ce)
        return (((1-pt)**self.gamma) * ce).mean()

def evaluate(model, dataloader, device):
    model.eval()
    y_true, y_pred = [], []
    with torch.no_grad():
        for batch in dataloader:
            labels = batch['label'].to(device)
            inputs = {k: v.to(device) for k, v in batch.items() if k != 'label'}
            logits = model(**inputs)
            preds = torch.argmax(logits, dim=1)
            y_true.extend(labels.cpu().tolist())
            y_pred.extend(preds.cpu().tolist())
    return y_true, y_pred

# -------------------------
# Main pipeline
# -------------------------
def main():
    print("Reading CSV...")
    df = read_data(cfg.data_csv)

    cnt = df['label'].value_counts()
    keep = cnt[cnt >= cfg.min_class_samples].index.tolist()
    df = df[df['label'].isin(keep)].reset_index(drop=True)

    labels_unique = sorted(df['label'].unique())
    label_map = {lab: i for i, lab in enumerate(labels_unique)}
    id2label = {v:k for k,v in label_map.items()}

    data = df.to_dict(orient='records')
    lablist = [label_map[r['label']] for r in data]

    train_idx, test_idx = train_test_split(range(len(data)), test_size=0.15, random_state=cfg.seed, stratify=lablist)
    train_idx, val_idx = train_test_split(train_idx, test_size=0.1, random_state=cfg.seed, stratify=[lablist[i] for i in train_idx])

    train_records = [data[i] for i in train_idx]
    val_records   = [data[i] for i in val_idx]
    test_records  = [data[i] for i in test_idx]
    print(f"Split sizes: Train={len(train_records)}, Val={len(val_records)}, Test={len(test_records)}")

    # TF-IDF
    tfidf_vec = TfidfVectorizer(max_features=cfg.tfidf_dim, ngram_range=(1,2), min_df=3, max_df=0.9)
    train_texts = [r['text'] for r in train_records]
    val_texts   = [r['text'] for r in val_records]
    test_texts  = [r['text'] for r in test_records]
    train_tfidf = tfidf_vec.fit_transform(train_texts)
    val_tfidf   = tfidf_vec.transform(val_texts)
    test_tfidf  = tfidf_vec.transform(test_texts)

    # build char vocab from training tokens
    print("Building char vocab...")
    enc_train = tokenizer(train_texts, padding='max_length', truncation=True, max_length=cfg.max_len, return_tensors='np')
    token_strings_train = [tokenizer.convert_ids_to_tokens(seq) for seq in enc_train['input_ids']]
    char2idx = build_char_vocab_from_token_strings(token_strings_train, min_freq=1, max_chars=cfg.max_char_per_token)
    print(f"Char vocab size: {len(char2idx)}")

    # datasets & loaders
    train_ds = BertHybridDataset(train_records, label_map, tokenizer, tfidf_vec, char2idx, cfg, tfidf_matrix=train_tfidf)
    val_ds   = BertHybridDataset(val_records, label_map, tokenizer, tfidf_vec, char2idx, cfg, tfidf_matrix=val_tfidf)
    test_ds  = BertHybridDataset(test_records, label_map, tokenizer, tfidf_vec, char2idx, cfg, tfidf_matrix=test_tfidf)

    train_loader = DataLoader(train_ds, batch_size=cfg.batch_size, shuffle=True, num_workers=cfg.num_workers, pin_memory=True)
    val_loader   = DataLoader(val_ds,   batch_size=cfg.batch_size*2, shuffle=False, num_workers=cfg.num_workers, pin_memory=True)
    test_loader  = DataLoader(test_ds,  batch_size=cfg.batch_size*2, shuffle=False, num_workers=cfg.num_workers, pin_memory=True)

    # model
    model = BERTHybrid(cfg.model_name, len(char2idx), cfg, num_labels=len(label_map))
    model.to(cfg.device)
    print(f"Model params (trainable): {sum(p.numel() for p in model.parameters() if p.requires_grad):,}")

    # optimizer groups
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {"params": [p for n,p in model.named_parameters() if n.startswith("bert.") and not any(nd in n for nd in no_decay)],
         "lr": cfg.lr_bert, "weight_decay": cfg.weight_decay},
        {"params": [p for n,p in model.named_parameters() if n.startswith("bert.") and any(nd in n for nd in no_decay)],
         "lr": cfg.lr_bert, "weight_decay": 0.0},
        {"params": [p for n,p in model.named_parameters() if not n.startswith("bert.") and not any(nd in n for nd in no_decay)],
         "lr": cfg.lr_head, "weight_decay": cfg.weight_decay},
        {"params": [p for n,p in model.named_parameters() if not n.startswith("bert.") and any(nd in n for nd in no_decay)],
         "lr": cfg.lr_head, "weight_decay": 0.0},
    ]
    optimizer = AdamW(optimizer_grouped_parameters)
    total_steps = len(train_loader) * cfg.epochs
    warmup_steps = int(total_steps * cfg.warmup_ratio) if total_steps > 0 else 0
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=max(1, total_steps))
    criterion = FocalLoss(2.0)

    best_macro = -1.0
    best_ckpt = None

    # training loop
    for epoch in range(cfg.epochs):
        model.train()
        pbar = tqdm(train_loader, desc=f"Train E{epoch+1}/{cfg.epochs}", leave=False)
        for batch in pbar:
            labels = batch['label'].to(cfg.device)
            inputs = {k: v.to(cfg.device) for k,v in batch.items() if k != 'label'}
            optimizer.zero_grad()
            logits = model(**inputs)
            loss = criterion(logits, labels)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), cfg.grad_clip)
            optimizer.step()
            scheduler.step()
            pbar.set_postfix(loss=float(loss.item()))

        # validation
        y_true, y_pred = evaluate(model, val_loader, cfg.device)
        report = classification_report(y_true, y_pred, target_names=list(label_map.keys()), output_dict=True, zero_division=0, digits=4)
        macro_f1 = float(report['macro avg']['f1-score'])
        print(f"Epoch {epoch+1}/{cfg.epochs} -> Val Macro F1: {macro_f1:.4f}")

        if cfg.save_best and macro_f1 > best_macro:
            best_macro = macro_f1
            best_ckpt = os.path.join(cfg.output_dir, f"best_macro_{macro_f1:.4f}.pt")
            torch.save({'model_state_dict': model.state_dict(), 'cfg': asdict(cfg), 'label_map': label_map}, best_ckpt)
            print(f"Saved checkpoint to {best_ckpt}")

    # final evaluation
    if best_ckpt is not None and os.path.exists(best_ckpt):
        ckpt = torch.load(best_ckpt, map_location=cfg.device)
        model.load_state_dict(ckpt['model_state_dict'])
    print("Running final evaluation on test set...")
    y_true, y_pred = evaluate(model, test_loader, cfg.device)
    print("\nFinal Test Report:")
    print(classification_report(y_true, y_pred, target_names=list(label_map.keys()), digits=4, zero_division=0))

if __name__ == "__main__":
    main()


Reading CSV...
Split sizes: Train=33674, Val=3742, Test=6603
Building char vocab...
Char vocab size: 455


pytorch_model.bin:   0%|          | 0.00/953M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/953M [00:00<?, ?B/s]

Model params (trainable): 239,625,635


Train E1/4:   0%|          | 0/1053 [00:00<?, ?it/s]

Epoch 1/4 -> Val Macro F1: 0.4933
Saved checkpoint to outputs_muril_hybrid/best_macro_0.4933.pt


Train E2/4:   0%|          | 0/1053 [00:00<?, ?it/s]

Epoch 2/4 -> Val Macro F1: 0.4928


Train E3/4:   0%|          | 0/1053 [00:00<?, ?it/s]

Epoch 3/4 -> Val Macro F1: 0.4945
Saved checkpoint to outputs_muril_hybrid/best_macro_0.4945.pt


Train E4/4:   0%|          | 0/1053 [00:00<?, ?it/s]

Epoch 4/4 -> Val Macro F1: 0.5120
Saved checkpoint to outputs_muril_hybrid/best_macro_0.5120.pt
Running final evaluation on test set...

Final Test Report:
                precision    recall  f1-score   support

Mixed_feelings     0.3699    0.1732    0.2359       739
      Negative     0.4319    0.4770    0.4533       784
      Positive     0.7512    0.8279    0.7877      3731
     not-Tamil     0.6886    0.6358    0.6611       313
 unknown_state     0.4525    0.4324    0.4423      1036

      accuracy                         0.6418      6603
     macro avg     0.5388    0.5093    0.5161      6603
  weighted avg     0.6208    0.6418    0.6261      6603

