MuRIL Transformer Embeddings , TF-IDF, CharCNN, BiLSTM Hybrid Model



In [2]:
# --- Step 1: Install necessary libraries, including Hugging Face Transformers ---
!pip install torch scikit-learn pandas tqdm transformers sentencepiece -q

import os
import time
import random
import json
from collections import Counter
from dataclasses import dataclass
from typing import List

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import AutoTokenizer, AutoModel, get_linear_schedule_with_warmup
from tqdm import tqdm # <--- FIX: Added the missing import for the progress bar

# ------------------------
# Configuration
# ------------------------
@dataclass
class CFG:
    data_csv: str = "mal_full_sentiment.tsv"
    output_dir: str = "outputs_malayalam_muril_hybrid_advanced"
    model_name: str = "google/muril-base-cased"
    device: str = "cuda" if torch.cuda.is_available() else "cpu"
    min_class_samples: int = 3
    max_token_len: int = 128
    max_char_len: int = 256
    tfidf_max_features: int = 5000
    tfidf_proj_dim: int = 64
    muril_hidden_size: int = 768
    lstm_hidden_dim: int = 256
    lstm_layers: int = 1
    char_emb_dim: int = 50
    char_out_dim: int = 100
    aux_dim: int = 8
    dropout: float = 0.4
    epochs: int = 5
    batch_size: int = 32
    lr_muril: float = 2e-5
    lr_recurrent: float = 1e-4
    lr_head: float = 1e-3
    label_smoothing: float = 0.1
    seed: int = 42

cfg = CFG()
os.makedirs(cfg.output_dir, exist_ok=True)

# ------------------------
# Utilities & Preprocessing
# ------------------------
def seed_everything(seed=42):
    random.seed(seed); os.environ['PYTHONHASHSEED'] = str(seed); np.random.seed(seed); torch.manual_seed(seed)
    if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed)

seed_everything(cfg.seed)

def normalize_label(l):
    s = str(l).lower()
    if 'posit' in s: return 'Positive'
    if 'negat' in s: return 'Negative'
    if 'neu' in s or 'normal' in s: return 'Neutral'
    return 'Other'

def compute_aux_features(text: str) -> List[float]:
    toks = text.split(); num_tokens = len(toks); num_chars = len(text)
    return [num_tokens, num_chars, 1.0 if any('\u0D00' <= ch <= '\u0D7F' for ch in text) else 0.0, 1.0 if any('a' <= ch.lower() <= 'z' for ch in text) else 0.0, (sum(len(t) for t in toks) / num_tokens) if num_tokens > 0 else 0.0, sum(1 for ch in text if ch.isupper()) / (num_chars + 1e-6), sum(1 for ch in text if ch in '?!.,;:'), sum(1 for ch in text if ord(ch) > 10000)]

# ------------------------
# Dataset Class
# ------------------------
class MurilHybridDataset(Dataset):
    def __init__(self, records, tfidf_vectors, label_map, char2idx, tokenizer, cfg):
        self.records, self.tfidf_vectors, self.label_map, self.char2idx, self.tokenizer, self.cfg = records, tfidf_vectors, label_map, char2idx, tokenizer, cfg
    def __len__(self): return len(self.records)
    def __getitem__(self, idx):
        record = self.records[idx]; text = str(record['text'])
        inputs = self.tokenizer.encode_plus(text, add_special_tokens=True, max_length=self.cfg.max_token_len, padding='max_length', truncation=True, return_tensors='pt')
        char_ids = [self.char2idx.get(c, self.char2idx['<unk>']) for c in text][:self.cfg.max_char_len]
        padded_chars = char_ids + [0] * (self.cfg.max_char_len - len(char_ids))
        return {
            "input_ids": inputs['input_ids'].squeeze(), "attention_mask": inputs['attention_mask'].squeeze(),
            "char_ids": torch.tensor(padded_chars, dtype=torch.long),
            "aux": torch.tensor(compute_aux_features(text), dtype=torch.float32),
            "tfidf": torch.tensor(self.tfidf_vectors[idx].toarray().squeeze(), dtype=torch.float32),
            "label": torch.tensor(self.label_map[record['label']], dtype=torch.long)
        }

# ------------------------
# Model Architecture
# ------------------------
class CharCNN(nn.Module):
    def __init__(self, char_vocab_size, char_emb_dim, out_dim):
        super().__init__(); self.char_emb = nn.Embedding(char_vocab_size, char_emb_dim, padding_idx=0); self.conv = nn.Conv1d(char_emb_dim, out_dim, kernel_size=3, padding=1)
    def forward(self, x_char):
        x = self.char_emb(x_char).transpose(1, 2); x = self.conv(x); return F.max_pool1d(x, x.size(2)).squeeze(2)

class MurilHybridClassifier(nn.Module):
    def __init__(self, char_vocab_size, num_labels, cfg):
        super().__init__()
        self.muril = AutoModel.from_pretrained(cfg.model_name)
        self.bilstm = nn.LSTM(cfg.muril_hidden_size, cfg.lstm_hidden_dim // 2, num_layers=cfg.lstm_layers, bidirectional=True, batch_first=True)
        self.char_cnn = CharCNN(char_vocab_size, cfg.char_emb_dim, cfg.char_out_dim)
        self.tfidf_proj = nn.Linear(cfg.tfidf_max_features, cfg.tfidf_proj_dim)
        self.aux_proj = nn.Linear(cfg.aux_dim, cfg.aux_dim)
        classifier_input_dim = cfg.lstm_hidden_dim + cfg.char_out_dim + cfg.aux_dim + cfg.tfidf_proj_dim
        self.classifier = nn.Sequential(nn.Linear(classifier_input_dim, 256), nn.ReLU(), nn.Dropout(cfg.dropout), nn.Linear(256, num_labels))

    def forward(self, input_ids, attention_mask, char_ids, aux, tfidf):
        muril_embeddings = self.muril(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state
        _, (h_n, _) = self.bilstm(muril_embeddings)
        hidden = torch.cat((h_n[-2,:,:], h_n[-1,:,:]), dim=1)
        char_vec = self.char_cnn(char_ids)
        aux_vec = F.relu(self.aux_proj(aux))
        tfidf_vec = F.relu(self.tfidf_proj(tfidf))
        combined_features = torch.cat([hidden, char_vec, aux_vec, tfidf_vec], dim=1)
        return self.classifier(combined_features)

# ------------------------
# Training & Evaluation
# ------------------------
def evaluate(model, dataloader, device):
    model.eval(); y_true, y_pred = [], []
    with torch.no_grad():
        for batch in dataloader:
            logits = model(batch['input_ids'].to(device), batch['attention_mask'].to(device), batch['char_ids'].to(device), batch['aux'].to(device), batch['tfidf'].to(device))
            y_true.extend(batch['label'].numpy()); y_pred.extend(torch.argmax(logits, dim=1).cpu().numpy())
    return y_true, y_pred

def train_loop(train_loader, val_loader, model, cfg, label_map, class_weights):
    device = cfg.device; model.to(device)
    optimizer = torch.optim.AdamW([
        {'params': model.muril.parameters(), 'lr': cfg.lr_muril},
        {'params': model.bilstm.parameters(), 'lr': cfg.lr_recurrent},
        {'params': model.char_cnn.parameters(), 'lr': cfg.lr_head},
        {'params': model.tfidf_proj.parameters(), 'lr': cfg.lr_head},
        {'params': model.aux_proj.parameters(), 'lr': cfg.lr_head},
        {'params': model.classifier.parameters(), 'lr': cfg.lr_head}
    ])
    num_training_steps = len(train_loader) * cfg.epochs
    num_warmup_steps = int(0.1 * num_training_steps)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps)
    weights_tensor = torch.tensor(class_weights, dtype=torch.float).to(device)
    criterion = nn.CrossEntropyLoss(weight=weights_tensor, label_smoothing=cfg.label_smoothing)

    best_macro_f1 = -1.0
    print(f"\n--- Starting Advanced Training on {device} ---")
    for epoch in range(cfg.epochs):
        model.train()
        pbar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{cfg.epochs}")
        for batch in pbar:
            optimizer.zero_grad()
            logits = model(batch['input_ids'].to(device), batch['attention_mask'].to(device), batch['char_ids'].to(device), batch['aux'].to(device), batch['tfidf'].to(device))
            loss = criterion(logits, batch['label'].to(device))
            loss.backward(); optimizer.step(); scheduler.step(); pbar.set_postfix(loss=loss.item())

        val_true, val_pred = evaluate(model, val_loader, device)
        report = classification_report(val_true, val_pred, target_names=label_map.keys(), output_dict=True, zero_division=0)
        macro_f1 = report['macro avg']['f1-score']
        print(f"Epoch {epoch+1} -> Val Macro F1: {macro_f1:.4f}")
        if macro_f1 > best_macro_f1:
            best_macro_f1 = macro_f1; torch.save(model.state_dict(), os.path.join(cfg.output_dir, "best_model.pt")); print(f"🚀 New best model saved with Macro F1: {macro_f1:.4f}")

# ------------------------
# Main Orchestration
# ------------------------
def main():
    df = pd.read_csv(cfg.data_csv, sep='\t', header=None, names=['label', 'text'], engine='python')
    df.dropna(subset=['text', 'label'], inplace=True); df['text'] = df['text'].astype(str); df['label'] = df['label'].apply(normalize_label); df = df[df['label'] != 'Other']
    label_counts = df['label'].value_counts(); classes_to_keep = label_counts[label_counts >= cfg.min_class_samples].index; df = df[df['label'].isin(classes_to_keep)].reset_index(drop=True)
    label_map = {label: i for i, label in enumerate(sorted(df['label'].unique()))}

    data = df.to_dict(orient="records"); labels = [label_map[r['label']] for r in data]
    train_records, test_records, _, _ = train_test_split(data, labels, test_size=0.2, random_state=cfg.seed, stratify=labels)
    train_records, val_records = train_test_split(train_records, test_size=0.15, random_state=cfg.seed, stratify=[label_map[r['label']] for r in train_records])

    train_labels_for_weights = [label_map[r['label']] for r in train_records]
    class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(train_labels_for_weights), y=train_labels_for_weights)
    print("Calculated Class Weights:", class_weights)

    print("Fitting TF-IDF Vectorizer...")
    train_texts = [r['text'] for r in train_records]; tfidf_vectorizer = TfidfVectorizer(max_features=cfg.tfidf_max_features, ngram_range=(1, 2)); train_tfidf = tfidf_vectorizer.fit_transform(train_texts); val_tfidf = tfidf_vectorizer.transform([r['text'] for r in val_records]); test_tfidf = tfidf_vectorizer.transform([r['text'] for r in test_records])

    print(f"Loading MURIL tokenizer: {cfg.model_name}...")
    tokenizer = AutoTokenizer.from_pretrained(cfg.model_name)
    char2idx = {c: i+2 for i, c in enumerate(Counter(c for text in train_texts for c in text).keys())}
    char2idx['<pad>'] = 0; char2idx['<unk>'] = 1

    train_ds = MurilHybridDataset(train_records, train_tfidf, label_map, char2idx, tokenizer, cfg)
    val_ds = MurilHybridDataset(val_records, val_tfidf, label_map, char2idx, tokenizer, cfg)
    test_ds = MurilHybridDataset(test_records, test_tfidf, label_map, char2idx, tokenizer, cfg)

    train_loader = DataLoader(train_ds, batch_size=cfg.batch_size, shuffle=True, num_workers=2)
    val_loader = DataLoader(val_ds, batch_size=cfg.batch_size * 2, shuffle=False, num_workers=2)
    test_loader = DataLoader(test_ds, batch_size=cfg.batch_size * 2, shuffle=False, num_workers=2)

    print(f"Loading MURIL model: {cfg.model_name}...")
    model = MurilHybridClassifier(len(char2idx), len(label_map), cfg)

    train_loop(train_loader, val_loader, model, cfg, label_map, class_weights)

    print("\n--- Final Test Set Evaluation ---")
    model.load_state_dict(torch.load(os.path.join(cfg.output_dir, "best_model.pt")))
    test_true, test_pred = evaluate(model, test_loader, cfg.device)
    print(classification_report(test_true, test_pred, target_names=label_map.keys()))

if __name__ == '__main__':
    main()

Calculated Class Weights: [2.02036199 0.66443452]
Fitting TF-IDF Vectorizer...
Loading MURIL tokenizer: google/muril-base-cased...
Loading MURIL model: google/muril-base-cased...

--- Starting Advanced Training on cuda ---


Epoch 1/5: 100%|██████████| 224/224 [02:43<00:00,  1.37it/s, loss=0.482]


Epoch 1 -> Val Macro F1: 0.7840
🚀 New best model saved with Macro F1: 0.7840


Epoch 2/5: 100%|██████████| 224/224 [02:45<00:00,  1.36it/s, loss=0.639]


Epoch 2 -> Val Macro F1: 0.8444
🚀 New best model saved with Macro F1: 0.8444


Epoch 3/5: 100%|██████████| 224/224 [02:45<00:00,  1.35it/s, loss=0.701]


Epoch 3 -> Val Macro F1: 0.8549
🚀 New best model saved with Macro F1: 0.8549


Epoch 4/5: 100%|██████████| 224/224 [02:45<00:00,  1.35it/s, loss=0.266]


Epoch 4 -> Val Macro F1: 0.8570
🚀 New best model saved with Macro F1: 0.8570


Epoch 5/5: 100%|██████████| 224/224 [02:45<00:00,  1.35it/s, loss=0.59]


Epoch 5 -> Val Macro F1: 0.8608
🚀 New best model saved with Macro F1: 0.8608

--- Final Test Set Evaluation ---
              precision    recall  f1-score   support

    Negative       0.76      0.83      0.79       520
    Positive       0.94      0.91      0.93      1582

    accuracy                           0.89      2102
   macro avg       0.85      0.87      0.86      2102
weighted avg       0.90      0.89      0.89      2102



MuRIL Transformer Embeddings, CharCNN, TF-IDF, BiLSTM +Attention Hybrid Models

In [3]:
# --- Step 1: Install necessary libraries ---
!pip install torch scikit-learn pandas tqdm transformers sentencepiece -q

import os
import time
import random
from dataclasses import dataclass
from typing import List

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import AutoTokenizer, AutoModel, get_linear_schedule_with_warmup
from tqdm import tqdm

# ------------------------
# Configuration
# ------------------------
@dataclass
class CFG:
    data_csv: str = "mal_full_sentiment.tsv"
    output_dir: str = "outputs_malayalam_muril_attention_hybrid"
    model_name: str = "google/muril-base-cased"
    device: str = "cuda" if torch.cuda.is_available() else "cpu"
    min_class_samples: int = 3
    max_token_len: int = 128
    max_char_len: int = 256
    tfidf_max_features: int = 5000
    tfidf_proj_dim: int = 64
    muril_hidden_size: int = 768
    lstm_hidden_dim: int = 256
    lstm_layers: int = 1
    char_emb_dim: int = 50
    char_out_dim: int = 100
    aux_dim: int = 8
    dropout: float = 0.4
    epochs: int = 5
    batch_size: int = 32
    lr_muril: float = 2e-5
    lr_recurrent: float = 1e-4
    lr_head: float = 1e-3
    label_smoothing: float = 0.1
    seed: int = 42

cfg = CFG()
os.makedirs(cfg.output_dir, exist_ok=True)

# ------------------------
# Utilities & Preprocessing
# ------------------------
def seed_everything(seed=42):
    random.seed(seed); os.environ['PYTHONHASHSEED'] = str(seed); np.random.seed(seed); torch.manual_seed(seed)
    if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed)

seed_everything(cfg.seed)

def normalize_label(l):
    s = str(l).lower()
    if 'posit' in s: return 'Positive'
    if 'negat' in s: return 'Negative'
    if 'neu' in s or 'normal' in s: return 'Neutral'
    return 'Other'

def compute_aux_features(text: str) -> List[float]:
    toks = text.split(); num_tokens = len(toks); num_chars = len(text)
    return [num_tokens, num_chars, 1.0 if any('\u0D00' <= ch <= '\u0D7F' for ch in text) else 0.0, 1.0 if any('a' <= ch.lower() <= 'z' for ch in text) else 0.0, (sum(len(t) for t in toks) / num_tokens) if num_tokens > 0 else 0.0, sum(1 for ch in text if ch.isupper()) / (num_chars + 1e-6), sum(1 for ch in text if ch in '?!.,;:'), sum(1 for ch in text if ord(ch) > 10000)]

# ------------------------
# Dataset Class
# ------------------------
class MurilHybridDataset(Dataset):
    def __init__(self, records, tfidf_vectors, label_map, char2idx, tokenizer, cfg):
        self.records, self.tfidf_vectors, self.label_map, self.char2idx, self.tokenizer, self.cfg = records, tfidf_vectors, label_map, char2idx, tokenizer, cfg
    def __len__(self): return len(self.records)
    def __getitem__(self, idx):
        record = self.records[idx]; text = str(record['text'])
        inputs = self.tokenizer.encode_plus(text, add_special_tokens=True, max_length=self.cfg.max_token_len, padding='max_length', truncation=True, return_tensors='pt')
        char_ids = [self.char2idx.get(c, self.char2idx['<unk>']) for c in text][:self.cfg.max_char_len]
        padded_chars = char_ids + [0] * (self.cfg.max_char_len - len(char_ids))
        return {
            "input_ids": inputs['input_ids'].squeeze(), "attention_mask": inputs['attention_mask'].squeeze(),
            "char_ids": torch.tensor(padded_chars, dtype=torch.long),
            "aux": torch.tensor(compute_aux_features(text), dtype=torch.float32),
            "tfidf": torch.tensor(self.tfidf_vectors[idx].toarray().squeeze(), dtype=torch.float32),
            "label": torch.tensor(self.label_map[record['label']], dtype=torch.long)
        }

# ------------------------
# Model Architecture
# ------------------------
class CharCNN(nn.Module):
    def __init__(self, char_vocab_size, char_emb_dim, out_dim):
        super().__init__(); self.char_emb = nn.Embedding(char_vocab_size, char_emb_dim, padding_idx=0); self.conv = nn.Conv1d(char_emb_dim, out_dim, kernel_size=3, padding=1)
    def forward(self, x_char):
        x = self.char_emb(x_char).transpose(1, 2); x = self.conv(x); return F.max_pool1d(x, x.size(2)).squeeze(2)

# --- NEW: Attention Mechanism ---
class Attention(nn.Module):
    def __init__(self, hidden_dim):
        super().__init__()
        self.attention = nn.Linear(hidden_dim, 1, bias=False)

    def forward(self, lstm_output, attention_mask):
        # lstm_output shape: (B, T, H)
        scores = self.attention(lstm_output).squeeze(-1)  # (B, T)
        # Mask out the padding tokens so they don't get attention
        scores = scores.masked_fill(attention_mask == 0, -1e9)
        weights = torch.softmax(scores, dim=1)  # (B, T)
        # Create context vector
        context = torch.bmm(weights.unsqueeze(1), lstm_output).squeeze(1) # (B, H)
        return context

class MurilHybridClassifier(nn.Module):
    def __init__(self, char_vocab_size, num_labels, cfg):
        super().__init__()
        self.muril = AutoModel.from_pretrained(cfg.model_name)
        self.bilstm = nn.LSTM(cfg.muril_hidden_size, cfg.lstm_hidden_dim // 2, num_layers=cfg.lstm_layers, bidirectional=True, batch_first=True)
        # --- NEW: Attention layer ---
        self.attention = Attention(cfg.lstm_hidden_dim)
        self.char_cnn = CharCNN(char_vocab_size, cfg.char_emb_dim, cfg.char_out_dim)
        self.tfidf_proj = nn.Linear(cfg.tfidf_max_features, cfg.tfidf_proj_dim)
        self.aux_proj = nn.Linear(cfg.aux_dim, cfg.aux_dim)
        classifier_input_dim = cfg.lstm_hidden_dim + cfg.char_out_dim + cfg.aux_dim + cfg.tfidf_proj_dim
        self.classifier = nn.Sequential(nn.Linear(classifier_input_dim, 256), nn.ReLU(), nn.Dropout(cfg.dropout), nn.Linear(256, num_labels))

    def forward(self, input_ids, attention_mask, char_ids, aux, tfidf):
        muril_embeddings = self.muril(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state
        lstm_output, _ = self.bilstm(muril_embeddings)
        # --- NEW: Apply Attention mechanism ---
        # We use the original attention_mask from the tokenizer
        context_vector = self.attention(lstm_output, attention_mask)
        # ------------------------------------
        char_vec = self.char_cnn(char_ids)
        aux_vec = F.relu(self.aux_proj(aux))
        tfidf_vec = F.relu(self.tfidf_proj(tfidf))
        combined_features = torch.cat([context_vector, char_vec, aux_vec, tfidf_vec], dim=1)
        return self.classifier(combined_features)

# ------------------------
# Training & Evaluation (Same as before)
# ------------------------
def evaluate(model, dataloader, device):
    model.eval(); y_true, y_pred = [], []
    with torch.no_grad():
        for batch in dataloader:
            logits = model(batch['input_ids'].to(device), batch['attention_mask'].to(device), batch['char_ids'].to(device), batch['aux'].to(device), batch['tfidf'].to(device))
            y_true.extend(batch['label'].numpy()); y_pred.extend(torch.argmax(logits, dim=1).cpu().numpy())
    return y_true, y_pred

def train_loop(train_loader, val_loader, model, cfg, label_map, class_weights):
    device = cfg.device; model.to(device)
    optimizer = torch.optim.AdamW([
        {'params': model.muril.parameters(), 'lr': cfg.lr_muril},
        {'params': model.bilstm.parameters(), 'lr': cfg.lr_recurrent},
        {'params': model.attention.parameters(), 'lr': cfg.lr_head},
        {'params': model.char_cnn.parameters(), 'lr': cfg.lr_head},
        {'params': model.tfidf_proj.parameters(), 'lr': cfg.lr_head},
        {'params': model.aux_proj.parameters(), 'lr': cfg.lr_head},
        {'params': model.classifier.parameters(), 'lr': cfg.lr_head}
    ])
    num_training_steps = len(train_loader) * cfg.epochs
    num_warmup_steps = int(0.1 * num_training_steps)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps)
    weights_tensor = torch.tensor(class_weights, dtype=torch.float).to(device)
    criterion = nn.CrossEntropyLoss(weight=weights_tensor, label_smoothing=cfg.label_smoothing)
    best_macro_f1 = -1.0
    print(f"\n--- Starting Advanced Training on {device} ---")
    for epoch in range(cfg.epochs):
        model.train()
        pbar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{cfg.epochs}")
        for batch in pbar:
            optimizer.zero_grad()
            logits = model(batch['input_ids'].to(device), batch['attention_mask'].to(device), batch['char_ids'].to(device), batch['aux'].to(device), batch['tfidf'].to(device))
            loss = criterion(logits, batch['label'].to(device))
            loss.backward(); optimizer.step(); scheduler.step(); pbar.set_postfix(loss=loss.item())

        val_true, val_pred = evaluate(model, val_loader, device)
        report = classification_report(val_true, val_pred, target_names=label_map.keys(), output_dict=True, zero_division=0)
        macro_f1 = report['macro avg']['f1-score']
        print(f"Epoch {epoch+1} -> Val Macro F1: {macro_f1:.4f}")
        if macro_f1 > best_macro_f1:
            best_macro_f1 = macro_f1; torch.save(model.state_dict(), os.path.join(cfg.output_dir, "best_model.pt")); print(f"🚀 New best model saved with Macro F1: {macro_f1:.4f}")

# ------------------------
# Main Orchestration
# ------------------------
def main():
    df = pd.read_csv(cfg.data_csv, sep='\t', header=None, names=['label', 'text'], engine='python')
    df.dropna(subset=['text', 'label'], inplace=True); df['text'] = df['text'].astype(str); df['label'] = df['label'].apply(normalize_label); df = df[df['label'] != 'Other']
    label_counts = df['label'].value_counts(); classes_to_keep = label_counts[label_counts >= cfg.min_class_samples].index; df = df[df['label'].isin(classes_to_keep)].reset_index(drop=True)
    label_map = {label: i for i, label in enumerate(sorted(df['label'].unique()))}

    data = df.to_dict(orient="records"); labels = [label_map[r['label']] for r in data]
    train_records, test_records, _, _ = train_test_split(data, labels, test_size=0.2, random_state=cfg.seed, stratify=labels)
    train_records, val_records = train_test_split(train_records, test_size=0.15, random_state=cfg.seed, stratify=[label_map[r['label']] for r in train_records])

    train_labels_for_weights = [label_map[r['label']] for r in train_records]
    class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(train_labels_for_weights), y=train_labels_for_weights)
    print("Calculated Class Weights:", class_weights)

    print("Fitting TF-IDF Vectorizer...")
    train_texts = [r['text'] for r in train_records]; tfidf_vectorizer = TfidfVectorizer(max_features=cfg.tfidf_max_features, ngram_range=(1, 2)); train_tfidf = tfidf_vectorizer.fit_transform(train_texts); val_tfidf = tfidf_vectorizer.transform([r['text'] for r in val_records]); test_tfidf = tfidf_vectorizer.transform([r['text'] for r in test_records])

    print(f"Loading MURIL tokenizer: {cfg.model_name}...")
    tokenizer = AutoTokenizer.from_pretrained(cfg.model_name)
    char2idx = {c: i+2 for i, c in enumerate(Counter(c for text in train_texts for c in text).keys())}
    char2idx['<pad>'] = 0; char2idx['<unk>'] = 1

    train_ds = MurilHybridDataset(train_records, train_tfidf, label_map, char2idx, tokenizer, cfg)
    val_ds = MurilHybridDataset(val_records, val_tfidf, label_map, char2idx, tokenizer, cfg)
    test_ds = MurilHybridDataset(test_records, test_tfidf, label_map, char2idx, tokenizer, cfg)

    train_loader = DataLoader(train_ds, batch_size=cfg.batch_size, shuffle=True, num_workers=2)
    val_loader = DataLoader(val_ds, batch_size=cfg.batch_size * 2, shuffle=False, num_workers=2)
    test_loader = DataLoader(test_ds, batch_size=cfg.batch_size * 2, shuffle=False, num_workers=2)

    print(f"Loading MURIL model: {cfg.model_name}...")
    model = MurilHybridClassifier(len(char2idx), len(label_map), cfg)

    train_loop(train_loader, val_loader, model, cfg, label_map, class_weights)

    print("\n--- Final Test Set Evaluation ---")
    model.load_state_dict(torch.load(os.path.join(cfg.output_dir, "best_model.pt")))
    test_true, test_pred = evaluate(model, test_loader, cfg.device)
    print(classification_report(test_true, test_pred, target_names=label_map.keys()))

if __name__ == '__main__':
    main()

Calculated Class Weights: [2.02036199 0.66443452]
Fitting TF-IDF Vectorizer...
Loading MURIL tokenizer: google/muril-base-cased...
Loading MURIL model: google/muril-base-cased...

--- Starting Advanced Training on cuda ---


Epoch 1/5: 100%|██████████| 224/224 [02:41<00:00,  1.38it/s, loss=0.517]


Epoch 1 -> Val Macro F1: 0.7937
🚀 New best model saved with Macro F1: 0.7937


Epoch 2/5: 100%|██████████| 224/224 [02:45<00:00,  1.36it/s, loss=0.357]


Epoch 2 -> Val Macro F1: 0.8502
🚀 New best model saved with Macro F1: 0.8502


Epoch 3/5: 100%|██████████| 224/224 [02:45<00:00,  1.36it/s, loss=0.43]


Epoch 3 -> Val Macro F1: 0.8509
🚀 New best model saved with Macro F1: 0.8509


Epoch 4/5: 100%|██████████| 224/224 [02:45<00:00,  1.35it/s, loss=0.212]


Epoch 4 -> Val Macro F1: 0.8673
🚀 New best model saved with Macro F1: 0.8673


Epoch 5/5: 100%|██████████| 224/224 [02:45<00:00,  1.36it/s, loss=0.614]


Epoch 5 -> Val Macro F1: 0.8701
🚀 New best model saved with Macro F1: 0.8701

--- Final Test Set Evaluation ---
              precision    recall  f1-score   support

    Negative       0.75      0.84      0.79       520
    Positive       0.94      0.91      0.93      1582

    accuracy                           0.89      2102
   macro avg       0.85      0.87      0.86      2102
weighted avg       0.90      0.89      0.89      2102

