Muril Embedding + BiLSTM + CharCNN + Tf-IdF

In [1]:
# --- Step 1: Install necessary libraries, including Hugging Face Transformers ---
!pip install torch scikit-learn pandas tqdm transformers sentencepiece -q

import os
import time
import random
import json
from collections import Counter
from dataclasses import dataclass
from typing import List

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F # <--- FIX: Added the missing import
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import AutoTokenizer, AutoModel, get_linear_schedule_with_warmup
from tqdm.auto import tqdm

# ------------------------
# Configuration
# ------------------------
@dataclass
class CFG:
    # --- MODIFIED: Paths for Kannada data ---
    data_csv: str = "kannada_sentiment.csv"
    output_dir: str = "outputs_kannada_muril_hybrid_advanced"
    # --- End of Modification ---

    model_name: str = "google/muril-base-cased"
    device: str = "cuda" if torch.cuda.is_available() else "cpu"
    min_class_samples: int = 5 # Increased slightly for stability
    max_token_len: int = 128
    max_char_len: int = 256
    tfidf_max_features: int = 5000
    tfidf_proj_dim: int = 64
    muril_hidden_size: int = 768
    lstm_hidden_dim: int = 256
    lstm_layers: int = 1
    char_emb_dim: int = 50
    char_out_dim: int = 100
    aux_dim: int = 8
    dropout: float = 0.4
    epochs: int = 5
    batch_size: int = 32
    lr_muril: float = 2e-5
    lr_recurrent: float = 1e-4
    lr_head: float = 1e-3
    label_smoothing: float = 0.1
    seed: int = 42

cfg = CFG()
os.makedirs(cfg.output_dir, exist_ok=True)

# ------------------------
# Utilities & Preprocessing
# ------------------------
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

seed_everything(cfg.seed)

# --- MODIFIED: Label normalization for Kannada dataset ---
def normalize_label(l):
    s = str(l).lower().strip()
    if 'positive' in s: return 'Positive'
    if 'negative' in s: return 'Negative'
    if 'mixed feelings' in s or 'unknown state' in s: return 'Neutral'
    return 'Other' # 'not-Kannada' and any other labels will be filtered out

# --- MODIFIED: Auxiliary features to detect Kannada characters ---
def compute_aux_features(text: str) -> List[float]:
    toks = text.split()
    num_tokens = len(toks)
    num_chars = len(text)
    return [
        num_tokens,
        num_chars,
        1.0 if any('\u0C80' <= ch <= '\u0CFF' for ch in text) else 0.0, # Kannada Unicode range
        1.0 if any('a' <= ch.lower() <= 'z' for ch in text) else 0.0,
        (sum(len(t) for t in toks) / num_tokens) if num_tokens > 0 else 0.0,
        sum(1 for ch in text if ch.isupper()) / (num_chars + 1e-6),
        sum(1 for ch in text if ch in '?!.,;:'),
        sum(1 for ch in text if ord(ch) > 10000)
    ]
# --- End of Modification ---

# ------------------------
# Dataset Class
# ------------------------
class MurilHybridDataset(Dataset):
    def __init__(self, records, tfidf_vectors, label_map, char2idx, tokenizer, cfg):
        self.records = records
        self.tfidf_vectors = tfidf_vectors
        self.label_map = label_map
        self.char2idx = char2idx
        self.tokenizer = tokenizer
        self.cfg = cfg

    def __len__(self):
        return len(self.records)

    def __getitem__(self, idx):
        record = self.records[idx]
        text = str(record['text'])

        inputs = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.cfg.max_token_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        char_ids = [self.char2idx.get(c, self.char2idx['<unk>']) for c in text][:self.cfg.max_char_len]
        padded_chars = char_ids + [self.char2idx['<pad>']] * (self.cfg.max_char_len - len(char_ids))

        return {
            "input_ids": inputs['input_ids'].squeeze(),
            "attention_mask": inputs['attention_mask'].squeeze(),
            "char_ids": torch.tensor(padded_chars, dtype=torch.long),
            "aux": torch.tensor(compute_aux_features(text), dtype=torch.float32),
            "tfidf": torch.tensor(self.tfidf_vectors[idx].toarray().squeeze(), dtype=torch.float32),
            "label": torch.tensor(self.label_map[record['label']], dtype=torch.long)
        }

# ------------------------
# Model Architecture
# ------------------------
class CharCNN(nn.Module):
    def __init__(self, char_vocab_size, char_emb_dim, out_dim):
        super().__init__()
        self.char_emb = nn.Embedding(char_vocab_size, char_emb_dim, padding_idx=0)
        self.conv = nn.Conv1d(char_emb_dim, out_dim, kernel_size=3, padding=1)

    def forward(self, x_char):
        x = self.char_emb(x_char).transpose(1, 2)
        x = self.conv(x)
        return F.max_pool1d(x, x.size(2)).squeeze(2)

class MurilHybridClassifier(nn.Module):
    def __init__(self, char_vocab_size, num_labels, cfg):
        super().__init__()
        self.muril = AutoModel.from_pretrained(cfg.model_name)
        self.bilstm = nn.LSTM(cfg.muril_hidden_size, cfg.lstm_hidden_dim // 2, num_layers=cfg.lstm_layers, bidirectional=True, batch_first=True)
        self.char_cnn = CharCNN(char_vocab_size, cfg.char_emb_dim, cfg.char_out_dim)
        self.tfidf_proj = nn.Linear(cfg.tfidf_max_features, cfg.tfidf_proj_dim)
        self.aux_proj = nn.Linear(cfg.aux_dim, cfg.aux_dim)

        classifier_input_dim = cfg.lstm_hidden_dim + cfg.char_out_dim + cfg.aux_dim + cfg.tfidf_proj_dim
        self.classifier = nn.Sequential(
            nn.Linear(classifier_input_dim, 256),
            nn.ReLU(),
            nn.Dropout(cfg.dropout),
            nn.Linear(256, num_labels)
        )

    def forward(self, input_ids, attention_mask, char_ids, aux, tfidf):
        muril_embeddings = self.muril(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state
        _, (h_n, _) = self.bilstm(muril_embeddings)
        hidden = torch.cat((h_n[-2,:,:], h_n[-1,:,:]), dim=1)
        char_vec = self.char_cnn(char_ids)
        aux_vec = F.relu(self.aux_proj(aux))
        tfidf_vec = F.relu(self.tfidf_proj(tfidf))

        combined_features = torch.cat([hidden, char_vec, aux_vec, tfidf_vec], dim=1)
        return self.classifier(combined_features)

# ------------------------
# Training & Evaluation
# ------------------------
def evaluate(model, dataloader, device):
    model.eval()
    y_true, y_pred = [], []
    with torch.no_grad():
        for batch in dataloader:
            logits = model(
                batch['input_ids'].to(device),
                batch['attention_mask'].to(device),
                batch['char_ids'].to(device),
                batch['aux'].to(device),
                batch['tfidf'].to(device)
            )
            y_true.extend(batch['label'].numpy())
            y_pred.extend(torch.argmax(logits, dim=1).cpu().numpy())
    return y_true, y_pred

def train_loop(train_loader, val_loader, model, cfg, label_map, class_weights):
    device = cfg.device
    model.to(device)

    optimizer = torch.optim.AdamW([
        {'params': model.muril.parameters(), 'lr': cfg.lr_muril},
        {'params': model.bilstm.parameters(), 'lr': cfg.lr_recurrent},
        {'params': model.char_cnn.parameters(), 'lr': cfg.lr_head},
        {'params': model.tfidf_proj.parameters(), 'lr': cfg.lr_head},
        {'params': model.aux_proj.parameters(), 'lr': cfg.lr_head},
        {'params': model.classifier.parameters(), 'lr': cfg.lr_head}
    ])

    num_training_steps = len(train_loader) * cfg.epochs
    num_warmup_steps = int(0.1 * num_training_steps)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=num_warmup_steps,
        num_training_steps=num_training_steps
    )
    weights_tensor = torch.tensor(class_weights, dtype=torch.float).to(device)
    criterion = nn.CrossEntropyLoss(weight=weights_tensor, label_smoothing=cfg.label_smoothing)

    best_macro_f1 = -1.0
    print(f"\n--- Starting Advanced Training on {device} ---")
    for epoch in range(cfg.epochs):
        model.train()
        pbar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{cfg.epochs}")
        for batch in pbar:
            optimizer.zero_grad()
            logits = model(
                batch['input_ids'].to(device),
                batch['attention_mask'].to(device),
                batch['char_ids'].to(device),
                batch['aux'].to(device),
                batch['tfidf'].to(device)
            )
            loss = criterion(logits, batch['label'].to(device))
            loss.backward()
            optimizer.step()
            scheduler.step()
            pbar.set_postfix(loss=loss.item())

        val_true, val_pred = evaluate(model, val_loader, device)
        report = classification_report(val_true, val_pred, target_names=label_map.keys(), output_dict=True, zero_division=0)
        macro_f1 = report['macro avg']['f1-score']
        print(f"Epoch {epoch+1} -> Val Macro F1: {macro_f1:.4f}")
        if macro_f1 > best_macro_f1:
            best_macro_f1 = macro_f1
            torch.save(model.state_dict(), os.path.join(cfg.output_dir, "best_model.pt"))
            print(f"🚀 New best model saved with Macro F1: {macro_f1:.4f}")

# ------------------------
# Main Orchestration
# ------------------------
def main():
    try:
        df = pd.read_csv(cfg.data_csv, sep='\t', header=None, names=['label', 'text'], on_bad_lines='skip', engine='python')
    except FileNotFoundError:
        print(f"ERROR: The data file '{cfg.data_csv}' was not found.")
        print("Please make sure you have uploaded the Kannada sentiment dataset.")
        return

    df.dropna(subset=['text', 'label'], inplace=True)
    df['text'] = df['text'].astype(str)
    df['label'] = df['label'].apply(normalize_label)
    df = df[df['label'] != 'Other']

    label_counts = df['label'].value_counts()
    classes_to_keep = label_counts[label_counts >= cfg.min_class_samples].index
    df = df[df['label'].isin(classes_to_keep)].reset_index(drop=True)

    label_map = {label: i for i, label in enumerate(sorted(df['label'].unique()))}
    print("Label Mapping:", label_map)

    data = df.to_dict(orient="records")
    labels = [label_map[r['label']] for r in data]

    train_records, test_records, _, _ = train_test_split(data, labels, test_size=0.2, random_state=cfg.seed, stratify=labels)
    train_records, val_records = train_test_split(train_records, test_size=0.15, random_state=cfg.seed, stratify=[label_map[r['label']] for r in train_records])

    train_labels_for_weights = [label_map[r['label']] for r in train_records]
    class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(train_labels_for_weights), y=train_labels_for_weights)
    print("Calculated Class Weights:", class_weights)

    print("Fitting TF-IDF Vectorizer...")
    train_texts = [r['text'] for r in train_records]
    tfidf_vectorizer = TfidfVectorizer(max_features=cfg.tfidf_max_features, ngram_range=(1, 2))
    train_tfidf = tfidf_vectorizer.fit_transform(train_texts)
    val_tfidf = tfidf_vectorizer.transform([r['text'] for r in val_records])
    test_tfidf = tfidf_vectorizer.transform([r['text'] for r in test_records])

    print(f"Loading MURIL tokenizer: {cfg.model_name}...")
    tokenizer = AutoTokenizer.from_pretrained(cfg.model_name)

    char_counts = Counter(c for text in train_texts for c in text)
    char2idx = {c: i+2 for i, c in enumerate(char_counts.keys())}
    char2idx['<pad>'] = 0
    char2idx['<unk>'] = 1

    train_ds = MurilHybridDataset(train_records, train_tfidf, label_map, char2idx, tokenizer, cfg)
    val_ds = MurilHybridDataset(val_records, val_tfidf, label_map, char2idx, tokenizer, cfg)
    test_ds = MurilHybridDataset(test_records, test_tfidf, label_map, char2idx, tokenizer, cfg)

    # Use 0 workers if running in a notebook environment that has issues with multiprocessing
    num_workers = 0 if 'google.colab' in str(get_ipython()) else 2

    train_loader = DataLoader(train_ds, batch_size=cfg.batch_size, shuffle=True, num_workers=num_workers)
    val_loader = DataLoader(val_ds, batch_size=cfg.batch_size * 2, shuffle=False, num_workers=num_workers)
    test_loader = DataLoader(test_ds, batch_size=cfg.batch_size * 2, shuffle=False, num_workers=num_workers)

    print(f"Initializing Hybrid MURIL model...")
    model = MurilHybridClassifier(len(char2idx), len(label_map), cfg)

    train_loop(train_loader, val_loader, model, cfg, label_map, class_weights)

    print("\n--- Final Test Set Evaluation ---")
    model.load_state_dict(torch.load(os.path.join(cfg.output_dir, "best_model.pt")))
    test_true, test_pred = evaluate(model, test_loader, cfg.device)
    print(classification_report(test_true, test_pred, target_names=label_map.keys()))

if __name__ == '__main__':
    main()

Label Mapping: {'Negative': 0, 'Neutral': 1, 'Positive': 2}
Calculated Class Weights: [1.46778989 1.42130518 0.61914716]
Fitting TF-IDF Vectorizer...
Loading MURIL tokenizer: google/muril-base-cased...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/206 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/113 [00:00<?, ?B/s]

Initializing Hybrid MURIL model...


pytorch_model.bin:   0%|          | 0.00/953M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/953M [00:00<?, ?B/s]


--- Starting Advanced Training on cuda ---


Epoch 1/5:   0%|          | 0/139 [00:00<?, ?it/s]

Epoch 1 -> Val Macro F1: 0.5691
🚀 New best model saved with Macro F1: 0.5691


Epoch 2/5:   0%|          | 0/139 [00:00<?, ?it/s]

Epoch 2 -> Val Macro F1: 0.6307
🚀 New best model saved with Macro F1: 0.6307


Epoch 3/5:   0%|          | 0/139 [00:00<?, ?it/s]

Epoch 3 -> Val Macro F1: 0.6693
🚀 New best model saved with Macro F1: 0.6693


Epoch 4/5:   0%|          | 0/139 [00:00<?, ?it/s]

Epoch 4 -> Val Macro F1: 0.6724
🚀 New best model saved with Macro F1: 0.6724


Epoch 5/5:   0%|          | 0/139 [00:00<?, ?it/s]

Epoch 5 -> Val Macro F1: 0.6603

--- Final Test Set Evaluation ---
              precision    recall  f1-score   support

    Negative       0.60      0.71      0.65       297
     Neutral       0.63      0.59      0.61       307
    Positive       0.82      0.78      0.80       703

    accuracy                           0.72      1307
   macro avg       0.68      0.69      0.69      1307
weighted avg       0.73      0.72      0.72      1307



MuRIL Embedding + Tf-IdF + BiLSTM + Attention + CharCNN

In [2]:
# --- Step 1: Install necessary libraries, including Hugging Face Transformers ---
!pip install torch scikit-learn pandas tqdm transformers sentencepiece -q

import os
import time
import random
import json
from collections import Counter
from dataclasses import dataclass
from typing import List

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import AutoTokenizer, AutoModel, get_linear_schedule_with_warmup
from tqdm.auto import tqdm

# ------------------------
# Configuration
# ------------------------
@dataclass
class CFG:
    data_csv: str = "kannada_sentiment.csv"
    output_dir: str = "outputs_kannada_muril_attention_hybrid" # Updated output dir
    model_name: str = "google/muril-base-cased"
    device: str = "cuda" if torch.cuda.is_available() else "cpu"
    min_class_samples: int = 5
    max_token_len: int = 128
    max_char_len: int = 256
    tfidf_max_features: int = 5000
    tfidf_proj_dim: int = 64
    muril_hidden_size: int = 768
    lstm_hidden_dim: int = 256
    lstm_layers: int = 1
    char_emb_dim: int = 50
    char_out_dim: int = 100
    aux_dim: int = 8
    dropout: float = 0.4
    epochs: int = 5
    batch_size: int = 32
    lr_muril: float = 2e-5
    lr_recurrent: float = 1e-4
    lr_head: float = 1e-3
    label_smoothing: float = 0.1
    seed: int = 42

cfg = CFG()
os.makedirs(cfg.output_dir, exist_ok=True)

# ------------------------
# Utilities & Preprocessing
# ------------------------
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

seed_everything(cfg.seed)

def normalize_label(l):
    s = str(l).lower().strip()
    if 'positive' in s: return 'Positive'
    if 'negative' in s: return 'Negative'
    if 'mixed feelings' in s or 'unknown state' in s: return 'Neutral'
    return 'Other'

def compute_aux_features(text: str) -> List[float]:
    toks = text.split(); num_tokens = len(toks); num_chars = len(text)
    return [
        num_tokens, num_chars,
        1.0 if any('\u0C80' <= ch <= '\u0CFF' for ch in text) else 0.0,
        1.0 if any('a' <= ch.lower() <= 'z' for ch in text) else 0.0,
        (sum(len(t) for t in toks) / num_tokens) if num_tokens > 0 else 0.0,
        sum(1 for ch in text if ch.isupper()) / (num_chars + 1e-6),
        sum(1 for ch in text if ch in '?!.,;:'),
        sum(1 for ch in text if ord(ch) > 10000)
    ]

# ------------------------
# Dataset Class
# ------------------------
class MurilHybridDataset(Dataset):
    def __init__(self, records, tfidf_vectors, label_map, char2idx, tokenizer, cfg):
        self.records, self.tfidf_vectors, self.label_map, self.char2idx, self.tokenizer, self.cfg = \
            records, tfidf_vectors, label_map, char2idx, tokenizer, cfg

    def __len__(self): return len(self.records)

    def __getitem__(self, idx):
        record = self.records[idx]; text = str(record['text'])
        inputs = self.tokenizer.encode_plus(
            text, add_special_tokens=True, max_length=self.cfg.max_token_len,
            padding='max_length', truncation=True, return_tensors='pt'
        )
        char_ids = [self.char2idx.get(c, self.char2idx['<unk>']) for c in text][:self.cfg.max_char_len]
        padded_chars = char_ids + [self.char2idx['<pad>']] * (self.cfg.max_char_len - len(char_ids))
        return {
            "input_ids": inputs['input_ids'].squeeze(),
            "attention_mask": inputs['attention_mask'].squeeze(),
            "char_ids": torch.tensor(padded_chars, dtype=torch.long),
            "aux": torch.tensor(compute_aux_features(text), dtype=torch.float32),
            "tfidf": torch.tensor(self.tfidf_vectors[idx].toarray().squeeze(), dtype=torch.float32),
            "label": torch.tensor(self.label_map[record['label']], dtype=torch.long)
        }

# ------------------------
# Model Architecture
# ------------------------
class CharCNN(nn.Module):
    def __init__(self, char_vocab_size, char_emb_dim, out_dim):
        super().__init__()
        self.char_emb = nn.Embedding(char_vocab_size, char_emb_dim, padding_idx=0)
        self.conv = nn.Conv1d(char_emb_dim, out_dim, kernel_size=3, padding=1)

    def forward(self, x_char):
        x = self.char_emb(x_char).transpose(1, 2)
        x = self.conv(x)
        return F.max_pool1d(x, x.size(2)).squeeze(2)

# --- NEW: Attention Mechanism ---
class Attention(nn.Module):
    def __init__(self, feature_dim):
        super(Attention, self).__init__()
        self.attention_fc = nn.Linear(feature_dim, 1)

    def forward(self, features):
        # features shape: (batch_size, seq_len, feature_dim)
        attention_scores = self.attention_fc(features).squeeze(-1)
        attention_weights = F.softmax(attention_scores, dim=1).unsqueeze(1)
        # attention_weights shape: (batch_size, 1, seq_len)
        context_vector = torch.bmm(attention_weights, features).squeeze(1)
        # context_vector shape: (batch_size, feature_dim)
        return context_vector
# --- End of New Section ---

class MurilHybridClassifier(nn.Module):
    def __init__(self, char_vocab_size, num_labels, cfg):
        super().__init__()
        self.muril = AutoModel.from_pretrained(cfg.model_name)
        self.bilstm = nn.LSTM(cfg.muril_hidden_size, cfg.lstm_hidden_dim // 2, num_layers=cfg.lstm_layers, bidirectional=True, batch_first=True)
        # --- MODIFIED: Added Attention Layer ---
        self.attention = Attention(cfg.lstm_hidden_dim)
        # --- End of Modification ---
        self.char_cnn = CharCNN(char_vocab_size, cfg.char_emb_dim, cfg.char_out_dim)
        self.tfidf_proj = nn.Linear(cfg.tfidf_max_features, cfg.tfidf_proj_dim)
        self.aux_proj = nn.Linear(cfg.aux_dim, cfg.aux_dim)

        classifier_input_dim = cfg.lstm_hidden_dim + cfg.char_out_dim + cfg.aux_dim + cfg.tfidf_proj_dim
        self.classifier = nn.Sequential(
            nn.Linear(classifier_input_dim, 256),
            nn.ReLU(),
            nn.Dropout(cfg.dropout),
            nn.Linear(256, num_labels)
        )

    def forward(self, input_ids, attention_mask, char_ids, aux, tfidf):
        muril_embeddings = self.muril(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state
        # --- MODIFIED: Apply attention to BiLSTM output ---
        lstm_out, _ = self.bilstm(muril_embeddings)
        attention_vec = self.attention(lstm_out)
        # --- End of Modification ---
        char_vec = self.char_cnn(char_ids)
        aux_vec = F.relu(self.aux_proj(aux))
        tfidf_vec = F.relu(self.tfidf_proj(tfidf))
        combined_features = torch.cat([attention_vec, char_vec, aux_vec, tfidf_vec], dim=1)
        return self.classifier(combined_features)

# ------------------------
# Training & Evaluation
# ------------------------
def evaluate(model, dataloader, device):
    model.eval(); y_true, y_pred = [], []
    with torch.no_grad():
        for batch in dataloader:
            logits = model(
                batch['input_ids'].to(device), batch['attention_mask'].to(device),
                batch['char_ids'].to(device), batch['aux'].to(device), batch['tfidf'].to(device)
            )
            y_true.extend(batch['label'].numpy())
            y_pred.extend(torch.argmax(logits, dim=1).cpu().numpy())
    return y_true, y_pred

def train_loop(train_loader, val_loader, model, cfg, label_map, class_weights):
    device = cfg.device; model.to(device)
    optimizer = torch.optim.AdamW([
        {'params': model.muril.parameters(), 'lr': cfg.lr_muril},
        {'params': model.bilstm.parameters(), 'lr': cfg.lr_recurrent},
        {'params': model.attention.parameters(), 'lr': cfg.lr_recurrent}, # Added attention params
        {'params': model.char_cnn.parameters(), 'lr': cfg.lr_head},
        {'params': model.tfidf_proj.parameters(), 'lr': cfg.lr_head},
        {'params': model.aux_proj.parameters(), 'lr': cfg.lr_head},
        {'params': model.classifier.parameters(), 'lr': cfg.lr_head}
    ])
    num_training_steps = len(train_loader) * cfg.epochs
    num_warmup_steps = int(0.1 * num_training_steps)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps)
    weights_tensor = torch.tensor(class_weights, dtype=torch.float).to(device)
    criterion = nn.CrossEntropyLoss(weight=weights_tensor, label_smoothing=cfg.label_smoothing)

    best_macro_f1 = -1.0
    print(f"\n--- Starting Training (with Attention) on {device} ---")
    for epoch in range(cfg.epochs):
        model.train()
        pbar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{cfg.epochs}")
        for batch in pbar:
            optimizer.zero_grad()
            logits = model(
                batch['input_ids'].to(device), batch['attention_mask'].to(device),
                batch['char_ids'].to(device), batch['aux'].to(device), batch['tfidf'].to(device)
            )
            loss = criterion(logits, batch['label'].to(device))
            loss.backward(); optimizer.step(); scheduler.step()
            pbar.set_postfix(loss=loss.item())

        val_true, val_pred = evaluate(model, val_loader, device)
        report = classification_report(val_true, val_pred, target_names=label_map.keys(), output_dict=True, zero_division=0)
        macro_f1 = report['macro avg']['f1-score']
        print(f"Epoch {epoch+1} -> Val Macro F1: {macro_f1:.4f}")
        if macro_f1 > best_macro_f1:
            best_macro_f1 = macro_f1
            torch.save(model.state_dict(), os.path.join(cfg.output_dir, "best_model_attention.pt"))
            print(f"🚀 New best model saved with Macro F1: {macro_f1:.4f}")

# ------------------------
# Main Orchestration
# ------------------------
def main():
    try:
        df = pd.read_csv(cfg.data_csv, sep='\t', header=None, names=['label', 'text'], on_bad_lines='skip', engine='python')
    except FileNotFoundError:
        print(f"ERROR: The data file '{cfg.data_csv}' was not found.")
        print("Please make sure you have uploaded the Kannada sentiment dataset.")
        return

    df.dropna(subset=['text', 'label'], inplace=True); df['text'] = df['text'].astype(str)
    df['label'] = df['label'].apply(normalize_label); df = df[df['label'] != 'Other']
    label_counts = df['label'].value_counts()
    classes_to_keep = label_counts[label_counts >= cfg.min_class_samples].index
    df = df[df['label'].isin(classes_to_keep)].reset_index(drop=True)
    label_map = {label: i for i, label in enumerate(sorted(df['label'].unique()))}
    print("Label Mapping:", label_map)
    data = df.to_dict(orient="records"); labels = [label_map[r['label']] for r in data]
    train_records, test_records, _, _ = train_test_split(data, labels, test_size=0.2, random_state=cfg.seed, stratify=labels)
    train_records, val_records = train_test_split(train_records, test_size=0.15, random_state=cfg.seed, stratify=[label_map[r['label']] for r in train_records])
    train_labels_for_weights = [label_map[r['label']] for r in train_records]
    class_weights = compute_class_weight('balanced', classes=np.unique(train_labels_for_weights), y=train_labels_for_weights)
    print("Calculated Class Weights:", class_weights)

    print("Fitting TF-IDF Vectorizer...")
    train_texts = [r['text'] for r in train_records]
    tfidf_vectorizer = TfidfVectorizer(max_features=cfg.tfidf_max_features, ngram_range=(1, 2))
    train_tfidf = tfidf_vectorizer.fit_transform(train_texts)
    val_tfidf = tfidf_vectorizer.transform([r['text'] for r in val_records])
    test_tfidf = tfidf_vectorizer.transform([r['text'] for r in test_records])

    print(f"Loading MURIL tokenizer: {cfg.model_name}...")
    tokenizer = AutoTokenizer.from_pretrained(cfg.model_name)
    char_counts = Counter(c for text in train_texts for c in text)
    char2idx = {c: i+2 for i, c in enumerate(char_counts.keys())}
    char2idx['<pad>'] = 0; char2idx['<unk>'] = 1

    train_ds = MurilHybridDataset(train_records, train_tfidf, label_map, char2idx, tokenizer, cfg)
    val_ds = MurilHybridDataset(val_records, val_tfidf, label_map, char2idx, tokenizer, cfg)
    test_ds = MurilHybridDataset(test_records, test_tfidf, label_map, char2idx, tokenizer, cfg)

    # Use 0 workers for compatibility with some notebook environments
    num_workers = 0
    train_loader = DataLoader(train_ds, batch_size=cfg.batch_size, shuffle=True, num_workers=num_workers)
    val_loader = DataLoader(val_ds, batch_size=cfg.batch_size * 2, shuffle=False, num_workers=num_workers)
    test_loader = DataLoader(test_ds, batch_size=cfg.batch_size * 2, shuffle=False, num_workers=num_workers)

    print(f"Initializing Hybrid MURIL model with Attention...")
    model = MurilHybridClassifier(len(char2idx), len(label_map), cfg)

    train_loop(train_loader, val_loader, model, cfg, label_map, class_weights)

    print("\n--- Final Test Set Evaluation ---")
    model.load_state_dict(torch.load(os.path.join(cfg.output_dir, "best_model_attention.pt")))
    test_true, test_pred = evaluate(model, test_loader, cfg.device)
    print(classification_report(test_true, test_pred, target_names=label_map.keys()))

if __name__ == '__main__':
    main()

Label Mapping: {'Negative': 0, 'Neutral': 1, 'Positive': 2}
Calculated Class Weights: [1.46778989 1.42130518 0.61914716]
Fitting TF-IDF Vectorizer...
Loading MURIL tokenizer: google/muril-base-cased...
Initializing Hybrid MURIL model with Attention...

--- Starting Training (with Attention) on cuda ---


Epoch 1/5:   0%|          | 0/139 [00:00<?, ?it/s]

Epoch 1 -> Val Macro F1: 0.5479
🚀 New best model saved with Macro F1: 0.5479


Epoch 2/5:   0%|          | 0/139 [00:00<?, ?it/s]

Epoch 2 -> Val Macro F1: 0.6469
🚀 New best model saved with Macro F1: 0.6469


Epoch 3/5:   0%|          | 0/139 [00:00<?, ?it/s]

Epoch 3 -> Val Macro F1: 0.6412


Epoch 4/5:   0%|          | 0/139 [00:00<?, ?it/s]

Epoch 4 -> Val Macro F1: 0.6526
🚀 New best model saved with Macro F1: 0.6526


Epoch 5/5:   0%|          | 0/139 [00:00<?, ?it/s]

Epoch 5 -> Val Macro F1: 0.6603
🚀 New best model saved with Macro F1: 0.6603

--- Final Test Set Evaluation ---
              precision    recall  f1-score   support

    Negative       0.62      0.67      0.64       297
     Neutral       0.56      0.64      0.60       307
    Positive       0.81      0.73      0.77       703

    accuracy                           0.70      1307
   macro avg       0.66      0.68      0.67      1307
weighted avg       0.71      0.70      0.70      1307



MuRIL Embedding + Tf-IDF + BiLSTM + Attention + CharCNN + Cross-Folding

In [3]:
# --- Step 1: Install necessary libraries, including Hugging Face Transformers ---
!pip install torch scikit-learn pandas tqdm transformers sentencepiece -q

import os
import time
import random
import json
from collections import Counter
from dataclasses import dataclass
from typing import List

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import classification_report
from sklearn.model_selection import StratifiedKFold # <--- NEW: Import for cross-validation
from sklearn.utils.class_weight import compute_class_weight
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import AutoTokenizer, AutoModel, get_linear_schedule_with_warmup
from tqdm.auto import tqdm

# ------------------------
# Configuration
# ------------------------
@dataclass
class CFG:
    data_csv: str = "kannada_sentiment.csv"
    output_dir: str = "outputs_kannada_cv_attention_hybrid" # Updated output dir
    model_name: str = "google/muril-base-cased"
    device: str = "cuda" if torch.cuda.is_available() else "cpu"
    num_folds: int = 5 # <--- NEW: Number of folds for cross-validation
    min_class_samples: int = 5
    max_token_len: int = 128
    max_char_len: int = 256
    tfidf_max_features: int = 5000
    tfidf_proj_dim: int = 64
    muril_hidden_size: int = 768
    lstm_hidden_dim: int = 256
    lstm_layers: int = 1
    char_emb_dim: int = 50
    char_out_dim: int = 100
    aux_dim: int = 8
    dropout: float = 0.4
    epochs: int = 4 # Reduced epochs per fold as we train multiple models
    batch_size: int = 32
    lr_muril: float = 2e-5
    lr_recurrent: float = 1e-4
    lr_head: float = 1e-3
    label_smoothing: float = 0.1
    seed: int = 42

cfg = CFG()
os.makedirs(cfg.output_dir, exist_ok=True)

# ------------------------
# Utilities & Preprocessing
# ------------------------
def seed_everything(seed=42):
    random.seed(seed); os.environ['PYTHONHASHSEED'] = str(seed); np.random.seed(seed); torch.manual_seed(seed)
    if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed)

seed_everything(cfg.seed)

def normalize_label(l):
    s = str(l).lower().strip()
    if 'positive' in s: return 'Positive'
    if 'negative' in s: return 'Negative'
    if 'mixed feelings' in s or 'unknown state' in s: return 'Neutral'
    return 'Other'

def compute_aux_features(text: str) -> List[float]:
    toks = text.split(); num_tokens = len(toks); num_chars = len(text)
    return [
        num_tokens, num_chars,
        1.0 if any('\u0C80' <= ch <= '\u0CFF' for ch in text) else 0.0,
        1.0 if any('a' <= ch.lower() <= 'z' for ch in text) else 0.0,
        (sum(len(t) for t in toks) / num_tokens) if num_tokens > 0 else 0.0,
        sum(1 for ch in text if ch.isupper()) / (num_chars + 1e-6),
        sum(1 for ch in text if ch in '?!.,;:'), sum(1 for ch in text if ord(ch) > 10000)
    ]

# ------------------------
# Dataset Class
# ------------------------
class MurilHybridDataset(Dataset):
    def __init__(self, records, tfidf_vectors, label_map, char2idx, tokenizer, cfg):
        self.records, self.tfidf_vectors, self.label_map, self.char2idx, self.tokenizer, self.cfg = \
            records, tfidf_vectors, label_map, char2idx, tokenizer, cfg
    def __len__(self): return len(self.records)
    def __getitem__(self, idx):
        record = self.records[idx]; text = str(record['text'])
        inputs = self.tokenizer.encode_plus(
            text, add_special_tokens=True, max_length=self.cfg.max_token_len,
            padding='max_length', truncation=True, return_tensors='pt'
        )
        char_ids = [self.char2idx.get(c, self.char2idx['<unk>']) for c in text][:self.cfg.max_char_len]
        padded_chars = char_ids + [self.char2idx['<pad>']] * (self.cfg.max_char_len - len(char_ids))
        return {
            "input_ids": inputs['input_ids'].squeeze(), "attention_mask": inputs['attention_mask'].squeeze(),
            "char_ids": torch.tensor(padded_chars, dtype=torch.long),
            "aux": torch.tensor(compute_aux_features(text), dtype=torch.float32),
            "tfidf": torch.tensor(self.tfidf_vectors[idx].toarray().squeeze(), dtype=torch.float32),
            "label": torch.tensor(self.label_map[record['label']], dtype=torch.long)
        }

# ------------------------
# Model Architecture
# ------------------------
class CharCNN(nn.Module):
    def __init__(self, char_vocab_size, char_emb_dim, out_dim):
        super().__init__(); self.char_emb = nn.Embedding(char_vocab_size, char_emb_dim, padding_idx=0); self.conv = nn.Conv1d(char_emb_dim, out_dim, kernel_size=3, padding=1)
    def forward(self, x_char):
        x = self.char_emb(x_char).transpose(1, 2); x = self.conv(x); return F.max_pool1d(x, x.size(2)).squeeze(2)

class Attention(nn.Module):
    def __init__(self, feature_dim):
        super(Attention, self).__init__(); self.attention_fc = nn.Linear(feature_dim, 1)
    def forward(self, features):
        attn_scores = self.attention_fc(features).squeeze(-1); attn_weights = F.softmax(attn_scores, dim=1).unsqueeze(1); context_vec = torch.bmm(attn_weights, features).squeeze(1); return context_vec

class MurilHybridClassifier(nn.Module):
    def __init__(self, char_vocab_size, num_labels, cfg):
        super().__init__()
        self.muril = AutoModel.from_pretrained(cfg.model_name)
        self.bilstm = nn.LSTM(cfg.muril_hidden_size, cfg.lstm_hidden_dim // 2, num_layers=cfg.lstm_layers, bidirectional=True, batch_first=True)
        self.attention = Attention(cfg.lstm_hidden_dim)
        self.char_cnn = CharCNN(char_vocab_size, cfg.char_emb_dim, cfg.char_out_dim)
        self.tfidf_proj = nn.Linear(cfg.tfidf_max_features, cfg.tfidf_proj_dim)
        self.aux_proj = nn.Linear(cfg.aux_dim, cfg.aux_dim)
        classifier_input_dim = cfg.lstm_hidden_dim + cfg.char_out_dim + cfg.aux_dim + cfg.tfidf_proj_dim
        self.classifier = nn.Sequential(nn.Linear(classifier_input_dim, 256), nn.ReLU(), nn.Dropout(cfg.dropout), nn.Linear(256, num_labels))
    def forward(self, input_ids, attention_mask, char_ids, aux, tfidf):
        muril_embeddings = self.muril(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state
        lstm_out, _ = self.bilstm(muril_embeddings); attention_vec = self.attention(lstm_out)
        char_vec = self.char_cnn(char_ids); aux_vec = F.relu(self.aux_proj(aux)); tfidf_vec = F.relu(self.tfidf_proj(tfidf))
        combined_features = torch.cat([attention_vec, char_vec, aux_vec, tfidf_vec], dim=1)
        return self.classifier(combined_features)

# ------------------------
# Training & Evaluation
# ------------------------
def evaluate(model, dataloader, device):
    model.eval(); y_true, y_pred = [], []
    with torch.no_grad():
        for batch in dataloader:
            logits = model(batch['input_ids'].to(device), batch['attention_mask'].to(device), batch['char_ids'].to(device), batch['aux'].to(device), batch['tfidf'].to(device))
            y_true.extend(batch['label'].numpy()); y_pred.extend(torch.argmax(logits, dim=1).cpu().numpy())
    return y_true, y_pred

# --- MODIFIED: Train loop now accepts a fold number ---
def train_loop(train_loader, val_loader, model, cfg, label_map, class_weights, fold):
    device = cfg.device; model.to(device)
    optimizer = torch.optim.AdamW([
        {'params': model.muril.parameters(), 'lr': cfg.lr_muril}, {'params': model.bilstm.parameters(), 'lr': cfg.lr_recurrent},
        {'params': model.attention.parameters(), 'lr': cfg.lr_recurrent}, {'params': model.char_cnn.parameters(), 'lr': cfg.lr_head},
        {'params': model.tfidf_proj.parameters(), 'lr': cfg.lr_head}, {'params': model.aux_proj.parameters(), 'lr': cfg.lr_head},
        {'params': model.classifier.parameters(), 'lr': cfg.lr_head}
    ])
    num_training_steps = len(train_loader) * cfg.epochs
    scheduler = get_linear_schedule_with_warmup(optimizer, int(0.1 * num_training_steps), num_training_steps)
    weights_tensor = torch.tensor(class_weights, dtype=torch.float).to(device)
    criterion = nn.CrossEntropyLoss(weight=weights_tensor, label_smoothing=cfg.label_smoothing)
    best_macro_f1 = -1.0

    print(f"\n--- Starting Training for Fold {fold+1} on {device} ---")
    for epoch in range(cfg.epochs):
        model.train()
        pbar = tqdm(train_loader, desc=f"Fold {fold+1} Epoch {epoch+1}/{cfg.epochs}")
        for batch in pbar:
            optimizer.zero_grad()
            logits = model(batch['input_ids'].to(device), batch['attention_mask'].to(device), batch['char_ids'].to(device), batch['aux'].to(device), batch['tfidf'].to(device))
            loss = criterion(logits, batch['label'].to(device)); loss.backward(); optimizer.step(); scheduler.step(); pbar.set_postfix(loss=loss.item())

        val_true, val_pred = evaluate(model, val_loader, device)
        report = classification_report(val_true, val_pred, target_names=label_map.keys(), output_dict=True, zero_division=0)
        macro_f1 = report['macro avg']['f1-score']
        print(f"Epoch {epoch+1} -> Val Macro F1: {macro_f1:.4f}")
        if macro_f1 > best_macro_f1:
            best_macro_f1 = macro_f1
            torch.save(model.state_dict(), os.path.join(cfg.output_dir, f"best_model_fold_{fold+1}.pt"))
            print(f"🚀 New best model for Fold {fold+1} saved with Macro F1: {macro_f1:.4f}")
    return best_macro_f1
# --- End of Modification ---

# ------------------------
# Main Orchestration
# ------------------------
def main():
    try:
        df = pd.read_csv(cfg.data_csv, sep='\t', header=None, names=['label', 'text'], on_bad_lines='skip', engine='python')
    except FileNotFoundError:
        print(f"ERROR: The data file '{cfg.data_csv}' was not found."); return

    df.dropna(subset=['text', 'label'], inplace=True); df['text'] = df['text'].astype(str)
    df['label'] = df['label'].apply(normalize_label); df = df[df['label'] != 'Other']
    label_counts = df['label'].value_counts()
    classes_to_keep = label_counts[label_counts >= cfg.min_class_samples].index
    df = df[df['label'].isin(classes_to_keep)].reset_index(drop=True)
    label_map = {label: i for i, label in enumerate(sorted(df['label'].unique()))}
    print("Label Mapping:", label_map)

    tokenizer = AutoTokenizer.from_pretrained(cfg.model_name)
    char_counts = Counter(c for text in df['text'] for c in text)
    char2idx = {c: i+2 for i, c in enumerate(char_counts.keys())}; char2idx['<pad>'] = 0; char2idx['<unk>'] = 1

    # --- MODIFIED: Split data into a main training set and a final held-out test set ---
    train_val_df, test_df = train_test_split(df, test_size=0.15, random_state=cfg.seed, stratify=df['label'])
    test_records = test_df.to_dict(orient="records")

    print("Fitting TF-IDF Vectorizer on the full training data...")
    tfidf_vectorizer = TfidfVectorizer(max_features=cfg.tfidf_max_features, ngram_range=(1, 2))
    tfidf_vectorizer.fit(train_val_df['text'])
    test_tfidf = tfidf_vectorizer.transform([r['text'] for r in test_records])

    # --- NEW: Cross-Validation Loop ---
    skf = StratifiedKFold(n_splits=cfg.num_folds, shuffle=True, random_state=cfg.seed)
    fold_scores = []

    for fold, (train_idx, val_idx) in enumerate(skf.split(train_val_df, train_val_df['label'])):
        train_fold_df = train_val_df.iloc[train_idx]
        val_fold_df = train_val_df.iloc[val_idx]

        train_records = train_fold_df.to_dict(orient="records")
        val_records = val_fold_df.to_dict(orient="records")

        train_tfidf = tfidf_vectorizer.transform([r['text'] for r in train_records])
        val_tfidf = tfidf_vectorizer.transform([r['text'] for r in val_records])

        train_labels = [label_map[r['label']] for r in train_records]
        class_weights = compute_class_weight('balanced', classes=np.unique(train_labels), y=train_labels)
        print(f"\nFold {fold+1} Class Weights:", class_weights)

        train_ds = MurilHybridDataset(train_records, train_tfidf, label_map, char2idx, tokenizer, cfg)
        val_ds = MurilHybridDataset(val_records, val_tfidf, label_map, char2idx, tokenizer, cfg)

        train_loader = DataLoader(train_ds, batch_size=cfg.batch_size, shuffle=True, num_workers=0)
        val_loader = DataLoader(val_ds, batch_size=cfg.batch_size * 2, shuffle=False, num_workers=0)

        # Re-initialize the model for each fold
        model = MurilHybridClassifier(len(char2idx), len(label_map), cfg)

        fold_f1_score = train_loop(train_loader, val_loader, model, cfg, label_map, class_weights, fold)
        fold_scores.append(fold_f1_score)
    # --- End of Cross-Validation Section ---

    print(f"\n--- Cross-Validation Summary ---")
    print(f"Scores for each fold: {fold_scores}")
    print(f"Average Macro F1 over {cfg.num_folds} folds: {np.mean(fold_scores):.4f} (+/- {np.std(fold_scores):.4f})")

    print("\n--- Final Test Set Evaluation on Best Model ---")
    best_fold = np.argmax(fold_scores)
    print(f"Loading best model from Fold {best_fold+1}...")
    best_model_path = os.path.join(cfg.output_dir, f"best_model_fold_{best_fold+1}.pt")

    # Initialize a new model instance before loading the state_dict
    final_model = MurilHybridClassifier(len(char2idx), len(label_map), cfg)
    final_model.load_state_dict(torch.load(best_model_path))
    final_model.to(cfg.device)

    test_ds = MurilHybridDataset(test_records, test_tfidf, label_map, char2idx, tokenizer, cfg)
    test_loader = DataLoader(test_ds, batch_size=cfg.batch_size * 2, shuffle=False, num_workers=0)

    test_true, test_pred = evaluate(final_model, test_loader, cfg.device)
    print(classification_report(test_true, test_pred, target_names=list(label_map.keys()), zero_division=0))

if __name__ == '__main__':
    main()

Label Mapping: {'Negative': 0, 'Neutral': 1, 'Positive': 2}
Fitting TF-IDF Vectorizer on the full training data...

Fold 1 Class Weights: [1.46924603 1.41994247 0.61914716]

--- Starting Training for Fold 1 on cuda ---


Fold 1 Epoch 1/4:   0%|          | 0/139 [00:00<?, ?it/s]

Epoch 1 -> Val Macro F1: 0.5932
🚀 New best model for Fold 1 saved with Macro F1: 0.5932


Fold 1 Epoch 2/4:   0%|          | 0/139 [00:00<?, ?it/s]

Epoch 2 -> Val Macro F1: 0.6401
🚀 New best model for Fold 1 saved with Macro F1: 0.6401


Fold 1 Epoch 3/4:   0%|          | 0/139 [00:00<?, ?it/s]

Epoch 3 -> Val Macro F1: 0.6381


Fold 1 Epoch 4/4:   0%|          | 0/139 [00:00<?, ?it/s]

Epoch 4 -> Val Macro F1: 0.6547
🚀 New best model for Fold 1 saved with Macro F1: 0.6547

Fold 2 Class Weights: [1.46778989 1.42130518 0.61914716]

--- Starting Training for Fold 2 on cuda ---


Fold 2 Epoch 1/4:   0%|          | 0/139 [00:00<?, ?it/s]

Epoch 1 -> Val Macro F1: 0.5713
🚀 New best model for Fold 2 saved with Macro F1: 0.5713


Fold 2 Epoch 2/4:   0%|          | 0/139 [00:00<?, ?it/s]

Epoch 2 -> Val Macro F1: 0.6368
🚀 New best model for Fold 2 saved with Macro F1: 0.6368


Fold 2 Epoch 3/4:   0%|          | 0/139 [00:00<?, ?it/s]

Epoch 3 -> Val Macro F1: 0.6620
🚀 New best model for Fold 2 saved with Macro F1: 0.6620


Fold 2 Epoch 4/4:   0%|          | 0/139 [00:00<?, ?it/s]

Epoch 4 -> Val Macro F1: 0.6656
🚀 New best model for Fold 2 saved with Macro F1: 0.6656

Fold 3 Class Weights: [1.46778989 1.42130518 0.61914716]

--- Starting Training for Fold 3 on cuda ---


Fold 3 Epoch 1/4:   0%|          | 0/139 [00:00<?, ?it/s]

Epoch 1 -> Val Macro F1: 0.5863
🚀 New best model for Fold 3 saved with Macro F1: 0.5863


Fold 3 Epoch 2/4:   0%|          | 0/139 [00:00<?, ?it/s]

Epoch 2 -> Val Macro F1: 0.6582
🚀 New best model for Fold 3 saved with Macro F1: 0.6582


Fold 3 Epoch 3/4:   0%|          | 0/139 [00:00<?, ?it/s]

Epoch 3 -> Val Macro F1: 0.6616
🚀 New best model for Fold 3 saved with Macro F1: 0.6616


Fold 3 Epoch 4/4:   0%|          | 0/139 [00:00<?, ?it/s]

Epoch 4 -> Val Macro F1: 0.6551

Fold 4 Class Weights: [1.46778989 1.42130518 0.61914716]

--- Starting Training for Fold 4 on cuda ---


Fold 4 Epoch 1/4:   0%|          | 0/139 [00:00<?, ?it/s]

Epoch 1 -> Val Macro F1: 0.5930
🚀 New best model for Fold 4 saved with Macro F1: 0.5930


Fold 4 Epoch 2/4:   0%|          | 0/139 [00:00<?, ?it/s]

Epoch 2 -> Val Macro F1: 0.6952
🚀 New best model for Fold 4 saved with Macro F1: 0.6952


Fold 4 Epoch 3/4:   0%|          | 0/139 [00:00<?, ?it/s]

Epoch 3 -> Val Macro F1: 0.6882


Fold 4 Epoch 4/4:   0%|          | 0/139 [00:00<?, ?it/s]

Epoch 4 -> Val Macro F1: 0.7017
🚀 New best model for Fold 4 saved with Macro F1: 0.7017

Fold 5 Class Weights: [1.46812025 1.42026206 0.61928651]

--- Starting Training for Fold 5 on cuda ---


Fold 5 Epoch 1/4:   0%|          | 0/139 [00:00<?, ?it/s]

Epoch 1 -> Val Macro F1: 0.5216
🚀 New best model for Fold 5 saved with Macro F1: 0.5216


Fold 5 Epoch 2/4:   0%|          | 0/139 [00:00<?, ?it/s]

Epoch 2 -> Val Macro F1: 0.6500
🚀 New best model for Fold 5 saved with Macro F1: 0.6500


Fold 5 Epoch 3/4:   0%|          | 0/139 [00:00<?, ?it/s]

Epoch 3 -> Val Macro F1: 0.6573
🚀 New best model for Fold 5 saved with Macro F1: 0.6573


Fold 5 Epoch 4/4:   0%|          | 0/139 [00:00<?, ?it/s]

Epoch 4 -> Val Macro F1: 0.6716
🚀 New best model for Fold 5 saved with Macro F1: 0.6716

--- Cross-Validation Summary ---
Scores for each fold: [0.6547289034194879, 0.665578614918806, 0.6615859218048938, 0.7016686577179684, 0.6715823130120313]
Average Macro F1 over 5 folds: 0.6710 (+/- 0.0163)

--- Final Test Set Evaluation on Best Model ---
Loading best model from Fold 4...
              precision    recall  f1-score   support

    Negative       0.61      0.72      0.66       223
     Neutral       0.59      0.65      0.62       230
    Positive       0.84      0.74      0.79       528

    accuracy                           0.71       981
   macro avg       0.68      0.70      0.69       981
weighted avg       0.73      0.71      0.72       981



MuRIL Embedding + Tf-IDF + BiGRU + attention + CharCNN 

In [5]:
# --- Step 1: Install necessary libraries ---
!pip install torch scikit-learn pandas tqdm transformers sentencepiece -q

import os
import random
from collections import Counter
from dataclasses import dataclass
from typing import List

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import AutoTokenizer, AutoModel, get_linear_schedule_with_warmup
from tqdm.auto import tqdm

# ------------------------
# Configuration
# ------------------------
@dataclass
class CFG:
    data_csv: str = "kannada_sentiment.csv"
    output_dir: str = "outputs_kannada_bigru_attention_hybrid"
    model_name: str = "google/muril-base-cased"
    device: str = "cuda" if torch.cuda.is_available() else "cpu"
    min_class_samples: int = 5
    max_token_len: int = 128
    max_char_len: int = 256
    tfidf_max_features: int = 5000

    # Model Hyperparameters
    gru_hidden_dim: int = 256
    tfidf_proj_dim: int = 64
    char_out_dim: int = 100
    dropout: float = 0.3

    # Training Hyperparameters
    epochs: int = 5
    batch_size: int = 32
    lr_muril: float = 2e-5
    lr_recurrent: float = 1e-4
    lr_head: float = 1e-3
    label_smoothing: float = 0.1
    seed: int = 42

cfg = CFG()
os.makedirs(cfg.output_dir, exist_ok=True)

# ------------------------
# Utilities & Preprocessing
# ------------------------
def seed_everything(seed=42):
    random.seed(seed); os.environ['PYTHONHASHSEED'] = str(seed); np.random.seed(seed); torch.manual_seed(seed)
    if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed)

seed_everything(cfg.seed)

def normalize_label(l):
    s = str(l).lower().strip()
    if 'positive' in s: return 'Positive'
    if 'negative' in s: return 'Negative'
    if 'mixed feelings' in s or 'unknown state' in s: return 'Neutral'
    return 'Other'

def compute_aux_features(text: str) -> List[float]:
    toks = text.split(); num_tokens = len(toks); num_chars = len(text)
    return [
        num_tokens, num_chars, 1.0 if any('\u0C80' <= ch <= '\u0CFF' for ch in text) else 0.0,
        1.0 if any('a' <= ch.lower() <= 'z' for ch in text) else 0.0,
        (sum(len(t) for t in toks) / num_tokens) if num_tokens > 0 else 0.0,
        sum(1 for ch in text if ch.isupper()) / (num_chars + 1e-6),
        sum(1 for ch in text if ch in '?!.,;:'), sum(1 for ch in text if ord(ch) > 10000)
    ]

# ------------------------
# Dataset Class
# ------------------------
class MurilHybridDataset(Dataset):
    def __init__(self, records, tfidf_vectors, label_map, char2idx, tokenizer, cfg):
        self.records, self.tfidf_vectors, self.label_map, self.char2idx, self.tokenizer, self.cfg = \
            records, tfidf_vectors, label_map, char2idx, tokenizer, cfg
    def __len__(self): return len(self.records)
    def __getitem__(self, idx):
        record = self.records[idx]; text = str(record['text'])
        inputs = self.tokenizer.encode_plus(
            text, add_special_tokens=True, max_length=self.cfg.max_token_len,
            padding='max_length', truncation=True, return_tensors='pt'
        )
        char_ids = [self.char2idx.get(c, self.char2idx['<unk>']) for c in text][:self.cfg.max_char_len]
        padded_chars = char_ids + [self.char2idx['<pad>']] * (self.cfg.max_char_len - len(char_ids))
        return {
            "input_ids": inputs['input_ids'].squeeze(), "attention_mask": inputs['attention_mask'].squeeze(),
            "char_ids": torch.tensor(padded_chars, dtype=torch.long),
            "aux": torch.tensor(compute_aux_features(text), dtype=torch.float32),
            "tfidf": torch.tensor(self.tfidf_vectors[idx].toarray().squeeze(), dtype=torch.float32),
            "label": torch.tensor(self.label_map[record['label']], dtype=torch.long)
        }

# ------------------------
# Model Architecture
# ------------------------
class CharCNN(nn.Module):
    def __init__(self, char_vocab_size, char_emb_dim, out_dim):
        super().__init__(); self.char_emb = nn.Embedding(char_vocab_size, char_emb_dim, padding_idx=0); self.conv = nn.Conv1d(char_emb_dim, out_dim, kernel_size=3, padding=1)
    def forward(self, x_char):
        x = self.char_emb(x_char).transpose(1, 2); x = self.conv(x); return F.max_pool1d(x, x.size(2)).squeeze(2)

class Attention(nn.Module):
    def __init__(self, feature_dim):
        super(Attention, self).__init__(); self.attention_fc = nn.Linear(feature_dim, 1)
    def forward(self, features):
        attn_scores = self.attention_fc(features).squeeze(-1); attn_weights = F.softmax(attn_scores, dim=1).unsqueeze(1); context_vec = torch.bmm(attn_weights, features).squeeze(1); return context_vec

class MurilHybridClassifier(nn.Module):
    def __init__(self, char_vocab_size, num_labels, cfg):
        super().__init__()
        self.muril = AutoModel.from_pretrained(cfg.model_name)
        # --- MODIFIED: Replaced BiLSTM with BiGRU ---
        self.bigru = nn.GRU(
            self.muril.config.hidden_size,
            cfg.gru_hidden_dim // 2,
            num_layers=1,
            bidirectional=True,
            batch_first=True
        )
        self.attention = Attention(cfg.gru_hidden_dim)
        # --- End of Modification ---
        self.char_cnn = CharCNN(char_vocab_size, 50, cfg.char_out_dim)
        self.tfidf_proj = nn.Linear(cfg.tfidf_max_features, cfg.tfidf_proj_dim)
        self.aux_proj = nn.Linear(8, 8)
        classifier_input_dim = cfg.gru_hidden_dim + cfg.char_out_dim + 8 + cfg.tfidf_proj_dim
        self.classifier = nn.Sequential(
            nn.Linear(classifier_input_dim, 256), nn.ReLU(), nn.Dropout(cfg.dropout),
            nn.Linear(256, num_labels)
        )

    def forward(self, input_ids, attention_mask, char_ids, aux, tfidf):
        muril_embeddings = self.muril(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state
        # --- MODIFIED: Using BiGRU output ---
        gru_out, _ = self.bigru(muril_embeddings)
        attention_vec = self.attention(gru_out)
        # --- End of Modification ---
        char_vec = self.char_cnn(char_ids); aux_vec = F.relu(self.aux_proj(aux)); tfidf_vec = F.relu(self.tfidf_proj(tfidf))
        combined_features = torch.cat([attention_vec, char_vec, aux_vec, tfidf_vec], dim=1)
        return self.classifier(combined_features)

# ------------------------
# Training & Evaluation
# ------------------------
def evaluate(model, dataloader, device):
    model.eval(); y_true, y_pred = [], []
    with torch.no_grad():
        for batch in dataloader:
            logits = model(batch['input_ids'].to(device), batch['attention_mask'].to(device), batch['char_ids'].to(device), batch['aux'].to(device), batch['tfidf'].to(device))
            y_true.extend(batch['label'].numpy()); y_pred.extend(torch.argmax(logits, dim=1).cpu().numpy())
    return y_true, y_pred

def train_loop(train_loader, val_loader, model, cfg, label_map):
    device = cfg.device; model.to(device)
    optimizer = torch.optim.AdamW([
        {'params': model.muril.parameters(), 'lr': cfg.lr_muril},
        {'params': model.bigru.parameters(), 'lr': cfg.lr_recurrent},
        {'params': model.attention.parameters(), 'lr': cfg.lr_recurrent},
        {'params': model.char_cnn.parameters(), 'lr': cfg.lr_head},
        {'params': model.tfidf_proj.parameters(), 'lr': cfg.lr_head},
        {'params': model.aux_proj.parameters(), 'lr': cfg.lr_head},
        {'params': model.classifier.parameters(), 'lr': cfg.lr_head}
    ])
    num_training_steps = len(train_loader) * cfg.epochs
    scheduler = get_linear_schedule_with_warmup(optimizer, int(0.1 * num_training_steps), num_training_steps)
    criterion = nn.CrossEntropyLoss(label_smoothing=cfg.label_smoothing)
    best_macro_f1 = -1.0

    print(f"\n--- Starting Training on {device} ---")
    for epoch in range(cfg.epochs):
        model.train()
        pbar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{cfg.epochs}")
        for batch in pbar:
            optimizer.zero_grad()
            logits = model(batch['input_ids'].to(device), batch['attention_mask'].to(device), batch['char_ids'].to(device), batch['aux'].to(device), batch['tfidf'].to(device))
            loss = criterion(logits, batch['label'].to(device)); loss.backward(); optimizer.step(); scheduler.step(); pbar.set_postfix(loss=loss.item())

        val_true, val_pred = evaluate(model, val_loader, device)
        report = classification_report(val_true, val_pred, target_names=label_map.keys(), output_dict=True, zero_division=0)
        macro_f1 = report['macro avg']['f1-score']
        print(f"Epoch {epoch+1} -> Val Macro F1: {macro_f1:.4f}")
        if macro_f1 > best_macro_f1:
            best_macro_f1 = macro_f1
            torch.save(model.state_dict(), os.path.join(cfg.output_dir, "best_model_bigru.pt"))
            print(f"🚀 New best model saved with Macro F1: {macro_f1:.4f}")

# ------------------------
# Main Orchestration
# ------------------------
def main():
    try:
        df = pd.read_csv(cfg.data_csv, sep='\t', header=None, names=['label', 'text'], on_bad_lines='skip', engine='python')
    except FileNotFoundError:
        print(f"ERROR: The data file '{cfg.data_csv}' was not found."); return

    df.dropna(subset=['text', 'label'], inplace=True); df['text'] = df['text'].astype(str)
    df['label'] = df['label'].apply(normalize_label); df = df[df['label'] != 'Other']
    label_counts = df['label'].value_counts()
    classes_to_keep = label_counts[label_counts >= cfg.min_class_samples].index
    df = df[df['label'].isin(classes_to_keep)].reset_index(drop=True)

    label_map = {label: i for i, label in enumerate(sorted(df['label'].unique()))}
    print("Label Mapping:", label_map)

    # Simple train/val/test split
    train_val_df, test_df = train_test_split(df, test_size=0.2, random_state=cfg.seed, stratify=df['label'])
    train_df, val_df = train_test_split(train_val_df, test_size=0.15, random_state=cfg.seed, stratify=train_val_df['label'])

    train_records = train_df.to_dict(orient="records")
    val_records = val_df.to_dict(orient="records")
    test_records = test_df.to_dict(orient="records")

    print("Fitting TF-IDF Vectorizer...")
    tfidf_vectorizer = TfidfVectorizer(max_features=cfg.tfidf_max_features, ngram_range=(1, 2))
    train_tfidf = tfidf_vectorizer.fit_transform([r['text'] for r in train_records])
    val_tfidf = tfidf_vectorizer.transform([r['text'] for r in val_records])
    test_tfidf = tfidf_vectorizer.transform([r['text'] for r in test_records])

    tokenizer = AutoTokenizer.from_pretrained(cfg.model_name)
    char_counts = Counter(c for text in df['text'] for c in text)
    char2idx = {c: i+2 for i, c in enumerate(char_counts.keys())}; char2idx['<pad>'] = 0; char2idx['<unk>'] = 1

    train_ds = MurilHybridDataset(train_records, train_tfidf, label_map, char2idx, tokenizer, cfg)
    val_ds = MurilHybridDataset(val_records, val_tfidf, label_map, char2idx, tokenizer, cfg)
    test_ds = MurilHybridDataset(test_records, test_tfidf, label_map, char2idx, tokenizer, cfg)

    train_loader = DataLoader(train_ds, batch_size=cfg.batch_size, shuffle=True, num_workers=0)
    val_loader = DataLoader(val_ds, batch_size=cfg.batch_size * 2, shuffle=False, num_workers=0)
    test_loader = DataLoader(test_ds, batch_size=cfg.batch_size * 2, shuffle=False, num_workers=0)

    print(f"Initializing Hybrid MURIL-BiGRU model...")
    model = MurilHybridClassifier(len(char2idx), len(label_map), cfg)

    train_loop(train_loader, val_loader, model, cfg, label_map)

    print("\n--- Final Test Set Evaluation ---")
    model.load_state_dict(torch.load(os.path.join(cfg.output_dir, "best_model_bigru.pt")))
    test_true, test_pred = evaluate(model, test_loader, cfg.device)
    print(classification_report(test_true, test_pred, target_names=list(label_map.keys()), zero_division=0))

if __name__ == '__main__':
    main()

Label Mapping: {'Negative': 0, 'Neutral': 1, 'Positive': 2}
Fitting TF-IDF Vectorizer...
Initializing Hybrid MURIL-BiGRU model...

--- Starting Training on cuda ---


Epoch 1/5:   0%|          | 0/139 [00:00<?, ?it/s]

Epoch 1 -> Val Macro F1: 0.5450
🚀 New best model saved with Macro F1: 0.5450


Epoch 2/5:   0%|          | 0/139 [00:00<?, ?it/s]

Epoch 2 -> Val Macro F1: 0.6665
🚀 New best model saved with Macro F1: 0.6665


Epoch 3/5:   0%|          | 0/139 [00:00<?, ?it/s]

Epoch 3 -> Val Macro F1: 0.6594


Epoch 4/5:   0%|          | 0/139 [00:00<?, ?it/s]

Epoch 4 -> Val Macro F1: 0.6659


Epoch 5/5:   0%|          | 0/139 [00:00<?, ?it/s]

Epoch 5 -> Val Macro F1: 0.6756
🚀 New best model saved with Macro F1: 0.6756

--- Final Test Set Evaluation ---
              precision    recall  f1-score   support

    Negative       0.66      0.66      0.66       297
     Neutral       0.61      0.52      0.56       307
    Positive       0.78      0.83      0.80       703

    accuracy                           0.72      1307
   macro avg       0.68      0.67      0.67      1307
weighted avg       0.71      0.72      0.71      1307

