FastText Embedding + BiLSTM + CharCNN Hybrid model

In [2]:
# --- Step 1: Install necessary libraries ---
!pip install torch scikit-learn gensim pandas tqdm -q

import os
import time
import random
import json
from collections import Counter
from dataclasses import dataclass
from typing import List

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from gensim.models import FastText
from tqdm import tqdm

# ------------------------
# Configuration
# ------------------------
@dataclass
class CFG:
    # --- ADAPTED FOR MALAYALAM ---
    data_csv: str = "mal_full_sentiment.tsv"
    output_dir: str = "outputs_malayalam_hybrid"
    # ---------------------------
    device: str = "cuda" if torch.cuda.is_available() else "cpu"
    min_class_samples: int = 3

    # Vocab / Chars / Padding
    min_token_freq: int = 2
    max_chars_per_token: int = 20 # Max characters to consider in a single token
    max_length: int = 128         # <<--- FIX: ADDED THIS MISSING ATTRIBUTE

    # FastText
    ft_dim: int = 300
    ft_min_count: int = 2
    ft_epochs: int = 10

    # Model Architecture
    hidden_dim: int = 256
    lstm_layers: int = 1
    char_emb_dim: int = 50
    char_out_dim: int = 100
    aux_dim: int = 8
    dropout: float = 0.4

    # Training
    epochs: int = 8
    batch_size: int = 64
    lr: float = 1e-3
    seed: int = 42

cfg = CFG()
os.makedirs(cfg.output_dir, exist_ok=True)

# ------------------------
# Utilities
# ------------------------
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

seed_everything(cfg.seed)

def normalize_label(l):
    s = str(l).lower()
    if 'posit' in s: return 'Positive'
    if 'negat' in s: return 'Negative'
    if 'neu' in s or 'normal' in s: return 'Neutral'
    return 'Other'

# ------------------------
# Preprocessing & Vocabs
# ------------------------
def build_vocab(texts: List[str], min_freq: int, vocab_type: str):
    print(f"Building {vocab_type} vocabulary...")
    counts = Counter()
    for text in texts:
        items = text.split() if vocab_type == 'token' else list("".join(text.split()))
        counts.update(items)

    vocab = {'<pad>': 0, '<unk>': 1}
    for item, count in counts.items():
        if count >= min_freq:
            vocab[item] = len(vocab)
    return vocab

def compute_aux_features(text: str) -> List[float]:
    toks = text.split()
    num_tokens = len(toks)
    num_chars = len(text)
    has_malayalam = 1.0 if any('\u0D00' <= ch <= '\u0D7F' for ch in text) else 0.0
    has_english = 1.0 if any('a' <= ch.lower() <= 'z' for ch in text) else 0.0
    avg_token_len = (sum(len(t) for t in toks) / num_tokens) if num_tokens > 0 else 0.0
    cap_ratio = sum(1 for ch in text if ch.isupper()) / (num_chars + 1e-6)
    punct_count = sum(1 for ch in text if ch in '?!.,;:')
    emoji_count = sum(1 for ch in text if ord(ch) > 10000)
    return [num_tokens, num_chars, has_malayalam, has_english, avg_token_len, cap_ratio, punct_count, emoji_count]

# ------------------------
# Dataset Class
# ------------------------
class SentimentDataset(Dataset):
    def __init__(self, records, label_map, word2idx, char2idx, cfg):
        self.samples = []
        for r in records:
            text = str(r['text'])
            token_ids = [word2idx.get(w, word2idx['<unk>']) for w in text.split()]
            char_ids = [char2idx.get(c, char2idx['<unk>']) for c in text]
            aux = compute_aux_features(text)
            label = label_map[r['label']]
            self.samples.append({"token_ids": token_ids, "char_ids": char_ids, "aux": aux, "label": label})
        self.cfg = cfg

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        sample = self.samples[idx]
        token_ids, char_ids = sample["token_ids"], sample["char_ids"]

        # <<--- FIX: Use self.cfg.max_length for padding tokens ---
        padded_tokens = token_ids[:self.cfg.max_length] + [0] * (self.cfg.max_length - len(token_ids))
        # <<--- FIX: Use self.cfg.max_chars_per_token for padding chars ---
        padded_chars = char_ids[:self.cfg.max_chars_per_token] + [0] * (self.cfg.max_chars_per_token - len(char_ids))

        return {
            "token_ids": torch.tensor(padded_tokens, dtype=torch.long),
            "char_ids": torch.tensor(padded_chars, dtype=torch.long),
            "aux": torch.tensor(sample["aux"], dtype=torch.float32),
            "label": torch.tensor(sample["label"], dtype=torch.long)
        }

# ------------------------
# Model Architecture
# ------------------------
class CharCNN(nn.Module):
    def __init__(self, char_vocab_size, char_emb_dim, out_dim):
        super().__init__()
        self.char_emb = nn.Embedding(char_vocab_size, char_emb_dim, padding_idx=0)
        self.conv = nn.Conv1d(char_emb_dim, out_dim, kernel_size=3, padding=1)

    def forward(self, x_char):
        x = self.char_emb(x_char)
        x = x.transpose(1, 2)
        x = self.conv(x)
        x = F.max_pool1d(x, x.size(2)).squeeze(2)
        return x

class HybridSentimentClassifier(nn.Module):
    def __init__(self, emb_matrix, char_vocab_size, num_labels, cfg):
        super().__init__()
        emb_matrix = torch.tensor(emb_matrix, dtype=torch.float32)
        self.token_embedding = nn.Embedding.from_pretrained(emb_matrix, freeze=False, padding_idx=0)
        self.char_cnn = CharCNN(char_vocab_size, cfg.char_emb_dim, cfg.char_out_dim)

        lstm_input_dim = emb_matrix.shape[1]
        self.bilstm = nn.LSTM(lstm_input_dim, cfg.hidden_dim // 2, num_layers=cfg.lstm_layers,
                              bidirectional=True, batch_first=True)

        classifier_input_dim = cfg.hidden_dim + cfg.char_out_dim + cfg.aux_dim
        self.classifier = nn.Sequential(
            nn.Linear(classifier_input_dim, 256),
            nn.ReLU(),
            nn.Dropout(cfg.dropout),
            nn.Linear(256, num_labels)
        )

    def forward(self, token_ids, char_ids, aux):
        emb = self.token_embedding(token_ids)
        lstm_out, (h_n, _) = self.bilstm(emb)
        hidden = torch.cat((h_n[-2,:,:], h_n[-1,:,:]), dim=1)
        char_vec = self.char_cnn(char_ids)
        combined_features = torch.cat([hidden, char_vec, aux], dim=1)
        logits = self.classifier(combined_features)
        return logits

# ------------------------
# Training & Evaluation
# ------------------------
def evaluate(model, dataloader, device):
    model.eval()
    y_true, y_pred = [], []
    with torch.no_grad():
        for batch in dataloader:
            token_ids = batch['token_ids'].to(device)
            char_ids = batch['char_ids'].to(device)
            aux = batch['aux'].to(device)
            labels = batch['label'].cpu().numpy()
            logits = model(token_ids, char_ids, aux)
            preds = torch.argmax(logits, dim=1).cpu().numpy()
            y_true.extend(labels)
            y_pred.extend(preds)
    return y_true, y_pred

def train_loop(train_loader, val_loader, model, cfg, label_map):
    device = cfg.device
    model.to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=cfg.lr)
    criterion = nn.CrossEntropyLoss()
    best_macro_f1 = -1.0

    print(f"\n--- Starting Training on {device} ---")
    for epoch in range(cfg.epochs):
        model.train()
        pbar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{cfg.epochs}")
        for batch in pbar:
            token_ids = batch['token_ids'].to(device)
            char_ids = batch['char_ids'].to(device)
            aux = batch['aux'].to(device)
            labels = batch['label'].to(device)

            optimizer.zero_grad()
            logits = model(token_ids, char_ids, aux)
            loss = criterion(logits, labels)
            loss.backward()
            optimizer.step()
            pbar.set_postfix(loss=loss.item())

        val_true, val_pred = evaluate(model, val_loader, device)
        report = classification_report(val_true, val_pred, target_names=label_map.keys(), output_dict=True, zero_division=0)
        macro_f1 = report['macro avg']['f1-score']
        print(f"Epoch {epoch+1} -> Val Macro F1: {macro_f1:.4f}")

        if macro_f1 > best_macro_f1:
            best_macro_f1 = macro_f1
            torch.save(model.state_dict(), os.path.join(cfg.output_dir, "best_model.pt"))
            print(f"🚀 New best model saved with Macro F1: {macro_f1:.4f}")

# ------------------------
# Main Orchestration
# ------------------------
def main():
    df = pd.read_csv(cfg.data_csv, sep='\t', header=None, names=['label', 'text'], engine='python')
    df.dropna(subset=['text', 'label'], inplace=True)
    df['text'] = df['text'].astype(str)
    df['label'] = df['label'].apply(normalize_label)
    df = df[df['label'] != 'Other']

    label_counts = df['label'].value_counts()
    classes_to_keep = label_counts[label_counts >= cfg.min_class_samples].index
    df = df[df['label'].isin(classes_to_keep)].reset_index(drop=True)

    label_map = {label: i for i, label in enumerate(df['label'].unique())}

    data = df.to_dict(orient="records")
    labels = [label_map[r['label']] for r in data]

    train_records, test_records, _, _ = train_test_split(data, labels, test_size=0.2, random_state=cfg.seed, stratify=labels)
    train_records, val_records = train_test_split(train_records, test_size=0.15, random_state=cfg.seed, stratify=[label_map[r['label']] for r in train_records])

    train_texts = [r['text'] for r in train_records]
    word2idx = build_vocab(train_texts, cfg.min_token_freq, 'token')
    char2idx = build_vocab(train_texts, 1, 'char')

    ft_path = os.path.join(cfg.output_dir, "fasttext.model")
    if not os.path.exists(ft_path):
        print("Training FastText model...")
        all_texts_for_ft = [r['text'].split() for r in data]
        ft_model = FastText(sentences=all_texts_for_ft, vector_size=cfg.ft_dim, window=5, min_count=cfg.ft_min_count, workers=4, sg=1, epochs=cfg.ft_epochs)
        ft_model.save(ft_path)
    else:
        print("Loading existing FastText model...")
        ft_model = FastText.load(ft_path)

    embedding_matrix = np.random.normal(scale=0.02, size=(len(word2idx), cfg.ft_dim))
    for word, i in word2idx.items():
        if word in ft_model.wv:
            embedding_matrix[i] = ft_model.wv[word]

    train_ds = SentimentDataset(train_records, label_map, word2idx, char2idx, cfg)
    val_ds = SentimentDataset(val_records, label_map, word2idx, char2idx, cfg)
    test_ds = SentimentDataset(test_records, label_map, word2idx, char2idx, cfg)

    train_loader = DataLoader(train_ds, batch_size=cfg.batch_size, shuffle=True, num_workers=2)
    val_loader = DataLoader(val_ds, batch_size=cfg.batch_size * 2, shuffle=False, num_workers=2)
    test_loader = DataLoader(test_ds, batch_size=cfg.batch_size * 2, shuffle=False, num_workers=2)

    model = HybridSentimentClassifier(embedding_matrix, len(char2idx), len(label_map), cfg)
    train_loop(train_loader, val_loader, model, cfg, label_map)

    print("\n--- Final Test Set Evaluation ---")
    model.load_state_dict(torch.load(os.path.join(cfg.output_dir, "best_model.pt")))
    test_true, test_pred = evaluate(model, test_loader, cfg.device)
    print(classification_report(test_true, test_pred, target_names=label_map.keys()))

if __name__ == '__main__':
    main()

Building token vocabulary...
Building char vocabulary...
Loading existing FastText model...

--- Starting Training on cuda ---


Epoch 1/8: 100%|██████████| 112/112 [00:03<00:00, 34.47it/s, loss=0.492]


Epoch 1 -> Val Macro F1: 0.7226
🚀 New best model saved with Macro F1: 0.7226


Epoch 2/8: 100%|██████████| 112/112 [00:01<00:00, 67.88it/s, loss=0.347]


Epoch 2 -> Val Macro F1: 0.7711
🚀 New best model saved with Macro F1: 0.7711


Epoch 3/8: 100%|██████████| 112/112 [00:01<00:00, 61.65it/s, loss=0.233]


Epoch 3 -> Val Macro F1: 0.7966
🚀 New best model saved with Macro F1: 0.7966


Epoch 4/8: 100%|██████████| 112/112 [00:02<00:00, 55.16it/s, loss=0.153]


Epoch 4 -> Val Macro F1: 0.7789


Epoch 5/8: 100%|██████████| 112/112 [00:01<00:00, 67.12it/s, loss=0.135]


Epoch 5 -> Val Macro F1: 0.7641


Epoch 6/8: 100%|██████████| 112/112 [00:01<00:00, 67.72it/s, loss=0.028]


Epoch 6 -> Val Macro F1: 0.7535


Epoch 7/8: 100%|██████████| 112/112 [00:01<00:00, 67.57it/s, loss=0.0539]


Epoch 7 -> Val Macro F1: 0.7583


Epoch 8/8: 100%|██████████| 112/112 [00:01<00:00, 67.50it/s, loss=0.0406]


Epoch 8 -> Val Macro F1: 0.7751

--- Final Test Set Evaluation ---
              precision    recall  f1-score   support

    Positive       0.91      0.86      0.88      1582
    Negative       0.62      0.73      0.67       520

    accuracy                           0.82      2102
   macro avg       0.76      0.79      0.78      2102
weighted avg       0.84      0.82      0.83      2102



FastText Embedding + BiGRU + CharCNN Hybrid Model

In [3]:
# --- Step 1: Install necessary libraries ---
!pip install torch scikit-learn gensim pandas tqdm -q

import os
import time
import random
import json
from collections import Counter
from dataclasses import dataclass
from typing import List

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from gensim.models import FastText
from tqdm import tqdm

# ------------------------
# Configuration
# ------------------------
@dataclass
class CFG:
    data_csv: str = "mal_full_sentiment.tsv"
    output_dir: str = "outputs_malayalam_hybrid_bigru"
    device: str = "cuda" if torch.cuda.is_available() else "cpu"
    min_class_samples: int = 3

    # Vocab / Chars / Padding
    min_token_freq: int = 2
    max_chars_per_token: int = 20
    max_length: int = 128

    # FastText
    ft_dim: int = 300
    ft_min_count: int = 2
    ft_epochs: int = 10

    # Model Architecture
    hidden_dim: int = 256
    # --- MODIFIED: This now controls the GRU layers ---
    gru_layers: int = 1
    char_emb_dim: int = 50
    char_out_dim: int = 100
    aux_dim: int = 8
    dropout: float = 0.4

    # Training
    epochs: int = 8
    batch_size: int = 64
    lr: float = 1e-3
    seed: int = 42

cfg = CFG()
os.makedirs(cfg.output_dir, exist_ok=True)

# ------------------------
# Utilities
# ------------------------
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

seed_everything(cfg.seed)

def normalize_label(l):
    s = str(l).lower()
    if 'posit' in s: return 'Positive'
    if 'negat' in s: return 'Negative'
    if 'neu' in s or 'normal' in s: return 'Neutral'
    return 'Other'

# ------------------------
# Preprocessing & Vocabs
# ------------------------
def build_vocab(texts: List[str], min_freq: int, vocab_type: str):
    print(f"Building {vocab_type} vocabulary...")
    counts = Counter()
    for text in texts:
        items = text.split() if vocab_type == 'token' else list("".join(text.split()))
        counts.update(items)

    vocab = {'<pad>': 0, '<unk>': 1}
    for item, count in counts.items():
        if count >= min_freq:
            vocab[item] = len(vocab)
    return vocab

def compute_aux_features(text: str) -> List[float]:
    toks = text.split()
    num_tokens = len(toks)
    num_chars = len(text)
    has_malayalam = 1.0 if any('\u0D00' <= ch <= '\u0D7F' for ch in text) else 0.0
    has_english = 1.0 if any('a' <= ch.lower() <= 'z' for ch in text) else 0.0
    avg_token_len = (sum(len(t) for t in toks) / num_tokens) if num_tokens > 0 else 0.0
    cap_ratio = sum(1 for ch in text if ch.isupper()) / (num_chars + 1e-6)
    punct_count = sum(1 for ch in text if ch in '?!.,;:')
    emoji_count = sum(1 for ch in text if ord(ch) > 10000)
    return [num_tokens, num_chars, has_malayalam, has_english, avg_token_len, cap_ratio, punct_count, emoji_count]

# ------------------------
# Dataset Class
# ------------------------
class SentimentDataset(Dataset):
    def __init__(self, records, label_map, word2idx, char2idx, cfg):
        self.samples = []
        for r in records:
            text = str(r['text'])
            token_ids = [word2idx.get(w, word2idx['<unk>']) for w in text.split()]
            char_ids = [char2idx.get(c, char2idx['<unk>']) for c in text]
            aux = compute_aux_features(text)
            label = label_map[r['label']]
            self.samples.append({"token_ids": token_ids, "char_ids": char_ids, "aux": aux, "label": label})
        self.cfg = cfg

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        sample = self.samples[idx]
        token_ids, char_ids = sample["token_ids"], sample["char_ids"]
        padded_tokens = token_ids[:self.cfg.max_length] + [0] * (self.cfg.max_length - len(token_ids))
        padded_chars = char_ids[:self.cfg.max_chars_per_token] + [0] * (self.cfg.max_chars_per_token - len(char_ids))
        return {
            "token_ids": torch.tensor(padded_tokens, dtype=torch.long),
            "char_ids": torch.tensor(padded_chars, dtype=torch.long),
            "aux": torch.tensor(sample["aux"], dtype=torch.float32),
            "label": torch.tensor(sample["label"], dtype=torch.long)
        }

# ------------------------
# Model Architecture
# ------------------------
class CharCNN(nn.Module):
    def __init__(self, char_vocab_size, char_emb_dim, out_dim):
        super().__init__()
        self.char_emb = nn.Embedding(char_vocab_size, char_emb_dim, padding_idx=0)
        self.conv = nn.Conv1d(char_emb_dim, out_dim, kernel_size=3, padding=1)

    def forward(self, x_char):
        x = self.char_emb(x_char)
        x = x.transpose(1, 2)
        x = self.conv(x)
        x = F.max_pool1d(x, x.size(2)).squeeze(2)
        return x

class HybridSentimentClassifier(nn.Module):
    def __init__(self, emb_matrix, char_vocab_size, num_labels, cfg):
        super().__init__()
        emb_matrix = torch.tensor(emb_matrix, dtype=torch.float32)
        self.token_embedding = nn.Embedding.from_pretrained(emb_matrix, freeze=False, padding_idx=0)
        self.char_cnn = CharCNN(char_vocab_size, cfg.char_emb_dim, cfg.char_out_dim)

        recurrent_input_dim = emb_matrix.shape[1]

        # --- MODIFIED: Using BiGRU instead of BiLSTM ---
        self.bigru = nn.GRU(
            recurrent_input_dim,
            cfg.hidden_dim // 2,
            num_layers=cfg.gru_layers,
            bidirectional=True,
            batch_first=True
        )
        # ---------------------------------------------

        classifier_input_dim = cfg.hidden_dim + cfg.char_out_dim + cfg.aux_dim
        self.classifier = nn.Sequential(
            nn.Linear(classifier_input_dim, 256),
            nn.ReLU(),
            nn.Dropout(cfg.dropout),
            nn.Linear(256, num_labels)
        )

    def forward(self, token_ids, char_ids, aux):
        emb = self.token_embedding(token_ids)

        # --- MODIFIED: Using BiGRU instead of BiLSTM ---
        # A GRU returns (output, h_n) instead of (output, (h_n, c_n))
        _, h_n = self.bigru(emb)
        # ---------------------------------------------

        hidden = torch.cat((h_n[-2,:,:], h_n[-1,:,:]), dim=1)
        char_vec = self.char_cnn(char_ids)
        combined_features = torch.cat([hidden, char_vec, aux], dim=1)
        logits = self.classifier(combined_features)
        return logits

# ------------------------
# Training & Evaluation
# ------------------------
def evaluate(model, dataloader, device):
    model.eval()
    y_true, y_pred = [], []
    with torch.no_grad():
        for batch in dataloader:
            token_ids = batch['token_ids'].to(device)
            char_ids = batch['char_ids'].to(device)
            aux = batch['aux'].to(device)
            labels = batch['label'].cpu().numpy()
            logits = model(token_ids, char_ids, aux)
            preds = torch.argmax(logits, dim=1).cpu().numpy()
            y_true.extend(labels)
            y_pred.extend(preds)
    return y_true, y_pred

def train_loop(train_loader, val_loader, model, cfg, label_map):
    device = cfg.device
    model.to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=cfg.lr)
    criterion = nn.CrossEntropyLoss()
    best_macro_f1 = -1.0

    print(f"\n--- Starting Training on {device} ---")
    for epoch in range(cfg.epochs):
        model.train()
        pbar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{cfg.epochs}")
        for batch in pbar:
            token_ids = batch['token_ids'].to(device)
            char_ids = batch['char_ids'].to(device)
            aux = batch['aux'].to(device)
            labels = batch['label'].to(device)

            optimizer.zero_grad()
            logits = model(token_ids, char_ids, aux)
            loss = criterion(logits, labels)
            loss.backward()
            optimizer.step()
            pbar.set_postfix(loss=loss.item())

        val_true, val_pred = evaluate(model, val_loader, device)
        report = classification_report(val_true, val_pred, target_names=label_map.keys(), output_dict=True, zero_division=0)
        macro_f1 = report['macro avg']['f1-score']
        print(f"Epoch {epoch+1} -> Val Macro F1: {macro_f1:.4f}")

        if macro_f1 > best_macro_f1:
            best_macro_f1 = macro_f1
            torch.save(model.state_dict(), os.path.join(cfg.output_dir, "best_model.pt"))
            print(f"🚀 New best model saved with Macro F1: {macro_f1:.4f}")

# ------------------------
# Main Orchestration
# ------------------------
def main():
    df = pd.read_csv(cfg.data_csv, sep='\t', header=None, names=['label', 'text'], engine='python')
    df.dropna(subset=['text', 'label'], inplace=True)
    df['text'] = df['text'].astype(str)
    df['label'] = df['label'].apply(normalize_label)
    df = df[df['label'] != 'Other']

    label_counts = df['label'].value_counts()
    classes_to_keep = label_counts[label_counts >= cfg.min_class_samples].index
    df = df[df['label'].isin(classes_to_keep)].reset_index(drop=True)

    label_map = {label: i for i, label in enumerate(df['label'].unique())}

    data = df.to_dict(orient="records")
    labels = [label_map[r['label']] for r in data]

    train_records, test_records, _, _ = train_test_split(data, labels, test_size=0.2, random_state=cfg.seed, stratify=labels)
    train_records, val_records = train_test_split(train_records, test_size=0.15, random_state=cfg.seed, stratify=[label_map[r['label']] for r in train_records])

    train_texts = [r['text'] for r in train_records]
    word2idx = build_vocab(train_texts, cfg.min_token_freq, 'token')
    char2idx = build_vocab(train_texts, 1, 'char')

    ft_path = os.path.join(cfg.output_dir, "fasttext.model")
    if not os.path.exists(ft_path):
        print("Training FastText model...")
        all_texts_for_ft = [r['text'].split() for r in data]
        ft_model = FastText(sentences=all_texts_for_ft, vector_size=cfg.ft_dim, window=5, min_count=cfg.ft_min_count, workers=4, sg=1, epochs=cfg.ft_epochs)
        ft_model.save(ft_path)
    else:
        print("Loading existing FastText model...")
        ft_model = FastText.load(ft_path)

    embedding_matrix = np.random.normal(scale=0.02, size=(len(word2idx), cfg.ft_dim))
    for word, i in word2idx.items():
        if word in ft_model.wv:
            embedding_matrix[i] = ft_model.wv[word]

    train_ds = SentimentDataset(train_records, label_map, word2idx, char2idx, cfg)
    val_ds = SentimentDataset(val_records, label_map, word2idx, char2idx, cfg)
    test_ds = SentimentDataset(test_records, label_map, word2idx, char2idx, cfg)

    train_loader = DataLoader(train_ds, batch_size=cfg.batch_size, shuffle=True, num_workers=2)
    val_loader = DataLoader(val_ds, batch_size=cfg.batch_size * 2, shuffle=False, num_workers=2)
    test_loader = DataLoader(test_ds, batch_size=cfg.batch_size * 2, shuffle=False, num_workers=2)

    model = HybridSentimentClassifier(embedding_matrix, len(char2idx), len(label_map), cfg)
    train_loop(train_loader, val_loader, model, cfg, label_map)

    print("\n--- Final Test Set Evaluation ---")
    model.load_state_dict(torch.load(os.path.join(cfg.output_dir, "best_model.pt")))
    test_true, test_pred = evaluate(model, test_loader, cfg.device)
    print(classification_report(test_true, test_pred, target_names=label_map.keys()))

if __name__ == '__main__':
    main()

Building token vocabulary...
Building char vocabulary...
Training FastText model...

--- Starting Training on cuda ---


Epoch 1/8: 100%|██████████| 112/112 [00:01<00:00, 75.63it/s, loss=0.302]


Epoch 1 -> Val Macro F1: 0.7114
🚀 New best model saved with Macro F1: 0.7114


Epoch 2/8: 100%|██████████| 112/112 [00:01<00:00, 77.29it/s, loss=0.264]


Epoch 2 -> Val Macro F1: 0.7914
🚀 New best model saved with Macro F1: 0.7914


Epoch 3/8: 100%|██████████| 112/112 [00:01<00:00, 61.25it/s, loss=0.158]


Epoch 3 -> Val Macro F1: 0.7873


Epoch 4/8: 100%|██████████| 112/112 [00:01<00:00, 68.87it/s, loss=0.151]


Epoch 4 -> Val Macro F1: 0.7732


Epoch 5/8: 100%|██████████| 112/112 [00:01<00:00, 83.16it/s, loss=0.0421]


Epoch 5 -> Val Macro F1: 0.7853


Epoch 6/8: 100%|██████████| 112/112 [00:01<00:00, 82.44it/s, loss=0.0054]


Epoch 6 -> Val Macro F1: 0.7839


Epoch 7/8: 100%|██████████| 112/112 [00:01<00:00, 83.21it/s, loss=0.0219]


Epoch 7 -> Val Macro F1: 0.7770


Epoch 8/8: 100%|██████████| 112/112 [00:01<00:00, 81.38it/s, loss=0.139]


Epoch 8 -> Val Macro F1: 0.7687

--- Final Test Set Evaluation ---
              precision    recall  f1-score   support

    Positive       0.89      0.89      0.89      1582
    Negative       0.66      0.67      0.66       520

    accuracy                           0.83      2102
   macro avg       0.77      0.78      0.78      2102
weighted avg       0.83      0.83      0.83      2102



FastText Embedding + BiLSTM + CharCNN + TF-IDF Hybrid Model

In [4]:
# --- Step 1: Install necessary libraries ---
!pip install torch scikit-learn gensim pandas tqdm -q

import os
import time
import random
import json
from collections import Counter
from dataclasses import dataclass
from typing import List

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer # <-- NEW IMPORT
from gensim.models import FastText
from tqdm import tqdm

# ------------------------
# Configuration
# ------------------------
@dataclass
class CFG:
    data_csv: str = "mal_full_sentiment.tsv"
    output_dir: str = "outputs_malayalam_final_hybrid"
    device: str = "cuda" if torch.cuda.is_available() else "cpu"
    min_class_samples: int = 3

    # Vocab / Chars / Padding
    min_token_freq: int = 2
    max_chars_per_token: int = 20
    max_length: int = 128

    # --- NEW: TF-IDF Config ---
    tfidf_max_features: int = 5000
    tfidf_proj_dim: int = 64
    # --------------------------

    # FastText
    ft_dim: int = 300
    ft_min_count: int = 2
    ft_epochs: int = 10

    # Model Architecture
    hidden_dim: int = 256
    lstm_layers: int = 1
    char_emb_dim: int = 50
    char_out_dim: int = 100
    aux_dim: int = 8
    dropout: float = 0.4

    # Training
    epochs: int = 8
    batch_size: int = 64
    lr: float = 1e-3
    seed: int = 42

cfg = CFG()
os.makedirs(cfg.output_dir, exist_ok=True)

# ------------------------
# Utilities
# ------------------------
def seed_everything(seed=42):
    random.seed(seed); os.environ['PYTHONHASHSEED'] = str(seed); np.random.seed(seed); torch.manual_seed(seed)
    if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed)

seed_everything(cfg.seed)

def normalize_label(l):
    s = str(l).lower()
    if 'posit' in s: return 'Positive'
    if 'negat' in s: return 'Negative'
    if 'neu' in s or 'normal' in s: return 'Neutral'
    return 'Other'

# ------------------------
# Preprocessing & Vocabs
# ------------------------
def build_vocab(texts: List[str], min_freq: int, vocab_type: str):
    counts = Counter(item for text in texts for item in (text.split() if vocab_type == 'token' else list("".join(text.split()))))
    vocab = {'<pad>': 0, '<unk>': 1}
    for item, count in counts.items():
        if count >= min_freq: vocab[item] = len(vocab)
    return vocab

def compute_aux_features(text: str) -> List[float]:
    toks = text.split(); num_tokens = len(toks); num_chars = len(text)
    return [
        num_tokens, num_chars,
        1.0 if any('\u0D00' <= ch <= '\u0D7F' for ch in text) else 0.0, # has_malayalam
        1.0 if any('a' <= ch.lower() <= 'z' for ch in text) else 0.0, # has_english
        (sum(len(t) for t in toks) / num_tokens) if num_tokens > 0 else 0.0, # avg_token_len
        sum(1 for ch in text if ch.isupper()) / (num_chars + 1e-6), # cap_ratio
        sum(1 for ch in text if ch in '?!.,;:'), # punct_count
        sum(1 for ch in text if ord(ch) > 10000) # emoji_count
    ]

# ------------------------
# Dataset Class - MODIFIED for TF-IDF
# ------------------------
class SentimentDataset(Dataset):
    def __init__(self, records, tfidf_vectors, label_map, word2idx, char2idx, cfg):
        self.samples = []
        for i, r in enumerate(records):
            text = str(r['text'])
            self.samples.append({
                "token_ids": [word2idx.get(w, word2idx['<unk>']) for w in text.split()],
                "char_ids": [char2idx.get(c, char2idx['<unk>']) for c in text],
                "aux": compute_aux_features(text),
                "tfidf": tfidf_vectors[i].toarray().squeeze(), # <-- NEW
                "label": label_map[r['label']]
            })
        self.cfg = cfg

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        sample = self.samples[idx]
        token_ids = sample["token_ids"][:self.cfg.max_length]
        char_ids = sample["char_ids"][:self.cfg.max_chars_per_token]
        return {
            "token_ids": torch.tensor(token_ids + [0] * (self.cfg.max_length - len(token_ids)), dtype=torch.long),
            "char_ids": torch.tensor(char_ids + [0] * (self.cfg.max_chars_per_token - len(char_ids)), dtype=torch.long),
            "aux": torch.tensor(sample["aux"], dtype=torch.float32),
            "tfidf": torch.tensor(sample["tfidf"], dtype=torch.float32), # <-- NEW
            "label": torch.tensor(sample["label"], dtype=torch.long)
        }

# ------------------------
# Model Architecture - MODIFIED for TF-IDF
# ------------------------
class CharCNN(nn.Module):
    def __init__(self, char_vocab_size, char_emb_dim, out_dim):
        super().__init__(); self.char_emb = nn.Embedding(char_vocab_size, char_emb_dim, padding_idx=0); self.conv = nn.Conv1d(char_emb_dim, out_dim, kernel_size=3, padding=1)
    def forward(self, x_char):
        x = self.char_emb(x_char).transpose(1, 2); x = self.conv(x); return F.max_pool1d(x, x.size(2)).squeeze(2)

class HybridSentimentClassifier(nn.Module):
    def __init__(self, emb_matrix, char_vocab_size, num_labels, cfg):
        super().__init__()
        self.token_embedding = nn.Embedding.from_pretrained(torch.tensor(emb_matrix, dtype=torch.float32), freeze=False, padding_idx=0)
        self.char_cnn = CharCNN(char_vocab_size, cfg.char_emb_dim, cfg.char_out_dim)
        self.bilstm = nn.LSTM(emb_matrix.shape[1], cfg.hidden_dim // 2, num_layers=cfg.lstm_layers, bidirectional=True, batch_first=True)
        # --- NEW: TF-IDF and Aux projections ---
        self.tfidf_proj = nn.Linear(cfg.tfidf_max_features, cfg.tfidf_proj_dim)
        self.aux_proj = nn.Linear(cfg.aux_dim, cfg.aux_dim)
        # -----------------------------------
        classifier_input_dim = cfg.hidden_dim + cfg.char_out_dim + cfg.aux_dim + cfg.tfidf_proj_dim
        self.classifier = nn.Sequential(nn.Linear(classifier_input_dim, 256), nn.ReLU(), nn.Dropout(cfg.dropout), nn.Linear(256, num_labels))

    def forward(self, token_ids, char_ids, aux, tfidf):
        emb = self.token_embedding(token_ids)
        _, (h_n, _) = self.bilstm(emb)
        hidden = torch.cat((h_n[-2,:,:], h_n[-1,:,:]), dim=1)
        char_vec = self.char_cnn(char_ids)
        # --- NEW: Project and combine all features ---
        aux_vec = F.relu(self.aux_proj(aux))
        tfidf_vec = F.relu(self.tfidf_proj(tfidf))
        combined_features = torch.cat([hidden, char_vec, aux_vec, tfidf_vec], dim=1)
        # -------------------------------------------
        return self.classifier(combined_features)

# ------------------------
# Training & Evaluation - MODIFIED for TF-IDF
# ------------------------
def evaluate(model, dataloader, device):
    model.eval(); y_true, y_pred = [], []
    with torch.no_grad():
        for batch in dataloader:
            logits = model(batch['token_ids'].to(device), batch['char_ids'].to(device), batch['aux'].to(device), batch['tfidf'].to(device))
            y_true.extend(batch['label'].numpy()); y_pred.extend(torch.argmax(logits, dim=1).cpu().numpy())
    return y_true, y_pred

def train_loop(train_loader, val_loader, model, cfg, label_map):
    device = cfg.device; model.to(device); optimizer = torch.optim.AdamW(model.parameters(), lr=cfg.lr); criterion = nn.CrossEntropyLoss(); best_macro_f1 = -1.0
    print(f"\n--- Starting Training on {device} ---")
    for epoch in range(cfg.epochs):
        model.train()
        pbar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{cfg.epochs}")
        for batch in pbar:
            optimizer.zero_grad()
            logits = model(batch['token_ids'].to(device), batch['char_ids'].to(device), batch['aux'].to(device), batch['tfidf'].to(device))
            loss = criterion(logits, batch['label'].to(device))
            loss.backward(); optimizer.step(); pbar.set_postfix(loss=loss.item())
        val_true, val_pred = evaluate(model, val_loader, device)
        report = classification_report(val_true, val_pred, target_names=label_map.keys(), output_dict=True, zero_division=0)
        macro_f1 = report['macro avg']['f1-score']
        print(f"Epoch {epoch+1} -> Val Macro F1: {macro_f1:.4f}")
        if macro_f1 > best_macro_f1:
            best_macro_f1 = macro_f1; torch.save(model.state_dict(), os.path.join(cfg.output_dir, "best_model.pt")); print(f"🚀 New best model saved with Macro F1: {macro_f1:.4f}")

# ------------------------
# Main Orchestration
# ------------------------
def main():
    df = pd.read_csv(cfg.data_csv, sep='\t', header=None, names=['label', 'text'], engine='python')
    df.dropna(subset=['text', 'label'], inplace=True); df['text'] = df['text'].astype(str); df['label'] = df['label'].apply(normalize_label); df = df[df['label'] != 'Other']
    label_counts = df['label'].value_counts(); classes_to_keep = label_counts[label_counts >= cfg.min_class_samples].index; df = df[df['label'].isin(classes_to_keep)].reset_index(drop=True)
    label_map = {label: i for i, label in enumerate(df['label'].unique())}
    data = df.to_dict(orient="records"); labels = [label_map[r['label']] for r in data]
    train_records, test_records, train_labels, _ = train_test_split(data, labels, test_size=0.2, random_state=cfg.seed, stratify=labels)
    train_records, val_records = train_test_split(train_records, test_size=0.15, random_state=cfg.seed, stratify=[label_map[r['label']] for r in train_records])

    # --- NEW: Fit TF-IDF on training data only ---
    print("Fitting TF-IDF Vectorizer...")
    train_texts = [r['text'] for r in train_records]
    tfidf_vectorizer = TfidfVectorizer(max_features=cfg.tfidf_max_features, ngram_range=(1, 2))
    train_tfidf = tfidf_vectorizer.fit_transform(train_texts)
    val_tfidf = tfidf_vectorizer.transform([r['text'] for r in val_records])
    test_tfidf = tfidf_vectorizer.transform([r['text'] for r in test_records])
    # ---------------------------------------------

    train_texts = [r['text'] for r in train_records]
    word2idx = build_vocab(train_texts, cfg.min_token_freq, 'token'); char2idx = build_vocab(train_texts, 1, 'char')

    ft_path = os.path.join(cfg.output_dir, "fasttext.model")
    if not os.path.exists(ft_path):
        print("Training FastText model..."); all_texts_for_ft = [r['text'].split() for r in data]; ft_model = FastText(sentences=all_texts_for_ft, vector_size=cfg.ft_dim, window=5, min_count=cfg.ft_min_count, workers=4, sg=1, epochs=cfg.ft_epochs); ft_model.save(ft_path)
    else: print("Loading existing FastText model..."); ft_model = FastText.load(ft_path)

    embedding_matrix = np.random.normal(scale=0.02, size=(len(word2idx), cfg.ft_dim))
    for word, i in word2idx.items():
        if word in ft_model.wv: embedding_matrix[i] = ft_model.wv[word]

    train_ds = SentimentDataset(train_records, train_tfidf, label_map, word2idx, char2idx, cfg)
    val_ds = SentimentDataset(val_records, val_tfidf, label_map, word2idx, char2idx, cfg)
    test_ds = SentimentDataset(test_records, test_tfidf, label_map, word2idx, char2idx, cfg)

    train_loader = DataLoader(train_ds, batch_size=cfg.batch_size, shuffle=True, num_workers=2)
    val_loader = DataLoader(val_ds, batch_size=cfg.batch_size * 2, shuffle=False, num_workers=2)
    test_loader = DataLoader(test_ds, batch_size=cfg.batch_size * 2, shuffle=False, num_workers=2)

    model = HybridSentimentClassifier(embedding_matrix, len(char2idx), len(label_map), cfg)
    train_loop(train_loader, val_loader, model, cfg, label_map)

    print("\n--- Final Test Set Evaluation ---")
    model.load_state_dict(torch.load(os.path.join(cfg.output_dir, "best_model.pt")))
    test_true, test_pred = evaluate(model, test_loader, cfg.device)
    print(classification_report(test_true, test_pred, target_names=label_map.keys()))

if __name__ == '__main__':
    main()

Fitting TF-IDF Vectorizer...
Training FastText model...

--- Starting Training on cuda ---


Epoch 1/8: 100%|██████████| 112/112 [00:02<00:00, 52.22it/s, loss=0.392]


Epoch 1 -> Val Macro F1: 0.7964
🚀 New best model saved with Macro F1: 0.7964


Epoch 2/8: 100%|██████████| 112/112 [00:02<00:00, 55.61it/s, loss=0.0998]


Epoch 2 -> Val Macro F1: 0.8092
🚀 New best model saved with Macro F1: 0.8092


Epoch 3/8: 100%|██████████| 112/112 [00:02<00:00, 55.41it/s, loss=0.265]


Epoch 3 -> Val Macro F1: 0.8231
🚀 New best model saved with Macro F1: 0.8231


Epoch 4/8: 100%|██████████| 112/112 [00:02<00:00, 50.82it/s, loss=0.0791]


Epoch 4 -> Val Macro F1: 0.8177


Epoch 5/8: 100%|██████████| 112/112 [00:02<00:00, 44.89it/s, loss=0.0179]


Epoch 5 -> Val Macro F1: 0.8044


Epoch 6/8: 100%|██████████| 112/112 [00:02<00:00, 55.54it/s, loss=0.0145]


Epoch 6 -> Val Macro F1: 0.8164


Epoch 7/8: 100%|██████████| 112/112 [00:02<00:00, 55.21it/s, loss=0.00149]


Epoch 7 -> Val Macro F1: 0.8145


Epoch 8/8: 100%|██████████| 112/112 [00:02<00:00, 55.05it/s, loss=0.00281]


Epoch 8 -> Val Macro F1: 0.7946

--- Final Test Set Evaluation ---
              precision    recall  f1-score   support

    Positive       0.91      0.90      0.90      1582
    Negative       0.70      0.71      0.70       520

    accuracy                           0.85      2102
   macro avg       0.80      0.81      0.80      2102
weighted avg       0.85      0.85      0.85      2102



MuRIL Embeddings + CharCNN + BiLSTM + TF-IDF Hybrid Model

In [6]:
# --- Step 1: Install necessary libraries, including Hugging Face Transformers ---
!pip install torch scikit-learn gensim pandas tqdm transformers sentencepiece -q

import os
import time
import random
import json
from collections import Counter
from dataclasses import dataclass
from typing import List

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import AutoTokenizer, AutoModel

# ------------------------
# Configuration
# ------------------------
@dataclass
class CFG:
    data_csv: str = "mal_full_sentiment.tsv"
    output_dir: str = "outputs_malayalam_muril_hybrid"
    model_name: str = "google/muril-base-cased"
    device: str = "cuda" if torch.cuda.is_available() else "cpu"
    min_class_samples: int = 3
    max_token_len: int = 128
    max_char_len: int = 256
    tfidf_max_features: int = 5000
    tfidf_proj_dim: int = 64
    muril_hidden_size: int = 768
    lstm_hidden_dim: int = 256
    lstm_layers: int = 1
    char_emb_dim: int = 50
    char_out_dim: int = 100
    aux_dim: int = 8
    dropout: float = 0.4
    epochs: int = 5
    batch_size: int = 32
    lr_muril: float = 2e-5
    lr_head: float = 1e-3
    seed: int = 42

cfg = CFG()
os.makedirs(cfg.output_dir, exist_ok=True)

# ------------------------
# Utilities & Preprocessing
# ------------------------
def seed_everything(seed=42):
    random.seed(seed); os.environ['PYTHONHASHSEED'] = str(seed); np.random.seed(seed); torch.manual_seed(seed)
    if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed)

seed_everything(cfg.seed)

def normalize_label(l):
    s = str(l).lower()
    if 'posit' in s: return 'Positive'
    if 'negat' in s: return 'Negative'
    if 'neu' in s or 'normal' in s: return 'Neutral'
    return 'Other'

def build_char_vocab(texts: List[str]):
    counts = Counter(c for text in texts for c in text)
    vocab = {'<pad>': 0, '<unk>': 1}
    for char, count in counts.items():
        vocab[char] = len(vocab)
    return vocab

def compute_aux_features(text: str) -> List[float]:
    toks = text.split(); num_tokens = len(toks); num_chars = len(text)
    return [num_tokens, num_chars, 1.0 if any('\u0D00' <= ch <= '\u0D7F' for ch in text) else 0.0, 1.0 if any('a' <= ch.lower() <= 'z' for ch in text) else 0.0, (sum(len(t) for t in toks) / num_tokens) if num_tokens > 0 else 0.0, sum(1 for ch in text if ch.isupper()) / (num_chars + 1e-6), sum(1 for ch in text if ch in '?!.,;:'), sum(1 for ch in text if ord(ch) > 10000)]

# ------------------------
# Dataset Class
# ------------------------
class MurilHybridDataset(Dataset):
    def __init__(self, records, tfidf_vectors, label_map, char2idx, tokenizer, cfg):
        self.records = records; self.tfidf_vectors = tfidf_vectors; self.label_map = label_map; self.char2idx = char2idx; self.tokenizer = tokenizer; self.cfg = cfg
    def __len__(self):
        return len(self.records)
    def __getitem__(self, idx):
        record = self.records[idx]; text = str(record['text'])
        inputs = self.tokenizer.encode_plus(text, add_special_tokens=True, max_length=self.cfg.max_token_len, padding='max_length', truncation=True, return_tensors='pt')
        char_ids = [self.char2idx.get(c, self.char2idx['<unk>']) for c in text][:self.cfg.max_char_len]
        padded_chars = char_ids + [0] * (self.cfg.max_char_len - len(char_ids))
        return {
            "input_ids": inputs['input_ids'].squeeze(), "attention_mask": inputs['attention_mask'].squeeze(),
            "char_ids": torch.tensor(padded_chars, dtype=torch.long),
            "aux": torch.tensor(compute_aux_features(text), dtype=torch.float32),
            "tfidf": torch.tensor(self.tfidf_vectors[idx].toarray().squeeze(), dtype=torch.float32),
            "label": torch.tensor(self.label_map[record['label']], dtype=torch.long)
        }

# ------------------------
# Model Architecture
# ------------------------
class CharCNN(nn.Module):
    def __init__(self, char_vocab_size, char_emb_dim, out_dim):
        super().__init__(); self.char_emb = nn.Embedding(char_vocab_size, char_emb_dim, padding_idx=0); self.conv = nn.Conv1d(char_emb_dim, out_dim, kernel_size=3, padding=1)
    def forward(self, x_char):
        x = self.char_emb(x_char).transpose(1, 2); x = self.conv(x); return F.max_pool1d(x, x.size(2)).squeeze(2)

class MurilHybridClassifier(nn.Module):
    def __init__(self, char_vocab_size, num_labels, cfg):
        super().__init__()
        self.muril = AutoModel.from_pretrained(cfg.model_name)
        self.bilstm = nn.LSTM(cfg.muril_hidden_size, cfg.lstm_hidden_dim // 2, num_layers=cfg.lstm_layers, bidirectional=True, batch_first=True)
        self.char_cnn = CharCNN(char_vocab_size, cfg.char_emb_dim, cfg.char_out_dim)
        self.tfidf_proj = nn.Linear(cfg.tfidf_max_features, cfg.tfidf_proj_dim)
        self.aux_proj = nn.Linear(cfg.aux_dim, cfg.aux_dim)
        classifier_input_dim = cfg.lstm_hidden_dim + cfg.char_out_dim + cfg.aux_dim + cfg.tfidf_proj_dim
        self.classifier = nn.Sequential(nn.Linear(classifier_input_dim, 256), nn.ReLU(), nn.Dropout(cfg.dropout), nn.Linear(256, num_labels))

    def forward(self, input_ids, attention_mask, char_ids, aux, tfidf):
        muril_output = self.muril(input_ids=input_ids, attention_mask=attention_mask)
        muril_embeddings = muril_output.last_hidden_state
        _, (h_n, _) = self.bilstm(muril_embeddings)
        hidden = torch.cat((h_n[-2,:,:], h_n[-1,:,:]), dim=1)
        char_vec = self.char_cnn(char_ids)
        aux_vec = F.relu(self.aux_proj(aux))
        tfidf_vec = F.relu(self.tfidf_proj(tfidf))
        combined_features = torch.cat([hidden, char_vec, aux_vec, tfidf_vec], dim=1)
        return self.classifier(combined_features)

# ------------------------
# Training & Evaluation
# ------------------------
def evaluate(model, dataloader, device):
    model.eval(); y_true, y_pred = [], []
    with torch.no_grad():
        for batch in dataloader:
            logits = model(batch['input_ids'].to(device), batch['attention_mask'].to(device), batch['char_ids'].to(device), batch['aux'].to(device), batch['tfidf'].to(device))
            y_true.extend(batch['label'].numpy()); y_pred.extend(torch.argmax(logits, dim=1).cpu().numpy())
    return y_true, y_pred

def train_loop(train_loader, val_loader, model, cfg, label_map):
    device = cfg.device; model.to(device)
    optimizer = torch.optim.AdamW([
        {'params': model.muril.parameters(), 'lr': cfg.lr_muril},
        {'params': model.bilstm.parameters(), 'lr': cfg.lr_head},
        {'params': model.char_cnn.parameters(), 'lr': cfg.lr_head},
        {'params': model.tfidf_proj.parameters(), 'lr': cfg.lr_head},
        {'params': model.aux_proj.parameters(), 'lr': cfg.lr_head},
        {'params': model.classifier.parameters(), 'lr': cfg.lr_head}
    ])
    criterion = nn.CrossEntropyLoss(); best_macro_f1 = -1.0
    print(f"\n--- Starting Training on {device} ---")
    for epoch in range(cfg.epochs):
        model.train()
        pbar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{cfg.epochs}")
        for batch in pbar:
            optimizer.zero_grad()
            logits = model(batch['input_ids'].to(device), batch['attention_mask'].to(device), batch['char_ids'].to(device), batch['aux'].to(device), batch['tfidf'].to(device))
            loss = criterion(logits, batch['label'].to(device))
            loss.backward(); optimizer.step(); pbar.set_postfix(loss=loss.item())
        val_true, val_pred = evaluate(model, val_loader, device)
        report = classification_report(val_true, val_pred, target_names=label_map.keys(), output_dict=True, zero_division=0)
        macro_f1 = report['macro avg']['f1-score']
        print(f"Epoch {epoch+1} -> Val Macro F1: {macro_f1:.4f}")
        if macro_f1 > best_macro_f1:
            best_macro_f1 = macro_f1; torch.save(model.state_dict(), os.path.join(cfg.output_dir, "best_model.pt")); print(f"🚀 New best model saved with Macro F1: {macro_f1:.4f}")

# ------------------------
# Main Orchestration
# ------------------------
def main():
    df = pd.read_csv(cfg.data_csv, sep='\t', header=None, names=['label', 'text'], engine='python')
    df.dropna(subset=['text', 'label'], inplace=True); df['text'] = df['text'].astype(str); df['label'] = df['label'].apply(normalize_label); df = df[df['label'] != 'Other']
    label_counts = df['label'].value_counts(); classes_to_keep = label_counts[label_counts >= cfg.min_class_samples].index; df = df[df['label'].isin(classes_to_keep)].reset_index(drop=True)
    label_map = {label: i for i, label in enumerate(df['label'].unique())}
    data = df.to_dict(orient="records"); labels = [label_map[r['label']] for r in data]
    train_records, test_records, _, _ = train_test_split(data, labels, test_size=0.2, random_state=cfg.seed, stratify=labels)
    train_records, val_records = train_test_split(train_records, test_size=0.15, random_state=cfg.seed, stratify=[label_map[r['label']] for r in train_records])

    print("Fitting TF-IDF Vectorizer...")
    train_texts = [r['text'] for r in train_records]; tfidf_vectorizer = TfidfVectorizer(max_features=cfg.tfidf_max_features, ngram_range=(1, 2)); train_tfidf = tfidf_vectorizer.fit_transform(train_texts); val_tfidf = tfidf_vectorizer.transform([r['text'] for r in val_records]); test_tfidf = tfidf_vectorizer.transform([r['text'] for r in test_records])

    print(f"Loading MURIL tokenizer: {cfg.model_name}...")
    tokenizer = AutoTokenizer.from_pretrained(cfg.model_name)
    char2idx = build_char_vocab([r['text'] for r in train_records])

    train_ds = MurilHybridDataset(train_records, train_tfidf, label_map, char2idx, tokenizer, cfg)
    val_ds = MurilHybridDataset(val_records, val_tfidf, label_map, char2idx, tokenizer, cfg)
    test_ds = MurilHybridDataset(test_records, test_tfidf, label_map, char2idx, tokenizer, cfg)

    train_loader = DataLoader(train_ds, batch_size=cfg.batch_size, shuffle=True, num_workers=2)
    val_loader = DataLoader(val_ds, batch_size=cfg.batch_size * 2, shuffle=False, num_workers=2)
    test_loader = DataLoader(test_ds, batch_size=cfg.batch_size * 2, shuffle=False, num_workers=2)

    print(f"Loading MURIL model: {cfg.model_name}... (This may take a moment)")
    # --- FIX WAS APPLIED HERE ---
    model = MurilHybridClassifier(len(char2idx), len(label_map), cfg)
    # --------------------------
    train_loop(train_loader, val_loader, model, cfg, label_map)

    print("\n--- Final Test Set Evaluation ---")
    model.load_state_dict(torch.load(os.path.join(cfg.output_dir, "best_model.pt")))
    test_true, test_pred = evaluate(model, test_loader, cfg.device)
    print(classification_report(test_true, test_pred, target_names=label_map.keys()))

if __name__ == '__main__':
    main()

Fitting TF-IDF Vectorizer...
Loading MURIL tokenizer: google/muril-base-cased...
Loading MURIL model: google/muril-base-cased... (This may take a moment)

--- Starting Training on cuda ---


Epoch 1/5: 100%|██████████| 224/224 [02:37<00:00,  1.43it/s, loss=0.696]


Epoch 1 -> Val Macro F1: 0.8462
🚀 New best model saved with Macro F1: 0.8462


Epoch 2/5: 100%|██████████| 224/224 [02:36<00:00,  1.43it/s, loss=0.0923]


Epoch 2 -> Val Macro F1: 0.8532
🚀 New best model saved with Macro F1: 0.8532


Epoch 3/5: 100%|██████████| 224/224 [02:36<00:00,  1.43it/s, loss=0.165]


Epoch 3 -> Val Macro F1: 0.8122


Epoch 4/5: 100%|██████████| 224/224 [02:35<00:00,  1.44it/s, loss=0.346]


Epoch 4 -> Val Macro F1: 0.8558
🚀 New best model saved with Macro F1: 0.8558


Epoch 5/5: 100%|██████████| 224/224 [02:36<00:00,  1.43it/s, loss=0.00552]


Epoch 5 -> Val Macro F1: 0.8530

--- Final Test Set Evaluation ---
              precision    recall  f1-score   support

    Positive       0.92      0.93      0.93      1582
    Negative       0.78      0.76      0.77       520

    accuracy                           0.89      2102
   macro avg       0.85      0.84      0.85      2102
weighted avg       0.89      0.89      0.89      2102



In [7]:
# --- Step 1: Install necessary libraries, including Hugging Face Transformers ---
!pip install torch scikit-learn pandas tqdm transformers sentencepiece -q

import os
import time
import random
import json
from collections import Counter
from dataclasses import dataclass
from typing import List

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import AutoTokenizer, AutoModel

# ------------------------
# Configuration
# ------------------------
@dataclass
class CFG:
    data_csv: str = "mal_full_sentiment.tsv"
    output_dir: str = "outputs_malayalam_muril_encoder"
    model_name: str = "google/muril-base-cased"
    device: str = "cuda" if torch.cuda.is_available() else "cpu"
    min_class_samples: int = 3

    # Padding / Length
    max_token_len: int = 128

    # TF-IDF Config
    tfidf_max_features: int = 5000
    tfidf_proj_dim: int = 64

    # Model Architecture
    muril_hidden_size: int = 768
    aux_dim: int = 8
    dropout: float = 0.3

    # Training
    epochs: int = 4 # Transformers fine-tune quickly
    batch_size: int = 32
    lr_muril: float = 2e-5
    lr_head: float = 1e-3
    seed: int = 42

cfg = CFG()
os.makedirs(cfg.output_dir, exist_ok=True)

# ------------------------
# Utilities & Preprocessing
# ------------------------
def seed_everything(seed=42):
    random.seed(seed); os.environ['PYTHONHASHSEED'] = str(seed); np.random.seed(seed); torch.manual_seed(seed)
    if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed)

seed_everything(cfg.seed)

def normalize_label(l):
    s = str(l).lower()
    if 'posit' in s: return 'Positive'
    if 'negat' in s: return 'Negative'
    if 'neu' in s or 'normal' in s: return 'Neutral'
    return 'Other'

def compute_aux_features(text: str) -> List[float]:
    toks = text.split(); num_tokens = len(toks); num_chars = len(text)
    return [
        num_tokens, num_chars,
        1.0 if any('\u0D00' <= ch <= '\u0D7F' for ch in text) else 0.0,
        1.0 if any('a' <= ch.lower() <= 'z' for ch in text) else 0.0,
        (sum(len(t) for t in toks) / num_tokens) if num_tokens > 0 else 0.0,
        sum(1 for ch in text if ch.isupper()) / (num_chars + 1e-6),
        sum(1 for ch in text if ch in '?!.,;:'),
        sum(1 for ch in text if ord(ch) > 10000)
    ]

# ------------------------
# Dataset Class - Simplified
# ------------------------
class MurilHybridDataset(Dataset):
    def __init__(self, records, tfidf_vectors, label_map, tokenizer, cfg):
        self.records = records
        self.tfidf_vectors = tfidf_vectors
        self.label_map = label_map
        self.tokenizer = tokenizer
        self.cfg = cfg

    def __len__(self):
        return len(self.records)

    def __getitem__(self, idx):
        record = self.records[idx]
        text = str(record['text'])

        inputs = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.cfg.max_token_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            "input_ids": inputs['input_ids'].squeeze(),
            "attention_mask": inputs['attention_mask'].squeeze(),
            "aux": torch.tensor(compute_aux_features(text), dtype=torch.float32),
            "tfidf": torch.tensor(self.tfidf_vectors[idx].toarray().squeeze(), dtype=torch.float32),
            "label": torch.tensor(self.label_map[record['label']], dtype=torch.long)
        }

# ------------------------
# Model Architecture - Simplified and Transformer-centric
# ------------------------
class MurilSentimentClassifier(nn.Module):
    def __init__(self, num_labels, cfg):
        super().__init__()
        # Main MURIL model
        self.muril = AutoModel.from_pretrained(cfg.model_name)

        # Projection layers for our extra features
        self.tfidf_proj = nn.Linear(cfg.tfidf_max_features, cfg.tfidf_proj_dim)
        self.aux_proj = nn.Linear(cfg.aux_dim, cfg.aux_dim)

        # Classifier head
        classifier_input_dim = cfg.muril_hidden_size + cfg.tfidf_proj_dim + cfg.aux_dim
        self.classifier = nn.Sequential(
            nn.Linear(classifier_input_dim, 256),
            nn.ReLU(),
            nn.Dropout(cfg.dropout),
            nn.Linear(256, num_labels)
        )

    def forward(self, input_ids, attention_mask, aux, tfidf):
        # Get output from MURIL
        muril_output = self.muril(input_ids=input_ids, attention_mask=attention_mask)

        # We use the pooler_output, which is the [CLS] token's embedding,
        # designed for sentence-level classification.
        cls_embedding = muril_output.pooler_output # Shape: (batch_size, 768)

        # Process auxiliary features
        aux_vec = F.relu(self.aux_proj(aux))
        tfidf_vec = F.relu(self.tfidf_proj(tfidf))

        # Combine all features and classify
        combined_features = torch.cat([cls_embedding, aux_vec, tfidf_vec], dim=1)
        return self.classifier(combined_features)

# ------------------------
# Training & Evaluation
# ------------------------
def evaluate(model, dataloader, device):
    model.eval(); y_true, y_pred = [], []
    with torch.no_grad():
        for batch in dataloader:
            logits = model(batch['input_ids'].to(device), batch['attention_mask'].to(device), batch['aux'].to(device), batch['tfidf'].to(device))
            y_true.extend(batch['label'].numpy()); y_pred.extend(torch.argmax(logits, dim=1).cpu().numpy())
    return y_true, y_pred

def train_loop(train_loader, val_loader, model, cfg, label_map):
    device = cfg.device; model.to(device)
    optimizer = torch.optim.AdamW([
        {'params': model.muril.parameters(), 'lr': cfg.lr_muril},
        {'params': model.classifier.parameters(), 'lr': cfg.lr_head},
        {'params': model.tfidf_proj.parameters(), 'lr': cfg.lr_head},
        {'params': model.aux_proj.parameters(), 'lr': cfg.lr_head}
    ])
    criterion = nn.CrossEntropyLoss(); best_macro_f1 = -1.0
    print(f"\n--- Starting Training on {device} ---")
    for epoch in range(cfg.epochs):
        model.train()
        pbar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{cfg.epochs}")
        for batch in pbar:
            optimizer.zero_grad()
            logits = model(batch['input_ids'].to(device), batch['attention_mask'].to(device), batch['aux'].to(device), batch['tfidf'].to(device))
            loss = criterion(logits, batch['label'].to(device))
            loss.backward(); optimizer.step(); pbar.set_postfix(loss=loss.item())
        val_true, val_pred = evaluate(model, val_loader, device)
        report = classification_report(val_true, val_pred, target_names=label_map.keys(), output_dict=True, zero_division=0)
        macro_f1 = report['macro avg']['f1-score']
        print(f"Epoch {epoch+1} -> Val Macro F1: {macro_f1:.4f}")
        if macro_f1 > best_macro_f1:
            best_macro_f1 = macro_f1; torch.save(model.state_dict(), os.path.join(cfg.output_dir, "best_model.pt")); print(f"🚀 New best model saved with Macro F1: {macro_f1:.4f}")

# ------------------------
# Main Orchestration
# ------------------------
def main():
    df = pd.read_csv(cfg.data_csv, sep='\t', header=None, names=['label', 'text'], engine='python')
    df.dropna(subset=['text', 'label'], inplace=True); df['text'] = df['text'].astype(str); df['label'] = df['label'].apply(normalize_label); df = df[df['label'] != 'Other']
    label_counts = df['label'].value_counts(); classes_to_keep = label_counts[label_counts >= cfg.min_class_samples].index; df = df[df['label'].isin(classes_to_keep)].reset_index(drop=True)
    label_map = {label: i for i, label in enumerate(df['label'].unique())}
    data = df.to_dict(orient="records"); labels = [label_map[r['label']] for r in data]
    train_records, test_records, _, _ = train_test_split(data, labels, test_size=0.2, random_state=cfg.seed, stratify=labels)
    train_records, val_records = train_test_split(train_records, test_size=0.15, random_state=cfg.seed, stratify=[label_map[r['label']] for r in train_records])

    print("Fitting TF-IDF Vectorizer...")
    train_texts = [r['text'] for r in train_records]; tfidf_vectorizer = TfidfVectorizer(max_features=cfg.tfidf_max_features, ngram_range=(1, 2)); train_tfidf = tfidf_vectorizer.fit_transform(train_texts); val_tfidf = tfidf_vectorizer.transform([r['text'] for r in val_records]); test_tfidf = tfidf_vectorizer.transform([r['text'] for r in test_records])

    print(f"Loading MURIL tokenizer: {cfg.model_name}...")
    tokenizer = AutoTokenizer.from_pretrained(cfg.model_name)

    train_ds = MurilHybridDataset(train_records, train_tfidf, label_map, tokenizer, cfg)
    val_ds = MurilHybridDataset(val_records, val_tfidf, label_map, tokenizer, cfg)
    test_ds = MurilHybridDataset(test_records, test_tfidf, label_map, tokenizer, cfg)

    train_loader = DataLoader(train_ds, batch_size=cfg.batch_size, shuffle=True, num_workers=2)
    val_loader = DataLoader(val_ds, batch_size=cfg.batch_size * 2, shuffle=False, num_workers=2)
    test_loader = DataLoader(test_ds, batch_size=cfg.batch_size * 2, shuffle=False, num_workers=2)

    print(f"Loading MURIL model: {cfg.model_name}... (This may take a moment)")
    model = MurilSentimentClassifier(len(label_map), cfg)
    train_loop(train_loader, val_loader, model, cfg, label_map)

    print("\n--- Final Test Set Evaluation ---")
    model.load_state_dict(torch.load(os.path.join(cfg.output_dir, "best_model.pt")))
    test_true, test_pred = evaluate(model, test_loader, cfg.device)
    print(classification_report(test_true, test_pred, target_names=label_map.keys()))

if __name__ == '__main__':
    main()

Fitting TF-IDF Vectorizer...
Loading MURIL tokenizer: google/muril-base-cased...
Loading MURIL model: google/muril-base-cased... (This may take a moment)

--- Starting Training on cuda ---


Epoch 1/4: 100%|██████████| 224/224 [02:29<00:00,  1.50it/s, loss=0.487]


Epoch 1 -> Val Macro F1: 0.7690
🚀 New best model saved with Macro F1: 0.7690


Epoch 2/4: 100%|██████████| 224/224 [02:27<00:00,  1.51it/s, loss=0.194]


Epoch 2 -> Val Macro F1: 0.8331
🚀 New best model saved with Macro F1: 0.8331


Epoch 3/4: 100%|██████████| 224/224 [02:27<00:00,  1.52it/s, loss=0.019]


Epoch 3 -> Val Macro F1: 0.8176


Epoch 4/4: 100%|██████████| 224/224 [02:27<00:00,  1.52it/s, loss=0.00402]


Epoch 4 -> Val Macro F1: 0.8306

--- Final Test Set Evaluation ---
              precision    recall  f1-score   support

    Positive       0.94      0.86      0.89      1582
    Negative       0.65      0.82      0.73       520

    accuracy                           0.85      2102
   macro avg       0.79      0.84      0.81      2102
weighted avg       0.87      0.85      0.85      2102



Vanilla Fine Tuning MuRIL

In [8]:
# --- Step 1: Install necessary libraries ---
!pip install torch scikit-learn pandas tqdm transformers sentencepiece -q

import os
import time
import random
from dataclasses import dataclass
from typing import List

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModel

# ------------------------
# Configuration
# ------------------------
@dataclass
class CFG:
    data_csv: str = "mal_full_sentiment.tsv"
    output_dir: str = "outputs_malayalam_muril_pure"
    model_name: str = "google/muril-base-cased"
    device: str = "cuda" if torch.cuda.is_available() else "cpu"
    min_class_samples: int = 3

    # Tokenizer/Padding
    max_len: int = 128

    # Model Architecture
    dropout: float = 0.2

    # Training
    epochs: int = 4
    batch_size: int = 32
    lr_muril: float = 2e-5  # Standard learning rate for fine-tuning transformers
    lr_head: float = 1e-3
    seed: int = 42

cfg = CFG()
os.makedirs(cfg.output_dir, exist_ok=True)

# ------------------------
# Utilities
# ------------------------
def seed_everything(seed=42):
    random.seed(seed); os.environ['PYTHONHASHSEED'] = str(seed); np.random.seed(seed); torch.manual_seed(seed)
    if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed)

seed_everything(cfg.seed)

def normalize_label(l):
    s = str(l).lower()
    if 'posit' in s: return 'Positive'
    if 'negat' in s: return 'Negative'
    if 'neu' in s or 'normal' in s: return 'Neutral'
    return 'Other'

# ------------------------
# Dataset Class - Simplified
# ------------------------
class MurilDataset(Dataset):
    def __init__(self, records, label_map, tokenizer, cfg):
        self.records = records
        self.label_map = label_map
        self.tokenizer = tokenizer
        self.cfg = cfg

    def __len__(self):
        return len(self.records)

    def __getitem__(self, idx):
        record = self.records[idx]
        text = str(record['text'])

        inputs = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.cfg.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            "input_ids": inputs['input_ids'].squeeze(),
            "attention_mask": inputs['attention_mask'].squeeze(),
            "label": torch.tensor(self.label_map[record['label']], dtype=torch.long)
        }

# ------------------------
# Model Architecture - Simplified
# ------------------------
class MurilClassifier(nn.Module):
    def __init__(self, num_labels, cfg):
        super().__init__()
        # Main MURIL model
        self.muril = AutoModel.from_pretrained(cfg.model_name)

        # Simple classifier head
        self.dropout = nn.Dropout(cfg.dropout)
        self.classifier = nn.Linear(self.muril.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        muril_output = self.muril(input_ids=input_ids, attention_mask=attention_mask)

        # Use the pooler_output, which is the [CLS] token's embedding
        cls_embedding = muril_output.pooler_output

        # Classification
        pooled_output = self.dropout(cls_embedding)
        return self.classifier(pooled_output)

# ------------------------
# Training & Evaluation
# ------------------------
def evaluate(model, dataloader, device):
    model.eval(); y_true, y_pred = [], []
    with torch.no_grad():
        for batch in dataloader:
            logits = model(batch['input_ids'].to(device), batch['attention_mask'].to(device))
            y_true.extend(batch['label'].numpy()); y_pred.extend(torch.argmax(logits, dim=1).cpu().numpy())
    return y_true, y_pred

def train_loop(train_loader, val_loader, model, cfg, label_map):
    device = cfg.device; model.to(device)
    optimizer = torch.optim.AdamW([
        {'params': model.muril.parameters(), 'lr': cfg.lr_muril},
        {'params': model.classifier.parameters(), 'lr': cfg.lr_head}
    ])
    criterion = nn.CrossEntropyLoss(); best_macro_f1 = -1.0
    print(f"\n--- Starting Training on {device} ---")
    for epoch in range(cfg.epochs):
        model.train()
        pbar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{cfg.epochs}")
        for batch in pbar:
            optimizer.zero_grad()
            logits = model(batch['input_ids'].to(device), batch['attention_mask'].to(device))
            loss = criterion(logits, batch['label'].to(device))
            loss.backward(); optimizer.step(); pbar.set_postfix(loss=loss.item())
        val_true, val_pred = evaluate(model, val_loader, device)
        report = classification_report(val_true, val_pred, target_names=label_map.keys(), output_dict=True, zero_division=0)
        macro_f1 = report['macro avg']['f1-score']
        print(f"Epoch {epoch+1} -> Val Macro F1: {macro_f1:.4f}")
        if macro_f1 > best_macro_f1:
            best_macro_f1 = macro_f1; torch.save(model.state_dict(), os.path.join(cfg.output_dir, "best_model.pt")); print(f"🚀 New best model saved with Macro F1: {macro_f1:.4f}")

# ------------------------
# Main Orchestration
# ------------------------
def main():
    df = pd.read_csv(cfg.data_csv, sep='\t', header=None, names=['label', 'text'], engine='python')
    df.dropna(subset=['text', 'label'], inplace=True); df['text'] = df['text'].astype(str); df['label'] = df['label'].apply(normalize_label); df = df[df['label'] != 'Other']
    label_counts = df['label'].value_counts(); classes_to_keep = label_counts[label_counts >= cfg.min_class_samples].index; df = df[df['label'].isin(classes_to_keep)].reset_index(drop=True)
    label_map = {label: i for i, label in enumerate(df['label'].unique())}
    data = df.to_dict(orient="records"); labels = [label_map[r['label']] for r in data]
    train_records, test_records, _, _ = train_test_split(data, labels, test_size=0.2, random_state=cfg.seed, stratify=labels)
    train_records, val_records = train_test_split(train_records, test_size=0.15, random_state=cfg.seed, stratify=[label_map[r['label']] for r in train_records])

    print(f"Loading MURIL tokenizer: {cfg.model_name}...")
    tokenizer = AutoTokenizer.from_pretrained(cfg.model_name)

    train_ds = MurilDataset(train_records, label_map, tokenizer, cfg)
    val_ds = MurilDataset(val_records, label_map, tokenizer, cfg)
    test_ds = MurilDataset(test_records, label_map, tokenizer, cfg)

    train_loader = DataLoader(train_ds, batch_size=cfg.batch_size, shuffle=True, num_workers=2)
    val_loader = DataLoader(val_ds, batch_size=cfg.batch_size * 2, shuffle=False, num_workers=2)
    test_loader = DataLoader(test_ds, batch_size=cfg.batch_size * 2, shuffle=False, num_workers=2)

    print(f"Loading MURIL model: {cfg.model_name}... (This may take a moment)")
    model = MurilClassifier(len(label_map), cfg)
    train_loop(train_loader, val_loader, model, cfg, label_map)

    print("\n--- Final Test Set Evaluation ---")
    model.load_state_dict(torch.load(os.path.join(cfg.output_dir, "best_model.pt")))
    test_true, test_pred = evaluate(model, test_loader, cfg.device)
    print(classification_report(test_true, test_pred, target_names=label_map.keys()))

if __name__ == '__main__':
    main()

Loading MURIL tokenizer: google/muril-base-cased...
Loading MURIL model: google/muril-base-cased... (This may take a moment)

--- Starting Training on cuda ---


Epoch 1/4: 100%|██████████| 224/224 [02:27<00:00,  1.52it/s, loss=0.666]


Epoch 1 -> Val Macro F1: 0.6238
🚀 New best model saved with Macro F1: 0.6238


Epoch 2/4: 100%|██████████| 224/224 [02:27<00:00,  1.52it/s, loss=0.0553]


Epoch 2 -> Val Macro F1: 0.7525
🚀 New best model saved with Macro F1: 0.7525


Epoch 3/4: 100%|██████████| 224/224 [02:27<00:00,  1.52it/s, loss=0.0497]


Epoch 3 -> Val Macro F1: 0.8242
🚀 New best model saved with Macro F1: 0.8242


Epoch 4/4: 100%|██████████| 224/224 [02:27<00:00,  1.52it/s, loss=0.0564]


Epoch 4 -> Val Macro F1: 0.8430
🚀 New best model saved with Macro F1: 0.8430

--- Final Test Set Evaluation ---
              precision    recall  f1-score   support

    Positive       0.90      0.95      0.92      1582
    Negative       0.80      0.68      0.74       520

    accuracy                           0.88      2102
   macro avg       0.85      0.81      0.83      2102
weighted avg       0.88      0.88      0.88      2102



HyperParameter Tuning MuRIL

In [9]:
# --- Step 1: Install necessary libraries ---
!pip install torch scikit-learn pandas tqdm transformers sentencepiece -q

import os
import time
import random
from dataclasses import dataclass
from typing import List

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from transformers import AutoTokenizer, AutoModel, get_linear_schedule_with_warmup

# ------------------------
# Configuration
# ------------------------
@dataclass
class CFG:
    data_csv: str = "mal_full_sentiment.tsv"
    output_dir: str = "outputs_malayalam_muril_advanced"
    model_name: str = "google/muril-base-cased"
    device: str = "cuda" if torch.cuda.is_available() else "cpu"
    min_class_samples: int = 3

    # Tokenizer/Padding
    max_len: int = 128

    # Model Architecture
    dropout: float = 0.2

    # --- MODIFIED: Advanced Training Config ---
    epochs: int = 5 # Can increase to 5-6 with a good scheduler
    batch_size: int = 32
    lr_muril: float = 2e-5
    lr_head: float = 1e-3
    label_smoothing: float = 0.1 # New regularization technique
    seed: int = 42

cfg = CFG()
os.makedirs(cfg.output_dir, exist_ok=True)

# ------------------------
# Utilities
# ------------------------
def seed_everything(seed=42):
    random.seed(seed); os.environ['PYTHONHASHSEED'] = str(seed); np.random.seed(seed); torch.manual_seed(seed)
    if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed)

seed_everything(cfg.seed)

def normalize_label(l):
    s = str(l).lower()
    if 'posit' in s: return 'Positive'
    if 'negat' in s: return 'Negative'
    if 'neu' in s or 'normal' in s: return 'Neutral'
    return 'Other'

# ------------------------
# Dataset Class
# ------------------------
class MurilDataset(Dataset):
    def __init__(self, records, label_map, tokenizer, cfg):
        self.records = records
        self.label_map = label_map
        self.tokenizer = tokenizer
        self.cfg = cfg

    def __len__(self):
        return len(self.records)

    def __getitem__(self, idx):
        record = self.records[idx]
        text = str(record['text'])
        inputs = self.tokenizer.encode_plus(
            text, add_special_tokens=True, max_length=self.cfg.max_len,
            padding='max_length', truncation=True, return_tensors='pt'
        )
        return {
            "input_ids": inputs['input_ids'].squeeze(),
            "attention_mask": inputs['attention_mask'].squeeze(),
            "label": torch.tensor(self.label_map[record['label']], dtype=torch.long)
        }

# ------------------------
# Model Architecture
# ------------------------
class MurilClassifier(nn.Module):
    def __init__(self, num_labels, cfg):
        super().__init__()
        self.muril = AutoModel.from_pretrained(cfg.model_name)
        self.dropout = nn.Dropout(cfg.dropout)
        self.classifier = nn.Linear(self.muril.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        muril_output = self.muril(input_ids=input_ids, attention_mask=attention_mask)
        cls_embedding = muril_output.pooler_output
        pooled_output = self.dropout(cls_embedding)
        return self.classifier(pooled_output)

# ------------------------
# Training & Evaluation
# ------------------------
def evaluate(model, dataloader, device):
    model.eval(); y_true, y_pred = [], []
    with torch.no_grad():
        for batch in dataloader:
            logits = model(batch['input_ids'].to(device), batch['attention_mask'].to(device))
            y_true.extend(batch['label'].numpy()); y_pred.extend(torch.argmax(logits, dim=1).cpu().numpy())
    return y_true, y_pred

def train_loop(train_loader, val_loader, model, cfg, label_map, class_weights):
    device = cfg.device; model.to(device)
    optimizer = torch.optim.AdamW([
        {'params': model.muril.parameters(), 'lr': cfg.lr_muril},
        {'params': model.classifier.parameters(), 'lr': cfg.lr_head}
    ])

    # --- NEW: Setup Scheduler and Weighted Loss Function ---
    num_training_steps = len(train_loader) * cfg.epochs
    num_warmup_steps = int(0.1 * num_training_steps)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps
    )
    weights_tensor = torch.tensor(class_weights, dtype=torch.float).to(device)
    criterion = nn.CrossEntropyLoss(weight=weights_tensor, label_smoothing=cfg.label_smoothing)
    # --------------------------------------------------------

    best_macro_f1 = -1.0
    print(f"\n--- Starting Training on {device} ---")
    for epoch in range(cfg.epochs):
        model.train()
        pbar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{cfg.epochs}")
        for batch in pbar:
            optimizer.zero_grad()
            logits = model(batch['input_ids'].to(device), batch['attention_mask'].to(device))
            loss = criterion(logits, batch['label'].to(device))
            loss.backward()
            optimizer.step()
            scheduler.step() # Update the learning rate
            pbar.set_postfix(loss=loss.item())
        val_true, val_pred = evaluate(model, val_loader, device)
        report = classification_report(val_true, val_pred, target_names=label_map.keys(), output_dict=True, zero_division=0)
        macro_f1 = report['macro avg']['f1-score']
        print(f"Epoch {epoch+1} -> Val Macro F1: {macro_f1:.4f}")
        if macro_f1 > best_macro_f1:
            best_macro_f1 = macro_f1; torch.save(model.state_dict(), os.path.join(cfg.output_dir, "best_model.pt")); print(f"🚀 New best model saved with Macro F1: {macro_f1:.4f}")

# ------------------------
# Main Orchestration
# ------------------------
def main():
    df = pd.read_csv(cfg.data_csv, sep='\t', header=None, names=['label', 'text'], engine='python')
    df.dropna(subset=['text', 'label'], inplace=True); df['text'] = df['text'].astype(str); df['label'] = df['label'].apply(normalize_label); df = df[df['label'] != 'Other']
    label_counts = df['label'].value_counts(); classes_to_keep = label_counts[label_counts >= cfg.min_class_samples].index; df = df[df['label'].isin(classes_to_keep)].reset_index(drop=True)
    label_map = {label: i for i, label in enumerate(sorted(df['label'].unique()))} # Sort labels for consistency
    data = df.to_dict(orient="records"); labels = [label_map[r['label']] for r in data]
    train_records, test_records, train_labels, _ = train_test_split(data, labels, test_size=0.2, random_state=cfg.seed, stratify=labels)
    train_records, val_records = train_test_split(train_records, test_size=0.15, random_state=cfg.seed, stratify=[label_map[r['label']] for r in train_records])

    # --- NEW: Calculate Class Weights from the training set ---
    train_labels_for_weights = [label_map[r['label']] for r in train_records]
    class_weights = compute_class_weight(
        class_weight='balanced',
        classes=np.unique(train_labels_for_weights),
        y=train_labels_for_weights
    )
    print("Calculated Class Weights:", class_weights)
    # --------------------------------------------------------

    print(f"Loading MURIL tokenizer: {cfg.model_name}...")
    tokenizer = AutoTokenizer.from_pretrained(cfg.model_name)

    train_ds = MurilDataset(train_records, label_map, tokenizer, cfg)
    val_ds = MurilDataset(val_records, label_map, tokenizer, cfg)
    test_ds = MurilDataset(test_records, label_map, tokenizer, cfg)

    train_loader = DataLoader(train_ds, batch_size=cfg.batch_size, shuffle=True, num_workers=2)
    val_loader = DataLoader(val_ds, batch_size=cfg.batch_size * 2, shuffle=False, num_workers=2)
    test_loader = DataLoader(test_ds, batch_size=cfg.batch_size * 2, shuffle=False, num_workers=2)

    print(f"Loading MURIL model: {cfg.model_name}... (This may take a moment)")
    model = MurilClassifier(len(label_map), cfg)
    # --- MODIFIED: Pass class_weights to the training loop ---
    train_loop(train_loader, val_loader, model, cfg, label_map, class_weights)

    print("\n--- Final Test Set Evaluation ---")
    model.load_state_dict(torch.load(os.path.join(cfg.output_dir, "best_model.pt")))
    test_true, test_pred = evaluate(model, test_loader, cfg.device)
    print(classification_report(test_true, test_pred, target_names=label_map.keys()))

if __name__ == '__main__':
    main()

Calculated Class Weights: [2.02036199 0.66443452]
Loading MURIL tokenizer: google/muril-base-cased...
Loading MURIL model: google/muril-base-cased... (This may take a moment)

--- Starting Training on cuda ---


Epoch 1/5: 100%|██████████| 224/224 [02:27<00:00,  1.52it/s, loss=0.371]


Epoch 1 -> Val Macro F1: 0.7378
🚀 New best model saved with Macro F1: 0.7378


Epoch 2/5: 100%|██████████| 224/224 [02:27<00:00,  1.52it/s, loss=0.516]


Epoch 2 -> Val Macro F1: 0.8324
🚀 New best model saved with Macro F1: 0.8324


Epoch 3/5: 100%|██████████| 224/224 [02:27<00:00,  1.52it/s, loss=0.582]


Epoch 3 -> Val Macro F1: 0.8480
🚀 New best model saved with Macro F1: 0.8480


Epoch 4/5: 100%|██████████| 224/224 [02:27<00:00,  1.52it/s, loss=0.233]


Epoch 4 -> Val Macro F1: 0.8528
🚀 New best model saved with Macro F1: 0.8528


Epoch 5/5: 100%|██████████| 224/224 [02:27<00:00,  1.52it/s, loss=0.443]


Epoch 5 -> Val Macro F1: 0.8528

--- Final Test Set Evaluation ---
              precision    recall  f1-score   support

    Negative       0.75      0.80      0.78       520
    Positive       0.93      0.91      0.92      1582

    accuracy                           0.89      2102
   macro avg       0.84      0.86      0.85      2102
weighted avg       0.89      0.89      0.89      2102



MuRIL Fine Tuning with K Folds cross-validation

In [10]:
# --- Step 1: Install necessary libraries ---
!pip install torch scikit-learn pandas tqdm transformers sentencepiece -q

import os
import time
import random
from dataclasses import dataclass
from typing import List

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import classification_report
from sklearn.model_selection import StratifiedKFold # <-- NEW IMPORT
from sklearn.utils.class_weight import compute_class_weight
from transformers import AutoTokenizer, AutoModel, get_linear_schedule_with_warmup

# ------------------------
# Configuration
# ------------------------
@dataclass
class CFG:
    data_csv: str = "mal_full_sentiment.tsv"
    output_dir: str = "outputs_malayalam_muril_ensembled"
    model_name: str = "google/muril-base-cased"
    device: str = "cuda" if torch.cuda.is_available() else "cpu"
    min_class_samples: int = 3
    n_folds: int = 5 # Number of models to train for the ensemble

    # Tokenizer/Padding
    max_len: int = 128

    # Model Architecture
    dropout: float = 0.2

    # Training
    epochs: int = 4
    batch_size: int = 32
    lr_muril: float = 2e-5
    lr_head: float = 1e-3
    label_smoothing: float = 0.1
    seed: int = 42

cfg = CFG()
os.makedirs(cfg.output_dir, exist_ok=True)

# ------------------------
# Utilities, Dataset, Model (Same as before)
# ------------------------
def seed_everything(seed=42):
    random.seed(seed); os.environ['PYTHONHASHSEED'] = str(seed); np.random.seed(seed); torch.manual_seed(seed)
    if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed)

seed_everything(cfg.seed)

def normalize_label(l):
    s = str(l).lower()
    if 'posit' in s: return 'Positive'
    if 'negat' in s: return 'Negative'
    if 'neu' in s or 'normal' in s: return 'Neutral'
    return 'Other'

class MurilDataset(Dataset):
    def __init__(self, records, label_map, tokenizer, cfg):
        self.records, self.label_map, self.tokenizer, self.cfg = records, label_map, tokenizer, cfg
    def __len__(self): return len(self.records)
    def __getitem__(self, idx):
        record = self.records[idx]; text = str(record['text'])
        inputs = self.tokenizer.encode_plus(
            text, add_special_tokens=True, max_length=self.cfg.max_len,
            padding='max_length', truncation=True, return_tensors='pt'
        )
        return {"input_ids": inputs['input_ids'].squeeze(), "attention_mask": inputs['attention_mask'].squeeze(), "label": torch.tensor(self.label_map[record['label']], dtype=torch.long)}

class MurilClassifier(nn.Module):
    def __init__(self, num_labels, cfg):
        super().__init__(); self.muril = AutoModel.from_pretrained(cfg.model_name); self.dropout = nn.Dropout(cfg.dropout); self.classifier = nn.Linear(self.muril.config.hidden_size, num_labels)
    def forward(self, input_ids, attention_mask):
        return self.classifier(self.dropout(self.muril(input_ids=input_ids, attention_mask=attention_mask).pooler_output))

# ------------------------
# Training & Evaluation - MODIFIED to save fold model
# ------------------------
def evaluate(model, dataloader, device):
    model.eval(); y_true, y_pred, all_probs = [], [], []
    with torch.no_grad():
        for batch in dataloader:
            logits = model(batch['input_ids'].to(device), batch['attention_mask'].to(device))
            probs = torch.softmax(logits, dim=1).cpu().numpy()
            y_true.extend(batch['label'].numpy()); y_pred.extend(np.argmax(probs, axis=1)); all_probs.append(probs)
    return y_true, y_pred, np.concatenate(all_probs)

def train_loop(train_loader, val_loader, model, cfg, label_map, class_weights, fold):
    device = cfg.device; model.to(device)
    optimizer = torch.optim.AdamW([{'params': model.muril.parameters(), 'lr': cfg.lr_muril}, {'params': model.classifier.parameters(), 'lr': cfg.lr_head}])
    num_training_steps = len(train_loader) * cfg.epochs; num_warmup_steps = int(0.1 * num_training_steps)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps)
    weights_tensor = torch.tensor(class_weights, dtype=torch.float).to(device)
    criterion = nn.CrossEntropyLoss(weight=weights_tensor, label_smoothing=cfg.label_smoothing)
    best_macro_f1 = -1.0

    print(f"\n--- Starting Training for Fold {fold+1}/{cfg.n_folds} on {device} ---")
    for epoch in range(cfg.epochs):
        model.train()
        pbar = tqdm(train_loader, desc=f"Fold {fold+1} Epoch {epoch+1}/{cfg.epochs}")
        for batch in pbar:
            optimizer.zero_grad()
            logits = model(batch['input_ids'].to(device), batch['attention_mask'].to(device))
            loss = criterion(logits, batch['label'].to(device))
            loss.backward(); optimizer.step(); scheduler.step(); pbar.set_postfix(loss=loss.item())

        val_true, val_pred, _ = evaluate(model, val_loader, device)
        report = classification_report(val_true, val_pred, target_names=label_map.keys(), output_dict=True, zero_division=0)
        macro_f1 = report['macro avg']['f1-score']
        print(f"Fold {fold+1} Epoch {epoch+1} -> Val Macro F1: {macro_f1:.4f}")
        if macro_f1 > best_macro_f1:
            best_macro_f1 = macro_f1
            torch.save(model.state_dict(), os.path.join(cfg.output_dir, f"best_model_fold_{fold}.pt"))
            print(f"🚀 New best model for Fold {fold+1} saved with Macro F1: {macro_f1:.4f}")

# ------------------------
# Main Orchestration - MODIFIED for Cross-Validation
# ------------------------
def main():
    df = pd.read_csv(cfg.data_csv, sep='\t', header=None, names=['label', 'text'], engine='python')
    df.dropna(subset=['text', 'label'], inplace=True); df['text'] = df['text'].astype(str); df['label'] = df['label'].apply(normalize_label); df = df[df['label'] != 'Other']
    label_counts = df['label'].value_counts(); classes_to_keep = label_counts[label_counts >= cfg.min_class_samples].index; df = df[df['label'].isin(classes_to_keep)].reset_index(drop=True)
    label_map = {label: i for i, label in enumerate(sorted(df['label'].unique()))}

    # We will split into train and test once
    data = df.to_dict(orient="records")
    labels = [label_map[r['label']] for r in data]
    train_val_records, test_records, train_val_labels, test_labels = train_test_split(data, labels, test_size=0.15, random_state=cfg.seed, stratify=labels)

    print(f"Loading MURIL tokenizer: {cfg.model_name}...")
    tokenizer = AutoTokenizer.from_pretrained(cfg.model_name)

    # --- NEW: Cross-Validation Loop ---
    skf = StratifiedKFold(n_splits=cfg.n_folds, shuffle=True, random_state=cfg.seed)

    # Convert records to numpy array for easy indexing by StratifiedKFold
    train_val_records_np = np.array(train_val_records)
    train_val_labels_np = np.array(train_val_labels)

    for fold, (train_idx, val_idx) in enumerate(skf.split(train_val_records_np, train_val_labels_np)):
        train_records_fold = train_val_records_np[train_idx].tolist()
        val_records_fold = train_val_records_np[val_idx].tolist()

        train_labels_fold = [label_map[r['label']] for r in train_records_fold]
        class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(train_labels_fold), y=train_labels_fold)

        train_ds = MurilDataset(train_records_fold, label_map, tokenizer, cfg)
        val_ds = MurilDataset(val_records_fold, label_map, tokenizer, cfg)

        train_loader = DataLoader(train_ds, batch_size=cfg.batch_size, shuffle=True, num_workers=2)
        val_loader = DataLoader(val_ds, batch_size=cfg.batch_size * 2, shuffle=False, num_workers=2)

        model = MurilClassifier(len(label_map), cfg)
        train_loop(train_loader, val_loader, model, cfg, label_map, class_weights, fold)

    # --- NEW: Final Evaluation with Ensembling ---
    print("\n--- Final Test Set Evaluation with Ensembling ---")
    test_ds = MurilDataset(test_records, label_map, tokenizer, cfg)
    test_loader = DataLoader(test_ds, batch_size=cfg.batch_size * 2, shuffle=False, num_workers=2)

    all_fold_probs = []
    for fold in range(cfg.n_folds):
        print(f"Loading model from fold {fold+1} for inference...")
        model = MurilClassifier(len(label_map), cfg)
        model.load_state_dict(torch.load(os.path.join(cfg.output_dir, f"best_model_fold_{fold}.pt")))
        model.to(cfg.device)

        _, _, fold_probs = evaluate(model, test_loader, cfg.device)
        all_fold_probs.append(fold_probs)

    # Average the probabilities from all models
    ensembled_probs = np.mean(all_fold_probs, axis=0)
    ensembled_preds = np.argmax(ensembled_probs, axis=1)

    print("\n--- Ensembled Classification Report ---")
    print(classification_report(test_labels, ensembled_preds, target_names=label_map.keys()))

if __name__ == '__main__':
    main()

Loading MURIL tokenizer: google/muril-base-cased...

--- Starting Training for Fold 1/5 on cuda ---


Fold 1 Epoch 1/4: 100%|██████████| 224/224 [02:27<00:00,  1.52it/s, loss=0.405]


Fold 1 Epoch 1 -> Val Macro F1: 0.6662
🚀 New best model for Fold 1 saved with Macro F1: 0.6662


Fold 1 Epoch 2/4: 100%|██████████| 224/224 [02:27<00:00,  1.52it/s, loss=0.292]


Fold 1 Epoch 2 -> Val Macro F1: 0.8191
🚀 New best model for Fold 1 saved with Macro F1: 0.8191


Fold 1 Epoch 3/4: 100%|██████████| 224/224 [02:27<00:00,  1.52it/s, loss=0.232]


Fold 1 Epoch 3 -> Val Macro F1: 0.8575
🚀 New best model for Fold 1 saved with Macro F1: 0.8575


Fold 1 Epoch 4/4: 100%|██████████| 224/224 [02:27<00:00,  1.52it/s, loss=0.365]


Fold 1 Epoch 4 -> Val Macro F1: 0.8596
🚀 New best model for Fold 1 saved with Macro F1: 0.8596

--- Starting Training for Fold 2/5 on cuda ---


Fold 2 Epoch 1/4: 100%|██████████| 224/224 [02:27<00:00,  1.52it/s, loss=0.496]


Fold 2 Epoch 1 -> Val Macro F1: 0.7761
🚀 New best model for Fold 2 saved with Macro F1: 0.7761


Fold 2 Epoch 2/4: 100%|██████████| 224/224 [02:27<00:00,  1.52it/s, loss=0.504]


Fold 2 Epoch 2 -> Val Macro F1: 0.8190
🚀 New best model for Fold 2 saved with Macro F1: 0.8190


Fold 2 Epoch 3/4: 100%|██████████| 224/224 [02:27<00:00,  1.52it/s, loss=0.322]


Fold 2 Epoch 3 -> Val Macro F1: 0.8349
🚀 New best model for Fold 2 saved with Macro F1: 0.8349


Fold 2 Epoch 4/4: 100%|██████████| 224/224 [02:27<00:00,  1.52it/s, loss=0.247]


Fold 2 Epoch 4 -> Val Macro F1: 0.8369
🚀 New best model for Fold 2 saved with Macro F1: 0.8369

--- Starting Training for Fold 3/5 on cuda ---


Fold 3 Epoch 1/4: 100%|██████████| 224/224 [02:27<00:00,  1.52it/s, loss=0.559]


Fold 3 Epoch 1 -> Val Macro F1: 0.7607
🚀 New best model for Fold 3 saved with Macro F1: 0.7607


Fold 3 Epoch 2/4: 100%|██████████| 224/224 [02:27<00:00,  1.52it/s, loss=0.386]


Fold 3 Epoch 2 -> Val Macro F1: 0.7830
🚀 New best model for Fold 3 saved with Macro F1: 0.7830


Fold 3 Epoch 3/4: 100%|██████████| 224/224 [02:27<00:00,  1.52it/s, loss=0.704]


Fold 3 Epoch 3 -> Val Macro F1: 0.8419
🚀 New best model for Fold 3 saved with Macro F1: 0.8419


Fold 3 Epoch 4/4: 100%|██████████| 224/224 [02:27<00:00,  1.52it/s, loss=0.334]


Fold 3 Epoch 4 -> Val Macro F1: 0.8427
🚀 New best model for Fold 3 saved with Macro F1: 0.8427

--- Starting Training for Fold 4/5 on cuda ---


Fold 4 Epoch 1/4: 100%|██████████| 224/224 [02:27<00:00,  1.52it/s, loss=0.694]


Fold 4 Epoch 1 -> Val Macro F1: 0.1984
🚀 New best model for Fold 4 saved with Macro F1: 0.1984


Fold 4 Epoch 2/4: 100%|██████████| 224/224 [02:27<00:00,  1.52it/s, loss=0.715]


Fold 4 Epoch 2 -> Val Macro F1: 0.1984


Fold 4 Epoch 3/4: 100%|██████████| 224/224 [02:26<00:00,  1.53it/s, loss=0.753]


Fold 4 Epoch 3 -> Val Macro F1: 0.1984


Fold 4 Epoch 4/4: 100%|██████████| 224/224 [02:26<00:00,  1.53it/s, loss=0.69]


Fold 4 Epoch 4 -> Val Macro F1: 0.1984

--- Starting Training for Fold 5/5 on cuda ---


Fold 5 Epoch 1/4: 100%|██████████| 224/224 [02:27<00:00,  1.52it/s, loss=0.601]


Fold 5 Epoch 1 -> Val Macro F1: 0.6950
🚀 New best model for Fold 5 saved with Macro F1: 0.6950


Fold 5 Epoch 2/4: 100%|██████████| 224/224 [02:27<00:00,  1.52it/s, loss=0.433]


Fold 5 Epoch 2 -> Val Macro F1: 0.7632
🚀 New best model for Fold 5 saved with Macro F1: 0.7632


Fold 5 Epoch 3/4: 100%|██████████| 224/224 [02:27<00:00,  1.52it/s, loss=0.267]


Fold 5 Epoch 3 -> Val Macro F1: 0.8482
🚀 New best model for Fold 5 saved with Macro F1: 0.8482


Fold 5 Epoch 4/4: 100%|██████████| 224/224 [02:27<00:00,  1.52it/s, loss=0.217]


Fold 5 Epoch 4 -> Val Macro F1: 0.8523
🚀 New best model for Fold 5 saved with Macro F1: 0.8523

--- Final Test Set Evaluation with Ensembling ---
Loading model from fold 1 for inference...
Loading model from fold 2 for inference...
Loading model from fold 3 for inference...
Loading model from fold 4 for inference...
Loading model from fold 5 for inference...

--- Ensembled Classification Report ---
              precision    recall  f1-score   support

    Negative       0.75      0.81      0.78       390
    Positive       0.94      0.91      0.92      1187

    accuracy                           0.89      1577
   macro avg       0.84      0.86      0.85      1577
weighted avg       0.89      0.89      0.89      1577

