In [None]:
# Cell 3 — upload local file via browser
from google.colab import files
uploaded = files.upload()   # choose your tamil_sentiment_full.csv from your machine
# After upload, file will be in current working directory, e.g. '/content/tamil_sentiment_full.csv'

Saving tamil_sentiment_full.csv to tamil_sentiment_full (1).csv


In [None]:
# Step 1: Install necessary libraries
!pip install torch scikit-learn gensim tensorboardX pandas tqdm -q

# Step 2: The full Python script
import os
import json
import random
from collections import Counter, defaultdict
from dataclasses import dataclass
from typing import List, Dict, Tuple

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from gensim.models import FastText
from tqdm import tqdm
from tensorboardX import SummaryWriter

In [None]:
# uninstall then install a fresh wheel
!pip uninstall -y numpy
!pip cache purge

# install a stable release (choose one appropriate for your Python version)
!pip install numpy==1.26.4 --no-cache-dir

Found existing installation: numpy 1.26.4
Uninstalling numpy-1.26.4:
  Successfully uninstalled numpy-1.26.4
Files removed: 24
Collecting numpy==1.26.4
  Downloading numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m86.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.0/18.0 MB[0m [31m172.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: numpy
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
opencv-python 4.12.0.88 requires numpy<2.3.0,>=2; python_version >= "3.9", but you have numpy 1.26.4 which is incompatible.
thinc 8.3.6 requires numpy<3.0.0,>=2.0.0, but you have nu

In [None]:
# -------------------------
# Config - edit these
# -------------------------
@dataclass
class CFG:
    # Data / paths
    # This path works directly with the file uploaded to Colab's root session storage
    data_csv: str = "tamil_sentiment_full.csv"
    labels_json: str = "data/labels.json"
    output_dir: str = "outputs_bilstm"
    fasttext_model_path: str = "outputs_bilstm/fasttext_tamil.model"
    device: str = "cuda" if torch.cuda.is_available() else "cpu"

    # Model
    max_length: int = 100
    embedding_dim: int = 300
    hidden_dim: int = 256
    n_layers: int = 2
    dropout: float = 0.4

    # Training
    epochs: int = 10
    batch_size: int = 32
    lr: float = 1e-3
    weight_decay: float = 1e-5
    warmup_steps: int = 100
    seed: int = 42

    # Loss options (Focal Loss is enabled by default)
    use_class_weight: bool = False
    use_oversample: bool = False
    use_focal: bool = True
    focal_gamma: float = 2.0
    focal_alpha: List[float] = None

    # FastText
    ft_dim: int = 300
    ft_window: int = 5
    ft_min_count: int = 2
    ft_epochs: int = 10

cfg = CFG()

# Create directories for outputs
os.makedirs(cfg.output_dir, exist_ok=True)
os.makedirs("data", exist_ok=True)

In [None]:
# -------------------------
# Utilities
# -------------------------
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.deterministic = True

seed_everything(cfg.seed)

def read_data(csv_path: str) -> Tuple[List[Dict], Dict[str, int]]:
    """
    Reads the tab-separated CSV. Creates and saves a label_map if one isn't found.
    """
    print(f"Reading data from {csv_path}...")
    # Handle potential errors if the file is not found
    if not os.path.exists(csv_path):
        raise FileNotFoundError(f"Dataset not found at {csv_path}. Please make sure you have uploaded it to the Colab session.")

    df = pd.read_csv(csv_path, sep='\\t', header=None, names=['label', 'text'], engine='python')
    df.dropna(subset=['text', 'label'], inplace=True)
    df['text'] = df['text'].astype(str)

    unique_labels = sorted(df['label'].unique())
    label_map = {label: i for i, label in enumerate(unique_labels)}

    if not os.path.exists(cfg.labels_json):
        with open(cfg.labels_json, 'w') as f:
            json.dump(label_map, f, indent=2)
        print(f"Created and saved label map to {cfg.labels_json}")

    data = df.to_dict(orient="records")
    return data, label_map

# -------------------------
# Vocabulary Builder
# -------------------------
class Vocabulary:
    def __init__(self, texts: List[str], min_freq=2):
        self.word2idx = {'<pad>': 0, '<unk>': 1}
        self.idx2word = {0: '<pad>', 1: '<unk>'}
        self.pad_token_id = 0
        self.unk_token_id = 1

        word_counts = Counter(word for text in texts for word in text.split())

        for word, count in word_counts.items():
            if count >= min_freq:
                idx = len(self.word2idx)
                self.word2idx[word] = idx
                self.idx2word[idx] = word

    def __len__(self):
        return len(self.word2idx)

    def text_to_sequence(self, text: str) -> List[int]:
        return [self.word2idx.get(word, self.unk_token_id) for word in text.split()]

# -------------------------
# Dataset
# -------------------------
class CommentsDataset(Dataset):
    def __init__(self, records: List[Dict], label_map: Dict[str,int], vocab: Vocabulary, max_len=128):
        self.vocab = vocab
        self.max_len = max_len
        self.label_map = label_map
        self.samples = []
        for rec in records:
            text = str(rec["text"])
            if text.strip()=="":
                text = "<pad>"
            label = self.label_map[rec["label"]]
            self.samples.append((text, label))

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        text, label = self.samples[idx]
        seq = self.vocab.text_to_sequence(text)
        if len(seq) < self.max_len:
            seq.extend([self.vocab.pad_token_id] * (self.max_len - len(seq)))
        else:
            seq = seq[:self.max_len]
        return {"text": torch.tensor(seq, dtype=torch.long), "label": torch.tensor(label, dtype=torch.long)}


In [None]:
# -------------------------
# FastText training helper
# -------------------------
def train_fasttext(sentences: List[str], save_path: str, dim=300, window=5, min_count=2, epochs=10):
    if os.path.exists(save_path):
        print(f"FastText model found at {save_path}, loading...")
        return FastText.load(save_path)

    print("Training new FastText model...")
    tokenized = [s.split() for s in sentences]
    ft = FastText(sentences=tokenized, vector_size=dim, window=window, min_count=min_count, workers=os.cpu_count(), epochs=epochs)
    ft.save(save_path)
    print(f"FastText model saved to {save_path}")
    return

In [None]:
# -------------------------
# Model: BiLSTM + Attention
# -------------------------
class Attention(nn.Module):
    def __init__(self, hidden_dim):
        super(Attention, self).__init__()
        self.attn = nn.Linear(hidden_dim, 1, bias=False)

    def forward(self, lstm_output):
        attn_weights = F.softmax(self.attn(lstm_output), dim=1)
        context_vector = torch.sum(attn_weights * lstm_output, dim=1)
        return context_vector

class BiLSTMAttentionClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, n_layers, num_labels, dropout):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2, num_layers=n_layers,
                            bidirectional=True, batch_first=True, dropout=dropout if n_layers > 1 else 0)
        self.attention = Attention(hidden_dim)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_dim, num_labels)

    def init_weights_from_fasttext(self, ft_model, vocab):
        print("Initializing embedding layer with FastText vectors...")
        weights_matrix = np.zeros((len(vocab), cfg.embedding_dim))
        found_count = 0
        for word, i in vocab.word2idx.items():
            if word in ft_model.wv:
                weights_matrix[i] = ft_model.wv[word]
                found_count += 1

        self.embedding.weight.data.copy_(torch.from_numpy(weights_matrix))
        print(f"Found {found_count}/{len(vocab)} words in FastText model.")

    def forward(self, text):
        embedded = self.embedding(text)
        lstm_output, _ = self.lstm(embedded)
        context_vector = self.attention(lstm_output)
        dropped_out = self.dropout(context_vector)
        logits = self.fc(dropped_out)
        return logits, None

In [None]:
# -------------------------
# Loss implementation (Focal Loss)
# -------------------------
class FocalLoss(nn.Module):
    def __init__(self, gamma=2.0, alpha=None, reduction='mean', device='cpu'):
        super().__init__()
        self.gamma = gamma
        if alpha is not None:
            self.alpha = torch.tensor(alpha, dtype=torch.float32).to(device)
        else:
            self.alpha = None
        self.reduction = reduction

    def forward(self, logits, targets):
        ce = F.cross_entropy(logits, targets, reduction='none')
        pt = torch.exp(-ce)
        loss = ((1 - pt) ** self.gamma) * ce
        if self.alpha is not None:
            a = self.alpha[targets]
            loss = a * loss
        return loss.mean() if self.reduction == 'mean' else loss.sum()


In [None]:
# -------------------------
# Training & evaluation loops
# -------------------------
def evaluate(model, dataloader, device, id2label):
    model.eval()
    y_true, y_pred = [], []
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Eval", leave=False):
            text = batch['text'].to(device)
            labels = batch['label'].cpu().numpy().tolist()
            logits, _ = model(text)
            preds = torch.argmax(logits, dim=1).cpu().numpy().tolist()
            y_true.extend(labels)
            y_pred.extend(preds)
    report = classification_report(y_true, y_pred, output_dict=True, zero_division=0, target_names=list(id2label.values()))
    cm = confusion_matrix(y_true, y_pred)
    return report, cm

def train_loop(train_loader, val_loader, model, cfg, class_weights=None, id2label=None):
    device = cfg.device
    model.to(device)
    print(f"Training on {device}...")

    optimizer = torch.optim.AdamW(model.parameters(), lr=cfg.lr, weight_decay=cfg.weight_decay)

    alpha = class_weights / class_weights.sum()
    criterion = FocalLoss(gamma=cfg.focal_gamma, alpha=alpha, device=cfg.device)

    best_macro, best_ckpt = -1.0, None
    for epoch in range(cfg.epochs):
        model.train()
        pbar = tqdm(train_loader, desc=f"Train E{epoch+1}/{cfg.epochs}")
        for batch in pbar:
            text, labels = batch['text'].to(device), batch['label'].to(device)
            optimizer.zero_grad()
            logits, _ = model(text)
            loss = criterion(logits, labels)
            loss.backward()
            optimizer.step()
            pbar.set_postfix(loss=loss.item())

        report, _ = evaluate(model, val_loader, device, id2label)
        macro_f1 = report['macro avg']['f1-score']
        print(f"Epoch {epoch+1} -> Val Macro F1: {macro_f1:.4f}")

        if macro_f1 > best_macro:
            best_macro = macro_f1
            ckpt_path = os.path.join(cfg.output_dir, "best_model.pt")
            torch.save({"model_state_dict": model.state_dict()}, ckpt_path)
            print(f"✅ New best model saved with Macro F1: {macro_f1:.4f}")

    return best_ckpt

In [None]:
# -------------------------
# Main pipeline orchestration
# -------------------------
def main():
    # Load the full dataset into a pandas DataFrame first
    if not os.path.exists(cfg.data_csv):
        raise FileNotFoundError(f"Dataset not found at {cfg.data_csv}. Please make sure you have uploaded it.")

    df = pd.read_csv(cfg.data_csv, sep='\\t', header=None, names=['label', 'text'], engine='python')
    df.dropna(subset=['text', 'label'], inplace=True)
    df['text'] = df['text'].astype(str)

    # --- FIX: Filter out classes with fewer than min_class_samples ---
    print("Original dataset size:", len(df))
    label_counts = df['label'].value_counts()

    # Define a minimum number of samples for a class to be included
    MIN_SAMPLES = 3
    classes_to_keep = label_counts[label_counts >= MIN_SAMPLES].index.tolist()

    if len(classes_to_keep) < len(label_counts):
        print(f"Filtering out classes with less than {MIN_SAMPLES} samples.")
        removed_classes = label_counts[label_counts < MIN_SAMPLES].index.tolist()
        print("--> Removed classes:", removed_classes)
        df = df[df['label'].isin(classes_to_keep)]
        print("Cleaned dataset size:", len(df))
    else:
        print("All classes have sufficient samples for a stratified split.")
    # --- END FIX ---

    # Create a new, clean label map from the filtered data
    unique_labels = sorted(df['label'].unique())
    label_map = {label: i for i, label in enumerate(unique_labels)}
    id2label = {v: k for k, v in label_map.items()}
    num_classes = len(label_map)
    print(f"\nProceeding with {num_classes} classes.")

    # Convert the cleaned DataFrame back to a list of records for the rest of the script
    data = df.to_dict(orient="records")
    texts = [r['text'] for r in data]
    labels = [label_map[r['label']] for r in data]

    # Now, we can safely stratify the split on the cleaned data
    train_idx, test_idx = train_test_split(range(len(data)), test_size=0.15, random_state=cfg.seed, stratify=labels)
    train_idx, val_idx = train_test_split(train_idx, test_size=0.15, random_state=cfg.seed, stratify=[labels[i] for i in train_idx])

    train_records = [data[i] for i in train_idx]
    val_records = [data[i] for i in val_idx]
    test_records = [data[i] for i in test_idx]

    print(f"Data split -> Train: {len(train_records)}, Val: {len(val_records)}, Test: {len(test_records)}")

    vocab = Vocabulary([r['text'] for r in train_records], min_freq=cfg.ft_min_count)
    print(f"Vocabulary size: {len(vocab)}")

    cnt = Counter([label_map[r['label']] for r in train_records])
    class_counts = np.array([cnt.get(i, 0) for i in range(num_classes)])
    class_weights = (1.0 / (class_counts + 1e-9))
    class_weights = class_weights / class_weights.sum() * num_classes
    print("Class counts (train set):", dict(sorted(cnt.items())))

    ft_model = train_fasttext([r['text'] for r in data], cfg.fasttext_model_path, dim=cfg.ft_dim, epochs=cfg.ft_epochs)

    model = BiLSTMAttentionClassifier(vocab_size=len(vocab), embedding_dim=cfg.embedding_dim, hidden_dim=cfg.hidden_dim,
                                      n_layers=cfg.n_layers, num_labels=num_classes, dropout=cfg.dropout)
    model.init_weights_from_fasttext(ft_model, vocab)

    train_loader, val_loader = build_dataloaders(train_records, val_records, label_map, vocab, cfg, use_sampler=cfg.use_oversample)
    writer = SummaryWriter(logdir=os.path.join(cfg.output_dir, "tb_logs"))

    print("\n--- Starting Training ---")
    best_ckpt = train_loop(train_loader, val_loader, model, cfg, class_weights=class_weights, id2label=id2label, writer=writer)
    writer.close()

    print("\n--- Final Test Set Evaluation ---")
    if best_ckpt:
        print(f"Loading best checkpoint from: {best_ckpt}")
        ckpt = torch.load(best_ckpt, map_location=cfg.device)
        model.load_state_dict(ckpt['model_state_dict'])

        test_ds = CommentsDataset(test_records, label_map, vocab, max_len=cfg.max_length)
        test_loader = DataLoader(test_ds, batch_size=cfg.batch_size*2)
        report, cm = evaluate(model, test_loader, cfg.device, id2label)

        print("\nTest Report:")
        print(json.dumps(report, indent=2))
        print("\nConfusion Matrix:\n", cm)
    else:
        print("Training did not complete successfully, no checkpoint to evaluate.")

In [None]:
# -------------------------
# Training & evaluation loops
# -------------------------
def evaluate(model, dataloader, device, id2label):
    model.eval()
    y_true, y_pred = [], []
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Eval", leave=False):
            text, labels = batch['text'].to(device), batch['label'].cpu().numpy().tolist()
            logits, _ = model(text)
            preds = torch.argmax(logits, dim=1).cpu().numpy().tolist()
            y_true.extend(labels)
            y_pred.extend(preds)
    report = classification_report(y_true, y_pred, output_dict=True, zero_division=0, target_names=list(id2label.values()))
    cm = confusion_matrix(y_true, y_pred)
    return report, cm

def train_loop(train_loader, val_loader, model, cfg, class_weights=None, id2label=None, writer=None):
    device = cfg.device
    model.to(device)
    print(f"Training on {device}...")

    optimizer = torch.optim.AdamW(model.parameters(), lr=cfg.lr, weight_decay=cfg.weight_decay)

    criterion = None
    if cfg.use_focal:
        alpha = class_weights / class_weights.sum()
        criterion = FocalLoss(gamma=cfg.focal_gamma, alpha=alpha, device=cfg.device)
    else: # Default to CrossEntropy
        criterion = nn.CrossEntropyLoss()

    best_macro, best_ckpt = -1.0, None
    for epoch in range(cfg.epochs):
        model.train()
        pbar = tqdm(train_loader, desc=f"Train E{epoch+1}/{cfg.epochs}")
        for batch in pbar:
            text, labels = batch['text'].to(device), batch['label'].to(device)
            optimizer.zero_grad()
            logits, _ = model(text)
            loss = criterion(logits, labels)
            loss.backward()
            optimizer.step()
            pbar.set_postfix(loss=loss.item())

        report, _ = evaluate(model, val_loader, device, id2label)
        macro_f1 = report['macro avg']['f1-score']
        print(f"Epoch {epoch+1} -> Val Macro F1: {macro_f1:.4f}")

        if writer: writer.add_scalar("eval/macro_f1", macro_f1, epoch)

        if macro_f1 > best_macro:
            best_macro = macro_f1
            ckpt_path = os.path.join(cfg.output_dir, "best_model.pt")
            torch.save({"model_state_dict": model.state_dict()}, ckpt_path)
            print(f"✅ New best model saved with Macro F1: {macro_f1:.4f}")

    return best_ckpt

def build_dataloaders(train_records, val_records, label_map, vocab, cfg):
    train_ds = CommentsDataset(train_records, label_map, vocab, max_len=cfg.max_length)
    val_ds = CommentsDataset(val_records, label_map, vocab, max_len=cfg.max_length)

    train_loader_args = {"batch_size": cfg.batch_size, "num_workers": 2, "pin_memory": True}
    if cfg.use_oversample:
        print("Using WeightedRandomSampler for oversampling minority classes.")
        labels = [label_map[rec['label']] for rec in train_records]
        cnt = Counter(labels)
        sample_weights = [1.0 / cnt[lbl] for lbl in labels]
        sampler = WeightedRandomSampler(weights=sample_weights, num_samples=len(sample_weights), replacement=True)
        train_loader_args["sampler"] = sampler
    else:
        train_loader_args["shuffle"] = True

    train_loader = DataLoader(train_ds, **train_loader_args)
    val_loader = DataLoader(val_ds, batch_size=cfg.batch_size*2, shuffle=False, num_workers=2)
    return train_loader, val_loader

# -------------------------
# Main pipeline orchestration
# -------------------------
def main():
    if not os.path.exists(cfg.data_csv):
        raise FileNotFoundError(f"Dataset not found at {cfg.data_csv}. Please upload it to the Colab session first.")

    df = pd.read_csv(cfg.data_csv, sep='\\t', header=None, names=['label', 'text'], engine='python')
    df.dropna(subset=['text', 'label'], inplace=True)
    df['text'] = df['text'].astype(str)

    # --- NEW ROBUST FIX STARTS HERE ---
    print("Original dataset size:", len(df))

    # Convert value counts to a standard Python dictionary to avoid pandas indexing issues
    label_counts_dict = df['label'].value_counts().to_dict()

    # Identify classes to keep and remove by iterating through the dictionary
    classes_to_keep = [label for label, count in label_counts_dict.items() if count >= cfg.min_class_samples]

    if len(classes_to_keep) < len(label_counts_dict):
        removed_classes = [label for label, count in label_counts_dict.items() if count < cfg.min_class_samples]
        print(f"\nFiltering out classes with less than {cfg.min_class_samples} samples.")
        print("Removed classes:", removed_classes)

        # Filter the DataFrame using the list of classes to keep
        df = df[df['label'].isin(classes_to_keep)]
        print("Cleaned dataset size:", len(df))
    else:
        print("All classes have sufficient samples.")
    # --- NEW ROBUST FIX ENDS HERE ---

    # Create a new, clean label map from the filtered data
    unique_labels = sorted(df['label'].unique())
    label_map = {label: i for i, label in enumerate(unique_labels)}
    id2label = {v: k for k, v in label_map.items()}
    num_classes = len(label_map)

    print(f"\nProcessing {num_classes} classes for training.")

    data = df.to_dict(orient="records")
    texts, labels = [r['text'] for r in data], [label_map[r['label']] for r in data]

    # Now, we can safely stratify the split on the cleaned data
    train_idx, test_idx = train_test_split(range(len(data)), test_size=0.15, random_state=cfg.seed, stratify=labels)
    train_idx, val_idx = train_test_split(train_idx, test_size=0.15, random_state=cfg.seed, stratify=[labels[i] for i in train_idx])

    train_records, val_records, test_records = [data[i] for i in train_idx], [data[i] for i in val_idx], [data[i] for i in test_idx]
    print(f"Data split -> Train: {len(train_records)}, Val: {len(val_records)}, Test: {len(test_records)}")

    vocab = Vocabulary([r['text'] for r in train_records], min_freq=cfg.ft_min_count)

    cnt = Counter([label_map[r['label']] for r in train_records])
    class_counts = np.array([cnt.get(i, 0) for i in range(num_classes)])
    class_weights = 1.0 / (class_counts + 1e-9)

    ft_model = train_fasttext([r['text'] for r in data], cfg.fasttext_model_path, dim=cfg.ft_dim, min_count=cfg.ft_min_count, epochs=cfg.ft_epochs)

    model = BiLSTMAttentionClassifier(vocab_size=len(vocab), embedding_dim=cfg.embedding_dim, hidden_dim=cfg.hidden_dim,
                                      n_layers=cfg.n_layers, num_labels=num_classes, dropout=cfg.dropout)
    model.init_weights_from_fasttext(ft_model, vocab)

    train_loader, val_loader = build_dataloaders(train_records, val_records, label_map, vocab, cfg)

    writer = SummaryWriter(logdir=os.path.join(cfg.output_dir, "tb_logs"))

    print("\n--- Starting Training ---")
    best_ckpt = train_loop(train_loader, val_loader, model, cfg, class_weights=class_weights, id2label=id2label, writer=writer)
    writer.close()

    print("\n--- Final Test Set Evaluation ---")
    if best_ckpt:
        ckpt = torch.load(best_ckpt, map_location=cfg.device)
        model.load_state_dict(ckpt['model_state_dict'])
        test_ds = CommentsDataset(test_records, label_map, vocab, max_len=cfg.max_length)
        test_loader = DataLoader(test_ds, batch_size=cfg.batch_size*2)
        report, cm = evaluate(model, test_loader, cfg.device, id2label)

        print("\nTest Report:")
        print(json.dumps(report, indent=2))
        print("\nConfusion Matrix:\n", cm)
    else:
        print("Training did not complete successfully, no checkpoint to evaluate.")

In [None]:
# Single Colab cell: full pipeline (BiLSTM + FastText) with corrected evaluate()
# Step 1: Install necessary libraries (uncomment if running fresh)
# !pip install torch scikit-learn gensim tensorboardX pandas tqdm -q

import os
import json
import random
from collections import Counter
from dataclasses import dataclass
from typing import List, Dict, Tuple
import time

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from gensim.models import FastText
from tqdm import tqdm
from tensorboardX import SummaryWriter

# -------------------------
# Config - edit these
# -------------------------
@dataclass
class CFG:
    data_csv: str = "tamil_sentiment_full.csv"
    output_dir: str = "outputs_bilstm"
    fasttext_model_path: str = "outputs_bilstm/fasttext_tamil.model"
    device: str = "cuda" if torch.cuda.is_available() else "cpu"
    min_class_samples: int = 3

    # Model
    max_length: int = 100
    embedding_dim: int = 300
    hidden_dim: int = 256
    n_layers: int = 2
    dropout: float = 0.4

    # Training
    epochs: int = 10
    batch_size: int = 64
    lr: float = 1e-3
    seed: int = 42

    # Loss options
    use_focal: bool = True
    focal_gamma: float = 2.0  # fixed

    # FastText
    ft_dim: int = 300
    ft_min_count: int = 2
    ft_epochs: int = 10

cfg = CFG()

os.makedirs(cfg.output_dir, exist_ok=True)
os.makedirs("data", exist_ok=True)

# -------------------------
# Utilities & helpers
# -------------------------
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.deterministic = True

seed_everything(cfg.seed)

class Vocabulary:
    def __init__(self, texts: List[str], min_freq=2):
        self.word2idx = {'<pad>': 0, '<unk>': 1}
        self.pad_token_id = 0; self.unk_token_id = 1
        word_counts = Counter(word for text in texts for word in text.split())
        for word, count in word_counts.items():
            if count >= min_freq:
                self.word2idx[word] = len(self.word2idx)
    def __len__(self): return len(self.word2idx)
    def text_to_sequence(self, text: str) -> List[int]:
        return [self.word2idx.get(word, self.unk_token_id) for word in text.split()]

class CommentsDataset(Dataset):
    def __init__(self, records: List[Dict], label_map: Dict[str,int], vocab: Vocabulary, max_len=128):
        self.vocab = vocab; self.max_len = max_len; self.label_map = label_map
        self.samples = [(str(rec["text"]), self.label_map[rec["label"]]) for rec in records]
    def __len__(self): return len(self.samples)
    def __getitem__(self, idx):
        text, label = self.samples[idx]
        seq = self.vocab.text_to_sequence(text)
        if len(seq) < self.max_len:
            seq.extend([self.vocab.pad_token_id] * (self.max_len - len(seq)))
        else:
            seq = seq[:self.max_len]
        return {"text": torch.tensor(seq, dtype=torch.long), "label": torch.tensor(label, dtype=torch.long)}

def train_fasttext(sentences: List[str], save_path: str, dim=300, min_count=2, epochs=10):
    if os.path.exists(save_path):
        print(f"✅ FastText model found at {save_path}, loading...")
        return FastText.load(save_path)
    print(f"⏳ Training new FastText model for {epochs} epochs... (This can take 5-10 minutes)")
    tokenized = [s.split() for s in sentences]
    ft = FastText(sentences=tokenized, vector_size=dim, window=5, min_count=min_count, workers=os.cpu_count(), epochs=epochs)
    ft.save(save_path)
    print("✅ FastText model training complete.")
    return ft

class Attention(nn.Module):
    def __init__(self, hidden_dim):
        super(Attention, self).__init__()
        self.attn = nn.Linear(hidden_dim, 1, bias=False)
    def forward(self, lstm_output):
        attn_weights = F.softmax(self.attn(lstm_output), dim=1)
        return torch.sum(attn_weights * lstm_output, dim=1)

class BiLSTMAttentionClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, n_layers, num_labels, dropout):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2, num_layers=n_layers, bidirectional=True, batch_first=True, dropout=dropout if n_layers > 1 else 0)
        self.attention = Attention(hidden_dim)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_dim, num_labels)
    def init_weights_from_fasttext(self, ft_model, vocab):
        print("⏳ Initializing embedding layer with FastText vectors...")
        weights_matrix = np.random.normal(size=(len(vocab), cfg.embedding_dim)).astype(np.float32)
        found_count = 0
        for word, i in vocab.word2idx.items():
            if word in ft_model.wv:
                weights_matrix[i] = ft_model.wv[word]
                found_count += 1
        self.embedding.weight.data.copy_(torch.from_numpy(weights_matrix))
        print(f"✅ Found {found_count}/{len(vocab)} words in FastText model.")
    def forward(self, text):
        embedded = self.embedding(text)
        lstm_output, _ = self.lstm(embedded)
        context_vector = self.attention(lstm_output)
        return self.fc(self.dropout(context_vector)), None

class FocalLoss(nn.Module):
    def __init__(self, gamma=2.0, alpha=None, device='cpu'):
        super().__init__()
        self.gamma = gamma
        self.alpha = torch.tensor(alpha, dtype=torch.float32).to(device) if alpha is not None else None
    def forward(self, logits, targets):
        ce = F.cross_entropy(logits, targets, reduction='none')
        pt = torch.exp(-ce)
        loss = ((1 - pt) ** self.gamma) * ce
        if self.alpha is not None:
            loss = self.alpha[targets] * loss
        return loss.mean()

# -------------------------
# Corrected evaluate() -> returns 4 items
# -------------------------
def evaluate(model, dataloader, device, id2label):
    """
    Returns:
      - report (dict): sklearn classification_report as dict (output_dict=True)
      - cm (np.array): confusion matrix
      - y_true (list): ground-truth labels
      - y_pred (list): predicted labels
    """
    model.eval()
    y_true, y_pred = [], []
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Eval", leave=False):
            text = batch['text'].to(device)
            labels = batch['label'].cpu().numpy().tolist()
            logits, _ = model(text)
            preds = torch.argmax(logits, dim=1).cpu().numpy().tolist()
            y_true.extend(labels)
            y_pred.extend(preds)

    report = classification_report(
        y_true,
        y_pred,
        target_names=list(id2label.values()),
        digits=4,
        output_dict=True,
        zero_division=0
    )
    cm = confusion_matrix(y_true, y_pred)
    return report, cm, y_true, y_pred

# -------------------------
# Training loop
# -------------------------
def train_loop(train_loader, val_loader, model, cfg, class_weights, id2label):
    device = cfg.device
    model.to(device)
    print(f"✅ All setup complete. Starting training on {device}...")
    time.sleep(1)

    optimizer = torch.optim.AdamW(model.parameters(), lr=cfg.lr)
    alpha = class_weights / class_weights.sum()
    criterion = FocalLoss(gamma=cfg.focal_gamma, alpha=alpha, device=cfg.device)

    best_macro = -1.0
    for epoch in range(cfg.epochs):
        model.train()
        pbar = tqdm(train_loader, desc=f"Train E{epoch+1}/{cfg.epochs}")
        for batch in pbar:
            text, labels = batch['text'].to(device), batch['label'].to(device)
            optimizer.zero_grad()
            logits, _ = model(text)
            loss = criterion(logits, labels)
            loss.backward()
            optimizer.step()
            pbar.set_postfix(loss=loss.item())

        report, _, _, _ = evaluate(model, val_loader, device, id2label)
        macro_f1 = report['macro avg']['f1-score']
        print(f"Epoch {epoch+1} -> Val Macro F1: {macro_f1:.4f}")
        if macro_f1 > best_macro:
            best_macro = macro_f1
            torch.save({"model_state_dict": model.state_dict()}, os.path.join(cfg.output_dir, "best_model.pt"))
            print(f"🚀 New best model saved with Macro F1: {macro_f1:.4f}")
    return os.path.join(cfg.output_dir, "best_model.pt")

# -------------------------
# Main pipeline
# -------------------------
def main():
    start_time = time.time()
    print("--- Step 1: Loading and Cleaning Data ---")
    df = pd.read_csv(cfg.data_csv, sep='\\t', header=None, names=['label', 'text'], engine='python')
    df.dropna(subset=['text', 'label'], inplace=True)
    df['text'] = df['text'].astype(str)

    label_counts = df['label'].value_counts()
    classes_to_keep = label_counts[label_counts >= cfg.min_class_samples].index.tolist()
    if len(classes_to_keep) < len(label_counts):
        print(f"Filtering out classes with less than {cfg.min_class_samples} samples.")
        df = df[df['label'].isin(classes_to_keep)]
    print(f"✅ Data loading complete. Kept {len(df)} records.")

    print("\n--- Step 2: Splitting Data and Building Vocabulary ---")
    unique_labels = sorted(df['label'].unique())
    label_map = {label: i for i, label in enumerate(unique_labels)}
    id2label = {v: k for k, v in label_map.items()}

    data = df.to_dict(orient="records")
    labels = [label_map[r['label']] for r in data]
    train_idx, test_idx = train_test_split(range(len(data)), test_size=0.15, random_state=cfg.seed, stratify=labels)
    train_idx, val_idx = train_test_split(train_idx, test_size=0.15, random_state=cfg.seed, stratify=[labels[i] for i in train_idx])
    train_records, val_records, test_records = [data[i] for i in train_idx], [data[i] for i in val_idx], [data[i] for i in test_idx]

    vocab = Vocabulary([r['text'] for r in train_records], min_freq=cfg.ft_min_count)
    print(f"✅ Vocabulary built with {len(vocab)} unique tokens.")

    print("\n--- Step 3: Training FastText Embeddings ---")
    ft_model = train_fasttext([r['text'] for r in data], cfg.fasttext_model_path, dim=cfg.ft_dim, min_count=cfg.ft_min_count, epochs=cfg.ft_epochs)

    print("\n--- Step 4: Building Model and DataLoaders ---")
    model = BiLSTMAttentionClassifier(vocab_size=len(vocab), embedding_dim=cfg.embedding_dim, hidden_dim=cfg.hidden_dim, n_layers=cfg.n_layers, num_labels=len(label_map), dropout=cfg.dropout)
    model.init_weights_from_fasttext(ft_model, vocab)

    train_ds = CommentsDataset(train_records, label_map, vocab, max_len=cfg.max_length)
    val_ds = CommentsDataset(val_records, label_map, vocab, max_len=cfg.max_length)
    train_loader = DataLoader(train_ds, batch_size=cfg.batch_size, shuffle=True, num_workers=0)
    val_loader = DataLoader(val_ds, batch_size=cfg.batch_size*2, shuffle=False, num_workers=0)

    cnt = Counter([label_map[r['label']] for r in train_records])
    class_weights = np.array([cnt.get(i, 0) for i in range(len(label_map))])
    class_weights = 1.0 / (class_weights + 1e-9)

    # --- Step 5: Starting the Training Loop ---
    best_ckpt = train_loop(train_loader, val_loader, model, cfg, class_weights=class_weights, id2label=id2label)

    # --- Step 6: Final Evaluation ---
    print("\n--- Final Test Set Evaluation ---")
    if best_ckpt and os.path.exists(best_ckpt):
        ckpt = torch.load(best_ckpt, map_location=cfg.device)
        model.load_state_dict(ckpt['model_state_dict'])
        test_ds = CommentsDataset(test_records, label_map, vocab, max_len=cfg.max_length)
        test_loader = DataLoader(test_ds, batch_size=cfg.batch_size*2, num_workers=0)
        report, cm, y_true, y_pred = evaluate(model, test_loader, cfg.device, id2label)

        print("\nClassification Report:\n")
        print(classification_report(y_true, y_pred,target_names=list(id2label.values()),digits=4))

        print("\nConfusion Matrix:\n", cm)

    total_time = time.time() - start_time
    print(f"\nTotal execution time: {total_time/60:.2f} minutes.")

# --- Run the main function ---
if __name__ == '__main__':
    main()


--- Step 1: Loading and Cleaning Data ---
Filtering out classes with less than 3 samples.
✅ Data loading complete. Kept 44019 records.

--- Step 2: Splitting Data and Building Vocabulary ---
✅ Vocabulary built with 21993 unique tokens.

--- Step 3: Training FastText Embeddings ---
✅ FastText model found at outputs_bilstm/fasttext_tamil.model, loading...

--- Step 4: Building Model and DataLoaders ---
⏳ Initializing embedding layer with FastText vectors...
✅ Found 21993/21993 words in FastText model.
✅ All setup complete. Starting training on cuda...


Train E1/10: 100%|██████████| 497/497 [00:11<00:00, 44.67it/s, loss=0.108]


Epoch 1 -> Val Macro F1: 0.4199
🚀 New best model saved with Macro F1: 0.4199


Train E2/10: 100%|██████████| 497/497 [00:11<00:00, 44.07it/s, loss=0.0611]


Epoch 2 -> Val Macro F1: 0.4175


Train E3/10: 100%|██████████| 497/497 [00:11<00:00, 43.93it/s, loss=0.0434]


Epoch 3 -> Val Macro F1: 0.4253
🚀 New best model saved with Macro F1: 0.4253


Train E4/10: 100%|██████████| 497/497 [00:11<00:00, 44.61it/s, loss=0.0374]


Epoch 4 -> Val Macro F1: 0.4415
🚀 New best model saved with Macro F1: 0.4415


Train E5/10: 100%|██████████| 497/497 [00:11<00:00, 44.75it/s, loss=0.0207]


Epoch 5 -> Val Macro F1: 0.4274


Train E6/10: 100%|██████████| 497/497 [00:10<00:00, 45.49it/s, loss=0.0185]


Epoch 6 -> Val Macro F1: 0.4371


Train E7/10: 100%|██████████| 497/497 [00:10<00:00, 45.47it/s, loss=0.0124]


Epoch 7 -> Val Macro F1: 0.4397


Train E8/10: 100%|██████████| 497/497 [00:11<00:00, 44.80it/s, loss=0.0178]


Epoch 8 -> Val Macro F1: 0.4332


Train E9/10: 100%|██████████| 497/497 [00:11<00:00, 44.72it/s, loss=0.0145]


Epoch 9 -> Val Macro F1: 0.4212


Train E10/10: 100%|██████████| 497/497 [00:11<00:00, 44.82it/s, loss=0.00613]


Epoch 10 -> Val Macro F1: 0.4240

--- Final Test Set Evaluation ---





Classification Report:

                precision    recall  f1-score   support

Mixed_feelings     0.1996    0.3938    0.2649       739
      Negative     0.3410    0.3954    0.3662       784
      Positive     0.8133    0.5162    0.6316      3731
     not-Tamil     0.3870    0.6837    0.4942       313
 unknown_state     0.3863    0.4903    0.4322      1036

      accuracy                         0.4920      6603
     macro avg     0.4255    0.4959    0.4378      6603
  weighted avg     0.6014    0.4920    0.5212      6603


Confusion Matrix:
 [[ 291  145  144   23  136]
 [ 202  310  115   25  132]
 [ 752  323 1926  234  496]
 [  20   10   26  214   43]
 [ 193  121  157   57  508]]

Total execution time: 2.09 minutes.


In [None]:
# Full corrected script: CharCNN + FastText + BiLSTM + Attention + Aux features
# Requirements:
# pip install torch scikit-learn gensim pandas tqdm

import os, time, random, json
from collections import Counter
from dataclasses import dataclass
from typing import List, Dict

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from gensim.models import FastText
from tqdm import tqdm

# -------------------------
# Config
# -------------------------
@dataclass
class CFG:
    data_csv: str = "tamil_sentiment_full.csv"   # tab-separated file: label \t text
    output_dir: str = "outputs_char_bilstm"
    device: str = "cuda" if torch.cuda.is_available() else "cpu"
    min_class_samples: int = 3

    # vocab / chars
    min_token_freq: int = 2
    max_chars_per_token: int = 12
    min_char_freq: int = 1

    # embedding / ft
    ft_dim: int = 300
    ft_min_count: int = 2
    ft_epochs: int = 8
    embedding_trainable: bool = True

    # model
    hidden_dim: int = 256
    lstm_layers: int = 1
    char_emb_dim: int = 50
    char_out: int = 100
    attn_dim: int = 128
    aux_dim: int = 8      # number of auxiliary features
    dropout: float = 0.3

    # training
    epochs: int = 6
    batch_size: int = 64
    lr_emb: float = 5e-5
    lr_head: float = 1e-3
    weight_decay: float = 1e-5
    use_sampler: bool = False
    use_focal: bool = True
    focal_gamma: float = 2.0
    seed: int = 42

cfg = CFG()
os.makedirs(cfg.output_dir, exist_ok=True)

# -------------------------
# Utilities
# -------------------------
def seed_everything(seed=42):
    random.seed(seed); np.random.seed(seed); torch.manual_seed(seed)
    if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed)
seed_everything(cfg.seed)

def read_data(path):
    # expects tab separated with label \t text
    df = pd.read_csv(path, sep='\t', header=None, names=['label','text'], engine='python')
    df.dropna(subset=['text','label'], inplace=True)
    df['text'] = df['text'].astype(str)
    return df

# -------------------------
# Preprocessing / vocabs
# -------------------------
def build_token_vocab(texts: List[str], min_freq=2):
    cnt = Counter()
    for t in texts:
        for w in t.split():
            cnt[w] += 1
    word2idx = {'<pad>':0, '<unk>':1}
    for w,c in cnt.items():
        if c>=min_freq:
            word2idx[w] = len(word2idx)
    return word2idx

def build_char_vocab(texts: List[str], min_freq=1, max_chars=12):
    cnt = Counter()
    for t in texts:
        for tok in t.split():
            for ch in list(tok)[:max_chars]:
                cnt[ch] += 1
    char2idx = {'<pad>':0, '<unk>':1}
    for ch,c in cnt.items():
        if c>=min_freq:
            char2idx[ch] = len(char2idx)
    return char2idx

def text_to_token_ids(text, word2idx, max_len):
    ids = [word2idx.get(w, word2idx['<unk>']) for w in text.split()]
    if len(ids) < max_len: ids += [word2idx['<pad>']] * (max_len - len(ids))
    else: ids = ids[:max_len]
    return ids

def text_to_char_ids(text, char2idx, max_len_tokens, max_chars_per_token):
    toks = text.split()
    char_ids = []
    for i in range(max_len_tokens):
        if i < len(toks):
            tok = toks[i][:max_chars_per_token]
            ids = [char2idx.get(ch, char2idx['<unk>']) for ch in tok]
            if len(ids) < max_chars_per_token:
                ids += [char2idx['<pad>']] * (max_chars_per_token - len(ids))
        else:
            ids = [char2idx['<pad>']] * max_chars_per_token
        char_ids.append(ids)
    return char_ids  # shape (max_len_tokens, max_chars_per_token)

# Auxiliary features generator (simple, extensible)
def compute_aux_features(text):
    toks = text.split()
    num_tokens = len(toks)
    num_chars = len(text)
    emoji_count = sum(1 for ch in text if ord(ch) > 10000)  # crude heuristic
    punct_count = sum(1 for ch in text if ch in '?!.,;:')
    has_english = 1.0 if any('a' <= ch.lower() <= 'z' for ch in text) else 0.0
    has_tamil = 1.0 if any('\u0B80' <= ch <= '\u0BFF' for ch in text) else 0.0
    avg_token_len = (sum(len(t) for t in toks)/num_tokens) if num_tokens>0 else 0.0
    cap_ratio = sum(1 for ch in text if ch.isupper()) / (num_chars+1)
    return [num_tokens, num_chars, emoji_count, punct_count, has_english, has_tamil, avg_token_len, cap_ratio]

# -------------------------
# Dataset
# -------------------------
class CharTokenDataset(Dataset):
    def __init__(self, records, label_map, word2idx, char2idx, max_len_tokens, max_chars_per_token, aux_dim):
        self.records = records
        self.label_map = label_map
        self.word2idx = word2idx
        self.char2idx = char2idx
        self.max_len_tokens = max_len_tokens
        self.max_chars_per_token = max_chars_per_token
        self.aux_dim = aux_dim

        self.samples = []
        for r in records:
            text = str(r['text'])
            label = label_map[r['label']]
            token_ids = text_to_token_ids(text, word2idx, max_len_tokens)
            char_ids = text_to_char_ids(text, char2idx, max_len_tokens, max_chars_per_token)
            aux = compute_aux_features(text)
            aux = (aux + [0.0]*aux_dim)[:aux_dim]
            self.samples.append((token_ids, char_ids, aux, label))

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        token_ids, char_ids, aux, label = self.samples[idx]
        token_ids = torch.tensor(token_ids, dtype=torch.long)
        char_ids = torch.tensor(char_ids, dtype=torch.long)  # shape (T, C)
        aux = torch.tensor(aux, dtype=torch.float32)
        label = torch.tensor(label, dtype=torch.long)
        return {"token_ids": token_ids, "char_ids": char_ids, "aux": aux, "label": label}

# -------------------------
# FastText training / loading
# -------------------------
def train_or_load_fasttext(sentences, path, dim=300, min_count=2, epochs=6):
    if os.path.exists(path):
        print("Loading FastText from", path)
        return FastText.load(path)
    print("Training FastText...")
    tokenized = [s.split() for s in sentences]
    ft = FastText(vector_size=dim, window=5, min_count=min_count, workers=os.cpu_count(), epochs=epochs)
    ft.build_vocab(tokenized)
    ft.train(tokenized, total_examples=len(tokenized), epochs=epochs)
    ft.save(path)
    print("Saved FastText at", path)
    return ft

def build_embedding_matrix(word2idx, ft_model, dim):
    V = len(word2idx)
    mat = np.random.normal(scale=0.01, size=(V, dim)).astype(np.float32)
    found = 0
    for w,i in word2idx.items():
        if w in ft_model.wv:
            mat[i] = ft_model.wv[w]
            found += 1
    print(f"Found {found}/{V} tokens in FastText.")
    return mat

# -------------------------
# Model components (robust CharCNN & BiLSTM)
# -------------------------
class CharCNN(nn.Module):
    def __init__(self, char_vocab_size, char_emb_dim=50, out_dim=100, kernel_sizes=(3,4,5), dropout=0.1, max_chars=12):
        super().__init__()
        self.char_emb = nn.Embedding(char_vocab_size, char_emb_dim, padding_idx=0)
        k = len(kernel_sizes)
        base = out_dim // k
        extras = out_dim - (base * k)
        out_channels_list = [base + (1 if i < extras else 0) for i in range(k)]
        self.convs = nn.ModuleList([
            nn.Conv2d(in_channels=1, out_channels=out_channels_list[i], kernel_size=(kernel_sizes[i], char_emb_dim))
            for i in range(k)
        ])
        self.out_dim_actual = sum(out_channels_list)
        self.dropout = nn.Dropout(dropout)
        self.max_chars = max_chars

    def forward(self, x_char):
        # x_char: (B, T, C)
        B,T,C = x_char.size()
        x = self.char_emb(x_char)           # (B, T, C, E)
        x = x.view(B*T, C, -1).unsqueeze(1) # (B*T, 1, C, E)
        conv_outs = []
        for conv in self.convs:
            o = conv(x)                     # (B*T, out_ch, L, 1)
            o = F.relu(o.squeeze(-1))       # (B*T, out_ch, L)
            o = F.max_pool1d(o, o.size(2)).squeeze(2)  # (B*T, out_ch)
            conv_outs.append(o)
        out = torch.cat(conv_outs, dim=1)   # (B*T, out_dim_actual)
        out = out.view(B, T, -1)            # (B, T, out_dim_actual)
        out = self.dropout(out)
        return out

class BiLSTMCharFastText(nn.Module):
    def __init__(self, emb_matrix, char_vocab_size, cfg, num_labels):
        super().__init__()
        emb_matrix = torch.tensor(emb_matrix)
        V, E = emb_matrix.shape
        self.embedding = nn.Embedding.from_pretrained(emb_matrix, freeze=not cfg.embedding_trainable, padding_idx=0)
        self.char_cnn = CharCNN(char_vocab_size, char_emb_dim=cfg.char_emb_dim, out_dim=cfg.char_out, dropout=cfg.dropout, max_chars=cfg.max_chars_per_token)
        token_in_dim = E + self.char_cnn.out_dim_actual
        self.bilstm = nn.LSTM(token_in_dim, cfg.hidden_dim//2, num_layers=cfg.lstm_layers, bidirectional=True, batch_first=True, dropout=cfg.dropout if cfg.lstm_layers>1 else 0)
        self.attn_proj = nn.Linear(cfg.hidden_dim, cfg.attn_dim)
        self.attn_v = nn.Linear(cfg.attn_dim, 1, bias=False)
        self.aux_proj = nn.Linear(cfg.aux_dim, 32)
        self.classifier = nn.Sequential(
            nn.Linear(cfg.hidden_dim + 32, 256),
            nn.ReLU(),
            nn.Dropout(cfg.dropout),
            nn.Linear(256, num_labels)
        )

    def forward(self, token_ids, char_ids, aux):
        emb = self.embedding(token_ids)                   # (B, T, E)
        char_vec = self.char_cnn(char_ids)                # (B, T, char_out_actual)
        x = torch.cat([emb, char_vec], dim=-1)            # (B, T, E + char_out_actual)
        h, _ = self.bilstm(x)                             # (B, T, H)
        a = torch.tanh(self.attn_proj(h))                 # (B, T, attn_dim)
        scores = self.attn_v(a).squeeze(-1)               # (B, T)
        mask = (token_ids != 0).float()                   # pad mask
        scores = scores.masked_fill(mask==0, -1e9)
        alpha = torch.softmax(scores, dim=1).unsqueeze(-1) # (B, T, 1)
        pooled = (h * alpha).sum(dim=1)                   # (B, H)
        aux_p = torch.relu(self.aux_proj(aux))            # (B, 32)
        cat = torch.cat([pooled, aux_p], dim=1)           # (B, H+32)
        logits = self.classifier(cat)                     # (B, num_labels)
        return logits

# -------------------------
# Loss: Focal
# -------------------------
# Replace your current FocalLoss class with this version

class FocalLoss(nn.Module):
    def __init__(self, gamma=2.0, alpha=None):
        super().__init__()
        self.gamma = gamma
        # store alpha as tensor on CPU for now; will move in forward
        if alpha is not None:
            self.alpha = torch.tensor(alpha, dtype=torch.float32)
        else:
            self.alpha = None

    def forward(self, logits, targets):
        """
        logits: (B, C)
        targets: (B,) long, on some device (cpu/cuda)
        """
        ce = F.cross_entropy(logits, targets, reduction='none')  # (B,)
        pt = torch.exp(-ce)
        loss = ((1 - pt) ** self.gamma) * ce

        if self.alpha is not None:
            # ensure alpha is on same device as targets before indexing
            if self.alpha.device != targets.device:
                self.alpha = self.alpha.to(targets.device)
            loss = self.alpha[targets] * loss

        return loss.mean()


# -------------------------
# Evaluate (returns y_true, y_pred)
# -------------------------
def evaluate(model, dataloader, device, id2label):
    model.eval()
    y_true, y_pred = [], []
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Eval", leave=False):
            tokens = batch['token_ids'].to(device)
            chars  = batch['char_ids'].to(device)
            aux    = batch['aux'].to(device)
            labels = batch['label'].cpu().numpy().tolist()
            logits = model(tokens, chars, aux)
            preds = torch.argmax(logits, dim=1).cpu().numpy().tolist()
            y_true.extend(labels); y_pred.extend(preds)
    report = classification_report(y_true, y_pred, target_names=list(id2label.values()), digits=4, output_dict=True, zero_division=0)
    cm = confusion_matrix(y_true, y_pred)
    return report, cm, y_true, y_pred

# -------------------------
# Training loop
# -------------------------
def train(train_loader, val_loader, model, cfg, class_weights, id2label):
    device = cfg.device
    model.to(device)
    emb_params = list(model.embedding.parameters())
    other_params = [p for n,p in model.named_parameters() if not n.startswith('embedding.')]
    optimizer = torch.optim.AdamW([
        {"params": emb_params, "lr": cfg.lr_emb},
        {"params": other_params, "lr": cfg.lr_head}
    ], weight_decay=cfg.weight_decay)

    if cfg.use_focal:
        alpha = (1.0 / (class_weights + 1e-9))
        alpha = alpha / alpha.sum()
        criterion = FocalLoss(cfg.focal_gamma, alpha=alpha)
    else:
        criterion = nn.CrossEntropyLoss()

    best_ckpt = None
    best_macro = -1.0
    for epoch in range(cfg.epochs):
        model.train()
        pbar = tqdm(train_loader, desc=f"Train E{epoch+1}/{cfg.epochs}")
        for batch in pbar:
            tokens = batch['token_ids'].to(device)
            chars  = batch['char_ids'].to(device)
            aux    = batch['aux'].to(device)
            labels = batch['label'].to(device)
            optimizer.zero_grad()
            logits = model(tokens, chars, aux)
            loss = criterion(logits, labels)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            pbar.set_postfix(loss=loss.item())
        report, cm, _, _ = evaluate(model, val_loader, cfg.device, id2label)
        macro_f1 = report['macro avg']['f1-score']
        print(f"Epoch {epoch+1} -> Val Macro F1: {macro_f1:.4f}")
        if macro_f1 > best_macro:
            best_macro = macro_f1
            best_ckpt = os.path.join(cfg.output_dir, f"best_macro_{macro_f1:.4f}.pt")
            torch.save({"model_state_dict": model.state_dict(), "cfg": cfg.__dict__}, best_ckpt)
            print("Saved", best_ckpt)
    return best_ckpt

# -------------------------
# Pipeline orchestration
# -------------------------
def main():
    # load
    df = read_data(cfg.data_csv)
    # filter tiny classes
    cnt = df['label'].value_counts()
    keep = cnt[cnt >= cfg.min_class_samples].index.tolist()
    if len(keep) < len(cnt):
        df = df[df['label'].isin(keep)].reset_index(drop=True)
    print("Records:", len(df), "labels:", df['label'].nunique())

    # label mapping
    labels_unique = sorted(df['label'].unique())
    label_map = {lab:i for i,lab in enumerate(labels_unique)}
    id2label = {v:k for k,v in label_map.items()}

    # splits
    data = df.to_dict(orient='records')
    lablist = [label_map[r['label']] for r in data]
    train_idx, test_idx = train_test_split(range(len(data)), test_size=0.15, random_state=cfg.seed, stratify=lablist)
    train_idx, val_idx = train_test_split(train_idx, test_size=0.15, random_state=cfg.seed, stratify=[lablist[i] for i in train_idx])
    train_records = [data[i] for i in train_idx]
    val_records   = [data[i] for i in val_idx]
    test_records  = [data[i] for i in test_idx]
    print("split sizes:", len(train_records), len(val_records), len(test_records))

    # vocabs
    word2idx = build_token_vocab([r['text'] for r in train_records], cfg.min_token_freq)
    char2idx = build_char_vocab([r['text'] for r in train_records], cfg.min_char_freq, cfg.max_chars_per_token)
    print("Vocab sizes: tokens", len(word2idx), "chars", len(char2idx))

    # fasttext
    ft_path = os.path.join(cfg.output_dir, "fasttext.model")
    ft = train_or_load_fasttext([r['text'] for r in data], ft_path, dim=cfg.ft_dim, min_count=cfg.ft_min_count, epochs=cfg.ft_epochs)
    emb_matrix = build_embedding_matrix(word2idx, ft, cfg.ft_dim)

    # datasets
    max_len = 64  # token length cap - tune as needed
    train_ds = CharTokenDataset(train_records, label_map, word2idx, char2idx, max_len, cfg.max_chars_per_token, cfg.aux_dim)
    val_ds   = CharTokenDataset(val_records, label_map, word2idx, char2idx, max_len, cfg.max_chars_per_token, cfg.aux_dim)
    test_ds  = CharTokenDataset(test_records, label_map, word2idx, char2idx, max_len, cfg.max_chars_per_token, cfg.aux_dim)

    if cfg.use_sampler:
        labels = [s[3] for s in train_ds.samples]
        cnts = Counter(labels)
        sample_weights = [1.0 / cnts[l] for l in labels]
        sampler = WeightedRandomSampler(sample_weights, num_samples=len(sample_weights), replacement=True)
        train_loader = DataLoader(train_ds, batch_size=cfg.batch_size, sampler=sampler, num_workers=2)
    else:
        train_loader = DataLoader(train_ds, batch_size=cfg.batch_size, shuffle=True, num_workers=2)

    val_loader = DataLoader(val_ds, batch_size=cfg.batch_size*2, shuffle=False, num_workers=2)
    test_loader = DataLoader(test_ds, batch_size=cfg.batch_size*2, shuffle=False, num_workers=2)

    # class weight vector (counts)
    train_labels = [s[3] for s in train_ds.samples]
    cnts = np.array([Counter(train_labels).get(i,0) for i in range(len(label_map))])
    class_weights = cnts.astype(np.float32)

    # model
    model = BiLSTMCharFastText(emb_matrix, char_vocab_size=len(char2idx), cfg=cfg, num_labels=len(label_map))
    # quick shape sanity check
    print("embedding dim:", emb_matrix.shape[1])
    print("char_cnn out dim actual:", model.char_cnn.out_dim_actual)
    sample_token_ids = torch.zeros((2, 8), dtype=torch.long)
    sample_char_ids = torch.zeros((2, 8, cfg.max_chars_per_token), dtype=torch.long)
    sample_aux = torch.zeros((2, cfg.aux_dim), dtype=torch.float)
    with torch.no_grad():
        logits_shape = model(sample_token_ids, sample_char_ids, sample_aux).shape
    print("logits shape (sanity):", logits_shape)
    print("Model trainable params:", sum(p.numel() for p in model.parameters() if p.requires_grad))

    # train
    best_ckpt = train(train_loader, val_loader, model, cfg, class_weights, id2label)

    # test eval
    if best_ckpt and os.path.exists(best_ckpt):
        ckpt = torch.load(best_ckpt, map_location=cfg.device)
        model.load_state_dict(ckpt['model_state_dict'])
    report, cm, y_true, y_pred = evaluate(model, test_loader, cfg.device, id2label)
    print("\nFinal Test Report:")
    print(classification_report(y_true, y_pred, target_names=list(id2label.values()), digits=4))
    print("\nConfusion Matrix:\n", cm)

    # save predictions csv for inspection
    rows = []
    for rec, yt, yp in zip(test_records, y_true, y_pred):
        rows.append({"text": rec['text'], "label": id2label[yt], "pred": id2label[yp]})
    pd.DataFrame(rows).to_csv(os.path.join(cfg.output_dir, "test_preds.csv"), index=False)
    print("Saved test_preds.csv")
    print("Done.")

if __name__ == "__main__":
    main()


Records: 44019 labels: 5
split sizes: 31803 5613 6603
Vocab sizes: tokens 21997 chars 657
Loading FastText from outputs_char_bilstm/fasttext.model
Found 21997/21997 tokens in FastText.
embedding dim: 300
char_cnn out dim actual: 100
logits shape (sanity): torch.Size([2, 5])
Model trainable params: 7303301


Train E1/6: 100%|██████████| 497/497 [07:03<00:00,  1.17it/s, loss=0.0879]


Epoch 1 -> Val Macro F1: 0.3860
Saved outputs_char_bilstm/best_macro_0.3860.pt


Train E2/6: 100%|██████████| 497/497 [07:08<00:00,  1.16it/s, loss=0.0687]


Epoch 2 -> Val Macro F1: 0.3646


Train E3/6: 100%|██████████| 497/497 [07:08<00:00,  1.16it/s, loss=0.0893]


Epoch 3 -> Val Macro F1: 0.4286
Saved outputs_char_bilstm/best_macro_0.4286.pt


Train E4/6: 100%|██████████| 497/497 [07:08<00:00,  1.16it/s, loss=0.0952]


Epoch 4 -> Val Macro F1: 0.4014


Train E5/6: 100%|██████████| 497/497 [07:07<00:00,  1.16it/s, loss=0.0822]


Epoch 5 -> Val Macro F1: 0.4619
Saved outputs_char_bilstm/best_macro_0.4619.pt


Train E6/6: 100%|██████████| 497/497 [07:08<00:00,  1.16it/s, loss=0.0595]


Epoch 6 -> Val Macro F1: 0.4359





Final Test Report:
                precision    recall  f1-score   support

Mixed_feelings     0.2225    0.2842    0.2496       739
      Negative     0.3159    0.5536    0.4022       784
      Positive     0.8604    0.5090    0.6396      3731
     not-Tamil     0.4488    0.6997    0.5468       313
 unknown_state     0.3535    0.5425    0.4280      1036

      accuracy                         0.5034      6603
     macro avg     0.4402    0.5178    0.4532      6603
  weighted avg     0.6253    0.5034    0.5302      6603


Confusion Matrix:
 [[ 210  251  131   19  128]
 [ 110  434   73   23  144]
 [ 481  479 1899  169  703]
 [   7   13   21  219   53]
 [ 136  197   83   58  562]]
Saved test_preds.csv
Done.


In [None]:
# Full corrected script: CharCNN + FastText + BiLSTM + Attention + Aux features + TF-IDF
# Requirements:
# pip install torch scikit-learn gensim pandas tqdm

import os, time, random, json
from collections import Counter
from dataclasses import dataclass
from typing import List, Dict

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer ### NEW ###
from gensim.models import FastText
from tqdm import tqdm

# -------------------------
# Config
# -------------------------
@dataclass
class CFG:
    data_csv: str = "tamil_sentiment_full.csv"  # tab-separated file: label \t text
    output_dir: str = "outputs_char_bilstm_tfidf"
    device: str = "cuda" if torch.cuda.is_available() else "cpu"
    min_class_samples: int = 3

    # vocab / chars
    min_token_freq: int = 2
    max_chars_per_token: int = 12
    min_char_freq: int = 1

    # embedding / ft
    ft_dim: int = 300
    ft_min_count: int = 2
    ft_epochs: int = 8
    embedding_trainable: bool = True

    # model
    hidden_dim: int = 256
    lstm_layers: int = 1
    char_emb_dim: int = 50
    char_out: int = 100
    attn_dim: int = 128
    aux_dim: int = 8      # number of auxiliary features
    tfidf_dim: int = 5000 ### NEW ###: Max features for TF-IDF
    tfidf_proj_dim: int = 64 ### NEW ###: Dimension to project TF-IDF features to
    dropout: float = 0.3

    # training
    epochs: int = 6
    batch_size: int = 64
    lr_emb: float = 5e-5
    lr_head: float = 1e-3
    weight_decay: float = 1e-5
    use_sampler: bool = False
    use_focal: bool = True
    focal_gamma: float = 2.0
    seed: int = 42

cfg = CFG()
os.makedirs(cfg.output_dir, exist_ok=True)

# -------------------------
# Utilities
# -------------------------
def seed_everything(seed=42):
    random.seed(seed); np.random.seed(seed); torch.manual_seed(seed)
    if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed)
seed_everything(cfg.seed)

def read_data(path):
    # expects tab separated with label \t text
    df = pd.read_csv(path, sep='\t', header=None, names=['label','text'], engine='python')
    df.dropna(subset=['text','label'], inplace=True)
    df['text'] = df['text'].astype(str)
    return df

# -------------------------
# Preprocessing / vocabs
# -------------------------
def build_token_vocab(texts: List[str], min_freq=2):
    cnt = Counter()
    for t in texts:
        for w in t.split():
            cnt[w] += 1
    word2idx = {'<pad>':0, '<unk>':1}
    for w,c in cnt.items():
        if c>=min_freq:
            word2idx[w] = len(word2idx)
    return word2idx

def build_char_vocab(texts: List[str], min_freq=1, max_chars=12):
    cnt = Counter()
    for t in texts:
        for tok in t.split():
            for ch in list(tok)[:max_chars]:
                cnt[ch] += 1
    char2idx = {'<pad>':0, '<unk>':1}
    for ch,c in cnt.items():
        if c>=min_freq:
            char2idx[ch] = len(char2idx)
    return char2idx

def text_to_token_ids(text, word2idx, max_len):
    ids = [word2idx.get(w, word2idx['<unk>']) for w in text.split()]
    if len(ids) < max_len: ids += [word2idx['<pad>']] * (max_len - len(ids))
    else: ids = ids[:max_len]
    return ids

def text_to_char_ids(text, char2idx, max_len_tokens, max_chars_per_token):
    toks = text.split()
    char_ids = []
    for i in range(max_len_tokens):
        if i < len(toks):
            tok = toks[i][:max_chars_per_token]
            ids = [char2idx.get(ch, char2idx['<unk>']) for ch in tok]
            if len(ids) < max_chars_per_token:
                ids += [char2idx['<pad>']] * (max_chars_per_token - len(ids))
        else:
            ids = [char2idx['<pad>']] * max_chars_per_token
        char_ids.append(ids)
    return char_ids  # shape (max_len_tokens, max_chars_per_token)

# Auxiliary features generator (simple, extensible)
def compute_aux_features(text):
    toks = text.split()
    num_tokens = len(toks)
    num_chars = len(text)
    emoji_count = sum(1 for ch in text if ord(ch) > 10000)  # crude heuristic
    punct_count = sum(1 for ch in text if ch in '?!.,;:')
    has_english = 1.0 if any('a' <= ch.lower() <= 'z' for ch in text) else 0.0
    has_tamil = 1.0 if any('\u0B80' <= ch <= '\u0BFF' for ch in text) else 0.0
    avg_token_len = (sum(len(t) for t in toks)/num_tokens) if num_tokens>0 else 0.0
    cap_ratio = sum(1 for ch in text if ch.isupper()) / (num_chars+1)
    return [num_tokens, num_chars, emoji_count, punct_count, has_english, has_tamil, avg_token_len, cap_ratio]

# -------------------------
# Dataset
# -------------------------
class CharTokenDataset(Dataset): ### MODIFIED to include TF-IDF ###
    def __init__(self, records, tfidf_matrix, label_map, word2idx, char2idx, max_len_tokens, max_chars_per_token, aux_dim):
        self.records = records
        self.tfidf_matrix = tfidf_matrix
        self.label_map = label_map
        self.word2idx = word2idx
        self.char2idx = char2idx
        self.max_len_tokens = max_len_tokens
        self.max_chars_per_token = max_chars_per_token
        self.aux_dim = aux_dim

        self.samples = []
        for i, r in enumerate(records):
            text = str(r['text'])
            label = label_map[r['label']]
            token_ids = text_to_token_ids(text, word2idx, max_len_tokens)
            char_ids = text_to_char_ids(text, char2idx, max_len_tokens, max_chars_per_token)
            aux = compute_aux_features(text)
            aux = (aux + [0.0]*aux_dim)[:aux_dim]
            tfidf_vec = self.tfidf_matrix[i].toarray().squeeze()
            self.samples.append((token_ids, char_ids, aux, tfidf_vec, label))

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        token_ids, char_ids, aux, tfidf, label = self.samples[idx]
        token_ids = torch.tensor(token_ids, dtype=torch.long)
        char_ids = torch.tensor(char_ids, dtype=torch.long)
        aux = torch.tensor(aux, dtype=torch.float32)
        tfidf = torch.tensor(tfidf, dtype=torch.float32)
        label = torch.tensor(label, dtype=torch.long)
        return {"token_ids": token_ids, "char_ids": char_ids, "aux": aux, "tfidf": tfidf, "label": label}

# -------------------------
# FastText training / loading
# -------------------------
def train_or_load_fasttext(sentences, path, dim=300, min_count=2, epochs=6):
    if os.path.exists(path):
        print("Loading FastText from", path)
        return FastText.load(path)
    print("Training FastText...")
    tokenized = [s.split() for s in sentences]
    ft = FastText(vector_size=dim, window=5, min_count=min_count, workers=os.cpu_count(), epochs=epochs)
    ft.build_vocab(tokenized)
    ft.train(tokenized, total_examples=len(tokenized), epochs=epochs)
    ft.save(path)
    print("Saved FastText at", path)
    return ft

def build_embedding_matrix(word2idx, ft_model, dim):
    V = len(word2idx)
    mat = np.random.normal(scale=0.01, size=(V, dim)).astype(np.float32)
    found = 0
    for w,i in word2idx.items():
        if w in ft_model.wv:
            mat[i] = ft_model.wv[w]
            found += 1
    print(f"Found {found}/{V} tokens in FastText.")
    return mat

# -------------------------
# Model components (robust CharCNN & BiLSTM)
# -------------------------
class CharCNN(nn.Module):
    def __init__(self, char_vocab_size, char_emb_dim=50, out_dim=100, kernel_sizes=(3,4,5), dropout=0.1, max_chars=12):
        super().__init__()
        self.char_emb = nn.Embedding(char_vocab_size, char_emb_dim, padding_idx=0)
        k = len(kernel_sizes)
        base = out_dim // k
        extras = out_dim - (base * k)
        out_channels_list = [base + (1 if i < extras else 0) for i in range(k)]
        self.convs = nn.ModuleList([
            nn.Conv2d(in_channels=1, out_channels=out_channels_list[i], kernel_size=(kernel_sizes[i], char_emb_dim))
            for i in range(k)
        ])
        self.out_dim_actual = sum(out_channels_list)
        self.dropout = nn.Dropout(dropout)
        self.max_chars = max_chars

    def forward(self, x_char):
        # x_char: (B, T, C)
        B,T,C = x_char.size()
        x = self.char_emb(x_char)            # (B, T, C, E)
        x = x.view(B*T, C, -1).unsqueeze(1) # (B*T, 1, C, E)
        conv_outs = []
        for conv in self.convs:
            o = conv(x)                      # (B*T, out_ch, L, 1)
            o = F.relu(o.squeeze(-1))      # (B*T, out_ch, L)
            o = F.max_pool1d(o, o.size(2)).squeeze(2)  # (B*T, out_ch)
            conv_outs.append(o)
        out = torch.cat(conv_outs, dim=1)  # (B*T, out_dim_actual)
        out = out.view(B, T, -1)           # (B, T, out_dim_actual)
        out = self.dropout(out)
        return out

class BiLSTMCharFastText(nn.Module): ### MODIFIED to include TF-IDF ###
    def __init__(self, emb_matrix, char_vocab_size, cfg, num_labels):
        super().__init__()
        emb_matrix = torch.tensor(emb_matrix)
        V, E = emb_matrix.shape
        self.embedding = nn.Embedding.from_pretrained(emb_matrix, freeze=not cfg.embedding_trainable, padding_idx=0)
        self.char_cnn = CharCNN(char_vocab_size, char_emb_dim=cfg.char_emb_dim, out_dim=cfg.char_out, dropout=cfg.dropout, max_chars=cfg.max_chars_per_token)
        token_in_dim = E + self.char_cnn.out_dim_actual
        self.bilstm = nn.LSTM(token_in_dim, cfg.hidden_dim//2, num_layers=cfg.lstm_layers, bidirectional=True, batch_first=True, dropout=cfg.dropout if cfg.lstm_layers>1 else 0)
        self.attn_proj = nn.Linear(cfg.hidden_dim, cfg.attn_dim)
        self.attn_v = nn.Linear(cfg.attn_dim, 1, bias=False)
        self.aux_proj = nn.Linear(cfg.aux_dim, 32)
        self.tfidf_proj = nn.Linear(cfg.tfidf_dim, cfg.tfidf_proj_dim) ### NEW ###
        self.classifier = nn.Sequential(
            nn.Linear(cfg.hidden_dim + 32 + cfg.tfidf_proj_dim, 256), ### MODIFIED ###
            nn.ReLU(),
            nn.Dropout(cfg.dropout),
            nn.Linear(256, num_labels)
        )

    def forward(self, token_ids, char_ids, aux, tfidf): ### MODIFIED ###
        emb = self.embedding(token_ids)              # (B, T, E)
        char_vec = self.char_cnn(char_ids)           # (B, T, char_out_actual)
        x = torch.cat([emb, char_vec], dim=-1)       # (B, T, E + char_out_actual)
        h, _ = self.bilstm(x)                        # (B, T, H)
        a = torch.tanh(self.attn_proj(h))            # (B, T, attn_dim)
        scores = self.attn_v(a).squeeze(-1)          # (B, T)
        mask = (token_ids != 0).float()              # pad mask
        scores = scores.masked_fill(mask==0, -1e9)
        alpha = torch.softmax(scores, dim=1).unsqueeze(-1) # (B, T, 1)
        pooled = (h * alpha).sum(dim=1)              # (B, H)
        aux_p = torch.relu(self.aux_proj(aux))       # (B, 32)
        tfidf_p = torch.relu(self.tfidf_proj(tfidf)) ### NEW ### (B, tfidf_proj_dim)
        cat = torch.cat([pooled, aux_p, tfidf_p], dim=1)  ### MODIFIED ### (B, H + 32 + tfidf_proj_dim)
        logits = self.classifier(cat)                # (B, num_labels)
        return logits

# -------------------------
# Loss: Focal
# -------------------------
class FocalLoss(nn.Module):
    def __init__(self, gamma=2.0, alpha=None):
        super().__init__()
        self.gamma = gamma
        if alpha is not None:
            self.alpha = torch.tensor(alpha, dtype=torch.float32)
        else:
            self.alpha = None

    def forward(self, logits, targets):
        ce = F.cross_entropy(logits, targets, reduction='none')
        pt = torch.exp(-ce)
        loss = ((1 - pt) ** self.gamma) * ce
        if self.alpha is not None:
            if self.alpha.device != targets.device:
                self.alpha = self.alpha.to(targets.device)
            loss = self.alpha[targets] * loss
        return loss.mean()


# -------------------------
# Evaluate (returns y_true, y_pred)
# -------------------------
def evaluate(model, dataloader, device, id2label): ### MODIFIED to include TF-IDF ###
    model.eval()
    y_true, y_pred = [], []
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Eval", leave=False):
            tokens = batch['token_ids'].to(device)
            chars  = batch['char_ids'].to(device)
            aux    = batch['aux'].to(device)
            tfidf  = batch['tfidf'].to(device)
            labels = batch['label'].cpu().numpy().tolist()
            logits = model(tokens, chars, aux, tfidf)
            preds = torch.argmax(logits, dim=1).cpu().numpy().tolist()
            y_true.extend(labels); y_pred.extend(preds)
    report = classification_report(y_true, y_pred, target_names=list(id2label.values()), digits=4, output_dict=True, zero_division=0)
    cm = confusion_matrix(y_true, y_pred)
    return report, cm, y_true, y_pred

# -------------------------
# Training loop
# -------------------------
def train(train_loader, val_loader, model, cfg, class_weights, id2label): ### MODIFIED to include TF-IDF ###
    device = cfg.device
    model.to(device)
    emb_params = list(model.embedding.parameters())
    other_params = [p for n,p in model.named_parameters() if not n.startswith('embedding.')]
    optimizer = torch.optim.AdamW([
        {"params": emb_params, "lr": cfg.lr_emb},
        {"params": other_params, "lr": cfg.lr_head}
    ], weight_decay=cfg.weight_decay)

    if cfg.use_focal:
        alpha = (1.0 / (class_weights + 1e-9))
        alpha = alpha / alpha.sum()
        criterion = FocalLoss(cfg.focal_gamma, alpha=alpha)
    else:
        criterion = nn.CrossEntropyLoss()

    best_ckpt = None
    best_macro = -1.0
    for epoch in range(cfg.epochs):
        model.train()
        pbar = tqdm(train_loader, desc=f"Train E{epoch+1}/{cfg.epochs}")
        for batch in pbar:
            tokens = batch['token_ids'].to(device)
            chars  = batch['char_ids'].to(device)
            aux    = batch['aux'].to(device)
            tfidf  = batch['tfidf'].to(device)
            labels = batch['label'].to(device)
            optimizer.zero_grad()
            logits = model(tokens, chars, aux, tfidf)
            loss = criterion(logits, labels)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            pbar.set_postfix(loss=loss.item())
        report, cm, _, _ = evaluate(model, val_loader, cfg.device, id2label)
        macro_f1 = report['macro avg']['f1-score']
        print(f"Epoch {epoch+1} -> Val Macro F1: {macro_f1:.4f}")
        if macro_f1 > best_macro:
            best_macro = macro_f1
            best_ckpt = os.path.join(cfg.output_dir, f"best_macro_{macro_f1:.4f}.pt")
            torch.save({"model_state_dict": model.state_dict(), "cfg": cfg.__dict__}, best_ckpt)
            print("Saved", best_ckpt)
    return best_ckpt

# -------------------------
# Pipeline orchestration
# -------------------------
def main():
    # load
    df = read_data(cfg.data_csv)
    # filter tiny classes
    cnt = df['label'].value_counts()
    keep = cnt[cnt >= cfg.min_class_samples].index.tolist()
    if len(keep) < len(cnt):
        df = df[df['label'].isin(keep)].reset_index(drop=True)
    print("Records:", len(df), "labels:", df['label'].nunique())

    # label mapping
    labels_unique = sorted(df['label'].unique())
    label_map = {lab:i for i,lab in enumerate(labels_unique)}
    id2label = {v:k for k,v in label_map.items()}

    # splits
    data = df.to_dict(orient='records')
    lablist = [label_map[r['label']] for r in data]
    train_idx, test_idx = train_test_split(range(len(data)), test_size=0.15, random_state=cfg.seed, stratify=lablist)
    train_idx, val_idx = train_test_split(train_idx, test_size=0.15, random_state=cfg.seed, stratify=[lablist[i] for i in train_idx])
    train_records = [data[i] for i in train_idx]
    val_records   = [data[i] for i in val_idx]
    test_records  = [data[i] for i in test_idx]
    print("split sizes:", len(train_records), len(val_records), len(test_records))

    ### NEW ###: Compute TF-IDF features
    print("Computing TF-IDF features...")
    tfidf_vectorizer = TfidfVectorizer(
        max_features=cfg.tfidf_dim,
        ngram_range=(1, 2),
        token_pattern=r'(?u)\b\w+\b'
    )
    train_texts = [r['text'] for r in train_records]
    train_tfidf = tfidf_vectorizer.fit_transform(train_texts)
    val_tfidf = tfidf_vectorizer.transform([r['text'] for r in val_records])
    test_tfidf = tfidf_vectorizer.transform([r['text'] for r in test_records])
    print("TF-IDF matrix shape (train):", train_tfidf.shape)

    # vocabs
    word2idx = build_token_vocab([r['text'] for r in train_records], cfg.min_token_freq)
    char2idx = build_char_vocab([r['text'] for r in train_records], cfg.min_char_freq, cfg.max_chars_per_token)
    print("Vocab sizes: tokens", len(word2idx), "chars", len(char2idx))

    # fasttext
    ft_path = os.path.join(cfg.output_dir, "fasttext.model")
    ft = train_or_load_fasttext([r['text'] for r in data], ft_path, dim=cfg.ft_dim, min_count=cfg.ft_min_count, epochs=cfg.ft_epochs)
    emb_matrix = build_embedding_matrix(word2idx, ft, cfg.ft_dim)

    # datasets ### MODIFIED to pass TF-IDF matrices ###
    max_len = 64  # token length cap - tune as needed
    train_ds = CharTokenDataset(train_records, train_tfidf, label_map, word2idx, char2idx, max_len, cfg.max_chars_per_token, cfg.aux_dim)
    val_ds   = CharTokenDataset(val_records, val_tfidf, label_map, word2idx, char2idx, max_len, cfg.max_chars_per_token, cfg.aux_dim)
    test_ds  = CharTokenDataset(test_records, test_tfidf, label_map, word2idx, char2idx, max_len, cfg.max_chars_per_token, cfg.aux_dim)

    if cfg.use_sampler:
        labels = [s[4] for s in train_ds.samples] ### MODIFIED ###: Label is now the 5th element (index 4)
        cnts = Counter(labels)
        sample_weights = [1.0 / cnts[l] for l in labels]
        sampler = WeightedRandomSampler(sample_weights, num_samples=len(sample_weights), replacement=True)
        train_loader = DataLoader(train_ds, batch_size=cfg.batch_size, sampler=sampler, num_workers=2)
    else:
        train_loader = DataLoader(train_ds, batch_size=cfg.batch_size, shuffle=True, num_workers=2)

    val_loader = DataLoader(val_ds, batch_size=cfg.batch_size*2, shuffle=False, num_workers=2)
    test_loader = DataLoader(test_ds, batch_size=cfg.batch_size*2, shuffle=False, num_workers=2)

    # class weight vector (counts)
    train_labels = [s[4] for s in train_ds.samples] ### MODIFIED ###: Label is now the 5th element (index 4)
    cnts = np.array([Counter(train_labels).get(i,0) for i in range(len(label_map))])
    class_weights = cnts.astype(np.float32)

    # model
    model = BiLSTMCharFastText(emb_matrix, char_vocab_size=len(char2idx), cfg=cfg, num_labels=len(label_map))
    # quick shape sanity check
    print("embedding dim:", emb_matrix.shape[1])
    print("char_cnn out dim actual:", model.char_cnn.out_dim_actual)
    sample_token_ids = torch.zeros((2, 8), dtype=torch.long)
    sample_char_ids = torch.zeros((2, 8, cfg.max_chars_per_token), dtype=torch.long)
    sample_aux = torch.zeros((2, cfg.aux_dim), dtype=torch.float)
    sample_tfidf = torch.zeros((2, cfg.tfidf_dim), dtype=torch.float) ### NEW ###
    with torch.no_grad():
        logits_shape = model(sample_token_ids, sample_char_ids, sample_aux, sample_tfidf).shape ### MODIFIED ###
    print("logits shape (sanity):", logits_shape)
    print("Model trainable params:", sum(p.numel() for p in model.parameters() if p.requires_grad))

    # train
    best_ckpt = train(train_loader, val_loader, model, cfg, class_weights, id2label)

    # test eval
    if best_ckpt and os.path.exists(best_ckpt):
        ckpt = torch.load(best_ckpt, map_location=cfg.device)
        model.load_state_dict(ckpt['model_state_dict'])
        report, cm, y_true, y_pred = evaluate(model, test_loader, cfg.device, id2label)
        print("\nFinal Test Report:")
        print(classification_report(y_true, y_pred, target_names=list(id2label.values()), digits=4))
        print("\nConfusion Matrix:\n", cm)

        # save predictions csv for inspection
        rows = []
        for rec, yt, yp in zip(test_records, y_true, y_pred):
            rows.append({"text": rec['text'], "label": id2label[yt], "pred": id2label[yp]})
        pd.DataFrame(rows).to_csv(os.path.join(cfg.output_dir, "test_preds.csv"), index=False)
        print("Saved test_preds.csv")
    print("Done.")

if __name__ == "__main__":
    main()

Records: 44019 labels: 5
split sizes: 31803 5613 6603
Computing TF-IDF features...
TF-IDF matrix shape (train): (31803, 5000)
Vocab sizes: tokens 21997 chars 657
Training FastText...
Saved FastText at outputs_char_bilstm_tfidf/fasttext.model
Found 21997/21997 tokens in FastText.
embedding dim: 300
char_cnn out dim actual: 100
logits shape (sanity): torch.Size([2, 5])
Model trainable params: 7639749


Train E1/6: 100%|██████████| 497/497 [07:02<00:00,  1.17it/s, loss=0.0734]


Epoch 1 -> Val Macro F1: 0.4474
Saved outputs_char_bilstm_tfidf/best_macro_0.4474.pt


Train E2/6: 100%|██████████| 497/497 [07:09<00:00,  1.16it/s, loss=0.0515]


Epoch 2 -> Val Macro F1: 0.4585
Saved outputs_char_bilstm_tfidf/best_macro_0.4585.pt


Train E3/6: 100%|██████████| 497/497 [07:09<00:00,  1.16it/s, loss=0.0418]


Epoch 3 -> Val Macro F1: 0.4445


Train E4/6: 100%|██████████| 497/497 [07:09<00:00,  1.16it/s, loss=0.0526]


Epoch 4 -> Val Macro F1: 0.4382


Train E5/6: 100%|██████████| 497/497 [07:09<00:00,  1.16it/s, loss=0.0431]


Epoch 5 -> Val Macro F1: 0.4228


Train E6/6: 100%|██████████| 497/497 [07:08<00:00,  1.16it/s, loss=0.0288]


Epoch 6 -> Val Macro F1: 0.4244





Final Test Report:
                precision    recall  f1-score   support

Mixed_feelings     0.2392    0.3288    0.2769       739
      Negative     0.3090    0.6199    0.4124       784
      Positive     0.8576    0.5615    0.6787      3731
     not-Tamil     0.3902    0.7668    0.5172       313
 unknown_state     0.4372    0.4035    0.4197      1036

      accuracy                         0.5273      6603
     macro avg     0.4466    0.5361    0.4610      6603
  weighted avg     0.6351    0.5273    0.5538      6603


Confusion Matrix:
 [[ 243  275  130   22   69]
 [ 118  486   76   23   81]
 [ 503  557 2095  213  363]
 [   7   15   26  240   25]
 [ 145  240  116  117  418]]
Saved test_preds.csv
Done.
