In [4]:
# ==============================================================================
# Step 1: Install Dependencies
# ==============================================================================
!pip install torch scikit-learn pandas tqdm transformers sentencepiece demoji -q

# ==============================================================================
# Step 2: Two-Stage Pipeline for Malayalam Dataset (NN Feature Extractor + RF Classifier)
# ==============================================================================
import os
import random
import json
import joblib
import re
import demoji
from collections import Counter
from dataclasses import dataclass
from typing import List, Tuple

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils.class_weight import compute_class_weight
from transformers import AutoTokenizer, AutoModel
from torch.optim import AdamW
from tqdm.auto import tqdm

# -------------------------
# Config
# -------------------------
@dataclass
class CFG:
    data_csv: str = "mal_full_sentiment.tsv"
    output_dir: str = "malayalam_model_artifacts_char_tfidf_rf"
    model_name: str = "google/muril-base-cased"
    device: str = "cuda" if torch.cuda.is_available() else "cpu"
    seed: int = 42
    max_len: int = 64
    max_char_per_token: int = 12
    char_emb_dim: int = 50
    char_out: int = 96
    cnn_out_channels: int = 128
    cnn_kernel_sizes: Tuple[int, ...] = (2, 3, 4)
    hidden_dim: int = 256
    gru_layers: int = 1
    attn_dim: int = 128
    dropout: float = 0.3
    tfidf_dim: int = 5000
    tfidf_proj_dim: int = 64
    epochs: int = 5
    batch_size: int = 32
    lr_bert: float = 2e-5
    lr_head: float = 1e-3
    weight_decay: float = 1e-6
    grad_clip: float = 1.0
    num_workers: int = 2

# (Helper functions, Dataset, and Model classes remain the same)
def seed_everything(seed: int):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

def clean_text(text: str) -> str:
    text = demoji.replace_with_desc(text, sep=" ")
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'#\w+', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def read_data(path: str):
    df = pd.read_csv(path, sep='\t', header=None, names=['label', 'text'], engine='python', on_bad_lines='skip')
    df.dropna(subset=['text', 'label'], inplace=True)
    df['text'] = df['text'].astype(str)
    print("Cleaning text data...")
    df['text'] = df['text'].apply(clean_text)
    return df

tokenizer = AutoTokenizer.from_pretrained(CFG.model_name, use_fast=True)

def build_char_vocab_from_token_strings(token_strings_list: List[List[str]]):
    cnt = Counter(ch for toks in token_strings_list for tok in toks for ch in tok.replace("##", ""))
    char2idx = {'<pad>': 0, '<unk>': 1}
    for ch in cnt:
        if ch not in char2idx: char2idx[ch] = len(char2idx)
    return char2idx

def tokens_to_char_ids(token_strings: List[str], char2idx, max_chars=12, max_tokens=64):
    char_ids = []
    for tok in token_strings[:max_tokens]:
        s = tok.replace("##", "")
        ids = [char2idx.get(ch, char2idx['<unk>']) for ch in list(s)[:max_chars]]
        ids.extend([char2idx['<pad>']] * (max_chars - len(ids)))
        char_ids.append(ids)
    char_ids.extend([[char2idx['<pad>']] * max_chars] * (max_tokens - len(char_ids)))
    return char_ids

class MurilHybridDataset(Dataset):
    def __init__(self, records, label_map, tokenizer, tfidf_vectorizer, char2idx, cfg: CFG):
        self.records, self.label_map = records, label_map
        texts = [r['text'] for r in records]
        enc = tokenizer(texts, padding='max_length', truncation=True, max_length=cfg.max_len, return_tensors='pt')
        self.input_ids, self.attention_mask = enc['input_ids'], enc['attention_mask']
        self.token_type_ids = enc.get('token_type_ids', torch.zeros_like(self.input_ids))
        token_strings_all = [tokenizer.convert_ids_to_tokens(s.tolist()) for s in self.input_ids]
        self.char_ids = torch.tensor([tokens_to_char_ids(ts, char2idx, max_chars=cfg.max_char_per_token, max_tokens=cfg.max_len) for ts in token_strings_all], dtype=torch.long)
        self.tfidf = tfidf_vectorizer.transform(texts)

    def __len__(self):
        return len(self.records)

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx], 'attention_mask': self.attention_mask[idx],
            'token_type_ids': self.token_type_ids[idx], 'char_ids': self.char_ids[idx],
            'tfidf': torch.from_numpy(self.tfidf[idx].toarray().squeeze()).float(),
            'label': torch.tensor(self.label_map[self.records[idx]['label']], dtype=torch.long)
        }

class CharCNN(nn.Module):
    def __init__(self, char_vocab_size: int, cfg: CFG):
        super().__init__()
        self.char_emb = nn.Embedding(char_vocab_size, cfg.char_emb_dim, padding_idx=0)
        num_filters_each = cfg.char_out // 3
        self.convs = nn.ModuleList([nn.Conv2d(1, num_filters_each, (k, cfg.char_emb_dim)) for k in (2, 3, 4)])
        self.dropout = nn.Dropout(cfg.dropout)
        self.out_dim_actual = num_filters_each * 3

    def forward(self, x_char):
        B, T, C = x_char.size()
        emb = self.char_emb(x_char).view(B * T, C, -1).unsqueeze(1)
        conv_outs = [F.relu(conv(emb)).squeeze(3) for conv in self.convs]
        pooled = [F.max_pool1d(o, o.size(2)).squeeze(2) for o in conv_outs]
        return self.dropout(torch.cat(pooled, dim=1).view(B, T, -1))

class MurilHybridClassifier(nn.Module):
    def __init__(self, num_labels: int, cfg: CFG, char_vocab_size: int, actual_tfidf_dim: int):
        super().__init__()
        self.bert = AutoModel.from_pretrained(cfg.model_name)
        bert_hidden = self.bert.config.hidden_size
        self.char_cnn = CharCNN(char_vocab_size, cfg)
        token_in_dim = bert_hidden + self.char_cnn.out_dim_actual
        self.word_cnns = nn.ModuleList([nn.Conv1d(token_in_dim, cfg.cnn_out_channels, k, padding=k//2) for k in cfg.cnn_kernel_sizes])
        cnn_output_dim = cfg.cnn_out_channels * len(cfg.cnn_kernel_sizes)
        self.bigru = nn.GRU(cnn_output_dim, cfg.hidden_dim // 2, num_layers=cfg.gru_layers, bidirectional=True, batch_first=True, dropout=cfg.dropout if cfg.gru_layers > 1 else 0)
        self.attn_proj = nn.Linear(cfg.hidden_dim, cfg.attn_dim)
        self.attn_v = nn.Linear(cfg.attn_dim, 1, bias=False)
        self.tfidf_proj = nn.Linear(actual_tfidf_dim, cfg.tfidf_proj_dim)
        fusion_dim = bert_hidden + cfg.hidden_dim + cfg.tfidf_proj_dim
        self.classifier = nn.Sequential(nn.Linear(fusion_dim, 256), nn.ReLU(), nn.Dropout(cfg.dropout), nn.Linear(256, num_labels))

    def forward(self, input_ids, attention_mask, token_type_ids, char_ids, tfidf):
        final_vec = self.extract_features(input_ids, attention_mask, token_type_ids, char_ids, tfidf)
        logits = self.classifier(final_vec)
        return logits

    def extract_features(self, input_ids, attention_mask, token_type_ids, char_ids, tfidf):
        bert_out = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, return_dict=True)
        x = torch.cat([bert_out.last_hidden_state, self.char_cnn(char_ids)], dim=-1).permute(0, 2, 1)
        cnn_feats = [F.relu(conv(x)) for conv in self.word_cnns]
        min_len = min(f.size(2) for f in cnn_feats)
        x_seq = torch.cat([f[:, :, :min_len] for f in cnn_feats], dim=1).permute(0, 2, 1)
        h, _ = self.bigru(x_seq)
        mask = attention_mask[:, :h.size(1)]
        scores = self.attn_v(torch.tanh(self.attn_proj(h))).squeeze(-1).masked_fill(mask == 0, float('-1e9'))
        seq_pooled = (h * torch.softmax(scores, dim=1).unsqueeze(-1)).sum(dim=1)
        final_vec = torch.cat([bert_out.pooler_output, seq_pooled, torch.relu(self.tfidf_proj(tfidf))], dim=1)
        return final_vec

def main():
    cfg = CFG()
    os.makedirs(cfg.output_dir, exist_ok=True)
    seed_everything(cfg.seed)

    df = read_data(cfg.data_csv)

    print("\nOriginal class distribution:")
    print(df['label'].value_counts())
    label_counts = df['label'].value_counts()
    labels_to_keep = label_counts[label_counts >= 2].index
    original_len = len(df)
    df = df[df['label'].isin(labels_to_keep)].reset_index(drop=True)
    if original_len > len(df):
        print(f"\nRemoved {original_len - len(df)} rows belonging to classes with < 2 samples.")
        print("\nNew class distribution:")
        print(df['label'].value_counts())

    labels_unique, data = sorted(df['label'].unique()), df.to_dict('records')
    label_map = {label: i for i, label in enumerate(labels_unique)}

    # --- STAGE 1: TRAIN NEURAL NETWORK FEATURE EXTRACTOR ---
    print("\n--- Stage 1: Training Neural Network Feature Extractor ---")

    train_val_idx, test_idx = train_test_split(range(len(data)), test_size=0.2, random_state=cfg.seed, stratify=df['label'])
    train_idx, val_idx = train_test_split(train_val_idx, test_size=0.1, random_state=cfg.seed, stratify=df['label'].iloc[train_val_idx])
    train_records, val_records, test_records = [data[i] for i in train_idx], [data[i] for i in val_idx], [data[i] for i in test_idx]

    # --- MODIFIED: Use (1-6)-gram character-level TF-IDF ---
    print("\nInitializing (1-6)-gram character-level TF-IDF Vectorizer...")
    tfidf_vec = TfidfVectorizer(
        analyzer='char',
        ngram_range=(1, 6),
        max_features=cfg.tfidf_dim,
        min_df=3
    )

    train_texts = [r['text'] for r in train_records]
    tfidf_vec.fit(train_texts)
    actual_tfidf_dim = len(tfidf_vec.vocabulary_)
    print(f"Actual Character TF-IDF vocabulary size: {actual_tfidf_dim}")


    enc_train = tokenizer(train_texts, padding=False, truncation=False)
    token_strings_train = [tokenizer.convert_ids_to_tokens(seq) for seq in enc_train['input_ids']]
    char2idx = build_char_vocab_from_token_strings(token_strings_train)

    class_weights = compute_class_weight('balanced', classes=np.array(labels_unique), y=[r['label'] for r in train_records])
    class_weights_tensor = torch.tensor(class_weights, dtype=torch.float).to(cfg.device)

    train_ds = MurilHybridDataset(train_records, label_map, tokenizer, tfidf_vec, char2idx, cfg)
    val_ds = MurilHybridDataset(val_records, label_map, tokenizer, tfidf_vec, char2idx, cfg)
    train_loader = DataLoader(train_ds, batch_size=cfg.batch_size, shuffle=True, pin_memory=True, num_workers=cfg.num_workers)
    val_loader = DataLoader(val_ds, batch_size=cfg.batch_size * 2, shuffle=False, pin_memory=True, num_workers=cfg.num_workers)

    model = MurilHybridClassifier(len(labels_unique), cfg, len(char2idx), actual_tfidf_dim=actual_tfidf_dim).to(cfg.device)
    optimizer = AdamW([{'params': model.bert.parameters(), 'lr': cfg.lr_bert}, {'params': [p for n, p in model.named_parameters() if 'bert' not in n], 'lr': cfg.lr_head}], weight_decay=cfg.weight_decay)
    criterion = nn.CrossEntropyLoss(weight=class_weights_tensor)

    best_macro_f1 = 0.0
    for epoch in range(cfg.epochs):
        model.train()
        pbar = tqdm(train_loader, desc=f"NN Train E{epoch+1}/{cfg.epochs}")
        for batch in pbar:
            labels = batch.pop('label').to(cfg.device)
            inputs = {k: v.to(cfg.device) for k, v in batch.items()}
            optimizer.zero_grad()
            loss = criterion(model(**inputs), labels)
            loss.backward()
            optimizer.step()

        model.eval()
        y_true_val, y_pred_val = [], []
        with torch.no_grad():
            for batch in val_loader:
                labels = batch.pop('label').to(cfg.device)
                inputs = {k: v.to(cfg.device) for k, v in batch.items()}
                preds = torch.argmax(model(**inputs), dim=1)
                y_true_val.extend(labels.cpu().numpy())
                y_pred_val.extend(preds.cpu().numpy())

        report = classification_report(y_true_val, y_pred_val, output_dict=True, zero_division=0)
        macro_f1 = report['macro avg']['f1-score']
        print(f"Epoch {epoch+1} -> Val Macro F1: {macro_f1:.4f}")
        if macro_f1 > best_macro_f1:
            best_macro_f1 = macro_f1
            torch.save(model.state_dict(), os.path.join(cfg.output_dir, "best_nn_model.pt"))
            print(f"ðŸš€ Saved new best NN model with Macro F1: {macro_f1:.4f}")

    # --- STAGE 2: EXTRACT FEATURES AND TRAIN RANDOM FOREST ---
    print("\n--- Stage 2: Training Random Forest Classifier on Extracted Features ---")

    model.load_state_dict(torch.load(os.path.join(cfg.output_dir, "best_nn_model.pt")))
    model.eval()

    full_train_records = train_records + val_records
    full_train_ds = MurilHybridDataset(full_train_records, label_map, tokenizer, tfidf_vec, char2idx, cfg)
    test_ds = MurilHybridDataset(test_records, label_map, tokenizer, tfidf_vec, char2idx, cfg)

    train_loader_full = DataLoader(full_train_ds, batch_size=cfg.batch_size * 2, shuffle=False)
    test_loader_full = DataLoader(test_ds, batch_size=cfg.batch_size * 2, shuffle=False)

    def get_features_and_labels(loader, model, device):
        all_features, all_labels = [], []
        with torch.no_grad():
            for batch in tqdm(loader, desc="Extracting features"):
                labels = batch.pop('label').numpy()
                inputs = {k: v.to(device) for k, v in batch.items()}
                features = model.extract_features(**inputs)
                all_features.append(features.cpu().numpy())
                all_labels.append(labels)
        return np.concatenate(all_features), np.concatenate(all_labels)

    X_train, y_train = get_features_and_labels(train_loader_full, model, cfg.device)
    X_test, y_test = get_features_and_labels(test_loader_full, model, cfg.device)

    print(f"Extracted feature shapes: X_train: {X_train.shape}, X_test: {X_test.shape}")

    print("\nTraining Random Forest model...")
    rf_classifier = RandomForestClassifier(n_estimators=200,
                                           max_depth=10,
                                           random_state=cfg.seed,
                                           n_jobs=-1)

    rf_classifier.fit(X_train, y_train)

    print("\n--- Final Test Set Evaluation (Random Forest) ---")
    y_pred_rf = rf_classifier.predict(X_test)

    print(classification_report(y_test, y_pred_rf, target_names=[labels_unique[i] for i in sorted(np.unique(y_test))], digits=4))

if __name__ == '__main__':
    main()

Cleaning text data...

Original class distribution:
label
Positive          7907
unknown_state     6502
Negative          2600
not-malayalam     1445
Mixed_feelings    1162
Name: count, dtype: int64

--- Stage 1: Training Neural Network Feature Extractor ---

Initializing (1-6)-gram character-level TF-IDF Vectorizer...
Actual Character TF-IDF vocabulary size: 5000


Token indices sequence length is longer than the specified maximum sequence length for this model (519 > 512). Running this sequence through the model will result in indexing errors


NN Train E1/5:   0%|          | 0/442 [00:00<?, ?it/s]

Epoch 1 -> Val Macro F1: 0.5787
ðŸš€ Saved new best NN model with Macro F1: 0.5787


NN Train E2/5:   0%|          | 0/442 [00:00<?, ?it/s]

Epoch 2 -> Val Macro F1: 0.6118
ðŸš€ Saved new best NN model with Macro F1: 0.6118


NN Train E3/5:   0%|          | 0/442 [00:00<?, ?it/s]

Epoch 3 -> Val Macro F1: 0.6691
ðŸš€ Saved new best NN model with Macro F1: 0.6691


NN Train E4/5:   0%|          | 0/442 [00:00<?, ?it/s]

Epoch 4 -> Val Macro F1: 0.6641


NN Train E5/5:   0%|          | 0/442 [00:00<?, ?it/s]

Epoch 5 -> Val Macro F1: 0.6821
ðŸš€ Saved new best NN model with Macro F1: 0.6821

--- Stage 2: Training Random Forest Classifier on Extracted Features ---


Extracting features:   0%|          | 0/246 [00:00<?, ?it/s]

Extracting features:   0%|          | 0/62 [00:00<?, ?it/s]

Extracted feature shapes: X_train: (15692, 1088), X_test: (3924, 1088)

Training Random Forest model...

--- Final Test Set Evaluation (Random Forest) ---
                precision    recall  f1-score   support

Mixed_feelings     0.5564    0.3190    0.4055       232
      Negative     0.6131    0.5942    0.6035       520
      Positive     0.7842    0.8180    0.8007      1582
 not-malayalam     0.8154    0.8408    0.8279       289
 unknown_state     0.7356    0.7571    0.7462      1301

      accuracy                         0.7403      3924
     macro avg     0.7010    0.6658    0.6768      3924
  weighted avg     0.7343    0.7403    0.7352      3924

