In [None]:
import os
import pandas as pd
import numpy as np
import re
from collections import Counter
from sklearn.utils.class_weight import compute_class_weight
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import (
    Embedding,
    LSTM,
    Dense,
    Dropout,
    Bidirectional,
    BatchNormalization,
    GlobalMaxPooling1D,
    Concatenate,
    Input,
)
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2
import tensorflow as tf

# Set random seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

# ---- Paths & labels ----
DATA_DIR = "/home/jupyter/old_backup"
TRAIN_CSV = os.path.join(DATA_DIR, "train_sent_emo.csv")
DEV_CSV = os.path.join(DATA_DIR, "dev_sent_emo.csv")
TEST_CSV = os.path.join(DATA_DIR, "test_sent_emo.csv")

CLASSES = ["anger", "disgust", "fear", "joy", "neutral", "sadness", "surprise"]
label2id = {c: i for i, c in enumerate(CLASSES)}
id2label = {i: c for c, i in label2id.items()}


# ---- Enhanced data loading ----
def load_meld_data():
    """Load and preprocess MELD dataset with enhanced preprocessing"""
    use_cols = ["Utterance", "Emotion"]

    try:
        train_df = pd.read_csv(TRAIN_CSV)[use_cols].dropna()
        dev_df = pd.read_csv(DEV_CSV)[use_cols].dropna()
        test_df = pd.read_csv(TEST_CSV)[use_cols].dropna()
    except FileNotFoundError as e:
        print(f"Error loading data: {e}")
        return None, None, None

    # Map emotions to labels and clean up
    for name, df in zip(("train", "dev", "test"), (train_df, dev_df, test_df)):
        df["label"] = df["Emotion"].map(label2id)
        df.dropna(subset=["label"], inplace=True)
        df.rename(columns={"Utterance": "text"}, inplace=True)
        df["label"] = df["label"].astype(int)

    print(f"Loaded - Train: {len(train_df)}, Dev: {len(dev_df)}, Test: {len(test_df)}")

    # Check label distribution
    print("\nLabel distribution in training data:")
    label_counts = train_df["Emotion"].value_counts()
    for emotion, count in label_counts.items():
        print(f"  {emotion}: {count}")

    return train_df, dev_df, test_df


# ---- Enhanced text preprocessing ----
def enhanced_clean_text(text):
    """Enhanced text cleaning with emotion-preserving features"""
    if pd.isna(text):
        return ""

    text = str(text).lower()

    # Preserve important punctuation patterns for emotions
    text = re.sub(r"!{2,}", " multiexclaim ", text)  # Multiple exclamations
    text = re.sub(r"\?{2,}", " multiquestion ", text)  # Multiple questions
    text = re.sub(r"\.{3,}", " ellipsis ", text)  # Ellipsis

    # Handle repeated characters (e.g., "sooo" -> "so repeatchar")
    text = re.sub(r"(.)\1{2,}", r"\1 repeatchar", text)

    # Preserve emoticons and basic punctuation
    text = re.sub(r"[^\w\s!?.,:\-\(\)]", " ", text)

    # Clean up whitespace
    text = re.sub(r"\s+", " ", text)
    return text.strip()


# Load data
train_df, dev_df, test_df = load_meld_data()
if train_df is None:
    print("Data loading failed. Please check your file paths.")
    exit()

# Apply enhanced preprocessing
print("Applying enhanced text preprocessing...")
train_df["text_clean"] = train_df["text"].apply(enhanced_clean_text)
dev_df["text_clean"] = dev_df["text"].apply(enhanced_clean_text)
test_df["text_clean"] = test_df["text"].apply(enhanced_clean_text)

# Remove empty texts
train_df = train_df[train_df["text_clean"].str.len() > 0]
dev_df = dev_df[dev_df["text_clean"].str.len() > 0]
test_df = test_df[test_df["text_clean"].str.len() > 0]

print(
    f"After cleaning - Train: {len(train_df)}, Dev: {len(dev_df)}, Test: {len(test_df)}"
)

# ---- Enhanced tokenization ----
vocab_size = 15000  # Increased vocabulary size
max_len = 150  # Increased sequence length

tokenizer = Tokenizer(
    num_words=vocab_size,
    oov_token="<OOV>",
    lower=True,
    split=" ",
    filters="",  # Don't remove punctuation we want to keep
)

print("Building enhanced vocabulary...")
tokenizer.fit_on_texts(train_df["text_clean"])
print(f"Vocabulary size: {len(tokenizer.word_index)}")

# Convert texts to sequences
X_train = tokenizer.texts_to_sequences(train_df["text_clean"])
X_dev = tokenizer.texts_to_sequences(dev_df["text_clean"])
X_test = tokenizer.texts_to_sequences(test_df["text_clean"])

# Pad sequences
X_train_pad = pad_sequences(X_train, maxlen=max_len, padding="post", truncating="post")
X_dev_pad = pad_sequences(X_dev, maxlen=max_len, padding="post", truncating="post")
X_test_pad = pad_sequences(X_test, maxlen=max_len, padding="post", truncating="post")

y_train = train_df["label"].values
y_dev = dev_df["label"].values
y_test = test_df["label"].values

print(
    f"Final shapes - Train: {X_train_pad.shape}, Dev: {X_dev_pad.shape}, Test: {X_test_pad.shape}"
)

# ---- Calculate class weights for imbalanced data ----
class_weights = compute_class_weight("balanced", classes=np.unique(y_train), y=y_train)
class_weight_dict = dict(zip(np.unique(y_train), class_weights))
print(f"Class weights: {class_weight_dict}")


# ---- Enhanced Model Architecture ----
def create_advanced_emotion_model(
    vocab_size, embedding_dim=200, max_len=150, num_classes=7
):
    """Create an advanced emotion classification model with multiple improvements"""

    # Input layer
    input_layer = Input(shape=(max_len,))

    # Embedding layer with larger dimensions
    embedding = Embedding(
        input_dim=vocab_size,
        output_dim=embedding_dim,
        input_length=max_len,
        mask_zero=True,
        embeddings_regularizer=l2(0.0001),
    )(input_layer)

    # First Bidirectional LSTM layer
    lstm1 = Bidirectional(
        LSTM(
            128,
            return_sequences=True,
            dropout=0.3,
            recurrent_dropout=0.2,
            kernel_regularizer=l2(0.0001),
        )
    )(embedding)
    lstm1 = BatchNormalization()(lstm1)

    # Second Bidirectional LSTM layer
    lstm2 = Bidirectional(
        LSTM(
            64,
            return_sequences=True,
            dropout=0.3,
            recurrent_dropout=0.2,
            kernel_regularizer=l2(0.0001),
        )
    )(lstm1)
    lstm2 = BatchNormalization()(lstm2)

    # Global Max Pooling to capture most important features
    global_max_pool = GlobalMaxPooling1D()(lstm2)

    # Alternative: use the last output from LSTM
    lstm3 = Bidirectional(
        LSTM(32, return_sequences=False, dropout=0.3, recurrent_dropout=0.2)
    )(lstm2)

    # Concatenate different representations
    concat_features = Concatenate()([global_max_pool, lstm3])

    # Dense layers with batch normalization
    dense1 = Dense(256, activation="relu", kernel_regularizer=l2(0.001))(
        concat_features
    )
    dense1 = BatchNormalization()(dense1)
    dense1 = Dropout(0.5)(dense1)

    dense2 = Dense(128, activation="relu", kernel_regularizer=l2(0.001))(dense1)
    dense2 = BatchNormalization()(dense2)
    dense2 = Dropout(0.4)(dense2)

    dense3 = Dense(64, activation="relu", kernel_regularizer=l2(0.001))(dense2)
    dense3 = Dropout(0.3)(dense3)

    # Output layer
    output = Dense(num_classes, activation="softmax")(dense3)

    model = Model(inputs=input_layer, outputs=output)
    return model


# Create the enhanced model
actual_vocab_size = min(vocab_size, len(tokenizer.word_index) + 1)
model = create_advanced_emotion_model(
    vocab_size=actual_vocab_size,
    embedding_dim=200,
    max_len=max_len,
    num_classes=len(CLASSES),
)

# Enhanced optimizer with learning rate scheduling
initial_learning_rate = 0.001
optimizer = Adam(learning_rate=initial_learning_rate, clipnorm=1.0)

# Compile model
model.compile(
    optimizer=optimizer, loss="sparse_categorical_crossentropy", metrics=["accuracy"]
)

model.summary()

# ---- Enhanced training callbacks ----
callbacks = [
    EarlyStopping(
        monitor="val_accuracy",
        patience=8,
        restore_best_weights=True,
        verbose=1,
        min_delta=0.001,
    ),
    ReduceLROnPlateau(
        monitor="val_loss",
        factor=0.3,
        patience=4,
        min_lr=1e-8,
        verbose=1,
        min_delta=0.001,
    ),
]

# ---- Train enhanced model ----
print("Starting enhanced training...")
history = model.fit(
    X_train_pad,
    y_train,
    batch_size=16,  # Smaller batch size for better convergence
    epochs=30,  # More epochs with early stopping
    validation_data=(X_dev_pad, y_dev),
    callbacks=callbacks,
    verbose=1,
    class_weight=class_weight_dict,  # Handle class imbalance
)

# ---- Enhanced evaluation ----
print("\nEvaluating on test set...")
test_loss, test_accuracy = model.evaluate(X_test_pad, y_test, verbose=0)
print(f"Test Accuracy: {test_accuracy:.4f}")

# Predictions with confidence scores
y_pred_proba = model.predict(X_test_pad, verbose=0)
y_pred_classes = np.argmax(y_pred_proba, axis=1)
confidence_scores = np.max(y_pred_proba, axis=1)

print(f"Average prediction confidence: {np.mean(confidence_scores):.4f}")


# ---- Enhanced Classification Report ----
def enhanced_classification_report(y_true, y_pred, class_names):
    """Enhanced classification report with additional metrics"""

    report = {}
    total_samples = len(y_true)
    correct_predictions = np.sum(y_true == y_pred)

    for i, class_name in enumerate(class_names):
        # Basic metrics
        tp = np.sum((y_true == i) & (y_pred == i))
        fp = np.sum((y_true != i) & (y_pred == i))
        fn = np.sum((y_true == i) & (y_pred != i))
        tn = np.sum((y_true != i) & (y_pred != i))

        precision = tp / (tp + fp) if (tp + fp) > 0 else 0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0
        f1 = (
            2 * (precision * recall) / (precision + recall)
            if (precision + recall) > 0
            else 0
        )

        support = np.sum(y_true == i)
        specificity = tn / (tn + fp) if (tn + fp) > 0 else 0

        report[class_name] = {
            "precision": precision,
            "recall": recall,
            "f1-score": f1,
            "support": support,
            "specificity": specificity,
        }

    return report


# Generate enhanced classification report
report = enhanced_classification_report(y_test, y_pred_classes, CLASSES)
print("\n" + "=" * 80)
print("ENHANCED CLASSIFICATION REPORT")
print("=" * 80)
print(
    f"{'Class':<12} {'Precision':<10} {'Recall':<10} {'F1-Score':<10} {'Support':<8} {'Specificity':<12}"
)
print("-" * 80)

for class_name, metrics in report.items():
    print(
        f"{class_name:<12} {metrics['precision']:<10.4f} {metrics['recall']:<10.4f} "
        f"{metrics['f1-score']:<10.4f} {metrics['support']:<8} {metrics['specificity']:<12.4f}"
    )

# Calculate macro and weighted averages
macro_f1 = np.mean([metrics["f1-score"] for metrics in report.values()])
weighted_f1 = np.sum(
    [metrics["f1-score"] * metrics["support"] for metrics in report.values()]
) / len(y_test)

print(f"\nMacro F1-Score: {macro_f1:.4f}")
print(f"Weighted F1-Score: {weighted_f1:.4f}")
print(f"Overall Accuracy: {test_accuracy:.4f}")

# ---- Error Analysis ----
print("\n" + "=" * 60)
print("ERROR ANALYSIS")
print("=" * 60)

# Find most confused classes
confusion_pairs = {}
for true_idx, pred_idx in zip(y_test, y_pred_classes):
    if true_idx != pred_idx:
        pair = (id2label[true_idx], id2label[pred_idx])
        confusion_pairs[pair] = confusion_pairs.get(pair, 0) + 1

# Sort by frequency
sorted_confusion = sorted(confusion_pairs.items(), key=lambda x: x[1], reverse=True)
print("Most frequent confusions:")
for (true_label, pred_label), count in sorted_confusion[:10]:
    print(f"  {true_label} -> {pred_label}: {count} times")

# ---- Save model (optional) ----
# model.save('enhanced_emotion_model.h5')
# print("Model saved as 'enhanced_emotion_model.h5'")

print("\n" + "=" * 60)
print("TRAINING COMPLETED")
print("=" * 60)