# 🚀 REQ-DL-012: Expanded Dataset Retraining
## Domain-Adapted Emotion Detection with 1000+ Samples

**Target**: Achieve 75-85% F1 Score
**Current**: 67% F1 Score
**Expected Improvement**: 8-18% F1 Score

---

## 🔧 Setup and Dependencies

In [None]:
# Clone repository
!git clone https://github.com/uelkerd/SAMO--DL.git
%cd SAMO--DL
print("✅ Repository cloned and ready!")

In [None]:
# Install dependencies with compatibility fixes
print("📦 Installing dependencies with compatibility fixes...")

# Step 1: Uninstall existing PyTorch to avoid conflicts
!pip uninstall torch torchvision torchaudio -y

# Step 2: Install PyTorch with compatible CUDA version
!pip install torch==2.1.0 torchvision==0.16.0 torchaudio==2.1.0 --index-url https://download.pytorch.org/whl/cu118

# Step 3: Install Transformers with compatible version
!pip install transformers==4.30.0 datasets==2.13.0 evaluate scikit-learn pandas numpy matplotlib seaborn

# Step 4: Verify installation
print("🔍 Verifying installation...")
import torch
import transformers
print(f"PyTorch: {torch.__version__}")
print(f"Transformers: {transformers.__version__}")
print(f"CUDA Available: {torch.cuda.is_available()}")

# Step 5: Test critical imports
try:
    from transformers import AutoModel, AutoTokenizer
    print("✅ Transformers imports successful")
except Exception as e:
    print(f"❌ Transformers import failed: {e}")
    print("🔄 Restarting runtime and trying again...")
    import os
    os._exit(0)  # Force restart

print("✅ Dependencies installed and verified!")

## 📊 Create Expanded Dataset

In [None]:
# Create expanded dataset directly in Colab
import json
import random
from typing import List, Dict

def load_current_dataset():
    """Load the current journal dataset."""
    with open('data/journal_test_dataset.json', 'r') as f:
        return json.load(f)

def create_variation(base_sample: Dict, emotion: str) -> Dict:
    """Create a variation of a base sample."""
    
    # Templates for different emotions
    emotion_templates = {
        'happy': [
            "I'm feeling really happy today!",
            "I'm so happy about this!",
            "This makes me incredibly happy!",
            "I'm feeling joyful and happy!",
            "I'm really happy with how things are going!",
            "This brings me so much happiness!",
            "I'm feeling happy and content!",
            "I'm really happy about this outcome!",
            "This makes me feel so happy!",
            "I'm feeling happy and grateful!"
        ],
        'sad': [
            "I'm feeling really sad today.",
            "This makes me so sad.",
            "I'm feeling down and sad.",
            "I'm really sad about this situation.",
            "This brings me sadness.",
            "I'm feeling sad and lonely.",
            "I'm really sad about what happened.",
            "This makes me feel so sad.",
            "I'm feeling sad and disappointed.",
            "I'm really sad about this outcome."
        ],
        'frustrated': [
            "I'm so frustrated with this!",
            "This is really frustrating me.",
            "I'm feeling frustrated and annoyed.",
            "I'm really frustrated about this situation.",
            "This is so frustrating!",
            "I'm feeling frustrated and angry.",
            "I'm really frustrated with how this is going.",
            "This makes me so frustrated.",
            "I'm feeling frustrated and upset.",
            "I'm really frustrated about this outcome."
        ],
        'anxious': [
            "I'm feeling really anxious about this.",
            "This is making me anxious.",
            "I'm feeling anxious and worried.",
            "I'm really anxious about what might happen.",
            "This gives me anxiety.",
            "I'm feeling anxious and nervous.",
            "I'm really anxious about this situation.",
            "This makes me feel so anxious.",
            "I'm feeling anxious and stressed.",
            "I'm really anxious about the outcome."
        ],
        'excited': [
            "I'm so excited about this!",
            "This makes me really excited!",
            "I'm feeling excited and enthusiastic!",
            "I'm really excited about what's coming!",
            "This is so exciting!",
            "I'm feeling excited and eager!",
            "I'm really excited about this opportunity!",
            "This makes me feel so excited!",
            "I'm feeling excited and thrilled!",
            "I'm really excited about this outcome!"
        ],
        'calm': [
            "I'm feeling really calm right now.",
            "This brings me a sense of calm.",
            "I'm feeling calm and peaceful.",
            "I'm really calm about this situation.",
            "This makes me feel calm.",
            "I'm feeling calm and relaxed.",
            "I'm really calm about what's happening.",
            "This gives me a calm feeling.",
            "I'm feeling calm and content.",
            "I'm really calm about this outcome."
        ],
        'content': [
            "I'm feeling really content with this.",
            "This makes me feel content.",
            "I'm feeling content and satisfied.",
            "I'm really content with how things are.",
            "This brings me contentment.",
            "I'm feeling content and happy.",
            "I'm really content with this situation.",
            "This makes me feel so content.",
            "I'm feeling content and peaceful.",
            "I'm really content with this outcome."
        ],
        'grateful': [
            "I'm feeling really grateful for this.",
            "This makes me so grateful.",
            "I'm feeling grateful and thankful.",
            "I'm really grateful for this opportunity.",
            "This fills me with gratitude.",
            "I'm feeling grateful and blessed.",
            "I'm really grateful for this situation.",
            "This makes me feel so grateful.",
            "I'm feeling grateful and appreciative.",
            "I'm really grateful for this outcome."
        ],
        'hopeful': [
            "I'm feeling really hopeful about this.",
            "This gives me hope.",
            "I'm feeling hopeful and optimistic.",
            "I'm really hopeful about what's coming.",
            "This brings me hope.",
            "I'm feeling hopeful and positive.",
            "I'm really hopeful about this situation.",
            "This makes me feel so hopeful.",
            "I'm feeling hopeful and confident.",
            "I'm really hopeful about this outcome."
        ],
        'overwhelmed': [
            "I'm feeling really overwhelmed by this.",
            "This is overwhelming me.",
            "I'm feeling overwhelmed and stressed.",
            "I'm really overwhelmed by this situation.",
            "This is so overwhelming.",
            "I'm feeling overwhelmed and anxious.",
            "I'm really overwhelmed by what's happening.",
            "This makes me feel so overwhelmed.",
            "I'm feeling overwhelmed and exhausted.",
            "I'm really overwhelmed by this outcome."
        ],
        'proud': [
            "I'm feeling really proud of this.",
            "This makes me so proud.",
            "I'm feeling proud and accomplished.",
            "I'm really proud of what I've done.",
            "This fills me with pride.",
            "I'm feeling proud and satisfied.",
            "I'm really proud of this achievement.",
            "This makes me feel so proud.",
            "I'm feeling proud and confident.",
            "I'm really proud of this outcome."
        ],
        'tired': [
            "I'm feeling really tired today.",
            "This is making me tired.",
            "I'm feeling tired and exhausted.",
            "I'm really tired from all this work.",
            "This is so tiring.",
            "I'm feeling tired and worn out.",
            "I'm really tired of this situation.",
            "This makes me feel so tired.",
            "I'm feeling tired and drained.",
            "I'm really tired of dealing with this."
        ]
    }
    
    # Get templates for this emotion
    templates = emotion_templates.get(emotion, [f"I'm feeling {emotion}."])
    
    # Create variation
    template = random.choice(templates)
    
    # Add some variety to the content
    variations = [
        f"{template} {random.choice(['It\'s been a long day.', 'Things are going well.', 'I need to process this.', 'This is important to me.'])}",
        f"{template} {random.choice(['I hope this continues.', 'I wonder what\'s next.', 'This feels right.', 'I\'m processing this.'])}",
        f"{template} {random.choice(['I should reflect on this.', 'This is meaningful.', 'I appreciate this moment.', 'I\'m learning from this.'])}"
    ]
    
    content = random.choice(variations)
    
    return {
        'content': content,
        'emotion': emotion,
        'id': f"expanded_{emotion}_{random.randint(1000, 9999)}"
    }

def create_balanced_dataset(target_size=1000):
    """Create a balanced expanded dataset."""
    print("🔧 Creating balanced expanded dataset...")
    
    # Load current data
    current_data = load_current_dataset()
    
    # Analyze current distribution
    emotion_counts = {}
    for entry in current_data:
        emotion = entry['emotion']
        emotion_counts[emotion] = emotion_counts.get(emotion, 0) + 1
    
    print(f"📊 Current emotion distribution:")
    for emotion, count in sorted(emotion_counts.items()):
        print(f"  {emotion}: {count} samples")
    
    # Calculate target per emotion
    target_per_emotion = target_size // len(emotion_counts)
    print(f"\n🎯 Target: {target_per_emotion} samples per emotion")
    
    # Create expanded dataset
    expanded_data = []
    
    for emotion in emotion_counts.keys():
        # Get existing samples for this emotion
        existing_samples = [entry for entry in current_data if entry['emotion'] == emotion]
        current_count = len(existing_samples)
        
        print(f"\n📝 Expanding '{emotion}' from {current_count} to {target_per_emotion} samples...")
        
        # Add existing samples
        expanded_data.extend(existing_samples)
        
        # Generate additional samples
        needed_samples = target_per_emotion - current_count
        
        if needed_samples > 0:
            # Create variations of existing samples
            for i in range(needed_samples):
                # Pick a random existing sample to base variation on
                base_sample = random.choice(existing_samples)
                
                # Create variation
                variation = create_variation(base_sample, emotion)
                expanded_data.append(variation)
    
    print(f"\n✅ Expanded dataset created:")
    print(f"  Original samples: {len(current_data)}")
    print(f"  Expanded samples: {len(expanded_data)}")
    print(f"  Target size: {target_size}")
    
    return expanded_data

# Create expanded dataset
expanded_data = create_balanced_dataset(target_size=1000)

# Save expanded dataset
with open('data/expanded_journal_dataset.json', 'w') as f:
    json.dump(expanded_data, f, indent=2)

print("✅ Expanded dataset saved to data/expanded_journal_dataset.json")

# Analyze expanded dataset
emotion_counts = {}
for entry in expanded_data:
    emotion = entry['emotion']
    emotion_counts[emotion] = emotion_counts.get(emotion, 0) + 1

print("\n📊 Expanded Dataset Analysis:")
print("=" * 40)
print("Emotion distribution:")
for emotion, count in sorted(emotion_counts.items()):
    print(f"  {emotion}: {count} samples")

print(f"\nTotal samples: {len(expanded_data)}")
print(f"Unique emotions: {len(emotion_counts)}")

## 🚀 Training with Expanded Dataset (GPU Optimized)

In [None]:
# Complete training script with expanded dataset and GPU optimizations
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModel, AutoTokenizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score
import numpy as np
from torch.cuda.amp import autocast, GradScaler

class ExpandedEmotionDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

class ExpandedEmotionClassifier(nn.Module):
    def __init__(self, model_name="bert-base-uncased", num_labels=12):
        super().__init__()
        self.num_labels = num_labels
        self.bert = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)
    
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        logits = self.classifier(self.dropout(pooled_output))
        return logits

def prepare_expanded_data(data, test_size=0.2, val_size=0.1):
    """Prepare data for training with expanded dataset."""
    print("🔧 Preparing expanded data...")
    
    # Extract texts and emotions
    texts = [entry['content'] for entry in data]
    emotions = [entry['emotion'] for entry in data]
    
    # Create label encoder
    label_encoder = LabelEncoder()
    labels = label_encoder.fit_transform(emotions)
    
    print(f"✅ Label encoder created with {len(label_encoder.classes_)} classes")
    print(f"📊 Classes: {list(label_encoder.classes_)}")
    
    # Split data
    X_temp, X_test, y_temp, y_test = train_test_split(
        texts, labels, test_size=test_size, random_state=42, stratify=labels
    )
    
    X_train, X_val, y_train, y_val = train_test_split(
        X_temp, y_temp, test_size=val_size/(1-test_size), random_state=42, stratify=y_temp
    )
    
    print(f"📊 Data split:")
    print(f"  Training: {len(X_train)} samples")
    print(f"  Validation: {len(X_val)} samples")
    print(f"  Test: {len(X_test)} samples")
    
    return (X_train, y_train), (X_val, y_val), (X_test, y_test), label_encoder

def train_expanded_model(train_data, val_data, label_encoder, epochs=5, batch_size=16):
    """Train the model with expanded dataset and GPU optimizations."""
    print("🚀 Training with expanded dataset...")
    
    # Setup
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"✅ Using device: {device}")
    
    # GPU optimizations
    if torch.cuda.is_available():
        print("🔧 Applying GPU optimizations...")
        torch.backends.cudnn.benchmark = True
        torch.backends.cudnn.deterministic = False
        print(f"📊 GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
        print(f"📊 Available Memory: {torch.cuda.memory_allocated(0) / 1e9:.1f} GB")
    
    # Clear GPU cache
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    
    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
    
    # Create datasets
    X_train, y_train = train_data
    X_val, y_val = val_data
    
    train_dataset = ExpandedEmotionDataset(X_train, y_train, tokenizer)
    val_dataset = ExpandedEmotionDataset(X_val, y_val, tokenizer)
    
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=2, pin_memory=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=2, pin_memory=True)
    
    # Initialize model
    model = ExpandedEmotionClassifier(num_labels=len(label_encoder.classes_))
    model.to(device)
    
    # Setup training with optimizations
    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=2, verbose=True)
    criterion = nn.CrossEntropyLoss()
    scaler = GradScaler()
    
    # Training loop
    best_f1 = 0
    training_history = []
    
    for epoch in range(epochs):
        print(f"\n🔄 Epoch {epoch + 1}/{epochs}")
        
        # Training
        model.train()
        total_loss = 0
        
        for i, batch in enumerate(train_loader):
            input_ids = batch['input_ids'].to(device, non_blocking=True)
            attention_mask = batch['attention_mask'].to(device, non_blocking=True)
            labels = batch['labels'].to(device, non_blocking=True)
            
            optimizer.zero_grad()
            
            # Mixed precision training
            with autocast():
                outputs = model(input_ids=input_ids, attention_mask=attention_mask)
                loss = criterion(outputs, labels)
            
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            
            total_loss += loss.item()
            
            if i % 50 == 0:
                print(f"  Batch {i}/{len(train_loader)}, Loss: {loss.item():.4f}")
        
        # Validation
        model.eval()
        val_loss = 0
        all_preds = []
        all_labels = []
        
        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input_ids'].to(device, non_blocking=True)
                attention_mask = batch['attention_mask'].to(device, non_blocking=True)
                labels = batch['labels'].to(device, non_blocking=True)
                
                outputs = model(input_ids=input_ids, attention_mask=attention_mask)
                loss = criterion(outputs, labels)
                val_loss += loss.item()
                
                preds = torch.argmax(outputs, dim=1)
                all_preds.extend(preds.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())
        
        # Calculate metrics
        avg_train_loss = total_loss / len(train_loader)
        avg_val_loss = val_loss / len(val_loader)
        f1_macro = f1_score(all_labels, all_preds, average='macro')
        accuracy = accuracy_score(all_labels, all_preds)
        
        print(f"📊 Epoch {epoch + 1} Results:")
        print(f"  Train Loss: {avg_train_loss:.4f}")
        print(f"  Val Loss: {avg_val_loss:.4f}")
        print(f"  Val F1 (Macro): {f1_macro:.4f}")
        print(f"  Val Accuracy: {accuracy:.4f}")
        
        # Early stopping check
        if epoch > 2 and f1_macro < best_f1 * 0.95:
            print(f"🛑 Early stopping triggered. F1 dropped below 95% of best.")
            break
        
        # Save best model
        if f1_macro > best_f1:
            best_f1 = f1_macro
            torch.save(model.state_dict(), 'best_expanded_model.pth')
            print(f"💾 New best model saved! F1: {best_f1:.4f}")
            scheduler.step(f1_macro)
        
        training_history.append({
            'epoch': epoch,
            'train_loss': avg_train_loss,
            'val_loss': avg_val_loss,
            'val_f1_macro': f1_macro,
            'val_accuracy': accuracy
        })
    
    return model, training_history, best_f1

# Load expanded dataset
with open('data/expanded_journal_dataset.json', 'r') as f:
    expanded_data = json.load(f)

print(f"📊 Loaded {len(expanded_data)} expanded samples")

# Prepare data
train_data, val_data, test_data, label_encoder = prepare_expanded_data(expanded_data)

# Train model
model, training_history, best_f1 = train_expanded_model(train_data, val_data, label_encoder)

print(f"\n🎉 Training completed!")
print(f"📊 Best F1 Score: {best_f1:.4f}")
print(f"🎯 Target Achieved: {best_f1 >= 0.70}")

## 🧪 Test the New Model

In [None]:
# Test the new model with sample entries
def test_new_model():
    """Test the new model with sample journal entries."""
    print("🧪 Testing new expanded model...")
    
    # Load best model
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = ExpandedEmotionClassifier(num_labels=len(label_encoder.classes_))
    model.load_state_dict(torch.load('best_expanded_model.pth'))
    model.to(device)
    model.eval()
    
    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
    
    # Sample test entries
    test_entries = [
        "I'm feeling really happy today! Everything is going well.",
        "I'm so frustrated with this project. Nothing is working.",
        "I feel anxious about the upcoming presentation.",
        "I'm grateful for all the support I've received.",
        "I'm feeling overwhelmed with all these tasks.",
        "I'm proud of what I've accomplished so far.",
        "I'm feeling sad and lonely today.",
        "I'm excited about the new opportunities ahead.",
        "I feel calm and peaceful right now.",
        "I'm hopeful that things will get better.",
        "I'm tired and need some rest.",
        "I'm content with how things are going."
    ]
    
    print("\n📊 Testing Results:")
    print("=" * 80)
    
    for i, text in enumerate(test_entries, 1):
        # Tokenize
        encoding = tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=128,
            return_tensors='pt'
        )
        
        # Predict
        with torch.no_grad():
            input_ids = encoding['input_ids'].to(device)
            attention_mask = encoding['attention_mask'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            probabilities = torch.softmax(outputs, dim=1)
            predicted_class = torch.argmax(probabilities, dim=1).item()
            confidence = probabilities[0][predicted_class].item()
        
        # Get emotion label
        emotion = label_encoder.inverse_transform([predicted_class])[0]
        
        print(f"\n{i}. Text: {text}")
        print(f"   Predicted: {emotion} (confidence: {confidence:.3f})")
        
        # Show top 3 predictions
        all_probs = probabilities[0].cpu().numpy()
        top_indices = np.argsort(all_probs)[-3:][::-1]
        print("   Top 3 predictions:")
        for idx in top_indices:
            prob = all_probs[idx]
            emotion_name = label_encoder.inverse_transform([idx])[0]
            print(f"     - {emotion_name}: {prob:.3f}")
    
    print("\n✅ Model testing completed!")

# Test the new model
test_new_model()

## 💾 Download Results

In [None]:
# Download the trained model and results
from google.colab import files

print("📥 Downloading results...")

# Download model
files.download('best_expanded_model.pth')

# Save and download results
results = {
    'best_f1': best_f1,
    'target_achieved': best_f1 >= 0.70,
    'num_labels': len(label_encoder.classes_),
    'all_emotions': list(label_encoder.classes_),
    'training_history': training_history,
    'expanded_samples': len(expanded_data)
}

with open('expanded_training_results.json', 'w') as f:
    json.dump(results, f, indent=2)

files.download('expanded_training_results.json')

print("✅ Downloads completed!")
print(f"📊 Final F1 Score: {best_f1:.4f}")
print(f"🎯 Target Achieved: {best_f1 >= 0.70}")