# 🚀 FIXED COMBINED TRAINING - JOURNAL + CMU-MOSEI

**Target: 75-85% F1 Score**  
**Current: 67% F1 Score**  
**Strategy: Combine high-quality datasets**

This notebook combines:
- Original 150 high-quality journal samples
- CMU-MOSEI samples for diversity
- Optimized hyperparameters for 75-85% F1

**FIXED**: Correct data loading for journal content field

In [None]:
# Install dependencies
!pip install transformers torch scikit-learn pandas numpy
print("✅ All dependencies installed!")

In [None]:
# Clone repository
!git clone https://github.com/uelkerd/SAMO--DL.git
print("📂 Repository cloned successfully!")

In [None]:
# Import libraries
import json
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback
)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score, accuracy_score, classification_report
import warnings
warnings.filterwarnings('ignore')

print("✅ All libraries imported!")

In [None]:
# FIXED: Load combined dataset with correct field names
print("📊 Loading combined dataset...")

combined_samples = []

# Load journal data (FIXED: use 'content' field)
try:
    with open('/content/SAMO--DL/data/journal_test_dataset.json', 'r') as f:
        journal_data = json.load(f)
    
    for item in journal_data:
        combined_samples.append({
            'text': item['content'],  # FIXED: use 'content' not 'text'
            'emotion': item['emotion']
        })
    print(f"✅ Loaded {len(journal_data)} journal samples")
except Exception as e:
    print(f"⚠️ Could not load journal data: {e}")

# Load CMU-MOSEI data (uses 'text' field)
try:
    with open('/content/SAMO--DL/data/cmu_mosei_balanced_dataset.json', 'r') as f:
        cmu_data = json.load(f)
    
    for item in cmu_data:
        combined_samples.append({
            'text': item['text'],  # CMU-MOSEI uses 'text' field
            'emotion': item['emotion']
        })
    print(f"✅ Loaded {len(cmu_data)} CMU-MOSEI samples")
except Exception as e:
    print(f"⚠️ Could not load CMU-MOSEI data: {e}")

print(f"📊 Total combined samples: {len(combined_samples)}")

# Show emotion distribution
if combined_samples:
    emotion_counts = {}
    for sample in combined_samples:
        emotion = sample['emotion']
        emotion_counts[emotion] = emotion_counts.get(emotion, 0) + 1
    
    print("📊 Emotion distribution:")
    for emotion, count in sorted(emotion_counts.items()):
        print(f"  {emotion}: {count} samples")
else:
    print("❌ No data loaded! Check file paths.")

In [None]:
# Check if we have data
if len(combined_samples) == 0:
    print("❌ No data loaded! Creating fallback dataset...")
    
    # Create minimal fallback dataset
    fallback_samples = [
        {"text": "I'm feeling happy today!", "emotion": "happy"},
        {"text": "I'm so frustrated with this project.", "emotion": "frustrated"},
        {"text": "I feel anxious about the presentation.", "emotion": "anxious"},
        {"text": "I'm grateful for all the support.", "emotion": "grateful"},
        {"text": "I'm feeling overwhelmed with tasks.", "emotion": "overwhelmed"},
        {"text": "I'm proud of what I accomplished.", "emotion": "proud"},
        {"text": "I'm feeling sad and lonely.", "emotion": "sad"},
        {"text": "I'm excited about new opportunities.", "emotion": "excited"},
        {"text": "I feel calm and peaceful.", "emotion": "calm"},
        {"text": "I'm hopeful things will get better.", "emotion": "hopeful"},
        {"text": "I'm tired and need rest.", "emotion": "tired"},
        {"text": "I'm content with how things are.", "emotion": "content"}
    ]
    combined_samples = fallback_samples
    print(f"✅ Created {len(combined_samples)} fallback samples")

print(f"📊 Final dataset size: {len(combined_samples)} samples")

In [None]:
# Custom dataset class
class EmotionDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [None]:
# Prepare data
texts = [sample['text'] for sample in combined_samples]
emotions = [sample['emotion'] for sample in combined_samples]

# Encode labels
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(emotions)

print(f"🎯 Number of labels: {len(label_encoder.classes_)}")
print(f"📊 Labels: {list(label_encoder.classes_)}")

# Split data
train_texts, test_texts, train_labels, test_labels = train_test_split(
    texts, labels, test_size=0.2, random_state=42, stratify=labels
)

print(f"📈 Training samples: {len(train_texts)}")
print(f"🧪 Test samples: {len(test_labels)}")

In [None]:
# Load model and tokenizer
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name, 
    num_labels=len(label_encoder.classes_),
    problem_type="single_label_classification"
)

print(f"✅ Model loaded: {model_name}")
print(f"📊 Number of classes: {len(label_encoder.classes_)}")

In [None]:
# Create datasets
train_dataset = EmotionDataset(train_texts, train_labels, tokenizer)
test_dataset = EmotionDataset(test_texts, test_labels, tokenizer)

print(f"✅ Datasets created")
print(f"📈 Train dataset: {len(train_dataset)} samples")
print(f"🧪 Test dataset: {len(test_dataset)} samples")

In [None]:
# Define metrics function
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    
    f1 = f1_score(labels, predictions, average='weighted')
    accuracy = accuracy_score(labels, predictions)
    
    return {'f1': f1, 'accuracy': accuracy}

In [None]:
# Training arguments
training_args = TrainingArguments(
    output_dir="./emotion_model_combined",
    num_train_epochs=8,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    eval_strategy="steps",
    eval_steps=100,
    save_strategy="steps",
    save_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    dataloader_num_workers=2,
    remove_unused_columns=False,
    report_to=None,
    learning_rate=2e-5,
    gradient_accumulation_steps=2,
)

print("✅ Training arguments configured")

In [None]:
# Create trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

print("✅ Trainer created with early stopping")

In [None]:
# Start training
print("🚀 Starting training...")
print("🎯 Target F1 Score: 75-85%")
print("📊 Current Best: 67%")
print("📈 Expected Improvement: 8-18%")

trainer.train()

print("✅ Training completed!")

In [None]:
# Evaluate final model
print("📊 Evaluating final model...")
results = trainer.evaluate()

print(f"🏆 Final F1 Score: {results['eval_f1']:.4f} ({results['eval_f1']*100:.2f}%)")
print(f"🎯 Target achieved: {'✅ YES!' if results['eval_f1'] >= 0.75 else '❌ Not yet'}")

# Save model
trainer.save_model("./emotion_model_final_combined")
print("💾 Model saved to ./emotion_model_final_combined")

In [None]:
# Test on sample texts
print("🧪 Testing on sample texts...")

test_texts = [
    "I'm feeling really happy today!",
    "I'm so frustrated with this project.",
    "I feel anxious about the presentation.",
    "I'm grateful for all the support.",
    "I'm feeling overwhelmed with tasks."
]

model.eval()
with torch.no_grad():
    for i, text in enumerate(test_texts, 1):
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
        outputs = model(**inputs)
        probabilities = torch.softmax(outputs.logits, dim=1)
        predicted_class = torch.argmax(probabilities, dim=1).item()
        confidence = probabilities[0][predicted_class].item()
        predicted_emotion = label_encoder.classes_[predicted_class]
        
        print(f"{i}. Text: {text}")
        print(f"   Predicted: {predicted_emotion} (confidence: {confidence:.3f})")
        print()

## 🎉 Training Complete!

**Results Summary:**
- Final F1 Score: [See output above]
- Target: 75-85%
- Improvement: [Calculated above]

**Next Steps:**
1. If F1 < 75%: Try different hyperparameters or more data
2. If F1 >= 75%: Model is ready for production!
3. Download the saved model from `./emotion_model_final_combined`