# Persona-Consistent Chatbot: Setup & EDA

## Environment Setup and Exploratory Data Analysis

This notebook covers:
- Environment setup and package installation
- Dataset downloading and exploration
- Data quality analysis
- Persona trait analysis
- Visualizations

In [None]:
# Install required packages
!pip install -q transformers datasets peft trl accelerate wandb
!pip install -q rouge-score sacrebleu evaluate
!pip install -q matplotlib seaborn pandas numpy plotly

In [None]:
import sys
import os
sys.path.append('../')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datasets import load_dataset
import json

## 1. Environment Setup

In [None]:
# Verify GPU availability
import torch
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"CUDA version: {torch.version.cuda}")
print(f"GPU count: {torch.cuda.device_count()}")

if torch.cuda.is_available():
    for i in range(torch.cuda.device_count()):
        print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
        print(f"  Memory: {torch.cuda.get_device_properties(i).total_memory / 1e9:.1f} GB")

## 2. Load and Explore Datasets

In [None]:
# Load PersonaChat dataset
print("Loading PersonaChat dataset...")
personachat = load_dataset("bavard/personachat_truecased")

print("\nDataset structure:")
for split, data in personachat.items():
    print(f"{split}: {len(data)} examples")

# Load Blended Skill Talk
print("\nLoading Blended Skill Talk dataset...")
bst = load_dataset("blended_skill_talk")
for split, data in bst.items():
    print(f"{split}: {len(data)} examples")

In [None]:
# Examine PersonaChat structure
print("PersonaChat example:")
example = personachat['train'][0]
print(f"Personality traits: {example['personality']}")
print(f"History: {example['history']}")
print(f"\nFull example keys: {list(example.keys())}")

## 3. Data Analysis and Visualizations

In [None]:
# Analyze persona traits distribution
all_traits = []
for example in personachat['train']:
    all_traits.extend(example['personality'])

print(f"Total unique persona traits: {len(set(all_traits))}")
print(f"Average traits per persona: {np.mean([len(ex['personality']) for ex in personachat['train']]):.1f}")

In [None]:
# Plot trait frequency
from collections import Counter
trait_counts = Counter(all_traits)
top_traits = trait_counts.most_common(20)

plt.figure(figsize=(12, 6))
traits, counts = zip(*top_traits)
plt.bar(range(len(traits)), counts)
plt.xticks(range(len(traits)), traits, rotation=45, ha='right')
plt.title('Top 20 Persona Traits')
plt.ylabel('Frequency')
plt.tight_layout()
plt.show()

In [None]:
# Analyze conversation lengths
conversation_lengths = [len(ex['history']) for ex in personachat['train']]

plt.figure(figsize=(10, 4))
plt.subplot(1, 2, 1)
plt.hist(conversation_lengths, bins=30, alpha=0.7)
plt.title('Conversation Length Distribution')
plt.xlabel('Number of turns')
plt.ylabel('Frequency')

plt.subplot(1, 2, 2)
response_lengths = [len(turn.split()) for ex in personachat['train'] for turn in ex['history']]
plt.hist(response_lengths, bins=30, alpha=0.7, color='orange')
plt.title('Response Length Distribution')
plt.xlabel('Words per response')
plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

print(f"Average conversation length: {np.mean(conversation_lengths):.1f} turns")
print(f"Average response length: {np.mean(response_lengths):.1f} words")

## 4. Data Quality Checks

In [None]:
# Check for data quality issues
empty_personas = sum(1 for ex in personachat['train'] if not ex['personality'])
short_conversations = sum(1 for ex in personachat['train'] if len(ex['history']) < 2)

print("Data Quality Report:")
print(f"Examples with empty personas: {empty_personas} ({empty_personas/len(personachat['train'])*100:.1f}%)")
print(f"Short conversations (<2 turns): {short_conversations} ({short_conversations/len(personachat['train'])*100:.1f}%)")
print(f"Total training examples: {len(personachat['train'])}")

## 5. Sample Conversations

In [None]:
# Display sample conversations
print("Sample Conversations from PersonaChat:")
print("=" * 50)

for i in range(3):
    example = personachat['train'][i]
    print(f"\nExample {i+1}:")
    print(f"Persona: {', '.join(example['personality'])}")
    print("Conversation:")
    for j, turn in enumerate(example['history']):
        speaker = "User" if j % 2 == 0 else "Bot"
        print(f"  {speaker}: {turn}")
    print("-" * 30)

## 6. Setup Verification

In [None]:
# Verify all components are working
print("Setup Verification:")
print("=" * 30)

# Test imports
try:
    from src.data.loader import DatasetLoader
    from src.utils.config import load_config
    print("✅ Source code imports working")
except ImportError as e:
    print(f"❌ Import error: {e}")

# Test config loading
try:
    config = load_config('../config/model.yaml')
    print("✅ Configuration loading working")
except Exception as e:
    print(f"❌ Config error: {e}")

# Test dataset loading
try:
    loader = DatasetLoader()
    data = loader.load_personachat()
    print(f"✅ Dataset loading working ({len(data)} examples)")
except Exception as e:
    print(f"❌ Dataset error: {e}")

print("\nSetup completed successfully! ✅")

## Summary

This notebook has:
- ✅ Set up the environment and installed dependencies
- ✅ Loaded and explored the PersonaChat and BST datasets
- ✅ Analyzed persona traits and conversation patterns
- ✅ Verified data quality and sample conversations
- ✅ Confirmed all components are working correctly

Next: Proceed to `2_baseline_testing.ipynb` to test base models.