# Star Wars Script Data Exploration

This notebook explores the structure of Star Wars script data, starting with "A New Hope" to understand:
1. Script formatting and structure
2. Character dialogue extraction patterns
3. Scene and context information
4. Data cleaning requirements for RAG pipeline


In [None]:
import os
import re
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns

# Set up paths
project_root = Path('..')
data_dir = project_root / 'data' / 'raw'
script_file = data_dir / 'STAR WARS A NEW HOPE.txt'

print(f"Project root: {project_root.absolute()}")
print(f"Data directory: {data_dir.absolute()}")
print(f"Script file exists: {script_file.exists()}")


## 1. Load and Examine Raw Script Structure


In [None]:
# Load the script file
with open(script_file, 'r', encoding='utf-8') as f:
    script_text = f.read()

print(f"Total characters: {len(script_text):,}")
print(f"Total lines: {len(script_text.splitlines()):,}")
print("\n" + "="*50)
print("FIRST 1000 CHARACTERS:")
print("="*50)
print(script_text[:1000])


In [None]:
# Split into lines for analysis
lines = script_text.splitlines()

print(f"Total lines: {len(lines)}")
print("\n" + "="*50)
print("SAMPLE LINES (50-100):")
print("="*50)
for i, line in enumerate(lines[50:100], 50):
    print(f"{i:3}: {line[:80]}{'...' if len(line) > 80 else ''}")


## 2. Identify Character Dialogue Patterns


In [None]:
# Look for character name patterns
# Character names are typically in ALL CAPS followed by dialogue

# Pattern: Character name at start of line (ALL CAPS)
character_pattern = re.compile(r'^([A-Z][A-Z\s\-\']+)\s+(.+)$')

# Find potential character lines
potential_dialogue = []
for i, line in enumerate(lines):
    line = line.strip()
    if line and character_pattern.match(line):
        match = character_pattern.match(line)
        character = match.group(1).strip()
        dialogue = match.group(2).strip()
        
        # Filter out obvious non-character lines
        if (len(character) <= 30 and  # Reasonable character name length
            not character.startswith('INT.') and
            not character.startswith('EXT.') and
            not character.endswith('.') and
            len(dialogue) > 10):  # Reasonable dialogue length
            
            potential_dialogue.append({
                'line_num': i,
                'character': character,
                'dialogue': dialogue
            })

print(f"Found {len(potential_dialogue)} potential dialogue lines")
print("\n" + "="*50)
print("SAMPLE DIALOGUE EXTRACTIONS:")
print("="*50)
for i, item in enumerate(potential_dialogue[:10]):
    print(f"{i+1:2}. {item['character']:15} | {item['dialogue'][:60]}{'...' if len(item['dialogue']) > 60 else ''}")


In [None]:
# Analyze character frequency
character_counts = {}
for item in potential_dialogue:
    char = item['character']
    character_counts[char] = character_counts.get(char, 0) + 1

# Sort by frequency
sorted_characters = sorted(character_counts.items(), key=lambda x: x[1], reverse=True)

print("TOP 20 CHARACTERS BY DIALOGUE COUNT:")
print("="*40)
for char, count in sorted_characters[:20]:
    print(f"{char:20} | {count:3} lines")


## 3. Advanced Dialogue Extraction Function


In [None]:
def extract_dialogue_lines(script_text):
    """Extract character dialogue from Star Wars script text"""
    lines = script_text.splitlines()
    dialogue_data = []
    current_scene = "Unknown"
    
    for i, line in enumerate(lines):
        line = line.strip()
        
        # Track scene changes
        if line.startswith('INT.') or line.startswith('EXT.'):
            current_scene = line[:50]  # Truncate long scene descriptions
            continue
            
        # Skip empty lines
        if not line:
            continue
            
        # Look for character dialogue pattern
        char_match = re.match(r'^([A-Z][A-Z\s\-\']+?)\s+(.+)$', line)
        
        if char_match:
            character = char_match.group(1).strip()
            dialogue = char_match.group(2).strip()
            
            # Filter criteria for valid character lines
            if (len(character) <= 25 and  # Reasonable name length
                len(dialogue) >= 5 and   # Minimum dialogue length
                not character.endswith('.') and  # Not scene direction
                not character.startswith('FADE') and  # Not script direction
                not character.startswith('CUT')):   # Not script direction
                
                dialogue_data.append({
                    'line_number': i + 1,
                    'scene': current_scene,
                    'character': character,
                    'dialogue': dialogue,
                    'movie': 'A New Hope'
                })
    
    return dialogue_data

# Extract dialogue
dialogue_data = extract_dialogue_lines(script_text)
print(f"Extracted {len(dialogue_data)} dialogue lines")

# Convert to DataFrame for easier analysis
df = pd.DataFrame(dialogue_data)
print(f"\nDataFrame shape: {df.shape}")
print("\nFirst 5 rows:")
df.head()


## 4. Character Analysis and Name Normalization


In [None]:
# Character name normalization mapping
character_mapping = {
    'THREEPIO': 'C-3PO',
    'C-3PO': 'C-3PO',
    'SEE-THREEPIO': 'C-3PO',
    'ARTOO': 'R2-D2',
    'R2-D2': 'R2-D2',
    'ARTOO-DETOO': 'R2-D2',
    'LUKE': 'Luke Skywalker',
    'LEIA': 'Princess Leia',
    'PRINCESS LEIA': 'Princess Leia',
    'HAN': 'Han Solo',
    'HAN SOLO': 'Han Solo',
    'BEN': 'Obi-Wan Kenobi',
    'OBI-WAN': 'Obi-Wan Kenobi',
    'VADER': 'Darth Vader',
    'DARTH VADER': 'Darth Vader',
    'CHEWBACCA': 'Chewbacca',
    'CHEWIE': 'Chewbacca'
}

def normalize_character_name(name):
    """Normalize character names to consistent format"""
    name = name.strip().upper()
    return character_mapping.get(name, name.title())

# Apply normalization
df['character_normalized'] = df['character'].apply(normalize_character_name)

# Analyze character distribution
print("CHARACTER DISTRIBUTION (after normalization):")
print("="*45)
character_stats = df['character_normalized'].value_counts()
print(character_stats.head(15))


## 5. Data Quality Analysis and Cleaning


In [None]:
def clean_dialogue_for_rag(df):
    """Clean dialogue data for RAG pipeline"""
    clean_df = df.copy()
    
    # Add word count and character length
    clean_df['word_count'] = clean_df['dialogue'].str.split().str.len()
    clean_df['char_length'] = clean_df['dialogue'].str.len()
    
    # Filter criteria for high-quality dialogue
    quality_mask = (
        (clean_df['word_count'] >= 3) &  # At least 3 words
        (clean_df['word_count'] <= 40) &  # Not too long
        (clean_df['char_length'] >= 10) &  # At least 10 characters
        (~clean_df['dialogue'].str.contains(r'^\(.*\)$', regex=True))  # Not just parenthetical
    )
    
    clean_df = clean_df[quality_mask].copy()
    
    # Clean dialogue text - remove parenthetical directions
    clean_df['dialogue_clean'] = clean_df['dialogue'].apply(lambda x: 
        re.sub(r'\([^)]*\)', '', x)  # Remove parenthetical directions
        .strip()
        .replace('  ', ' ')  # Remove double spaces
    )
    
    # Remove lines where cleaning left very little content
    clean_df = clean_df[clean_df['dialogue_clean'].str.len() > 5]
    
    # Focus on characters with sufficient dialogue (5+ lines)
    char_counts = clean_df['character_normalized'].value_counts()
    main_chars = char_counts[char_counts >= 5].index.tolist()
    clean_df = clean_df[clean_df['character_normalized'].isin(main_chars)]
    
    return clean_df

# Create clean dataset
clean_df = clean_dialogue_for_rag(df)

print(f"Original dataset: {len(df)} lines")
print(f"Clean dataset: {len(clean_df)} lines")
print(f"Reduction: {(1 - len(clean_df)/len(df))*100:.1f}%")

print("\nCLEAN DATASET - TOP 10 CHARACTERS:")
print("="*40)
print(clean_df['character_normalized'].value_counts().head(10))


In [None]:
# Sample dialogue from main characters
main_characters = ['Luke Skywalker', 'Princess Leia', 'Han Solo', 'C-3PO', 'Darth Vader', 'Obi-Wan Kenobi']

print("SAMPLE DIALOGUE FROM MAIN CHARACTERS:")
print("="*50)

for char in main_characters:
    char_dialogue = clean_df[clean_df['character_normalized'] == char]
    if len(char_dialogue) > 0:
        print(f"\n{char.upper()} ({len(char_dialogue)} lines):")
        print("-" * (len(char) + 20))
        # Show 3 sample lines
        for i, (_, row) in enumerate(char_dialogue.head(3).iterrows()):
            dialogue_text = row['dialogue_clean']
            print(f"{i+1}. {dialogue_text[:80]}{'...' if len(dialogue_text) > 80 else ''}")
            
print(f"\nTotal characters with 5+ lines: {len(clean_df['character_normalized'].unique())}")


In [None]:
# Save clean dataset
output_dir = project_root / 'data' / 'processed'
output_dir.mkdir(exist_ok=True)

output_file = output_dir / 'a_new_hope_dialogue.csv'
clean_df.to_csv(output_file, index=False)

print(f"Clean dataset saved to: {output_file}")
print(f"Final dataset shape: {clean_df.shape}")

# Display final structure
print("\nFINAL DATASET COLUMNS:")
print("="*30)
for col in clean_df.columns:
    print(f"- {col}")
    
print(f"\nDataset ready for embedding and RAG pipeline!")
print(f"Total dialogue lines: {len(clean_df)}")
print(f"Unique characters: {len(clean_df['character_normalized'].unique())}")


## Summary and Next Steps

### Data Quality Assessment:
- Successfully extracted character dialogue from A New Hope script
- Identified main characters and normalized their names
- Cleaned data by removing stage directions and filtering low-quality lines
- Created a structured dataset ready for embedding

### Key Findings:
- Script format is consistent and parseable
- Main characters have sufficient dialogue for RAG
- Data quality is good after cleaning

### Ready for Next Steps:
1. ✅ **Data exploration complete**
2. 🔄 **Next**: Set up embedding model (sentence-transformers)
3. 🔄 **Next**: Create PostgreSQL + pgvector database schema
4. 🔄 **Next**: Build retrieval system prototype
5. 🔄 **Next**: Test character-specific responses
