# Phase 2: Stability Analysis Exploration

This notebook explores the stability analysis results and investigates issues.

## Key Questions
1. Why does Claude show 0.0 median cosine in summary but valid values in raw data?
2. Is the "lexically unstable but semantically stable" claim validated?
3. What's the token count vs variability correlation (S4)?

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import json
import ast

# Set style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('husl')

# Paths
TABLES_DIR = Path('../results/phase2/tables')
DATA_DIR = Path('../data')
PHASE1_DIR = Path('../results/phase1_downloaded')

## 1. Load Data

In [None]:
# Load analysis results
run_stability = pd.read_csv(TABLES_DIR / 'run_stability.csv')
prompt_sensitivity = pd.read_csv(TABLES_DIR / 'prompt_sensitivity.csv')
model_sensitivity = pd.read_csv(TABLES_DIR / 'model_sensitivity.csv')
retention = pd.read_csv(TABLES_DIR / 'retention.csv')
uncertainty = pd.read_csv(TABLES_DIR / 'uncertainty_dispersion.csv')

print(f"Run stability: {len(run_stability)} rows")
print(f"Prompt sensitivity: {len(prompt_sensitivity)} rows")
print(f"Model sensitivity: {len(model_sensitivity)} rows")
print(f"Retention: {len(retention)} rows")
print(f"Uncertainty: {len(uncertainty)} venues")

## 2. Investigate Claude Issue

The summary showed Claude with 0.0 median cosine, but raw data looks valid. Let's investigate.

In [None]:
# Check Claude data
claude_data = run_stability[run_stability['model_key'] == 'claude']
print(f"Claude rows: {len(claude_data)}")
print(f"\nCosine similarity stats:")
print(claude_data['cosine_similarity'].describe())

In [None]:
# Check for zeros
zero_cosine = claude_data[claude_data['cosine_similarity'] == 0]
print(f"Claude rows with cosine = 0: {len(zero_cosine)} ({100*len(zero_cosine)/len(claude_data):.1f}%)")

# Check distribution
print(f"\nCosine value distribution:")
print(claude_data['cosine_similarity'].value_counts(bins=10).sort_index())

In [None]:
# If there are zeros, look at what venues they are
if len(zero_cosine) > 0:
    print("Sample of zero-cosine venues:")
    print(zero_cosine[['venue_id', 'prompt_type', 'n_tags_run1', 'n_tags_run2', 'jaccard_norm_eval']].head(10))

In [None]:
# Compare all models
print("Cosine similarity by model:")
print(run_stability.groupby('model_key')['cosine_similarity'].agg(['count', 'mean', 'median', 'std', 'min', 'max']))

## 3. Key Claim: Lexically Unstable but Semantically Stable

In [None]:
# Compute the gap
run_stability['semantic_gap'] = run_stability['cosine_similarity'] - run_stability['jaccard_norm_eval']

print("Semantic Gap (Cosine - Jaccard):")
print(run_stability['semantic_gap'].describe())
print(f"\n% with positive gap (cosine > jaccard): {100*(run_stability['semantic_gap'] > 0).mean():.1f}%")

In [None]:
# Plot: Jaccard vs Cosine
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Scatter plot
ax1 = axes[0]
for model in run_stability['model_key'].unique():
    subset = run_stability[run_stability['model_key'] == model]
    ax1.scatter(subset['jaccard_norm_eval'], subset['cosine_similarity'], 
                alpha=0.3, label=model, s=10)
ax1.plot([0, 1], [0, 1], 'k--', alpha=0.5, label='y=x')
ax1.set_xlabel('Jaccard (Surface)')
ax1.set_ylabel('Cosine (Semantic)')
ax1.set_title('Surface vs Semantic Stability')
ax1.legend()
ax1.set_xlim(0, 1)
ax1.set_ylim(0, 1)

# Gap distribution
ax2 = axes[1]
for model in run_stability['model_key'].unique():
    subset = run_stability[run_stability['model_key'] == model]
    ax2.hist(subset['semantic_gap'], bins=30, alpha=0.5, label=model)
ax2.axvline(0, color='k', linestyle='--', alpha=0.5)
ax2.set_xlabel('Semantic Gap (Cosine - Jaccard)')
ax2.set_ylabel('Count')
ax2.set_title('Distribution of Semantic Gap')
ax2.legend()

plt.tight_layout()
plt.show()

In [None]:
# Summary by model and prompt
summary = run_stability.groupby(['model_key', 'prompt_type']).agg({
    'cosine_similarity': ['median', 'mean'],
    'jaccard_norm_eval': ['median', 'mean'],
    'semantic_gap': ['median', 'mean'],
    'mmc': ['median', 'mean'],
}).round(3)
print(summary)

## 4. S4: Sparsity Analysis (Token Count vs Variability)

In [None]:
# Load venue data to get reviews
venues_df = pd.read_csv(DATA_DIR / 'study1_venues_20250117.csv')
print(f"Venues: {len(venues_df)}")
print(f"Columns: {venues_df.columns.tolist()}")

In [None]:
# Parse google_reviews and compute token counts
def count_tokens(reviews_str):
    """Count total words across all reviews for a venue."""
    try:
        reviews = ast.literal_eval(reviews_str)
        total_words = 0
        for review in reviews:
            if isinstance(review, dict) and 'text' in review:
                total_words += len(review['text'].split())
        return total_words
    except:
        return 0

def count_reviews(reviews_str):
    """Count number of reviews for a venue."""
    try:
        reviews = ast.literal_eval(reviews_str)
        return len(reviews)
    except:
        return 0

venues_df['total_tokens'] = venues_df['google_reviews'].apply(count_tokens)
venues_df['n_reviews'] = venues_df['google_reviews'].apply(count_reviews)

print(f"Token count stats:")
print(venues_df['total_tokens'].describe())
print(f"\nReview count stats:")
print(venues_df['n_reviews'].describe())

In [None]:
# Merge with uncertainty data
sparsity_df = uncertainty.merge(
    venues_df[['id', 'total_tokens', 'n_reviews']], 
    left_on='venue_id', 
    right_on='id',
    how='left'
)
print(f"Merged: {len(sparsity_df)} venues")
print(sparsity_df[['venue_id', 'total_tokens', 'n_reviews', 'mean_pairwise_distance']].head())

In [None]:
# Compute correlation: tokens vs variability
corr_tokens = sparsity_df['total_tokens'].corr(sparsity_df['mean_pairwise_distance'])
corr_reviews = sparsity_df['n_reviews'].corr(sparsity_df['mean_pairwise_distance'])

print(f"Correlation (tokens vs variability): {corr_tokens:.3f}")
print(f"Correlation (n_reviews vs variability): {corr_reviews:.3f}")
print(f"\nExpected: NEGATIVE (more evidence â†’ less variability)")

In [None]:
# Plot: Tokens vs Variability
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

ax1 = axes[0]
ax1.scatter(sparsity_df['total_tokens'], sparsity_df['mean_pairwise_distance'], alpha=0.5)
ax1.set_xlabel('Total Tokens (Evidence Amount)')
ax1.set_ylabel('Mean Pairwise Distance (Variability)')
ax1.set_title(f'S4: Sparsity vs Variability (r={corr_tokens:.3f})')

ax2 = axes[1]
# Bin by token count
sparsity_df['token_bucket'] = pd.cut(sparsity_df['total_tokens'], 
                                      bins=[0, 100, 200, 300, 500, 1000, float('inf')],
                                      labels=['<100', '100-200', '200-300', '300-500', '500-1000', '>1000'])
sparsity_df.boxplot(column='mean_pairwise_distance', by='token_bucket', ax=ax2)
ax2.set_xlabel('Token Bucket')
ax2.set_ylabel('Mean Pairwise Distance (Variability)')
ax2.set_title('Variability by Evidence Amount')
plt.suptitle('')

plt.tight_layout()
plt.show()

## 5. Run Stability Distribution by Model

In [None]:
# Violin plot of run stability
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

# Cosine similarity
ax1 = axes[0]
run_stability.boxplot(column='cosine_similarity', by='model_key', ax=ax1)
ax1.set_title('Cosine Similarity (Semantic)')
ax1.set_xlabel('Model')
ax1.set_ylabel('Cosine Similarity')
plt.suptitle('')

# Jaccard
ax2 = axes[1]
run_stability.boxplot(column='jaccard_norm_eval', by='model_key', ax=ax2)
ax2.set_title('Jaccard (Surface)')
ax2.set_xlabel('Model')
ax2.set_ylabel('Jaccard')
plt.suptitle('')

# MMC
ax3 = axes[2]
run_stability.boxplot(column='mmc', by='model_key', ax=ax3)
ax3.set_title('Mean Max Cosine (Paraphrase)')
ax3.set_xlabel('Model')
ax3.set_ylabel('MMC')
plt.suptitle('')

plt.tight_layout()
plt.show()

## 6. Prompt Sensitivity

In [None]:
# Prompt sensitivity summary
prompt_summary = prompt_sensitivity.groupby(['model_key', 'prompt1', 'prompt2']).agg({
    'cosine_similarity': ['median', 'mean', 'std'],
    'jaccard_norm_eval': ['median', 'mean'],
}).round(3)
print(prompt_summary)

In [None]:
# Heatmap of prompt sensitivity per model
fig, axes = plt.subplots(2, 2, figsize=(12, 10))

for idx, model in enumerate(['claude', 'gemini', 'grok', 'openai']):
    ax = axes[idx // 2, idx % 2]
    model_data = prompt_sensitivity[prompt_sensitivity['model_key'] == model]
    
    # Create pivot table
    pivot = model_data.pivot_table(
        values='cosine_similarity', 
        index='prompt1', 
        columns='prompt2', 
        aggfunc='median'
    )
    
    sns.heatmap(pivot, annot=True, fmt='.3f', cmap='RdYlGn', 
                vmin=0.8, vmax=1.0, ax=ax)
    ax.set_title(f'{model.upper()}: Prompt Sensitivity')

plt.tight_layout()
plt.show()

## 7. Model Sensitivity

In [None]:
# Model sensitivity summary
model_summary = model_sensitivity.groupby(['prompt_type', 'model1', 'model2']).agg({
    'cosine_similarity': ['median', 'mean', 'std'],
    'jaccard_norm_eval': ['median', 'mean'],
}).round(3)
print(model_summary)

In [None]:
# Heatmap of model sensitivity per prompt
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

for idx, prompt in enumerate(['minimal', 'anti_hallucination', 'short_phrase']):
    ax = axes[idx]
    prompt_data = model_sensitivity[model_sensitivity['prompt_type'] == prompt]
    
    # Create pivot table
    pivot = prompt_data.pivot_table(
        values='cosine_similarity', 
        index='model1', 
        columns='model2', 
        aggfunc='median'
    )
    
    sns.heatmap(pivot, annot=True, fmt='.3f', cmap='RdYlGn', 
                vmin=0.6, vmax=1.0, ax=ax)
    ax.set_title(f'{prompt}: Model Sensitivity')

plt.tight_layout()
plt.show()

## 8. Retention Analysis

In [None]:
# Retention summary
retention_summary = retention.groupby(['model_key', 'prompt_type']).agg({
    'retention_cosine': ['median', 'mean'],
    'retention_random': ['median', 'mean'],
    'delta_retention': ['median', 'mean'],
    'z_score_random': ['median', 'mean'],
}).round(3)
print(retention_summary)

In [None]:
# Plot: Retention vs Random Baseline
fig, ax = plt.subplots(figsize=(10, 6))

x = np.arange(len(retention_summary))
width = 0.35

labels = [f"{m}\n{p}" for (m, p) in retention_summary.index]
actual = retention_summary[('retention_cosine', 'median')].values
random = retention_summary[('retention_random', 'median')].values

ax.bar(x - width/2, actual, width, label='Actual Retention', color='steelblue')
ax.bar(x + width/2, random, width, label='Random Baseline', color='lightcoral')

ax.set_ylabel('Cosine Similarity')
ax.set_title('Retention: Gentags vs Random Baseline')
ax.set_xticks(x)
ax.set_xticklabels(labels, rotation=45, ha='right')
ax.legend()
ax.set_ylim(0, 0.8)

plt.tight_layout()
plt.show()

## 9. Summary Table (Paper-Ready)

In [None]:
# Create paper-ready summary
paper_summary = run_stability.groupby('model_key').agg({
    'cosine_similarity': lambda x: f"{x.median():.3f} [{x.quantile(0.25):.3f}, {x.quantile(0.75):.3f}]",
    'jaccard_norm_eval': lambda x: f"{x.median():.3f} [{x.quantile(0.25):.3f}, {x.quantile(0.75):.3f}]",
    'mmc': lambda x: f"{x.median():.3f} [{x.quantile(0.25):.3f}, {x.quantile(0.75):.3f}]",
    'semantic_gap': lambda x: f"{x.median():.3f}",
}).rename(columns={
    'cosine_similarity': 'Cosine (median [IQR])',
    'jaccard_norm_eval': 'Jaccard (median [IQR])',
    'mmc': 'MMC (median [IQR])',
    'semantic_gap': 'Gap'
})

print("\n" + "="*80)
print("PAPER-READY SUMMARY: Run Stability by Model")
print("="*80)
print(paper_summary.to_string())