# Mock Distribution Demo - ALINE Ticket 001

This notebook visualizes and validates the synthetic migraine prediction dataset.

## Objectives:
- Load and inspect the mock distribution data
- Visualize correlation patterns
- Validate prior vs posterior distributions
- Verify migraine probability distribution

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
from pathlib import Path

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print("‚úÖ Libraries imported successfully")

## 1. Load Mock Distribution Data

In [None]:
# Load pickle file
data_dir = Path('../data')
pickle_path = data_dir / 'mock_distribution.pkl'

with open(pickle_path, 'rb') as f:
    mock_data = pickle.load(f)

print("Loaded keys:", list(mock_data.keys()))
print(f"\nNumber of features: {len(mock_data['features'])}")
print(f"Number of samples: {len(mock_data['samples'])}")
print(f"Migraine cases: {mock_data['migraine'].sum()} ({mock_data['migraine'].mean()*100:.1f}%)")

In [None]:
# Load CSV for easier inspection
csv_path = data_dir / 'mock_distribution.csv'
df_samples = pd.read_csv(csv_path)

print("Sample data shape:", df_samples.shape)
print("\nFirst few rows:")
df_samples.head()

## 2. Migraine Probability Distribution

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Histogram of probabilities
axes[0].hist(mock_data['p_migraine'], bins=50, edgecolor='black', alpha=0.7)
axes[0].axvline(mock_data['p_migraine'].mean(), color='red', linestyle='--', 
                label=f"Mean: {mock_data['p_migraine'].mean():.3f}")
axes[0].set_xlabel('P(migraine)')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Distribution of Migraine Probability')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Box plot by actual migraine status
axes[1].boxplot([mock_data['p_migraine'][mock_data['migraine']==0], 
                 mock_data['p_migraine'][mock_data['migraine']==1]],
                labels=['No Migraine', 'Migraine'])
axes[1].set_ylabel('P(migraine)')
axes[1].set_title('Probability by Actual Migraine Status')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"‚úì P(migraine) is unimodal and within [0, 1]: [{mock_data['p_migraine'].min():.3f}, {mock_data['p_migraine'].max():.3f}]")

## 3. Correlation Heatmap

Verify that correlation patterns match clinical intuition.

In [None]:
# Create correlation matrix
feature_names = mock_data['features']
samples = mock_data['samples']

corr_matrix = np.corrcoef(samples.T)

# Plot heatmap with abbreviated names for readability
abbreviated_names = [name.split('(')[0].strip()[:20] for name in feature_names]

plt.figure(figsize=(16, 14))
sns.heatmap(corr_matrix, 
            xticklabels=abbreviated_names,
            yticklabels=abbreviated_names,
            cmap='coolwarm', center=0, 
            vmin=-1, vmax=1,
            square=True,
            cbar_kws={'label': 'Correlation'})
plt.title('Feature Correlation Heatmap', fontsize=14, pad=20)
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

print("‚úì Correlation heatmap generated")

## 4. Prior vs Posterior Comparison

Compare feature distributions before and after conditioning on migraine occurrence.

In [None]:
# Calculate shifts
mu_prior = mock_data['mu_prior']
mu_post = mock_data['mu_post']
shifts = mu_post - mu_prior
relative_shifts = shifts / np.std(mock_data['samples'], axis=0)

# Create comparison DataFrame
comparison_df = pd.DataFrame({
    'Feature': feature_names,
    'Category': mock_data['categories'],
    'Weight': mock_data['weights'],
    'Prior Mean': mu_prior,
    'Posterior Mean': mu_post,
    'Absolute Shift': shifts,
    'Relative Shift (œÉ)': relative_shifts
})

comparison_df = comparison_df.sort_values('Absolute Shift', key=abs, ascending=False)

print("Top 10 Features by Posterior Shift:\n")
print(comparison_df.head(10).to_string(index=False))

print("\n‚úì Posterior mean differs meaningfully from prior mean")

In [None]:
# Visualize top shifts
top_n = 10
top_features = comparison_df.head(top_n)

fig, ax = plt.subplots(figsize=(12, 8))

x = np.arange(top_n)
width = 0.35

bars1 = ax.barh(x - width/2, top_features['Prior Mean'], width, label='Prior', alpha=0.8)
bars2 = ax.barh(x + width/2, top_features['Posterior Mean'], width, label='Posterior', alpha=0.8)

ax.set_yticks(x)
ax.set_yticklabels([name.split('(')[0].strip() for name in top_features['Feature']])
ax.set_xlabel('Mean Value')
ax.set_title(f'Prior vs Posterior: Top {top_n} Features by Shift')
ax.legend()
ax.grid(True, alpha=0.3, axis='x')

plt.tight_layout()
plt.show()

## 5. Feature Distributions by Migraine Status

In [None]:
# Select top 6 most influential features
top_features = comparison_df.head(6)['Feature'].tolist()

fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.flatten()

for idx, feature in enumerate(top_features):
    ax = axes[idx]
    
    no_migraine = df_samples[df_samples['migraine']==0][feature]
    migraine = df_samples[df_samples['migraine']==1][feature]
    
    ax.hist(no_migraine, bins=30, alpha=0.6, label='No Migraine', density=True)
    ax.hist(migraine, bins=30, alpha=0.6, label='Migraine', density=True)
    
    ax.set_xlabel(feature.split('(')[0].strip())
    ax.set_ylabel('Density')
    ax.legend()
    ax.grid(True, alpha=0.3)

plt.suptitle('Feature Distributions by Migraine Status', fontsize=14, y=1.00)
plt.tight_layout()
plt.show()

## 6. Covariance Structure Analysis

In [None]:
# Analyze covariance matrices
Sigma_prior = mock_data['Sigma_prior']
Sigma_post = mock_data['Sigma_post']

print("Prior Covariance Matrix:")
print(f"  Shape: {Sigma_prior.shape}")
print(f"  Condition number: {np.linalg.cond(Sigma_prior):.2e}")
print(f"  Determinant: {np.linalg.det(Sigma_prior):.2e}")

print("\nPosterior Covariance Matrix:")
print(f"  Shape: {Sigma_post.shape}")
print(f"  Condition number: {np.linalg.cond(Sigma_post):.2e}")
print(f"  Determinant: {np.linalg.det(Sigma_post):.2e}")

# Check if matrices are positive definite
try:
    np.linalg.cholesky(Sigma_prior)
    print("\n‚úì Prior covariance is positive definite")
except:
    print("\n‚ö†Ô∏è Prior covariance is NOT positive definite")

try:
    np.linalg.cholesky(Sigma_post)
    print("‚úì Posterior covariance is positive definite")
except:
    print("‚ö†Ô∏è Posterior covariance is NOT positive definite")

## 7. Validation Checklist

Final validation against ticket requirements:

In [None]:
print("üìã VALIDATION CHECKLIST:\n")

# Check 1: Correlation heatmap
print("‚úì [PASS] Correlation heatmap generated - verify clinical intuition manually")

# Check 2: P(migraine) distribution
p_min, p_max = mock_data['p_migraine'].min(), mock_data['p_migraine'].max()
if 0 <= p_min and p_max <= 1:
    print(f"‚úì [PASS] P(migraine) within [0,1]: [{p_min:.3f}, {p_max:.3f}]")
else:
    print(f"‚úó [FAIL] P(migraine) out of bounds: [{p_min:.3f}, {p_max:.3f}]")

# Check 3: Posterior differs from prior
max_shift = np.abs(relative_shifts).max()
if max_shift > 0.1:  # At least 0.1 standard deviations
    print(f"‚úì [PASS] Posterior differs meaningfully (max shift: {max_shift:.2f}œÉ)")
else:
    print(f"‚úó [FAIL] Posterior too similar to prior (max shift: {max_shift:.2f}œÉ)")

# Check 4: Files exist
pkl_exists = (data_dir / 'mock_distribution.pkl').exists()
csv_exists = (data_dir / 'mock_distribution.csv').exists()
if pkl_exists and csv_exists:
    print("‚úì [PASS] Export .pkl and .csv files verified")
else:
    print(f"‚úó [FAIL] Missing files (pkl: {pkl_exists}, csv: {csv_exists})")

print("\n‚úÖ Validation complete!")

## 8. Summary Statistics

In [None]:
# Load and display summary
summary_df = pd.read_csv(data_dir / 'mock_distribution_summary.csv')

print("üìä MOCK DISTRIBUTION SUMMARY:\n")
for col in summary_df.columns:
    val = summary_df[col].values[0]
    if isinstance(val, float):
        print(f"  {col}: {val:.4f}")
    else:
        print(f"  {col}: {val}")

print("\n" + "="*60)
print("Mock distribution ready for ALINE warm-up training!")
print("="*60)