# Data Exploration for Causal Ad Impact Analysis

This notebook explores the ad clickstream dataset to understand:
- Data quality and completeness
- Treatment assignment patterns
- Outcome distributions
- Potential confounders

In [None]:
import sys
sys.path.append('..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from src.data.data_loader import simulate_ad_data, load_benchmark_dataset
from src.data.preprocessing import balance_check

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Load data
df = simulate_ad_data(n_samples=15000, seed=42)
print(f"Dataset shape: {df.shape}")
df.head()

## Data Quality Assessment

In [None]:
# Basic statistics
print("=== DATASET OVERVIEW ===")
print(f"Total records: {len(df):,}")
print(f"Features: {df.shape[1]}")
print(f"Missing values: {df.isnull().sum().sum()}")

print("\n=== TREATMENT ASSIGNMENT ===")
treatment_dist = df['treatment'].value_counts()
print(f"Control (0): {treatment_dist[0]:,} ({treatment_dist[0]/len(df)*100:.1f}%)")
print(f"Treatment (1): {treatment_dist[1]:,} ({treatment_dist[1]/len(df)*100:.1f}%)")

print("\n=== OUTCOME DISTRIBUTION ===")
conversion_by_treatment = df.groupby('treatment')['conversion'].agg(['count', 'mean', 'std'])
print(conversion_by_treatment)

# Naive treatment effect
naive_effect = df[df['treatment']==1]['conversion'].mean() - df[df['treatment']==0]['conversion'].mean()
print(f"\nNaive treatment effect: {naive_effect:.4f}")
print(f"True treatment effect: {df['true_effect'].iloc[0]:.4f}")

## Exploratory Visualizations

In [None]:
# Create visualization grid
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
fig.suptitle('Ad Clickstream Data Exploration', fontsize=16)

# 1. Age distribution by treatment
df[df['treatment']==0]['age'].hist(alpha=0.7, bins=30, ax=axes[0,0], label='Control', color='blue')
df[df['treatment']==1]['age'].hist(alpha=0.7, bins=30, ax=axes[0,0], label='Treatment', color='red')
axes[0,0].set_title('Age Distribution by Treatment Group')
axes[0,0].set_xlabel('Age')
axes[0,0].legend()

# 2. Income distribution by treatment
df[df['treatment']==0]['income'].hist(alpha=0.7, bins=30, ax=axes[0,1], label='Control', color='blue')
df[df['treatment']==1]['income'].hist(alpha=0.7, bins=30, ax=axes[0,1], label='Treatment', color='red')
axes[0,1].set_title('Income Distribution by Treatment Group')
axes[0,1].set_xlabel('Income')
axes[0,1].legend()

# 3. Website visits by treatment
website_visits_summary = df.groupby(['treatment', 'website_visits']).size().unstack(fill_value=0)
website_visits_summary.plot(kind='bar', ax=axes[0,2], alpha=0.8)
axes[0,2].set_title('Website Visits by Treatment')
axes[0,2].set_xlabel('Treatment Group')

# 4. Conversion rate by age groups
df['age_group'] = pd.cut(df['age'], bins=[0, 25, 35, 50, 100], labels=['18-25', '26-35', '36-50', '50+'])
conv_by_age = df.groupby(['age_group', 'treatment'])['conversion'].mean().unstack()
conv_by_age.plot(kind='bar', ax=axes[1,0], alpha=0.8)
axes[1,0].set_title('Conversion Rate by Age Group')
axes[1,0].set_ylabel('Conversion Rate')

# 5. Propensity score distribution
df['true_propensity'].hist(bins=30, ax=axes[1,1], alpha=0.7, color='green')
axes[1,1].set_title('True Propensity Score Distribution')
axes[1,1].set_xlabel('Propensity Score')

# 6. Correlation heatmap
corr_cols = ['age', 'income', 'website_visits', 'past_purchases', 'treatment', 'conversion']
corr_matrix = df[corr_cols].corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0, ax=axes[1,2])
axes[1,2].set_title('Feature Correlation Matrix')

plt.tight_layout()
plt.show()

## Covariate Balance Analysis

In [None]:
# Check covariate balance
balance_stats = balance_check(df, 'treatment')

print("=== COVARIATE BALANCE ASSESSMENT ===")
print("Variables with significant imbalance (|SMD| > 0.1):")
imbalanced = balance_stats[balance_stats['imbalanced'] == True]
for _, row in imbalanced.iterrows():
    print(f"  {row['variable']}: SMD = {row['standardized_mean_diff']:.3f}")

print(f"\nTotal imbalanced variables: {len(imbalanced)} out of {len(balance_stats)}")

# Visualize balance
plt.figure(figsize=(12, 6))
variables = balance_stats['variable'][:8]  # Top 8 variables
smds = balance_stats['standardized_mean_diff'][:8]

colors = ['red' if abs(smd) > 0.1 else 'green' for smd in smds]
bars = plt.barh(variables, smds, color=colors, alpha=0.7)

plt.axvline(x=0.1, color='red', linestyle='--', alpha=0.7, label='Imbalance Threshold')
plt.axvline(x=-0.1, color='red', linestyle='--', alpha=0.7)
plt.axvline(x=0, color='black', linestyle='-', alpha=0.5)

plt.xlabel('Standardized Mean Difference')
plt.title('Covariate Balance Before Matching')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## Treatment Assignment Mechanism

In [None]:
# Analyze treatment assignment patterns
print("=== TREATMENT ASSIGNMENT ANALYSIS ===")

# Treatment probability by characteristics
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# By age groups
treat_by_age = df.groupby('age_group')['treatment'].mean()
treat_by_age.plot(kind='bar', ax=axes[0], alpha=0.8, color='skyblue')
axes[0].set_title('Treatment Probability by Age Group')
axes[0].set_ylabel('Treatment Probability')
axes[0].tick_params(axis='x', rotation=45)

# By income quartiles
df['income_quartile'] = pd.qcut(df['income'], q=4, labels=['Q1', 'Q2', 'Q3', 'Q4'])
treat_by_income = df.groupby('income_quartile')['treatment'].mean()
treat_by_income.plot(kind='bar', ax=axes[1], alpha=0.8, color='lightcoral')
axes[1].set_title('Treatment Probability by Income Quartile')
axes[1].set_ylabel('Treatment Probability')
axes[1].tick_params(axis='x', rotation=45)

# By website visits
treat_by_visits = df.groupby('website_visits')['treatment'].mean()
treat_by_visits.plot(ax=axes[2], marker='o', alpha=0.8, color='green')
axes[2].set_title('Treatment Probability by Website Visits')
axes[2].set_xlabel('Website Visits')
axes[2].set_ylabel('Treatment Probability')

plt.tight_layout()
plt.show()

# Summary statistics
print("\nTreatment assignment is clearly non-random!")
print("Key patterns observed:")
print(f"- Older users more likely to receive treatment: {treat_by_age.corr(pd.Series([1,2,3,4])):.3f} correlation")
print(f"- Higher income users more likely to receive treatment")
print(f"- Users with more website visits more likely to receive treatment")

## Key Insights for Causal Analysis

In [None]:
print("=== KEY INSIGHTS FOR CAUSAL ANALYSIS ===")
print()
print("1. SELECTION BIAS DETECTED:")
print(f"   - Treatment assignment is biased toward older, higher-income users")
print(f"   - Naive estimate ({naive_effect:.4f}) likely overstates true effect ({df['true_effect'].iloc[0]:.4f})")
print()
print("2. CONFOUNDING VARIABLES IDENTIFIED:")
print(f"   - Age, income, website_visits, past_purchases all affect both treatment and outcome")
print(f"   - {len(imbalanced)} variables show significant imbalance")
print()
print("3. DATA QUALITY:")
print(f"   - No missing values")
print(f"   - Sufficient sample size ({len(df):,} observations)")
print(f"   - Good treatment/control balance ({df['treatment'].mean():.1%} treated)")
print()
print("4. RECOMMENDED APPROACH:")
print("   - Propensity Score Matching to address selection bias")
print("   - Doubly Robust Estimation for robustness")
print("   - Causal Forest for heterogeneous effects")
print("   - Comprehensive robustness checks")

# Save preprocessed data for next notebooks
df.to_csv('../data/processed/explored_ad_data.csv', index=False)
print("\n✅ Preprocessed data saved for next analysis steps")