# Data Preprocessing for Causal Analysis

This notebook handles:
- Data cleaning and validation
- Feature engineering for causal inference
- Confounder selection
- Data preparation for modeling

In [None]:
import sys
sys.path.append('..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from src.data.data_loader import simulate_ad_data
from src.data.preprocessing import preprocess, create_treatment_control_split, balance_check
from src.data.feature_engineering import engineer_features, select_confounders

# Load raw data
df_raw = simulate_ad_data(n_samples=12000, seed=42)
print(f"Raw data shape: {df_raw.shape}")
print(f"Columns: {list(df_raw.columns)}")

# Initial data quality check
print("\n=== DATA QUALITY OVERVIEW ===")
print(f"Missing values: {df_raw.isnull().sum().sum()}")
print(f"Duplicate rows: {df_raw.duplicated().sum()}")
print(f"Treatment distribution: {df_raw['treatment'].value_counts().to_dict()}")

## Step 1: Data Cleaning

In [None]:
# Apply preprocessing
df_clean = preprocess(df_raw)

print("=== CLEANING RESULTS ===")
print(f"Original shape: {df_raw.shape}")
print(f"Cleaned shape: {df_clean.shape}")
print(f"Records removed: {len(df_raw) - len(df_clean)}")

# Check for outliers before/after
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
fig.suptitle('Data Cleaning: Before vs After', fontsize=16)

# Age distribution
axes[0,0].hist(df_raw['age'], bins=30, alpha=0.7, label='Before', color='red')
axes[0,0].hist(df_clean['age'], bins=30, alpha=0.7, label='After', color='blue')
axes[0,0].set_title('Age Distribution')
axes[0,0].legend()

# Income distribution
axes[0,1].hist(df_raw['income'], bins=30, alpha=0.7, label='Before', color='red')
axes[0,1].hist(df_clean['income'], bins=30, alpha=0.7, label='After', color='blue')
axes[0,1].set_title('Income Distribution')
axes[0,1].legend()

# Box plots for outlier detection
df_raw[['age', 'income']].boxplot(ax=axes[1,0])
axes[1,0].set_title('Before Cleaning')

df_clean[['age', 'income']].boxplot(ax=axes[1,1])
axes[1,1].set_title('After Cleaning')

plt.tight_layout()
plt.show()

## Step 2: Feature Engineering

In [None]:
# Engineer features for causal analysis
df_features = engineer_features(df_clean)

print("=== FEATURE ENGINEERING RESULTS ===")
print(f"Original features: {df_clean.shape[1]}")
print(f"Engineered features: {df_features.shape[1]}")
print(f"New features added: {df_features.shape[1] - df_clean.shape[1]}")

# Show new features
new_features = [col for col in df_features.columns if col not in df_clean.columns]
print(f"\nNew features: {new_features}")

# Examine feature distributions
if len(new_features) > 0:
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    axes = axes.flatten()
    
    for i, feature in enumerate(new_features[:4]):
        if df_features[feature].dtype in ['int64', 'float64']:
            df_features[feature].hist(bins=30, ax=axes[i], alpha=0.7)
            axes[i].set_title(f'{feature} Distribution')
        else:
            df_features[feature].value_counts().plot(kind='bar', ax=axes[i], alpha=0.7)
            axes[i].set_title(f'{feature} Counts')
            axes[i].tick_params(axis='x', rotation=45)
    
    plt.tight_layout()
    plt.show()

## Step 3: Confounder Selection

In [None]:
# Select confounding variables
confounders = select_confounders(df_features)

print("=== CONFOUNDER SELECTION ===")
print(f"Total available features: {len(df_features.columns)}")
print(f"Selected confounders: {len(confounders)}")
print(f"\nConfounding variables:")
for i, conf in enumerate(confounders, 1):
    print(f"  {i:2d}. {conf}")

# Analyze confounder importance
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

# Feature importance for treatment prediction
rf_treatment = RandomForestClassifier(n_estimators=100, random_state=42)
rf_treatment.fit(df_features[confounders], df_features['treatment'])

# Feature importance for outcome prediction
rf_outcome = RandomForestClassifier(n_estimators=100, random_state=42)
rf_outcome.fit(df_features[confounders], df_features['conversion'])

# Create importance comparison
importance_df = pd.DataFrame({
    'feature': confounders,
    'treatment_importance': rf_treatment.feature_importances_,
    'outcome_importance': rf_outcome.feature_importances_
})

# Plot feature importance
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Treatment prediction importance
top_treatment = importance_df.nlargest(10, 'treatment_importance')
axes[0].barh(top_treatment['feature'], top_treatment['treatment_importance'], alpha=0.8, color='red')
axes[0].set_title('Top 10 Features for Treatment Prediction')
axes[0].set_xlabel('Feature Importance')

# Outcome prediction importance
top_outcome = importance_df.nlargest(10, 'outcome_importance')
axes[1].barh(top_outcome['feature'], top_outcome['outcome_importance'], alpha=0.8, color='blue')
axes[1].set_title('Top 10 Features for Outcome Prediction')
axes[1].set_xlabel('Feature Importance')

plt.tight_layout()
plt.show()

# Identify strongest confounders (important for both)
importance_df['combined_importance'] = importance_df['treatment_importance'] + importance_df['outcome_importance']
top_confounders = importance_df.nlargest(5, 'combined_importance')
print(f"\nStrongest confounders (affect both treatment and outcome):")
for _, row in top_confounders.iterrows():
    print(f"  {row['feature']}: Treatment={row['treatment_importance']:.3f}, Outcome={row['outcome_importance']:.3f}")

## Step 4: Balance Assessment

In [None]:
# Check initial covariate balance
balance_stats = balance_check(df_features, 'treatment')

print("=== COVARIATE BALANCE ASSESSMENT ===")
print(f"Variables assessed: {len(balance_stats)}")
imbalanced_vars = balance_stats[balance_stats['imbalanced'] == True]
print(f"Imbalanced variables: {len(imbalanced_vars)}")
print(f"Balance rate: {(len(balance_stats) - len(imbalanced_vars)) / len(balance_stats) * 100:.1f}%")

# Show worst imbalances
worst_imbalances = balance_stats.reindex(balance_stats['standardized_mean_diff'].abs().sort_values(ascending=False).index).head(10)
print(f"\nWorst imbalances (top 10):")
for _, row in worst_imbalances.iterrows():
    status = "⚠️" if row['imbalanced'] else "✅"
    print(f"  {status} {row['variable']}: SMD = {row['standardized_mean_diff']:.3f}")

# Visualize balance
plt.figure(figsize=(12, 8))
variables = worst_imbalances['variable'][:15]
smds = worst_imbalances['standardized_mean_diff'][:15]
colors = ['red' if abs(smd) > 0.1 else 'green' for smd in smds]

bars = plt.barh(variables, smds, color=colors, alpha=0.7)
plt.axvline(x=0.1, color='red', linestyle='--', alpha=0.7, label='Imbalance Threshold (+)')
plt.axvline(x=-0.1, color='red', linestyle='--', alpha=0.7, label='Imbalance Threshold (-)')
plt.axvline(x=0, color='black', linestyle='-', alpha=0.5)

plt.xlabel('Standardized Mean Difference')
plt.title('Covariate Balance Before Matching (Top 15 Variables)')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## Step 5: Data Preparation for Modeling

In [None]:
# Prepare final dataset for causal analysis
treatment_group, control_group = create_treatment_control_split(df_features)

print("=== FINAL DATASET SUMMARY ===")
print(f"Total observations: {len(df_features):,}")
print(f"Treatment group: {len(treatment_group):,} ({len(treatment_group)/len(df_features)*100:.1f}%)")
print(f"Control group: {len(control_group):,} ({len(control_group)/len(df_features)*100:.1f}%)")
print(f"Total features: {len(df_features.columns)}")
print(f"Confounding variables: {len(confounders)}")

# Outcome statistics
print(f"\n=== OUTCOME STATISTICS ===")
print(f"Overall conversion rate: {df_features['conversion'].mean():.3f}")
print(f"Treatment conversion rate: {treatment_group['conversion'].mean():.3f}")
print(f"Control conversion rate: {control_group['conversion'].mean():.3f}")
print(f"Naive treatment effect: {treatment_group['conversion'].mean() - control_group['conversion'].mean():.4f}")
print(f"True treatment effect: {df_features['true_effect'].iloc[0]:.4f}")

# Save processed data
df_features.to_csv('../data/processed/preprocessed_ad_data.csv', index=False)

# Save confounder list
pd.Series(confounders).to_csv('../data/processed/confounders.csv', index=False, header=['confounder'])

print("\n✅ Preprocessed data saved to '../data/processed/'")
print("✅ Ready for causal inference modeling")

# Create summary statistics table
summary_stats = pd.DataFrame({
    'Variable': ['Total Observations', 'Treatment Group', 'Control Group', 'Features', 'Confounders',
                'Conversion Rate (Overall)', 'Conversion Rate (Treatment)', 'Conversion Rate (Control)',
                'Naive Effect', 'True Effect'],
    'Value': [f"{len(df_features):,}", f"{len(treatment_group):,}", f"{len(control_group):,}",
             f"{len(df_features.columns)}", f"{len(confounders)}",
             f"{df_features['conversion'].mean():.3f}", f"{treatment_group['conversion'].mean():.3f}",
             f"{control_group['conversion'].mean():.3f}",
             f"{treatment_group['conversion'].mean() - control_group['conversion'].mean():.4f}",
             f"{df_features['true_effect'].iloc[0]:.4f}"]
})

print("\n=== PREPROCESSING SUMMARY ===")
print(summary_stats.to_string(index=False))