# CMS Open Payments Data Exploration & Analysis

**Project:** AAI-540 Machine Learning Operations - Final Team Project  
**Dataset:** CMS Open Payments Program Year 2024 General Payments  
**Purpose:** Exploratory Data Analysis for Payment Patterns and Statistical Insights

---

## Table of Contents
1. [Environment Setup & Configuration](#setup)
2. [Data Loading from Datalake](#loading)
3. [Data Quality Assessment](#quality)
4. [Univariate Analysis](#univariate)
5. [Bivariate & Multivariate Analysis](#multivariate)
6. [Temporal Analysis](#temporal)
7. [Geographic Analysis](#geographic)
8. [Feature Engineering](#features)
9. [Outlier Detection](#outliers)
10. [Advanced Visualizations](#advanced)
11. [Key Findings & Insights](#findings)

---

## 1. Environment Setup & Configuration

Setting up the environment with necessary libraries and AWS integration.

In [None]:
# Install required packages
!pip install -r ../requirements.txt --quiet
!pip install boto3 sagemaker awswrangler pyathena --quiet

In [None]:
# Standard libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
from pathlib import Path
import sys
from scipy import stats

# AWS libraries
import boto3
import sagemaker
import awswrangler as wr
from pyathena import connect

# Add parent directory to path for custom modules
parent_dir = Path.cwd().parent
if str(parent_dir) not in sys.path:
    sys.path.insert(0, str(parent_dir))

# Visualization settings
warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("viridis")

# Display settings
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 100)
pd.set_option('display.float_format', '{:.2f}'.format)

print("Libraries imported successfully")

In [None]:
# Import custom utilities (if available)
try:
    from config import CONFIG
    from utils import CMSDataLoader, PaymentVisualizer, FeatureEngineer
    print("Custom utilities imported")
    use_custom_utils = True
except ImportError:
    print("Custom utilities not found - using standard libraries only")
    use_custom_utils = False

In [None]:
# Restore AWS configuration from datalake setup notebook
%store -r bucket
%store -r region
%store -r database_name
%store -r table_name_parquet
%store -r s3_parquet_path
%store -r s3_athena_staging

# If variables not restored, set defaults matching datalake setup
try:
    # Test if variables exist
    test_vars = [bucket, region, database_name, table_name_parquet]
    
    print(f"AWS Configuration:")
    print(f"  Region: {region}")
    print(f"  S3 Bucket: {bucket}")
    print(f"  Database: {database_name}")
    print(f"  Table: {table_name_parquet}")
    print(f"  Parquet Path: {s3_parquet_path}")
    print(f"  Athena Staging: {s3_athena_staging}")
    
except NameError as e:
    print(f"Variables not found in store. Setting up from AWS configuration...")
    
    # Initialize AWS session
    boto_session = boto3.Session()
    region = boto_session.region_name
    
    # Get account information
    sts_client = boto3.client('sts')
    account_id = sts_client.get_caller_identity().get('Account')
    
    # Set configuration matching datalake setup
    bucket = f"cmsopenpaymentsystems{account_id}"
    database_name = "cms_open_payments"
    table_name_parquet = "general_payments_parquet"
    
    # Define S3 paths
    cms_data_prefix = "cms-open-payments"
    parquet_data_prefix = f"{cms_data_prefix}/parquet"
    s3_parquet_path = f"s3://{bucket}/{parquet_data_prefix}"
    s3_athena_staging = f"s3://{bucket}/athena/staging"
    
    print(f"\nAWS Configuration (manual setup):")
    print(f"  Region: {region}")
    print(f"  Account ID: {account_id}")
    print(f"  S3 Bucket: {bucket}")
    print(f"  Database: {database_name}")
    print(f"  Table: {table_name_parquet}")
    print(f"  Parquet Path: {s3_parquet_path}")
    print(f"  Athena Staging: {s3_athena_staging}")
    print(f"\nNote: Please run the datalake setup notebook (01_setup_cms_datalake.ipynb) first for full setup.")

## 2. Data Loading from Datalake

Load CMS Open Payments data from AWS Athena using optimized Parquet format.

In [None]:
# Initialize Athena connection
athena_conn = connect(
    region_name=region,
    s3_staging_dir=s3_athena_staging
)

print("Athena connection established")

In [None]:
# Option 1: Load full dataset from Parquet (recommended for complete analysis)
# Note: This may take several minutes and require substantial memory

load_full_dataset = False  # Set to True to load full dataset

if load_full_dataset:
    print("Loading full dataset from Parquet...")
    print("Note: This may take several minutes")
    
    df = wr.athena.read_sql_query(
        sql=f"SELECT * FROM {database_name}.{table_name_parquet}",
        database=database_name,
        ctas_approach=False
    )
    
    print(f"Full dataset loaded: {df.shape[0]:,} rows, {df.shape[1]} columns")
else:
    print("Skipping full dataset load - will use sample queries instead")

In [None]:
# Option 2: Load sample dataset for faster EDA
sample_size = 100000  # Adjust based on your needs

print(f"Loading sample dataset ({sample_size:,} rows)...")

sample_query = f"""
SELECT *
FROM {database_name}.{table_name_parquet}
LIMIT {sample_size}
"""

df = wr.athena.read_sql_query(
    sql=sample_query,
    database=database_name,
    ctas_approach=False
)

print(f"Sample dataset loaded: {df.shape[0]:,} rows, {df.shape[1]} columns")
print(f"  Memory Usage: {df.memory_usage(deep=True).sum() / (1024**2):.2f} MB")

In [None]:
# Preview the data
print("Dataset Preview:")
display(df.head(3))

print(f"\nDataset Info:")
print(f"  Shape: {df.shape}")
print(f"  Columns: {df.shape[1]}")
print(f"\nColumn Names:")
for i, col in enumerate(df.columns[:20], 1):
    print(f"  {i}. {col}")
if len(df.columns) > 20:
    print(f"  ... ({len(df.columns) - 20} more columns)")

In [None]:
# Data preprocessing
print("Preprocessing data...")

# Identify and convert date columns
date_columns = [col for col in df.columns if 'date' in col.lower() or 'Date' in col]
for col in date_columns:
    if col in df.columns:
        df[col] = pd.to_datetime(df[col], errors='coerce')

# Identify numeric columns
numeric_cols = df.select_dtypes(include=['number']).columns.tolist()
print(f"  Date columns: {len(date_columns)}")
print(f"  Numeric columns: {len(numeric_cols)}")

# Create temporal features if date column exists
payment_date_col = 'Date_of_Payment'
if payment_date_col in df.columns:
    df['Payment_Year'] = df[payment_date_col].dt.year
    df['Payment_Month'] = df[payment_date_col].dt.month
    df['Payment_Quarter'] = df[payment_date_col].dt.quarter
    df['Payment_DayOfWeek'] = df[payment_date_col].dt.dayofweek
    df['Payment_Week'] = df[payment_date_col].dt.isocalendar().week
    print(f"Temporal features created")

print(f"Preprocessing complete")

## 3. Data Quality Assessment

Comprehensive assessment of data quality including completeness, validity, and consistency.

In [None]:
# Basic statistics
print("=" * 70)
print("DATASET BASIC STATISTICS")
print("=" * 70)

print(f"\nDimensions:")
print(f"  Total Rows: {df.shape[0]:,}")
print(f"  Total Columns: {df.shape[1]}")

print(f"\nMemory Usage:")
memory_mb = df.memory_usage(deep=True).sum() / (1024**2)
print(f"  Total: {memory_mb:.2f} MB")
print(f"  Per Row: {memory_mb / df.shape[0] * 1024:.2f} KB")

print(f"\nColumn Types:")
print(f"  Numeric: {len(df.select_dtypes(include=['number']).columns)}")
print(f"  Object/String: {len(df.select_dtypes(include=['object']).columns)}")
print(f"  DateTime: {len(df.select_dtypes(include=['datetime']).columns)}")

print(f"\nMissing Values:")
total_cells = df.shape[0] * df.shape[1]
missing_cells = df.isnull().sum().sum()
print(f"  Total Cells: {total_cells:,}")
print(f"  Missing Cells: {missing_cells:,}")
print(f"  Missing Percentage: {(missing_cells/total_cells)*100:.2f}%")

print(f"\nDuplicate Rows:")
duplicates = df.duplicated().sum()
print(f"  Count: {duplicates:,}")
print(f"  Percentage: {(duplicates/df.shape[0])*100:.2f}%")

In [None]:
# Missing values analysis
print("Missing Values by Column (Top 20):")

missing_stats = pd.DataFrame({
    'Column': df.columns,
    'Missing_Count': df.isnull().sum().values,
    'Missing_Percent': (df.isnull().sum().values / len(df) * 100)
})

missing_stats = missing_stats[missing_stats['Missing_Count'] > 0].sort_values(
    'Missing_Percent', ascending=False
)

display(missing_stats.head(20))

In [None]:
# Visualize missing values
if len(missing_stats) > 0:
    fig, ax = plt.subplots(figsize=(12, 8))
    
    top_missing = missing_stats.head(20)
    
    colors = ['#e74c3c' if x > 50 else '#f39c12' if x > 20 else '#3498db' 
              for x in top_missing['Missing_Percent']]
    
    ax.barh(range(len(top_missing)), top_missing['Missing_Percent'], 
            color=colors, edgecolor='black', alpha=0.7)
    ax.set_yticks(range(len(top_missing)))
    ax.set_yticklabels(top_missing['Column'])
    ax.set_xlabel('Missing Values (%)', fontsize=11, fontweight='bold')
    ax.set_title('Top 20 Columns by Missing Values', fontsize=12, fontweight='bold')
    ax.grid(axis='x', alpha=0.3)
    
    # Add percentage labels
    for i, (idx, row) in enumerate(top_missing.iterrows()):
        ax.text(row['Missing_Percent'] + 1, i, f"{row['Missing_Percent']:.1f}%", 
                va='center', fontsize=9)
    
    plt.tight_layout()
    plt.show()
else:
    print("No missing values detected")

In [None]:
# Data types summary
print("\nData Types Summary:")
print(df.dtypes.value_counts())

print("\n Sample of each data type:")
for dtype in df.dtypes.unique():
    cols = df.select_dtypes(include=[dtype]).columns[:3]
    print(f"  {dtype}: {', '.join(cols)}")

## 4. Univariate Analysis

Analyzing individual variables to understand distributions, central tendencies, and variability.

In [None]:
# Payment amount analysis
payment_col = 'Total_Amount_of_Payment_USDollars'

if payment_col in df.columns:
    print("=" * 70)
    print("PAYMENT AMOUNT STATISTICS")
    print("=" * 70)
    
    payment_stats = df[payment_col].describe()
    
    print(f"\nBasic Statistics:")
    print(f"  Count: {payment_stats['count']:,.0f}")
    print(f"  Mean: ${payment_stats['mean']:,.2f}")
    print(f"  Median: ${payment_stats['50%']:,.2f}")
    print(f"  Std Dev: ${payment_stats['std']:,.2f}")
    print(f"  Min: ${payment_stats['min']:,.2f}")
    print(f"  Max: ${payment_stats['max']:,.2f}")
    
    print(f"\nQuartiles:")
    print(f"  25th percentile: ${payment_stats['25%']:,.2f}")
    print(f"  50th percentile: ${payment_stats['50%']:,.2f}")
    print(f"  75th percentile: ${payment_stats['75%']:,.2f}")
    
    print(f"\nAdditional Percentiles:")
    for p in [90, 95, 99, 99.9]:
        val = df[payment_col].quantile(p/100)
        print(f"  {p}th percentile: ${val:,.2f}")
    
    # Skewness and Kurtosis
    skewness = df[payment_col].skew()
    kurtosis = df[payment_col].kurtosis()
    print(f"\nDistribution Shape:")
    print(f"  Skewness: {skewness:.2f}")
    print(f"  Kurtosis: {kurtosis:.2f}")

In [None]:
# Visualize payment distribution
if payment_col in df.columns:
    fig, axes = plt.subplots(2, 2, figsize=(16, 12))
    
    # Histogram
    axes[0, 0].hist(df[payment_col].dropna(), bins=50, color='steelblue', 
                    edgecolor='black', alpha=0.7)
    axes[0, 0].set_xlabel('Payment Amount ($)', fontsize=11, fontweight='bold')
    axes[0, 0].set_ylabel('Frequency', fontsize=11, fontweight='bold')
    axes[0, 0].set_title('Payment Amount Distribution', fontsize=12, fontweight='bold')
    axes[0, 0].grid(alpha=0.3)
    
    # Log-scale histogram
    log_payments = np.log10(df[payment_col][df[payment_col] > 0])
    axes[0, 1].hist(log_payments, bins=50, color='coral', 
                    edgecolor='black', alpha=0.7)
    axes[0, 1].set_xlabel('Log10(Payment Amount)', fontsize=11, fontweight='bold')
    axes[0, 1].set_ylabel('Frequency', fontsize=11, fontweight='bold')
    axes[0, 1].set_title('Payment Amount Distribution (Log Scale)', fontsize=12, fontweight='bold')
    axes[0, 1].grid(alpha=0.3)
    
    # Box plot
    axes[1, 0].boxplot(df[payment_col].dropna(), vert=True, patch_artist=True,
                       boxprops=dict(facecolor='lightgreen', alpha=0.7),
                       medianprops=dict(color='red', linewidth=2))
    axes[1, 0].set_ylabel('Payment Amount ($)', fontsize=11, fontweight='bold')
    axes[1, 0].set_title('Payment Amount Box Plot', fontsize=12, fontweight='bold')
    axes[1, 0].grid(alpha=0.3)
    
    # Violin plot
    parts = axes[1, 1].violinplot([df[payment_col].dropna()], vert=True, 
                                   showmeans=True, showmedians=True)
    for pc in parts['bodies']:
        pc.set_facecolor('plum')
        pc.set_alpha(0.7)
    axes[1, 1].set_ylabel('Payment Amount ($)', fontsize=11, fontweight='bold')
    axes[1, 1].set_title('Payment Amount Violin Plot', fontsize=12, fontweight='bold')
    axes[1, 1].grid(alpha=0.3)
    
    plt.tight_layout()
    plt.show()

In [None]:
# Categorical variables analysis
print("=" * 70)
print("CATEGORICAL VARIABLES ANALYSIS")
print("=" * 70)

categorical_cols = [
    'Change_Type',
    'Covered_Recipient_Type',
    'Form_of_Payment_or_Transfer_of_Value',
    'Nature_of_Payment_or_Transfer_of_Value'
]

for col in categorical_cols:
    if col in df.columns:
        print(f"\n{col}:")
        value_counts = df[col].value_counts().head(10)
        print(value_counts)
        print(f"  Unique values: {df[col].nunique()}")
        print(f"  Missing values: {df[col].isnull().sum()} ({df[col].isnull().sum()/len(df)*100:.1f}%)")

In [None]:
# Visualize categorical distributions
categorical_cols_viz = [
    'Covered_Recipient_Type',
    'Form_of_Payment_or_Transfer_of_Value',
    'Nature_of_Payment_or_Transfer_of_Value'
]

available_cols = [col for col in categorical_cols_viz if col in df.columns]

if len(available_cols) > 0:
    n_cols = min(len(available_cols), 2)
    n_rows = (len(available_cols) + 1) // 2
    
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(16, 6*n_rows))
    if n_rows * n_cols == 1:
        axes = [axes]
    else:
        axes = axes.flatten()
    
    for idx, col in enumerate(available_cols):
        value_counts = df[col].value_counts().head(15)
        
        axes[idx].barh(range(len(value_counts)), value_counts.values,
                       color=sns.color_palette('viridis', len(value_counts)),
                       edgecolor='black', alpha=0.7)
        axes[idx].set_yticks(range(len(value_counts)))
        axes[idx].set_yticklabels(value_counts.index, fontsize=9)
        axes[idx].set_xlabel('Count', fontsize=11, fontweight='bold')
        axes[idx].set_title(f'{col}\n(Top 15)', fontsize=12, fontweight='bold')
        axes[idx].grid(axis='x', alpha=0.3)
    
    # Hide extra subplots
    for idx in range(len(available_cols), len(axes)):
        axes[idx].set_visible(False)
    
    plt.tight_layout()
    plt.show()

## 5. Bivariate & Multivariate Analysis

Explore relationships between multiple variables.

In [None]:
# Payment amount by recipient type
recipient_type_col = 'Covered_Recipient_Type'

if recipient_type_col in df.columns and payment_col in df.columns:
    print("=" * 70)
    print("PAYMENT STATISTICS BY RECIPIENT TYPE")
    print("=" * 70)
    
    type_stats = df.groupby(recipient_type_col)[payment_col].agg([
        'count', 'sum', 'mean', 'median', 'std', 'min', 'max'
    ]).round(2)
    
    type_stats.columns = ['Count', 'Total ($)', 'Mean ($)', 'Median ($)', 'Std Dev ($)', 'Min ($)', 'Max ($)']
    type_stats = type_stats.sort_values('Total ($)', ascending=False)
    
    display(type_stats)

In [None]:
# Visualize payment by recipient type
if recipient_type_col in df.columns and payment_col in df.columns:
    fig, axes = plt.subplots(1, 2, figsize=(16, 6))
    
    # Box plot
    df.boxplot(column=payment_col, by=recipient_type_col, ax=axes[0],
               patch_artist=True, grid=True)
    axes[0].set_xlabel('Recipient Type', fontsize=11, fontweight='bold')
    axes[0].set_ylabel('Payment Amount ($)', fontsize=11, fontweight='bold')
    axes[0].set_title('Payment Distribution by Recipient Type', fontsize=12, fontweight='bold')
    plt.sca(axes[0])
    plt.xticks(rotation=45, ha='right')
    
    # Bar plot - total amounts
    total_by_type = df.groupby(recipient_type_col)[payment_col].sum().sort_values(ascending=False)
    colors = sns.color_palette('rocket', len(total_by_type))
    axes[1].bar(range(len(total_by_type)), total_by_type.values,
                color=colors, edgecolor='black', alpha=0.7)
    axes[1].set_xticks(range(len(total_by_type)))
    axes[1].set_xticklabels(total_by_type.index, rotation=45, ha='right')
    axes[1].set_xlabel('Recipient Type', fontsize=11, fontweight='bold')
    axes[1].set_ylabel('Total Payment Amount ($)', fontsize=11, fontweight='bold')
    axes[1].set_title('Total Payments by Recipient Type', fontsize=12, fontweight='bold')
    axes[1].grid(axis='y', alpha=0.3)
    
    plt.tight_layout()
    plt.show()

In [None]:
# Payment by Nature of Payment
nature_col = 'Nature_of_Payment_or_Transfer_of_Value'

if nature_col in df.columns and payment_col in df.columns:
    print("\n" + "=" * 70)
    print("TOP 15 PAYMENT NATURES BY TOTAL AMOUNT")
    print("=" * 70)
    
    nature_stats = df.groupby(nature_col)[payment_col].agg([
        'count', 'sum', 'mean', 'median'
    ]).round(2)
    
    nature_stats.columns = ['Count', 'Total ($)', 'Mean ($)', 'Median ($)']
    nature_stats = nature_stats.sort_values('Total ($)', ascending=False).head(15)
    
    display(nature_stats)

## 6. Temporal Analysis

Analyze payment patterns over time.

In [None]:
# Monthly payment trends
if 'Payment_Month' in df.columns and payment_col in df.columns:
    print("=" * 70)
    print("MONTHLY PAYMENT STATISTICS")
    print("=" * 70)
    
    monthly_stats = df.groupby('Payment_Month')[payment_col].agg([
        'count', 'sum', 'mean', 'median'
    ]).round(2)
    
    monthly_stats.columns = ['Count', 'Total ($)', 'Mean ($)', 'Median ($)']
    monthly_stats.index.name = 'Month'
    
    display(monthly_stats)

In [None]:
# Visualize monthly trends
if 'Payment_Month' in df.columns and payment_col in df.columns:
    fig, axes = plt.subplots(2, 1, figsize=(14, 10))
    
    # Monthly payment count
    monthly_counts = df.groupby('Payment_Month').size()
    axes[0].plot(monthly_counts.index, monthly_counts.values, 
                 marker='o', linewidth=2, markersize=8, color='steelblue')
    axes[0].set_xlabel('Month', fontsize=11, fontweight='bold')
    axes[0].set_ylabel('Number of Payments', fontsize=11, fontweight='bold')
    axes[0].set_title('Monthly Payment Count', fontsize=12, fontweight='bold')
    axes[0].grid(alpha=0.3)
    axes[0].set_xticks(range(1, 13))
    
    # Monthly payment total
    monthly_totals = df.groupby('Payment_Month')[payment_col].sum()
    axes[1].bar(monthly_totals.index, monthly_totals.values,
                color=sns.color_palette('viridis', 12), edgecolor='black', alpha=0.7)
    axes[1].set_xlabel('Month', fontsize=11, fontweight='bold')
    axes[1].set_ylabel('Total Payment Amount ($)', fontsize=11, fontweight='bold')
    axes[1].set_title('Monthly Total Payment Amount', fontsize=12, fontweight='bold')
    axes[1].grid(axis='y', alpha=0.3)
    axes[1].set_xticks(range(1, 13))
    
    plt.tight_layout()
    plt.show()

In [None]:
# Quarterly analysis
if 'Payment_Quarter' in df.columns and payment_col in df.columns:
    print("\n" + "=" * 70)
    print("QUARTERLY PAYMENT STATISTICS")
    print("=" * 70)
    
    quarterly_stats = df.groupby('Payment_Quarter')[payment_col].agg([
        'count', 'sum', 'mean', 'median'
    ]).round(2)
    
    quarterly_stats.columns = ['Count', 'Total ($)', 'Mean ($)', 'Median ($)']
    quarterly_stats.index.name = 'Quarter'
    
    display(quarterly_stats)
    
    # Visualize
    fig, ax = plt.subplots(figsize=(10, 6))
    quarterly_stats['Total ($)'].plot(kind='bar', ax=ax, 
                                       color=['#3498db', '#2ecc71', '#f39c12', '#e74c3c'],
                                       edgecolor='black', alpha=0.7)
    ax.set_xlabel('Quarter', fontsize=11, fontweight='bold')
    ax.set_ylabel('Total Payment Amount ($)', fontsize=11, fontweight='bold')
    ax.set_title('Quarterly Total Payment Amount', fontsize=12, fontweight='bold')
    ax.grid(axis='y', alpha=0.3)
    plt.xticks(rotation=0)
    plt.tight_layout()
    plt.show()

## 7. Geographic Analysis {#geographic}

Analyzing payment distributions across geographic regions.

In [None]:
# State-level statistics
state_col = 'Recipient_State'

if state_col in df.columns and payment_col in df.columns:
    print("=" * 70)
    print("TOP 20 STATES BY PAYMENT METRICS")
    print("=" * 70)
    
    state_stats = df.groupby(state_col)[payment_col].agg([
        'count', 'sum', 'mean', 'median'
    ]).round(2)
    
    state_stats.columns = ['Count', 'Total ($)', 'Mean ($)', 'Median ($)']
    state_stats = state_stats.sort_values('Total ($)', ascending=False).head(20)
    state_stats.index.name = 'State'
    
    display(state_stats)

In [None]:
# Visualize geographic distribution
if state_col in df.columns and payment_col in df.columns:
    fig, axes = plt.subplots(2, 1, figsize=(14, 12))
    
    # Top 20 states by count
    top_states_count = df[state_col].value_counts().head(20)
    axes[0].barh(range(len(top_states_count)), top_states_count.values,
                 color=sns.color_palette('rocket', len(top_states_count)),
                 edgecolor='black', alpha=0.7)
    axes[0].set_yticks(range(len(top_states_count)))
    axes[0].set_yticklabels(top_states_count.index)
    axes[0].set_xlabel('Number of Payments', fontsize=11, fontweight='bold')
    axes[0].set_title('Top 20 States by Payment Count', fontsize=12, fontweight='bold')
    axes[0].grid(axis='x', alpha=0.3)
    
    # Top 20 states by total amount
    top_states_amount = df.groupby(state_col)[payment_col].sum().sort_values(ascending=False).head(20)
    axes[1].barh(range(len(top_states_amount)), top_states_amount.values,
                 color=sns.color_palette('mako', len(top_states_amount)),
                 edgecolor='black', alpha=0.7)
    axes[1].set_yticks(range(len(top_states_amount)))
    axes[1].set_yticklabels(top_states_amount.index)
    axes[1].set_xlabel('Total Payment Amount ($)', fontsize=11, fontweight='bold')
    axes[1].set_title('Top 20 States by Total Payment Amount', fontsize=12, fontweight='bold')
    axes[1].grid(axis='x', alpha=0.3)
    
    plt.tight_layout()
    plt.show()

In [None]:
# Interactive choropleth map
if state_col in df.columns and payment_col in df.columns:
    # Aggregate by state
    state_summary = df.groupby(state_col).agg({
        payment_col: ['count', 'sum', 'mean']
    }).round(2)
    
    state_summary.columns = ['Payment_Count', 'Total_Amount', 'Avg_Amount']
    state_summary = state_summary.reset_index()
    
    # Create choropleth
    fig = px.choropleth(
        state_summary,
        locations=state_col,
        locationmode='USA-states',
        color='Total_Amount',
        hover_name=state_col,
        hover_data={'Payment_Count': ':,', 'Total_Amount': ':$,.0f', 'Avg_Amount': ':$,.2f'},
        color_continuous_scale='Viridis',
        scope='usa',
        title='Total Payment Amount by State'
    )
    
    fig.update_layout(
        geo=dict(bgcolor='rgba(0,0,0,0)'),
        height=600
    )
    
    fig.show()

## 7. Geographic Analysis

Explore payment patterns across different geographic regions.

In [None]:
# Create aggregated features by recipient
print("=" * 70)
print("FEATURE ENGINEERING: AGGREGATED RECIPIENT METRICS")
print("=" * 70)

# Identify recipient ID column
recipient_id_cols = [col for col in df.columns if 'recipient' in col.lower() and 'id' in col.lower()]
print(f"\nAvailable recipient ID columns: {recipient_id_cols}")

if len(recipient_id_cols) > 0:
    recipient_id_col = recipient_id_cols[0]  # Use first available ID column
    print(f"Using: {recipient_id_col}")
    
    # Create aggregated features
    agg_features = df.groupby(recipient_id_col).agg({
        payment_col: ['count', 'sum', 'mean', 'median', 'std', 'min', 'max']
    }).round(2)
    
    agg_features.columns = ['_'.join(col).strip() for col in agg_features.columns.values]
    agg_features = agg_features.reset_index()
    
    print(f"\nAggregated features created: {agg_features.shape}")
    display(agg_features.head(10))
else:
    print("⚠ No recipient ID column found for aggregation")

## 10. Advanced Visualizations

Create sophisticated visualizations for deeper insights.

In [None]:
# Outlier detection using IQR method
if payment_col in df.columns:
    print("=" * 70)
    print("OUTLIER DETECTION ANALYSIS")
    print("=" * 70)
    
    amounts = df[payment_col].dropna()
    
    # Method 1: IQR (Interquartile Range)
    Q1 = amounts.quantile(0.25)
    Q3 = amounts.quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    iqr_outliers = df[(df[payment_col] < lower_bound) | (df[payment_col] > upper_bound)]
    
    print(f"\nIQR Method:")
    print(f"  Q1 (25th percentile): ${Q1:,.2f}")
    print(f"  Q3 (75th percentile): ${Q3:,.2f}")
    print(f"  IQR: ${IQR:,.2f}")
    print(f"  Lower Bound: ${lower_bound:,.2f}")
    print(f"  Upper Bound: ${upper_bound:,.2f}")
    print(f"  Outliers Detected: {len(iqr_outliers):,} ({len(iqr_outliers)/len(df)*100:.2f}%)")
    
    # Method 2: Z-Score
    z_scores = np.abs(stats.zscore(amounts))
    z_threshold = 3
    z_outliers = df[np.abs(stats.zscore(df[payment_col].fillna(0))) > z_threshold]
    
    print(f"\nZ-Score Method (threshold={z_threshold}):")
    print(f"  Outliers Detected: {len(z_outliers):,} ({len(z_outliers)/len(df)*100:.2f}%)")
    
    # Method 3: Percentile-based
    percentile_99 = amounts.quantile(0.99)
    percentile_outliers = df[df[payment_col] > percentile_99]
    
    print(f"\nPercentile Method (99th percentile):")
    print(f"  Threshold: ${percentile_99:,.2f}")
    print(f"  Outliers Detected: {len(percentile_outliers):,} ({len(percentile_outliers)/len(df)*100:.2f}%)")

In [None]:
# Analyze outlier characteristics
if payment_col in df.columns and len(iqr_outliers) > 0:
    print("\n" + "=" * 70)
    print("OUTLIER CHARACTERISTICS")
    print("=" * 70)
    
    print(f"\nPayment Amount Statistics for Outliers:")
    print(f"  Count: {len(iqr_outliers):,}")
    print(f"  Mean: ${iqr_outliers[payment_col].mean():,.2f}")
    print(f"  Median: ${iqr_outliers[payment_col].median():,.2f}")
    print(f"  Min: ${iqr_outliers[payment_col].min():,.2f}")
    print(f"  Max: ${iqr_outliers[payment_col].max():,.2f}")
    
    # Top outliers
    print(f"\nTop 10 Outliers by Payment Amount:")
    top_outliers = iqr_outliers.nlargest(10, payment_col)[
        [col for col in [payment_col, recipient_type_col, state_col, nature_col] 
         if col in iqr_outliers.columns]
    ]
    display(top_outliers)

## 8. Feature Engineering

Create new features for predictive modeling.

In [None]:
# Interactive scatter plot - Payment amount vs Count
if payment_col in df.columns and recipient_type_col in df.columns:
    # Aggregate by recipient
    if len(recipient_id_cols) > 0:
        scatter_data = df.groupby([recipient_id_cols[0], recipient_type_col]).agg({
            payment_col: ['count', 'sum']
        }).reset_index()
        
        scatter_data.columns = ['Recipient_ID', 'Recipient_Type', 'Payment_Count', 'Total_Amount']
        
        fig = px.scatter(
            scatter_data.head(1000),  # Limit for performance
            x='Payment_Count',
            y='Total_Amount',
            color='Recipient_Type',
            size='Total_Amount',
            hover_data=['Recipient_ID'],
            title='Payment Frequency vs Total Amount by Recipient Type',
            labels={'Payment_Count': 'Number of Payments', 'Total_Amount': 'Total Payment Amount ($)'},
            height=600
        )
        
        fig.show()

## 9. Outlier Detection

Identify and analyze outlier payments.

In [None]:
# Generate summary report
print("=" * 70)
print("KEY FINDINGS SUMMARY")
print("=" * 70)

print("\n1. DATASET OVERVIEW")
print(f"   - Total Records: {df.shape[0]:,}")
print(f"   - Total Columns: {df.shape[1]}")
print(f"   - Data Completeness: {(1 - df.isnull().sum().sum() / (df.shape[0] * df.shape[1]))*100:.1f}%")

if payment_col in df.columns:
    print("\n2. PAYMENT STATISTICS")
    print(f"   - Total Payment Amount: ${df[payment_col].sum():,.2f}")
    print(f"   - Average Payment: ${df[payment_col].mean():,.2f}")
    print(f"   - Median Payment: ${df[payment_col].median():,.2f}")
    print(f"   - Payment Range: ${df[payment_col].min():,.2f} - ${df[payment_col].max():,.2f}")

if recipient_type_col in df.columns:
    print("\n3. RECIPIENT DISTRIBUTION")
    type_counts = df[recipient_type_col].value_counts()
    for rtype, count in type_counts.items():
        pct = (count / len(df)) * 100
        print(f"   - {rtype}: {count:,} ({pct:.1f}%)")

if 'Payment_Month' in df.columns:
    print("\n4. TEMPORAL PATTERNS")
    month_with_most = df.groupby('Payment_Month')[payment_col].sum().idxmax()
    month_with_least = df.groupby('Payment_Month')[payment_col].sum().idxmin()
    print(f"   - Highest payment month: {month_with_most}")
    print(f"   - Lowest payment month: {month_with_least}")

if state_col in df.columns:
    print("\n5. GEOGRAPHIC DISTRIBUTION")
    print(f"   - Number of states: {df[state_col].nunique()}")
    top_state = df.groupby(state_col)[payment_col].sum().idxmax()
    top_state_amount = df.groupby(state_col)[payment_col].sum().max()
    print(f"   - Top state by total amount: {top_state} (${top_state_amount:,.2f})")

print("\n6. DATA QUALITY OBSERVATIONS")
high_missing = (df.isnull().sum() / len(df) * 100).sort_values(ascending=False).head(3)
print(f"   - Columns with highest missingness:")
for col, pct in high_missing.items():
    if pct > 0:
        print(f"     • {col}: {pct:.1f}%")

print("\n7. OUTLIER ANALYSIS")
if payment_col in df.columns:
    print(f"   - IQR outliers: {len(iqr_outliers):,} ({len(iqr_outliers)/len(df)*100:.2f}%)")
    print(f"   - 99th percentile threshold: ${percentile_99:,.2f}")