# CMS Open Payments Data Exploration & Analysis

**Project:** AAI-540 Machine Learning Operations - Final Team Project  
**Dataset:** CMS Open Payments Program Year 2024 General Payments  
**Purpose:** Exploratory Data Analysis for Anomaly Detection and Risk Scoring

---

## Table of Contents
1. [Environment Setup & Configuration](#setup)
2. [Data Loading from Datalake](#loading)
3. [Data Cleaning & Preprocessing](#cleaning)
4. [Data Quality Assessment](#quality)
5. [Exploratory Data Analysis](#eda)
6. [Correlation Analysis](#correlation)
7. [Geographic Analysis](#geographic)

---

## Project Objectives

- Explore and understand CMS Open Payments data patterns
- Identify unusual payment patterns and outliers through visualization
- Assess data quality and completeness
- Analyze payment distributions across recipients, geography, and time
- Prepare insights for feature engineering and model development

## 1. Environment Setup & Configuration

Setting up the environment with necessary libraries and AWS integration.

In [None]:
# Install required packages
!pip install -r ../requirements.txt --quiet
!pip install boto3 sagemaker awswrangler pyathena --quiet

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
from pathlib import Path
import sys
from scipy import stats
import boto3
import sagemaker
import awswrangler as wr
from pyathena import connect
from sagemaker.session import Session
from datetime import datetime
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import RobustScaler

parent_dir = Path.cwd().parent
if str(parent_dir) not in sys.path:
    sys.path.insert(0, str(parent_dir))

warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("viridis")

pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 100)
pd.set_option('display.float_format', '{:.2f}'.format)

In [None]:
try:
    from utils.visualizations import PaymentVisualizer
    visualizer = PaymentVisualizer()
    use_visualizer = True
except ImportError:
    use_visualizer = False

In [None]:
%store -r bucket
%store -r region
%store -r database_name
%store -r table_name_parquet
%store -r s3_parquet_path
%store -r s3_athena_staging

try:
    test_vars = [bucket, region, database_name, table_name_parquet]
    print(f"Region: {region} | Bucket: {bucket} | Database: {database_name}")
except NameError:
    boto_session = boto3.Session()
    region = boto_session.region_name
    sts_client = boto3.client('sts')
    account_id = sts_client.get_caller_identity().get('Account')
    
    bucket = "cmsopenpaymentsystemslight"
    database_name = "cms_open_payments_light"
    table_name_parquet = "general_payments_parquet"
    cms_data_prefix = "cms-open-payments_light"
    parquet_data_prefix = f"{cms_data_prefix}/parquet"
    s3_parquet_path = f"s3://{bucket}/{parquet_data_prefix}"
    s3_athena_staging = f"s3://{bucket}/athena/staging"
    
    print(f"Region: {region} | Bucket: {bucket} | Database: {database_name}")

In [None]:
required_from_nb01 = {
    'bucket': 'S3 bucket name',
    'region': 'AWS region',
    'database_name': 'Athena database name',
    'table_name_parquet': 'Parquet table name',
    's3_parquet_path': 'S3 parquet path',
    's3_athena_staging': 'Athena staging path'
}

missing_vars = []
for var_name, description in required_from_nb01.items():
    try:
        eval(var_name)
    except NameError:
        missing_vars.append((var_name, description))

if missing_vars:
    print(f"WARNING: {len(missing_vars)} prerequisites missing. Run notebook 01 first.")
else:
    print("Prerequisites validated.")

## 2. Data Loading from Datalake

In [None]:
athena_conn = connect(
    region_name=region,
    s3_staging_dir=s3_athena_staging
)

In [None]:
load_full_dataset = True

if load_full_dataset:
    df = wr.athena.read_sql_query(
        sql=f"SELECT * FROM {database_name}.{table_name_parquet}",
        database=database_name,
        ctas_approach=False
    )
    print(f"Dataset loaded: {df.shape[0]:,} rows, {df.shape[1]} columns")

In [None]:
sample_size = 1_000_000 if load_full_dataset else 100_000

sample_query = f"""
SELECT *
FROM {database_name}.{table_name_parquet}
LIMIT {sample_size}
"""

df = wr.athena.read_sql_query(
    sql=sample_query,
    database=database_name,
    ctas_approach=False
)

print(f"Sample loaded: {df.shape[0]:,} rows, {df.shape[1]} columns | Memory: {df.memory_usage(deep=True).sum() / (1024**2):.2f} MB")

In [None]:
display(df.head(3))
print(f"Shape: {df.shape} | Columns: {df.shape[1]}")

## 3. Data Cleaning & Preprocessing

Prepare data for anomaly detection models by cleaning, standardizing, and selecting relevant features.

### 3.1 Initial Data Assessment

In [None]:
initial_shape = df.shape
initial_columns = df.shape[1]

print(f"Shape: {initial_shape} | Memory: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
print(f"\nData Types:\n{df.dtypes.value_counts()}")

missing_summary = pd.DataFrame({
    'Column': df.columns,
    'Missing_Count': df.isnull().sum(),
    'Missing_Percent': (df.isnull().sum() / len(df) * 100).round(2)
})
missing_summary = missing_summary[missing_summary['Missing_Count'] > 0].sort_values(
    'Missing_Percent', ascending=False
)

if len(missing_summary) > 0:
    print(f"\nMissing Values: {len(missing_summary)}/{len(df.columns)} columns")
    display(missing_summary.head(10))

print(f"\nDuplicates: {df.duplicated().sum():,}")

### 3.2 Feature Selection for Anomaly Detection

In [None]:
CORE_PAYMENT_FEATURES = [
    'total_amount_of_payment_usdollars',
    'number_of_payments_included_in_total_amount',
    'date_of_payment'
]

CATEGORICAL_FEATURES = [
    'covered_recipient_type',
    'nature_of_payment_or_transfer_of_value',
    'form_of_payment_or_transfer_of_value',
    'physician_specialty',
    'recipient_state'
]

IDENTIFIER_FEATURES = [
    'covered_recipient_profile_id',
    'covered_recipient_npi',
    'applicable_manufacturer_or_applicable_gpo_making_payment_name'
]

RISK_INDICATOR_FEATURES = [
    'physician_ownership_indicator',
    'third_party_payment_recipient_indicator',
    'product_indicator'
]

all_selected_features = (CORE_PAYMENT_FEATURES + CATEGORICAL_FEATURES + 
                         IDENTIFIER_FEATURES + RISK_INDICATOR_FEATURES)

available_features = [f for f in all_selected_features if f in df.columns]
missing_features = [f for f in all_selected_features if f not in df.columns]

print(f"Features: {len(available_features)}/{len(all_selected_features)} available")
if missing_features:
    print(f"Missing: {missing_features}")

df_selected = df[available_features].copy()
print(f"Selected: {df_selected.shape}")

### 3.3 Data Type Conversion and Validation

In [None]:
if 'total_amount_of_payment_usdollars' in df_selected.columns:
    df_selected['total_amount_of_payment_usdollars'] = pd.to_numeric(
        df_selected['total_amount_of_payment_usdollars'], errors='coerce'
    )
    negative_count = (df_selected['total_amount_of_payment_usdollars'] < 0).sum()
    if negative_count > 0:
        df_selected['total_amount_of_payment_usdollars'] = df_selected['total_amount_of_payment_usdollars'].abs()

if 'date_of_payment' in df_selected.columns:
    df_selected['date_of_payment'] = pd.to_datetime(df_selected['date_of_payment'], errors='coerce')

if 'number_of_payments_included_in_total_amount' in df_selected.columns:
    df_selected['number_of_payments_included_in_total_amount'] = pd.to_numeric(
        df_selected['number_of_payments_included_in_total_amount'], errors='coerce'
    ).fillna(1).astype('int64')

indicator_mapping = {'Yes': 1, 'Y': 1, 'No': 0, 'N': 0, 'Unknown': 0}
for col in RISK_INDICATOR_FEATURES:
    if col in df_selected.columns and df_selected[col].dtype == 'object':
        df_selected[col] = df_selected[col].map(indicator_mapping).fillna(0).astype('int64')

if 'recipient_state' in df_selected.columns:
    df_selected['recipient_state'] = df_selected['recipient_state'].str.upper().str.strip()

print("Data types converted")

### 3.4 Handle Missing Values

In [None]:
missing_before = df_selected.isnull().sum().sum()

if 'total_amount_of_payment_usdollars' in df_selected.columns:
    payment_median = df_selected['total_amount_of_payment_usdollars'].median()
    missing_amt = df_selected['total_amount_of_payment_usdollars'].isnull().sum()
    if missing_amt > 0:
        df_selected['total_amount_of_payment_usdollars'].fillna(payment_median, inplace=True)

if 'date_of_payment' in df_selected.columns:
    missing_dates = df_selected['date_of_payment'].isnull().sum()
    if missing_dates > 0:
        df_selected['date_of_payment'].fillna(method='ffill', inplace=True)
        df_selected['date_of_payment'].fillna(method='bfill', inplace=True)

for col in df_selected.select_dtypes(include=['object']).columns:
    missing_count = df_selected[col].isnull().sum()
    if missing_count > 0:
        if col in ['covered_recipient_type', 'nature_of_payment_or_transfer_of_value']:
            mode_val = df_selected[col].mode()[0] if len(df_selected[col].mode()) > 0 else "Unknown"
            df_selected[col].fillna(mode_val, inplace=True)
        else:
            df_selected[col].fillna("Unknown", inplace=True)

for col in df_selected.select_dtypes(include=[np.number]).columns:
    missing_count = df_selected[col].isnull().sum()
    if missing_count > 0:
        if 'count' in col.lower() or 'number' in col.lower():
            df_selected[col].fillna(0, inplace=True)
        else:
            df_selected[col].fillna(df_selected[col].median(), inplace=True)

missing_after = df_selected.isnull().sum().sum()
print(f"Missing values resolved: {missing_before - missing_after:,}")

### 3.5 Remove Invalid Records

In [None]:
records_before = len(df_selected)

duplicates = df_selected.duplicated().sum()
if duplicates > 0:
    df_selected = df_selected.drop_duplicates()

if 'total_amount_of_payment_usdollars' in df_selected.columns:
    zero_payments = (df_selected['total_amount_of_payment_usdollars'] == 0).sum()
    if zero_payments > 0:
        df_selected = df_selected[df_selected['total_amount_of_payment_usdollars'] > 0]

if 'date_of_payment' in df_selected.columns:
    invalid_dates = df_selected['date_of_payment'].isnull().sum()
    if invalid_dates > 0:
        df_selected = df_selected[df_selected['date_of_payment'].notnull()]

if 'covered_recipient_profile_id' in df_selected.columns:
    missing_id = df_selected['covered_recipient_profile_id'].isnull().sum()
    if missing_id > 0:
        df_selected = df_selected[df_selected['covered_recipient_profile_id'].notnull()]

df_selected = df_selected.reset_index(drop=True)
records_after = len(df_selected)
records_removed = records_before - records_after

print(f"Records: {records_after:,} | Removed: {records_removed:,} ({records_removed/records_before*100:.2f}%) | Retention: {records_after/records_before*100:.2f}%")

df = df_selected.copy()

### 3.6 Data Cleaning Summary

In [None]:
summary_data = {
    'Metric': [
        'Initial Records',
        'Final Records',
        'Records Removed',
        'Initial Columns',
        'Final Columns',
        'Missing Values',
        'Duplicates',
        'Data Completeness (%)'
    ],
    'Value': [
        f"{initial_shape[0]:,}",
        f"{df.shape[0]:,}",
        f"{initial_shape[0] - df.shape[0]:,}",
        initial_columns,
        df.shape[1],
        f"{df.isnull().sum().sum():,}",
        f"{df.duplicated().sum():,}",
        f"{(1 - df.isnull().sum().sum()/(df.shape[0]*df.shape[1]))*100:.2f}"
    ]
}

display(pd.DataFrame(summary_data))

feature_data = {
    'Category': ['Core Payment', 'Categorical', 'Risk Indicators', 'Identifiers'],
    'Count': [
        len([f for f in CORE_PAYMENT_FEATURES if f in df.columns]),
        len([f for f in CATEGORICAL_FEATURES if f in df.columns]),
        len([f for f in RISK_INDICATOR_FEATURES if f in df.columns]),
        len([f for f in IDENTIFIER_FEATURES if f in df.columns])
    ]
}

display(pd.DataFrame(feature_data))

if 'total_amount_of_payment_usdollars' in df.columns:
    payment_stats = df['total_amount_of_payment_usdollars']
    payment_data = {
        'Statistic': ['Total', 'Mean', 'Median', 'Min', 'Max'],
        'Amount ($)': [
            f"{payment_stats.sum():,.2f}",
            f"{payment_stats.mean():,.2f}",
            f"{payment_stats.median():,.2f}",
            f"{payment_stats.min():,.2f}",
            f"{payment_stats.max():,.2f}"
        ]
    }
    display(pd.DataFrame(payment_data))

display(df.head(3))

In [None]:
payment_date_col = 'date_of_payment'
if payment_date_col in df.columns:
    df['payment_year'] = df[payment_date_col].dt.year
    df['payment_month'] = df[payment_date_col].dt.month
    df['payment_quarter'] = df[payment_date_col].dt.quarter
    df['payment_dayofweek'] = df[payment_date_col].dt.dayofweek
    df['is_weekend'] = (df[payment_date_col].dt.dayofweek >= 5).astype('int64')

In [None]:
payment_col_options = ['total_amount_of_payment_usdollars', 'total_amount', 'amount', 'payment_amount']
payment_col = None
for col in payment_col_options:
    if col in df.columns:
        payment_col = col
        break

if payment_col is None:
    amount_cols = [col for col in df.columns if 'amount' in col.lower()]
    if len(amount_cols) > 0:
        payment_col = amount_cols[0]

recipient_type_col_options = ['covered_recipient_type', 'recipient_type', 'recipienttype']
recipient_type_col = None
for col in recipient_type_col_options:
    if col in df.columns:
        recipient_type_col = col
        break

if recipient_type_col is None:
    type_cols = [col for col in df.columns if 'recipient' in col.lower() and 'type' in col.lower()]
    if len(type_cols) > 0:
        recipient_type_col = type_cols[0]

recipient_id_cols = [col for col in df.columns if 'recipient' in col.lower() and 'id' in col.lower()]

state_col_options = ['recipient_state', 'state']
state_col = None
for col in state_col_options:
    if col in df.columns:
        state_col = col
        break

if state_col is None:
    state_cols = [col for col in df.columns if 'state' in col.lower() and 'recipient' in col.lower()]
    if len(state_cols) > 0:
        state_col = state_cols[0]

city_col_options = ['recipient_City', 'city']
city_col = None
for col in city_col_options:
    if col in df.columns:
        city_col = col
        break

if city_col is None:
    city_cols = [col for col in df.columns if 'city' in col.lower() and 'recipient' in col.lower()]
    if len(city_cols) > 0:
        city_col = city_cols[0]

print(f"Payment: {payment_col} | Recipient Type: {recipient_type_col} | State: {state_col}")

## 4. Data Quality Assessment

In [None]:
total_cells = df.shape[0] * df.shape[1]
missing_cells = df.isnull().sum().sum()
duplicates = df.duplicated().sum()

print(f"Dimensions: {df.shape[0]:,} rows x {df.shape[1]} columns")
print(f"Data Types: Numeric={len(df.select_dtypes(include=['number']).columns)} | Object={len(df.select_dtypes(include=['object']).columns)} | DateTime={len(df.select_dtypes(include=['datetime']).columns)}")
print(f"Missing: {missing_cells:,}/{total_cells:,} ({(missing_cells/total_cells)*100:.2f}%)")
print(f"Duplicates: {duplicates:,} ({(duplicates/df.shape[0])*100:.2f}%)")

In [None]:
missing_stats = pd.DataFrame({
    'Column': df.columns,
    'Missing_Count': df.isnull().sum().values,
    'Missing_Percent': (df.isnull().sum().values / len(df) * 100)
})

missing_stats = missing_stats[missing_stats['Missing_Count'] > 0].sort_values(
    'Missing_Percent', ascending=False
)

display(missing_stats.head(20))

In [None]:
if len(missing_stats) > 0 and use_visualizer:
    fig = visualizer.plot_missing_values(df, top_n=20)
    if fig:
        plt.show()

In [None]:
print(df.dtypes.value_counts())

## 5. Exploratory Data Analysis

Analyze distributions and patterns in cleaned data.

In [None]:
if payment_col and payment_col in df.columns:
    payment_stats = df[payment_col].describe()
    
    print(f"Count: {payment_stats['count']:,.0f}")
    print(f"Mean: ${payment_stats['mean']:,.2f} | Median: ${payment_stats['50%']:,.2f} | Std: ${payment_stats['std']:,.2f}")
    print(f"Min: ${payment_stats['min']:,.2f} | Max: ${payment_stats['max']:,.2f}")
    print(f"Quartiles: 25%=${payment_stats['25%']:,.2f} | 50%=${payment_stats['50%']:,.2f} | 75%=${payment_stats['75%']:,.2f}")
    
    for p in [90, 95, 99]:
        print(f"{p}th percentile: ${df[payment_col].quantile(p/100):,.2f}", end=' | ')
    print()
    
    print(f"Skewness: {df[payment_col].skew():.2f} | Kurtosis: {df[payment_col].kurtosis():.2f}")

In [None]:
if payment_col and payment_col in df.columns and use_visualizer:
    visualizer.plot_payment_distribution_detailed(df, payment_col=payment_col)
    plt.show()

In [None]:
categorical_cols = [
    'change_type',
    'covered_recipient_type',
    'form_of_payment_or_transfer_of_value',
    'nature_of_payment_or_transfer_of_value'
]

for col in categorical_cols:
    if col in df.columns:
        value_counts = df[col].value_counts().head(10)
        print(f"\n{col}: {df[col].nunique()} unique | Missing: {df[col].isnull().sum()} ({df[col].isnull().sum()/len(df)*100:.1f}%)")
        print(value_counts)

In [None]:
categorical_cols_viz = [
    'covered_recipient_type',
    'form_of_payment_or_transfer_of_value',
    'nature_of_payment_or_transfer_of_value'
]

for col in categorical_cols_viz:
    if col in df.columns and use_visualizer:
        visualizer.plot_category_distribution(df, col, top_n=15)
        plt.show()

### 5.1 Bivariate Analysis

Explore relationships between variables.

In [None]:
# Payment amount by recipient type
recipient_type_col = 'covered_recipient_type'

if recipient_type_col in df.columns and payment_col in df.columns:
    
    type_stats = df.groupby(recipient_type_col)[payment_col].agg([
        'count', 'sum', 'mean', 'median', 'std', 'min', 'max'
    ]).round(2)
    
    type_stats.columns = ['Count', 'Total ($)', 'Mean ($)', 'Median ($)', 'Std Dev ($)', 'Min ($)', 'Max ($)']
    type_stats = type_stats.sort_values('Total ($)', ascending=False)
    
    display(type_stats)

In [None]:
if recipient_type_col and recipient_type_col in df.columns and payment_col and payment_col in df.columns and use_visualizer:
    visualizer.plot_bivariate_comparison(df, group_col=recipient_type_col, amount_col=payment_col)
    plt.show()

In [None]:
if use_visualizer:
    try:
        visualizer.plot_payment_nature_by_total(
            df,
            nature_col='nature_of_payment_or_transfer_of_value',
            amount_col='total_amount_of_payment_usdollars',
            top_n=15
        )
    except KeyError as e:
        print(f"Error: {e}")

### 5.2 Temporal Analysis

Analyze payment patterns over time.

In [None]:
if payment_col and payment_col in df.columns:
    numeric_features = [payment_col]
    potential_numeric = ['number_of_payments_included_in_total_amount', 
                        'payment_month', 'payment_quarter', 'payment_dayofweek']
    
    for col in potential_numeric:
        if col in df.columns:
            numeric_features.append(col)
    
    if len(numeric_features) > 2 and use_visualizer:
        visualizer.plot_correlation_heatmap(df, numeric_features)
        plt.show()

### Correlation Analysis

Explore correlations between numeric features.

In [None]:
if 'payment_month' in df.columns and payment_col in df.columns:
    monthly_stats = df.groupby('payment_month')[payment_col].agg([
        'count', 'sum', 'mean', 'median'
    ]).round(2)
    monthly_stats.columns = ['Count', 'Total ($)', 'Mean ($)', 'Median ($)']
    monthly_stats.index.name = 'Month'
    display(monthly_stats)

### Monthly Payment Statistics

In [None]:
if 'date_of_payment' in df.columns and payment_col and payment_col in df.columns and use_visualizer:
    visualizer.plot_temporal_trends(df, date_col='date_of_payment', amount_col=payment_col)
    plt.show()

### Temporal Trends Overview

Visualize overall temporal patterns in payments.

In [None]:
if 'payment_month' in df.columns and payment_col and payment_col in df.columns and use_visualizer:
    visualizer.plot_monthly_trends(df, payment_col=payment_col, month_col='payment_month')
    plt.show()

### Quarterly Payment Statistics

In [None]:
if 'payment_quarter' in df.columns and payment_col and payment_col in df.columns:
    quarterly_stats = df.groupby('payment_quarter')[payment_col].agg([
        'count', 'sum', 'mean', 'median'
    ]).round(2)
    quarterly_stats.columns = ['Count', 'Total ($)', 'Mean ($)', 'Median ($)']
    quarterly_stats.index.name = 'Quarter'
    display(quarterly_stats)
    
    if use_visualizer:
        visualizer.plot_quarterly_trends(df, quarter_col='payment_quarter', amount_col=payment_col)
        plt.show()

## 7. Geographic Analysis

Analyzing payment distributions across geographic regions.

In [None]:
state_col = 'recipient_state'

if state_col in df.columns and payment_col in df.columns:
    state_stats = df.groupby(state_col)[payment_col].agg([
        'count', 'sum', 'mean', 'median'
    ]).round(2)
    state_stats.columns = ['Count', 'Total ($)', 'Mean ($)', 'Median ($)']
    state_stats = state_stats.sort_values('Total ($)', ascending=False).head(20)
    state_stats.index.name = 'State'
    display(state_stats)

### State-Level Payment Statistics

In [None]:
if state_col and state_col in df.columns and use_visualizer:
    visualizer.plot_geographic_distribution(df, state_col=state_col)
    plt.show()
    
    if payment_col and payment_col in df.columns:
        state_summary = df.groupby(state_col)[payment_col].agg(['count', 'sum', 'mean', 'median']).round(2)
        
        plt.figure(figsize=(12, 6))
        top_states = state_summary['sum'].nlargest(10)
        sns.barplot(x=top_states.values, y=top_states.index)
        plt.title('Top 10 States by Total Payment Amount')
        plt.xlabel('Total Payment Amount ($)')
        plt.tight_layout()
        plt.show()

### Geographic Distribution Overview

In [None]:
if state_col and state_col in df.columns and payment_col and payment_col in df.columns and use_visualizer:
    visualizer.plot_state_comparison(df, state_col=state_col, payment_col=payment_col, top_n=20)
    plt.show()

### Interactive Geographic Maps

In [None]:
recipient_id_cols = [col for col in df.columns if 'recipient' in col.lower() and 'id' in col.lower()]

if len(recipient_id_cols) > 0:
    recipient_id_col = recipient_id_cols[0]
    
    agg_features = df.groupby(recipient_id_col).agg({
        payment_col: ['count', 'sum', 'mean', 'median', 'std', 'min', 'max']
    }).round(2)
    
    agg_features.columns = ['_'.join(col).strip() for col in agg_features.columns.values]
    agg_features = agg_features.reset_index()
    
    print(f"Aggregated features: {agg_features.shape}")
    display(agg_features.head(10))