# Training Data Analysis

This notebook contains various data analysis and exploration tools for the ClearShield training pipeline.

**Purpose**: Analyze and validate training data at different pipeline stages

**Sections**:
1. Cleaned Data Analysis
2. Member Transaction Distribution
3. Fraud Statistics

## 0. Setup and Configuration

In [None]:
import sys
import os
import pandas as pd
from pathlib import Path
from glob import glob

# Add project root to path
PROJECT_ROOT = Path.cwd().parent
sys.path.insert(0, str(PROJECT_ROOT))

# Import centralized configuration
from config.pipeline_config import get_train_config

# Get configuration
config = get_train_config()

print(f"Project Root: {config.PROJECT_ROOT}")
print(f"Data Root: {config.DATA_ROOT}")

---

## 1. Cleaned Data Analysis

Analyze the cleaned data files after Stage 1 preprocessing.

### 1.1 File Overview: Size, Rows, and Date Span

In [None]:
CLEANED_DIR = str(config.get_path('cleaned'))
csv_files = sorted([f for f in os.listdir(CLEANED_DIR) if f.endswith('.csv')])

print(f"Analyzing {len(csv_files)} file(s) in: {CLEANED_DIR}\n")

# Load all dataframes
dfs = {}
for filename in csv_files:
    dfs[filename] = pd.read_csv(os.path.join(CLEANED_DIR, filename))

# Collect stats
stats = []
for filename, df in dfs.items():
    df['Post Date'] = pd.to_datetime(df['Post Date'], errors='coerce')
    min_date = df['Post Date'].min()
    max_date = df['Post Date'].max()
    fraud_count = ((df['Fraud Adjustment Indicator'].notna()) &
                   (df['Fraud Adjustment Indicator'] != '')).sum()

    stats.append({
        'File': filename,
        'Rows': len(df),
        'Members': df['Member ID'].nunique(),
        'Date From': min_date.strftime('%m/%d/%Y') if pd.notna(min_date) else 'N/A',
        'Date To': max_date.strftime('%m/%d/%Y') if pd.notna(max_date) else 'N/A',
        'Days': (max_date - min_date).days if pd.notna(min_date) else 0,
        'Fraud %': round(fraud_count / len(df) * 100, 4) if len(df) > 0 else 0
    })

# Display table
df_stats = pd.DataFrame(stats)
display(df_stats)

# Summary
all_members = set()
for df in dfs.values():
    all_members.update(df['Member ID'].dropna())

total_fraud = sum([((dfs[f]['Fraud Adjustment Indicator'].notna()) &
                    (dfs[f]['Fraud Adjustment Indicator'] != '')).sum()
                   for f in dfs.keys()])

print(f"\n{'='*60}")
print("SUMMARY STATISTICS")
print(f"{'='*60}")
print(f"Total Rows: {df_stats['Rows'].sum():,}")
print(f"Total Unique Members: {len(all_members):,}")
print(f"Total Fraud Indicators: {total_fraud:,}")
print(f"Overall Fraud %: {round(total_fraud / df_stats['Rows'].sum() * 100, 4)}%")
print(f"{'='*60}")

### 1.2 File Overlap Detection

Check for duplicate records across different CSV files.

In [None]:
CLEANED_DIR = str(config.get_path('cleaned'))
csv_files = sorted([f for f in os.listdir(CLEANED_DIR) if f.endswith('.csv')])

print(f"Loading {len(csv_files)} file(s) for overlap detection...\n")

# Load files and create row IDs
file_rows = {}
for filename in csv_files:
    df = pd.read_csv(os.path.join(CLEANED_DIR, filename))
    row_ids = set(df['Account ID'].astype(str) + '|' +
                  df['Member ID'].astype(str) + '|' +
                  df['Post Date'].astype(str) + '|' +
                  df['Post Time'].astype(str) + '|' +
                  df['Amount'].astype(str))
    file_rows[filename] = row_ids

# Calculate pairwise overlaps
results = []
for i, file1 in enumerate(csv_files):
    for j, file2 in enumerate(csv_files):
        if i < j:
            overlap = len(file_rows[file1] & file_rows[file2])
            pct1 = overlap / len(file_rows[file1]) * 100
            pct2 = overlap / len(file_rows[file2]) * 100

            results.append({
                'File 1': file1,
                'File 2': file2,
                'Overlap Rows': overlap,
                '% of File 1': round(pct1, 2),
                '% of File 2': round(pct2, 2)
            })

df_results = pd.DataFrame(results)

if df_results.empty:
    print("✓ Only one file found or no overlaps detected")
else:
    print("Overlap Analysis Results:")
    display(df_results)
    
    # Highlight significant overlaps
    significant = df_results[df_results['Overlap Rows'] > 0]
    if not significant.empty:
        print(f"\n⚠ Warning: Found {len(significant)} file pair(s) with overlapping records")
    else:
        print("\n✓ No overlapping records found between files")

---

## 2. Member Transaction Distribution

Analyze transaction count distribution across members after Stage 3.

In [None]:
# Configuration
BY_MEMBER_DIR = str(config.get_path('by_member_temp'))
threshold = config.fraud_matching['min_history_length']

print(f"Analyzing member files in: {BY_MEMBER_DIR}")
print(f"Minimum history threshold: {threshold}\n")

# Get all member files and count transactions
member_files = glob(os.path.join(BY_MEMBER_DIR, 'member_*.csv'))

if not member_files:
    print("⚠ No member files found. Please run Stage 3 first.")
else:
    counts = [len(pd.read_csv(f)) for f in member_files]

    # Calculate statistics
    total_count = len(counts)
    above_n = sum(1 for c in counts if c >= threshold)
    below_n = total_count - above_n
    above_ratio = (above_n / total_count) * 100
    below_ratio = (below_n / total_count) * 100

    # Print results
    print(f"{'='*60}")
    print("MEMBER TRANSACTION DISTRIBUTION")
    print(f"{'='*60}")
    print(f"Threshold: {threshold} transactions")
    print(f"Total Members: {total_count:,}")
    print(f"\nMembers >= {threshold} txns: {above_n:,} ({above_ratio:.2f}%)")
    print(f"Members < {threshold} txns:  {below_n:,} ({below_ratio:.2f}%)")
    print(f"{'='*60}")

    # Distribution visualization
    import matplotlib.pyplot as plt
    import seaborn as sns

    fig, axes = plt.subplots(1, 2, figsize=(14, 5))

    # Histogram
    axes[0].hist(counts, bins=50, edgecolor='black', alpha=0.7)
    axes[0].axvline(threshold, color='red', linestyle='--', linewidth=2, label=f'Threshold ({threshold})')
    axes[0].set_xlabel('Number of Transactions')
    axes[0].set_ylabel('Number of Members')
    axes[0].set_title('Transaction Count Distribution')
    axes[0].legend()
    axes[0].grid(True, alpha=0.3)

    # Box plot
    axes[1].boxplot(counts, vert=True)
    axes[1].axhline(threshold, color='red', linestyle='--', linewidth=2, label=f'Threshold ({threshold})')
    axes[1].set_ylabel('Number of Transactions')
    axes[1].set_title('Transaction Count Box Plot')
    axes[1].legend()
    axes[1].grid(True, alpha=0.3)

    plt.tight_layout()
    plt.show()

    # Additional statistics
    print(f"\nDistribution Statistics:")
    print(f"  Mean: {sum(counts) / len(counts):.2f}")
    print(f"  Median: {sorted(counts)[len(counts)//2]}")
    print(f"  Min: {min(counts)}")
    print(f"  Max: {max(counts)}")

---

## 3. Fraud Statistics Analysis

Analyze fraud matching results after Stage 3 completion.

In [None]:
# Read member summary
summary_path = config.get_path('by_member') / 'member_summary.csv'

if summary_path.exists():
    summary_df = pd.read_csv(summary_path)
    
    print(f"{'='*60}")
    print("FRAUD MATCHING RESULTS")
    print(f"{'='*60}")
    print(f"Total Members Processed: {len(summary_df):,}")
    print(f"\nCategory Breakdown:")
    print(summary_df['Category'].value_counts())
    
    # Detailed statistics
    print(f"\n{'='*60}")
    print("FRAUD ADJUSTMENT STATISTICS")
    print(f"{'='*60}")
    
    for category in ['matched', 'unmatched', 'no_fraud']:
        cat_df = summary_df[summary_df['Category'] == category]
        if not cat_df.empty:
            print(f"\n{category.upper()}:")
            print(f"  Members: {len(cat_df):,}")
            print(f"  Total Transactions: {cat_df['Total_Transactions'].sum():,}")
            print(f"  Fraud Adjustments: {cat_df['Fraud_Adjustments'].sum():,}")
            print(f"  Matched: {cat_df['Matched'].sum():,}")
    
    # Visualization
    import matplotlib.pyplot as plt
    
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # Category distribution
    category_counts = summary_df['Category'].value_counts()
    axes[0].pie(category_counts.values, labels=category_counts.index, autopct='%1.1f%%', startangle=90)
    axes[0].set_title('Member Distribution by Category')
    
    # Match rate
    match_data = summary_df[summary_df['Fraud_Adjustments'] > 0]
    if not match_data.empty:
        match_data['Match_Rate'] = match_data['Matched'] / match_data['Fraud_Adjustments'] * 100
        axes[1].hist(match_data['Match_Rate'], bins=20, edgecolor='black', alpha=0.7)
        axes[1].set_xlabel('Match Rate (%)')
        axes[1].set_ylabel('Number of Members')
        axes[1].set_title('Fraud Adjustment Match Rate Distribution')
        axes[1].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
else:
    print("⚠ Member summary not found. Please run Stage 3 first.")

---

## 4. Column Statistics

Analyze column distributions and data quality.

In [None]:
CLEANED_DIR = str(config.get_path('cleaned'))
csv_files = sorted([f for f in os.listdir(CLEANED_DIR) if f.endswith('.csv')])

if csv_files:
    # Load first file for analysis
    df = pd.read_csv(os.path.join(CLEANED_DIR, csv_files[0]))
    
    print(f"{'='*60}")
    print("DATA QUALITY ANALYSIS")
    print(f"{'='*60}")
    print(f"\nAnalyzing: {csv_files[0]}")
    print(f"Total Rows: {len(df):,}\n")
    
    # Missing values
    print("Missing Values:")
    missing = df.isnull().sum()
    missing_pct = (missing / len(df) * 100).round(2)
    missing_df = pd.DataFrame({
        'Column': missing.index,
        'Missing': missing.values,
        'Percent': missing_pct.values
    })
    display(missing_df[missing_df['Missing'] > 0])
    
    # Categorical columns
    print(f"\n{'='*60}")
    print("CATEGORICAL COLUMNS")
    print(f"{'='*60}")
    categorical_cols = ['Account Type', 'Action Type', 'Source Type']
    
    for col in categorical_cols:
        if col in df.columns:
            print(f"\n{col}:")
            print(df[col].value_counts().head(10))
    
    # Numerical columns
    print(f"\n{'='*60}")
    print("NUMERICAL STATISTICS")
    print(f"{'='*60}")
    print(df['Amount'].describe())
    
else:
    print("⚠ No cleaned files found. Please run Stage 1 first.")

---

## Summary

This notebook provides comprehensive analysis tools for the ClearShield training data pipeline. 

Use these analyses to:
- Validate data quality
- Understand data distributions
- Monitor fraud matching performance
- Detect potential issues early