# Exploratory Data Analysis - Milestone 1
## Predictive Transaction Intelligence for BFSI

**Objective**: Analyze cleaned transaction data to identify patterns, anomalies, and key insights for fraud detection.

**Dataset**: `data/processed/transactions_processed.csv`

## 1. Setup and Data Loading

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings

warnings.filterwarnings('ignore')

# Set visualization style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 10

# Create output directory for figures
fig_dir = Path('../docs/figs')
fig_dir.mkdir(parents=True, exist_ok=True)

print("Libraries loaded successfully!")
print(f"Figure output directory: {fig_dir}")

In [None]:
# Load processed data
data_path = '../data/processed/transactions_processed.csv'
df = pd.read_csv(data_path)

# Convert timestamp to datetime
df['timestamp'] = pd.to_datetime(df['timestamp'])

print(f"Dataset loaded: {len(df)} rows, {len(df.columns)} columns")
print(f"\nColumns: {list(df.columns)}")

## 2. Data Overview

In [None]:
# Display first few rows
print("Sample Data:")
df.head()

In [None]:
# Data info
print("Data Types and Non-Null Counts:")
df.info()

In [None]:
# Statistical summary
print("Statistical Summary:")
df.describe()

## 3. Fraud Analysis

In [None]:
# Fraud statistics
fraud_counts = df['is_fraud'].value_counts()
fraud_pct = df['is_fraud'].value_counts(normalize=True) * 100

print("="*60)
print("FRAUD DISTRIBUTION ANALYSIS")
print("="*60)
print(f"\nTotal Transactions: {len(df):,}")
print(f"Legitimate Transactions: {fraud_counts[0]:,} ({fraud_pct[0]:.2f}%)")
print(f"Fraudulent Transactions: {fraud_counts[1]:,} ({fraud_pct[1]:.2f}%)")
print(f"\nFraud Rate: {fraud_pct[1]:.2f}%")
print("="*60)

In [None]:
# Summary statistics by fraud status
print("\nTransaction Amount Statistics by Fraud Status:")
print("="*60)

fraud_stats = df.groupby('is_fraud')['transaction_amount'].agg([
    ('count', 'count'),
    ('mean', 'mean'),
    ('median', 'median'),
    ('std', 'std'),
    ('min', 'min'),
    ('max', 'max')
])

fraud_stats.index = ['Legitimate', 'Fraudulent']
print(fraud_stats)

print("\nKey Insights:")
legit_mean = fraud_stats.loc['Legitimate', 'mean']
fraud_mean = fraud_stats.loc['Fraudulent', 'mean']
print(f"- Average legitimate transaction: ${legit_mean:,.2f}")
print(f"- Average fraudulent transaction: ${fraud_mean:,.2f}")
print(f"- Fraud transactions are {fraud_mean/legit_mean:.2f}x larger on average")

## 4. Visualizations

### Figure 1: Fraud vs Legitimate Transaction Count

In [None]:
# Figure 1: Fraud count bar chart
fig, ax = plt.subplots(figsize=(10, 6))

colors = ['#2ecc71', '#e74c3c']  # Green for legit, red for fraud
bars = ax.bar(['Legitimate', 'Fraudulent'], fraud_counts.values, color=colors, alpha=0.8, edgecolor='black')

# Add value labels on bars
for i, (bar, count, pct) in enumerate(zip(bars, fraud_counts.values, fraud_pct.values)):
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height,
            f'{count:,}\n({pct:.2f}%)',
            ha='center', va='bottom', fontsize=12, fontweight='bold')

ax.set_ylabel('Number of Transactions', fontsize=12, fontweight='bold')
ax.set_xlabel('Transaction Type', fontsize=12, fontweight='bold')
ax.set_title('Distribution of Legitimate vs Fraudulent Transactions', fontsize=14, fontweight='bold', pad=20)
ax.grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.savefig(fig_dir / 'fig1_fraud_count.png', dpi=300, bbox_inches='tight')
plt.show()

print("✓ Figure 1 saved: fig1_fraud_count.png")

### Figure 2: Transaction Amount Distribution by Fraud Status

In [None]:
# Figure 2: Boxplot of transaction amounts (log scale)
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Regular scale
df_plot = df.copy()
df_plot['fraud_label'] = df_plot['is_fraud'].map({0: 'Legitimate', 1: 'Fraudulent'})

sns.boxplot(data=df_plot, x='fraud_label', y='transaction_amount', 
            palette=['#2ecc71', '#e74c3c'], ax=axes[0])
axes[0].set_ylabel('Transaction Amount ($)', fontsize=12, fontweight='bold')
axes[0].set_xlabel('Transaction Type', fontsize=12, fontweight='bold')
axes[0].set_title('Transaction Amount Distribution (Linear Scale)', fontsize=13, fontweight='bold')
axes[0].grid(axis='y', alpha=0.3)

# Log scale
sns.boxplot(data=df_plot, x='fraud_label', y='transaction_amount', 
            palette=['#2ecc71', '#e74c3c'], ax=axes[1])
axes[1].set_yscale('log')
axes[1].set_ylabel('Transaction Amount ($) - Log Scale', fontsize=12, fontweight='bold')
axes[1].set_xlabel('Transaction Type', fontsize=12, fontweight='bold')
axes[1].set_title('Transaction Amount Distribution (Log Scale)', fontsize=13, fontweight='bold')
axes[1].grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.savefig(fig_dir / 'fig2_box_amount.png', dpi=300, bbox_inches='tight')
plt.show()

print("✓ Figure 2 saved: fig2_box_amount.png")

### Figure 3: Transaction Activity Heatmap (Hour vs Weekday)

In [None]:
# Figure 3: Heatmap of transactions by hour and weekday
pivot_table = df.pivot_table(
    values='transaction_id', 
    index='weekday', 
    columns='hour', 
    aggfunc='count', 
    fill_value=0
)

# Map weekday numbers to names
weekday_names = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
pivot_table.index = [weekday_names[i] for i in pivot_table.index]

fig, ax = plt.subplots(figsize=(16, 8))
sns.heatmap(pivot_table, annot=False, fmt='d', cmap='YlOrRd', 
            cbar_kws={'label': 'Number of Transactions'}, ax=ax)

ax.set_xlabel('Hour of Day', fontsize=12, fontweight='bold')
ax.set_ylabel('Day of Week', fontsize=12, fontweight='bold')
ax.set_title('Transaction Activity Heatmap: Weekday vs Hour', fontsize=14, fontweight='bold', pad=20)

plt.tight_layout()
plt.savefig(fig_dir / 'fig3_heatmap_time.png', dpi=300, bbox_inches='tight')
plt.show()

print("✓ Figure 3 saved: fig3_heatmap_time.png")

### Figure 4: Top 5 Channels by Fraud Rate

In [None]:
# Figure 4: Fraud rate by channel
channel_analysis = df.groupby('channel').agg({
    'is_fraud': ['sum', 'count', 'mean']
}).round(4)

channel_analysis.columns = ['fraud_count', 'total_transactions', 'fraud_rate']
channel_analysis['fraud_percentage'] = channel_analysis['fraud_rate'] * 100
channel_analysis = channel_analysis.sort_values('fraud_rate', ascending=False)

print("\nFraud Rate by Channel:")
print(channel_analysis)

# Plot top 5 (or all if less than 5)
top_n = min(5, len(channel_analysis))
top_channels = channel_analysis.head(top_n)

fig, ax = plt.subplots(figsize=(12, 6))

bars = ax.bar(range(len(top_channels)), top_channels['fraud_percentage'], 
              color='#e74c3c', alpha=0.7, edgecolor='black')

# Add percentage labels
for i, (idx, row) in enumerate(top_channels.iterrows()):
    ax.text(i, row['fraud_percentage'], 
            f"{row['fraud_percentage']:.2f}%\n({int(row['fraud_count'])}/{int(row['total_transactions'])})",
            ha='center', va='bottom', fontsize=10, fontweight='bold')

ax.set_xticks(range(len(top_channels)))
ax.set_xticklabels(top_channels.index, fontsize=11, fontweight='bold')
ax.set_ylabel('Fraud Rate (%)', fontsize=12, fontweight='bold')
ax.set_xlabel('Channel', fontsize=12, fontweight='bold')
ax.set_title(f'Top {top_n} Channels by Fraud Rate', fontsize=14, fontweight='bold', pad=20)
ax.grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.savefig(fig_dir / 'fig4_channel_fraud.png', dpi=300, bbox_inches='tight')
plt.show()

print("✓ Figure 4 saved: fig4_channel_fraud.png")

### Figure 5: High-Risk Customer Segments (Account Age Bucket)

In [None]:
# Figure 5: Fraud rate by account age bucket
segment_analysis = df.groupby('account_age_bucket').agg({
    'is_fraud': ['sum', 'count', 'mean']
}).round(4)

segment_analysis.columns = ['fraud_count', 'total_transactions', 'fraud_rate']
segment_analysis['fraud_percentage'] = segment_analysis['fraud_rate'] * 100
segment_analysis = segment_analysis.sort_values('fraud_rate', ascending=False)

print("\nFraud Rate by Account Age Bucket:")
print(segment_analysis)

# Define custom order for better visualization
age_order = ['new', 'recent', 'established', 'old']
segment_plot = segment_analysis.reindex([x for x in age_order if x in segment_analysis.index])

fig, ax = plt.subplots(figsize=(12, 6))

colors = plt.cm.RdYlGn_r(np.linspace(0.3, 0.8, len(segment_plot)))
bars = ax.bar(range(len(segment_plot)), segment_plot['fraud_percentage'], 
              color=colors, alpha=0.8, edgecolor='black')

# Add percentage labels
for i, (idx, row) in enumerate(segment_plot.iterrows()):
    ax.text(i, row['fraud_percentage'], 
            f"{row['fraud_percentage']:.2f}%\n({int(row['fraud_count'])}/{int(row['total_transactions'])})",
            ha='center', va='bottom', fontsize=10, fontweight='bold')

ax.set_xticks(range(len(segment_plot)))
ax.set_xticklabels([x.title() for x in segment_plot.index], fontsize=11, fontweight='bold')
ax.set_ylabel('Fraud Rate (%)', fontsize=12, fontweight='bold')
ax.set_xlabel('Account Age Bucket', fontsize=12, fontweight='bold')
ax.set_title('High-Risk Customer Segments by Account Age', fontsize=14, fontweight='bold', pad=20)
ax.grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.savefig(fig_dir / 'fig5_segment_risk.png', dpi=300, bbox_inches='tight')
plt.show()

print("✓ Figure 5 saved: fig5_segment_risk.png")

### Bonus Figure 6: KYC Verification Impact on Fraud

In [None]:
# Bonus: KYC verification analysis
kyc_analysis = df.groupby('kyc_verified').agg({
    'is_fraud': ['sum', 'count', 'mean']
}).round(4)

kyc_analysis.columns = ['fraud_count', 'total_transactions', 'fraud_rate']
kyc_analysis['fraud_percentage'] = kyc_analysis['fraud_rate'] * 100
kyc_analysis.index = ['Not Verified', 'Verified']

print("\nFraud Rate by KYC Status:")
print(kyc_analysis)

fig, ax = plt.subplots(figsize=(10, 6))

colors = ['#e74c3c', '#2ecc71']
bars = ax.bar(kyc_analysis.index, kyc_analysis['fraud_percentage'], 
              color=colors, alpha=0.7, edgecolor='black')

for i, (idx, row) in enumerate(kyc_analysis.iterrows()):
    ax.text(i, row['fraud_percentage'], 
            f"{row['fraud_percentage']:.2f}%\n({int(row['fraud_count'])}/{int(row['total_transactions'])})",
            ha='center', va='bottom', fontsize=11, fontweight='bold')

ax.set_ylabel('Fraud Rate (%)', fontsize=12, fontweight='bold')
ax.set_xlabel('KYC Verification Status', fontsize=12, fontweight='bold')
ax.set_title('Impact of KYC Verification on Fraud Rate', fontsize=14, fontweight='bold', pad=20)
ax.grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.savefig(fig_dir / 'fig6_kyc_impact.png', dpi=300, bbox_inches='tight')
plt.show()

print("✓ Bonus Figure 6 saved: fig6_kyc_impact.png")

## 5. Key Findings Summary

In [None]:
print("="*80)
print("KEY EDA FINDINGS")
print("="*80)

print(f"\n1. CLASS IMBALANCE:")
print(f"   - Fraud rate: {fraud_pct[1]:.2f}% (highly imbalanced dataset)")
print(f"   - This will require special handling in modeling (SMOTE, class weights)")

print(f"\n2. TRANSACTION AMOUNTS:")
print(f"   - Fraudulent transactions are {fraud_mean/legit_mean:.2f}x larger on average")
print(f"   - High-value transactions (>$50k) should be monitored closely")

print(f"\n3. CHANNEL RISK:")
highest_risk_channel = channel_analysis.index[0]
highest_risk_rate = channel_analysis.iloc[0]['fraud_percentage']
print(f"   - Highest risk channel: {highest_risk_channel} ({highest_risk_rate:.2f}% fraud rate)")

print(f"\n4. ACCOUNT AGE PATTERNS:")
highest_risk_segment = segment_analysis.index[0]
highest_segment_rate = segment_analysis.iloc[0]['fraud_percentage']
print(f"   - Highest risk segment: {highest_risk_segment} accounts ({highest_segment_rate:.2f}% fraud rate)")

print(f"\n5. KYC EFFECTIVENESS:")
kyc_reduction = kyc_analysis.loc['Not Verified', 'fraud_percentage'] - kyc_analysis.loc['Verified', 'fraud_percentage']
print(f"   - KYC verification reduces fraud rate by {kyc_reduction:.2f} percentage points")

print("\n" + "="*80)
print("All visualizations saved to: docs/figs/")
print("="*80)

## 6. Export and Conclusion

This analysis has identified several key risk factors for fraud detection:
- Transaction amount (larger transactions have higher fraud rates)
- Channel type (certain channels are riskier than others)
- Account age (newer accounts may have different risk profiles)
- KYC verification status (unverified accounts show higher fraud rates)
- Time patterns (certain hours/days may show elevated activity)

These insights will inform feature engineering and model development in Milestone 2.