# Credit Risk Data Exploration

This notebook demonstrates comprehensive data exploration techniques for credit risk datasets.

## Objectives:
- Load and examine credit risk datasets
- Perform data quality assessment
- Conduct exploratory data analysis (EDA)
- Identify patterns and relationships
- Generate insights for model development

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import os

# Add src to path
sys.path.append('../src')

from data_loader import CreditDataLoader
from visualization import RiskVisualizer

# Set up plotting
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
%matplotlib inline

# Configure pandas display
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

## 1. Data Loading and Initial Inspection

In [None]:
# Initialize data loader
loader = CreditDataLoader(data_path='../data/')

# For demonstration, we'll use sample data
# In practice, you would load real datasets:
# df = loader.load_give_me_credit('../data/cs-training.csv')
# df = loader.load_home_credit('../data/application_train.csv')

# Generate sample dataset for demonstration
df = loader.get_sample_data(n_samples=5000, random_state=42)

print(f"Dataset shape: {df.shape}")
print(f"\nColumn names: {list(df.columns)}")

In [None]:
# Display first few rows
print("First 5 rows:")
display(df.head())

print("\nDataset info:")
df.info()

In [None]:
# Basic statistics
print("Descriptive statistics:")
display(df.describe())

## 2. Data Quality Assessment

In [None]:
# Perform comprehensive data quality check
quality_report = loader.basic_data_quality_check(df, 'default')

print("=== DATA QUALITY REPORT ===")
print(f"Dataset shape: {quality_report['shape']}")
print(f"Total missing values: {quality_report['missing_values']}")
print(f"Duplicate rows: {quality_report['duplicate_rows']}")
print(f"Default rate: {quality_report['target_rate']:.2%}")

print("\nTarget distribution:")
for value, count in quality_report['target_distribution'].items():
    print(f"  {value}: {count} ({count/sum(quality_report['target_distribution'].values()):.1%})")

In [None]:
# Check for missing values
missing_data = df.isnull().sum()
missing_pct = (missing_data / len(df)) * 100

missing_df = pd.DataFrame({
    'Missing Count': missing_data,
    'Missing %': missing_pct
}).sort_values('Missing %', ascending=False)

print("Missing values by column:")
display(missing_df[missing_df['Missing Count'] > 0])

## 3. Target Variable Analysis

In [None]:
# Target variable distribution
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))

# Count plot
df['default'].value_counts().plot(kind='bar', ax=ax1, color=['skyblue', 'salmon'])
ax1.set_title('Default Distribution (Counts)', fontsize=14, fontweight='bold')
ax1.set_xlabel('Default Status')
ax1.set_ylabel('Count')
ax1.set_xticklabels(['No Default', 'Default'], rotation=0)

# Pie chart
df['default'].value_counts().plot(kind='pie', ax=ax2, autopct='%1.1f%%', 
                                  labels=['No Default', 'Default'],
                                  colors=['skyblue', 'salmon'])
ax2.set_title('Default Distribution (%)', fontsize=14, fontweight='bold')
ax2.set_ylabel('')

plt.tight_layout()
plt.show()

print(f"Class imbalance ratio: {df['default'].value_counts()[0] / df['default'].value_counts()[1]:.1f}:1")

## 4. Feature Distribution Analysis

In [None]:
# Numerical features distribution
numerical_cols = df.select_dtypes(include=[np.number]).columns.drop('default')

fig, axes = plt.subplots(3, 3, figsize=(18, 15))
axes = axes.ravel()

for idx, col in enumerate(numerical_cols):
    if idx < len(axes):
        # Distribution by default status
        df[df['default']==0][col].hist(bins=30, alpha=0.7, label='No Default', 
                                       color='skyblue', ax=axes[idx])
        df[df['default']==1][col].hist(bins=30, alpha=0.7, label='Default', 
                                       color='salmon', ax=axes[idx])
        axes[idx].set_title(f'{col} Distribution', fontweight='bold')
        axes[idx].legend()
        axes[idx].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 5. Feature Relationships and Correlations

In [None]:
# Initialize visualizer
viz = RiskVisualizer()

# Plot correlation heatmap
correlation_fig = viz.plot_correlation_heatmap(df, figsize=(12, 10))
plt.show()

In [None]:
# Correlation with target variable
target_corr = df.corr()['default'].sort_values(key=abs, ascending=False)
target_corr = target_corr.drop('default')  # Remove self-correlation

print("Correlation with default (sorted by absolute value):")
for feature, corr in target_corr.items():
    print(f"{feature:25s}: {corr:6.3f}")

In [None]:
# Visualize top correlations with target
top_corr_features = target_corr.head(6)

fig, axes = plt.subplots(2, 3, figsize=(18, 12))
axes = axes.ravel()

for idx, (feature, corr) in enumerate(top_corr_features.items()):
    # Box plot by default status
    df.boxplot(column=feature, by='default', ax=axes[idx])
    axes[idx].set_title(f'{feature}\n(Correlation: {corr:.3f})', fontweight='bold')
    axes[idx].set_xlabel('Default Status')

plt.tight_layout()
plt.show()

## 6. Risk Segmentation Analysis

In [None]:
# Analyze default rates by risk segments
key_features = ['credit_score', 'debt_to_income', 'age', 'income']

fig, axes = plt.subplots(2, 2, figsize=(16, 12))
axes = axes.ravel()

for idx, feature in enumerate(key_features):
    if feature in df.columns:
        # Create risk bins
        df[f'{feature}_bin'] = pd.qcut(df[feature], q=10, precision=2, duplicates='drop')
        
        # Calculate default rates by bin
        bin_stats = df.groupby(f'{feature}_bin')['default'].agg(['count', 'mean']).reset_index()
        bin_stats.columns = [f'{feature}_bin', 'count', 'default_rate']
        
        # Plot default rates
        x_pos = range(len(bin_stats))
        bars = axes[idx].bar(x_pos, bin_stats['default_rate'], 
                            color='lightcoral', alpha=0.7)
        axes[idx].set_title(f'Default Rate by {feature}', fontweight='bold')
        axes[idx].set_ylabel('Default Rate')
        axes[idx].set_xlabel(f'{feature} Bins')
        axes[idx].set_xticks(x_pos)
        axes[idx].set_xticklabels([f'Q{i+1}' for i in x_pos], rotation=45)
        axes[idx].grid(True, alpha=0.3)
        
        # Add value labels
        for i, bar in enumerate(bars):
            height = bar.get_height()
            axes[idx].text(bar.get_x() + bar.get_width()/2., height,
                          f'{height:.1%}', ha='center', va='bottom')

plt.tight_layout()
plt.show()

## 7. Advanced Analytics

In [None]:
# Create risk score based on key features
def calculate_risk_score(row):
    """Simple risk scoring function for demonstration."""
    score = 0
    
    # Credit score component (lower is worse)
    if row['credit_score'] < 600:
        score += 40
    elif row['credit_score'] < 700:
        score += 20
    
    # Debt-to-income component
    if row['debt_to_income'] > 0.4:
        score += 30
    elif row['debt_to_income'] > 0.3:
        score += 15
    
    # Age component (very young or old)
    if row['age'] < 25 or row['age'] > 65:
        score += 10
    
    # Late payment component
    if row['num_late_payments'] > 2:
        score += 20
    elif row['num_late_payments'] > 0:
        score += 10
    
    return score

# Calculate risk scores
df['risk_score'] = df.apply(calculate_risk_score, axis=1)

# Analyze risk score performance
risk_bins = pd.cut(df['risk_score'], bins=5, labels=['Low', 'Low-Med', 'Medium', 'Med-High', 'High'])
df['risk_category'] = risk_bins

risk_analysis = df.groupby('risk_category')['default'].agg(['count', 'sum', 'mean']).round(3)
risk_analysis.columns = ['Total', 'Defaults', 'Default_Rate']
risk_analysis['Default_Rate_Pct'] = (risk_analysis['Default_Rate'] * 100).round(1)

print("Risk Score Performance:")
display(risk_analysis)

In [None]:
# Visualize risk score performance
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

# Default rates by risk category
risk_analysis['Default_Rate'].plot(kind='bar', ax=ax1, color='lightcoral', alpha=0.7)
ax1.set_title('Default Rate by Risk Category', fontsize=14, fontweight='bold')
ax1.set_ylabel('Default Rate')
ax1.set_xlabel('Risk Category')
ax1.tick_params(axis='x', rotation=45)
ax1.grid(True, alpha=0.3)

# Volume by risk category
risk_analysis['Total'].plot(kind='bar', ax=ax2, color='lightblue', alpha=0.7)
ax2.set_title('Volume by Risk Category', fontsize=14, fontweight='bold')
ax2.set_ylabel('Count')
ax2.set_xlabel('Risk Category')
ax2.tick_params(axis='x', rotation=45)
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 8. Key Insights and Recommendations

In [None]:
print("=== KEY INSIGHTS FROM DATA EXPLORATION ===")
print()

# Dataset characteristics
print(f"1. Dataset Overview:")
print(f"   - Total observations: {len(df):,}")
print(f"   - Features: {len(df.columns)-1}")
print(f"   - Default rate: {df['default'].mean():.2%}")
print(f"   - Class imbalance: {df['default'].value_counts()[0]/df['default'].value_counts()[1]:.1f}:1")
print()

# Feature insights
print(f"2. Most Predictive Features (by correlation):")
for i, (feature, corr) in enumerate(target_corr.head(5).items()):
    print(f"   {i+1}. {feature}: {corr:.3f}")
print()

# Risk segmentation insights
print(f"3. Risk Segmentation Performance:")
low_risk_rate = risk_analysis.loc['Low', 'Default_Rate']
high_risk_rate = risk_analysis.loc['High', 'Default_Rate']
print(f"   - Low risk default rate: {low_risk_rate:.2%}")
print(f"   - High risk default rate: {high_risk_rate:.2%}")
print(f"   - Risk discrimination: {high_risk_rate/low_risk_rate:.1f}x higher")
print()

print(f"4. Recommendations for Model Development:")
print(f"   - Address class imbalance using techniques like SMOTE or class weighting")
print(f"   - Focus feature engineering on top predictive variables")
print(f"   - Consider ensemble methods for better performance")
print(f"   - Implement proper cross-validation for model selection")
print(f"   - Monitor model calibration for probability accuracy")

## Next Steps

Based on this exploration, the next notebooks will cover:

1. **Feature Engineering** (`02_feature_engineering.ipynb`):
   - Advanced feature transformations
   - Interaction terms creation
   - Missing value imputation strategies

2. **Model Development** (`03_model_development.ipynb`):
   - Multiple algorithm comparison
   - Hyperparameter optimization
   - Model validation and selection

3. **Risk Analysis** (`04_risk_analysis.ipynb`):
   - Portfolio risk calculations
   - Regulatory capital estimation
   - Stress testing scenarios