In [None]:
# Import libraries
import pandas as pd
import plotly.express as px
from scipy.stats import zscore, ttest_ind
from sklearn.ensemble import IsolationForest

# Reusable Plotting Function
def save_plot(fig, filename):
    fig.write_to_html(f'reports/figures/{filename}.html')
    fig.write_image(f'reports/figures/{filename}.png')
    fig.show()

# Load dataset
df = pd.read_csv('data/raw/loan_data.csv')


In [None]:
# --- 0. Safe Column Access and Sampling ---
print("=== Sampling and Validation ===")
required_cols = ['Credit_Score', 'dtir1', 'loan_amount', 'LTV', 'Status', 'year', 'Region', 'loan_purpose', 'Gender']
try:
    missing_cols = [col for col in required_cols if col not in df.columns]
    if missing_cols:
        raise KeyError(f"Missing columns: {missing_cols}")
    if len(df) > 50000:
        df = df.sample(50000, random_state=42)
        print("Sampled 50,000 rows for performance")
except KeyError as e:
    print(e)
    raise

# --- 1. Data Overview ---
print("\n=== Data Overview ===")
print("Shape:", df.shape)
print("\nColumns:", df.columns.tolist())
print("\nFirst 5 rows:\n", df.head())
print("\nData Types:\n", df.dtypes)
print("\nDefault Rate:\n", df['Status'].value_counts(normalize=True))


In [None]:
# --- 2. Missing Values ---
print("\n=== Missing Values ===")
missing = df.isnull().sum()
missing_pct = (missing / len(df)) * 100
print("Missing Values:\n", missing[missing > 0])
print("\nMissing Percentages:\n", missing_pct[missing_pct > 0])


In [None]:
# --- 3. Interactive Visualizations ---
print("\n=== Interactive Visualizations ===")
# Credit Score Histogram with Facets
fig = px.histogram(df, x='Credit_Score', nbins=30, title='Credit Score Distribution by Region',
                   color='Status', facet_col='Region', marginal='box')
fig.update_layout(showlegend=True, bargap=0.1, title_font_size=20)
fig.update_xaxes(title_text='Credit Score', title_font_size=14)
fig.update_yaxes(title_text='Count', title_font_size=14)
fig.add_annotation(text='Lower scores correlate with higher defaults', x=600, y=5000, showarrow=True)
save_plot(fig, 'credit_score_dist')

# DTI Boxplot
fig_box = px.box(df, x='Status', y='dtir1', title='DTI by Default Status',
                 color='Status', points='suspectedoutliers', notched=True,
                 color_discrete_map={0: '#1f77b4', 1: '#ff7f0e'})
fig_box.update_layout(showlegend=True, title_font_size=20)
save_plot(fig_box, 'dti_by_default')


In [None]:
# --- 4. Statistical Test ---
print("\n=== T-Test: DTI by Default Status ===")
dti_default = df[df['Status'] == 1]['dtir1'].dropna()
dti_non_default = df[df['Status'] == 0]['dtir1'].dropna()
t_stat, p_value = ttest_ind(dti_default, dti_non_default)
print(f"T-Statistic: {t_stat:.2f}, P-Value: {p_value:.4f}")
print("Interpretation: Significant difference" if p_value < 0.05 else "No significant difference")


In [None]:
# --- 5. Outlier and Anomaly Detection ---
print("\n=== Outlier Detection ===")
df['loan_amount_z'] = zscore(df['loan_amount'])
outliers = df[df['loan_amount_z'].abs() > 3]
print("Number of Outliers:", len(outliers))
print("\nOutlier Summary:\n", outliers[['ID', 'loan_amount', 'Credit_Score', 'Status']].head())
outlier_default_rate = outliers['Status'].mean()
print(f"Default Rate for Outliers: {outlier_default_rate:.2%}")

# Scatter Plot for Outliers
fig = px.scatter(df, x='loan_amount', y='Credit_Score', color='Status',
                 title='Loan Amount vs. Credit Score with Outliers',
                 hover_data=['ID', 'dtir1'])
fig.add_scatter(x=outliers['loan_amount'], y=outliers['Credit_Score'],
                mode='markers', marker=dict(size=10, color='red', symbol='x'),
                name='Outliers')
fig.update_layout(showlegend=True, title_font_size=20)
save_plot(fig, 'outliers')

# Anomaly Detection
print("\n=== Anomaly Detection ===")
features = ['loan_amount', 'Credit_Score', 'dtir1']
iso_forest = IsolationForest(contamination=0.05, random_state=42)
df['anomaly'] = iso_forest.fit_predict(df[features].dropna())
anomalies = df[df['anomaly'] == -1]
print("Number of Anomalies:", len(anomalies))
print("Anomaly Default Rate:", anomalies['Status'].mean())
fig = px.scatter(df, x='loan_amount', y='Credit_Score', color='anomaly',
                 title='Anomalies in Loan Amount vs. Credit Score',
                 hover_data=['ID', 'dtir1'])
save_plot(fig, 'anomalies')


In [None]:
# --- 6. Temporal Analysis ---
print("\n=== Temporal Analysis ===")
yearly_defaults = df.groupby('year')['Status'].mean().reset_index()
print("Yearly Default Rates:\n", yearly_defaults)
fig = px.line(yearly_defaults, x='year', y='Status', title='Default Rate by Year',
              markers=True, line_shape='spline')
fig.update_layout(showlegend=False, xaxis_title='Year', yaxis_title='Default Rate', title_font_size=20)
save_plot(fig, 'yearly_defaults')


In [None]:
# --- 7. Feature Interaction Heatmaps ---
print("\n=== Feature Interaction Heatmaps ===")
defaulted = df[df['Status'] == 1][['Credit_Score', 'dtir1', 'loan_amount', 'LTV']]
print("Defaulted Loans Shape:", defaulted.shape)
corr_matrix = defaulted.corr()
fig = px.imshow(corr_matrix, title='Correlation Heatmap (Defaulted Loans)',
                color_continuous_scale='Viridis', text_auto='.2f')
fig.update_layout(width=600, height=600, title_font_size=20)
save_plot(fig, 'corr_defaulted')

# Pair Plot
fig = px.scatter_matrix(df, dimensions=['Credit_Score', 'dtir1', 'loan_amount', 'LTV'],
                        color='Status', title='Pair Plot of Key Features')
fig.update_layout(title_font_size=20, width=800, height=800)
save_plot(fig, 'pair_plot')


In [None]:
# --- 8. Categorical Analysis ---
print("\n=== Categorical Analysis ===")
purpose_defaults = df.groupby('loan_purpose')['Status'].mean().reset_index()
fig = px.bar(purpose_defaults, x='loan_purpose', y='Status',
             title='Default Rates by Loan Purpose', color='Status',
             text_auto='.2%')
fig.update_layout(title_font_size=20, xaxis_title='Loan Purpose', yaxis_title='Default Rate')
save_plot(fig, 'purpose_defaults')


In [None]:
# --- 9. Fairness Analysis ---
print("\n=== Fairness Analysis ===")
gender_defaults = df.groupby('Gender')['Status'].mean()
print("Default Rates by Gender:\n", gender_defaults)
disparate_impact = gender_defaults['Male'] / gender_defaults['Female']
print(f"Disparate Impact (Male/Female): {disparate_impact:.2f}")
print("Note: Values far from 1.0 may indicate bias")


In [None]:
# --- 10. Export Summary ---
print("\n=== Summary Statistics ===")
summary = df[['Credit_Score', 'dtir1', 'loan_amount', 'LTV']].describe()
summary.to_csv('reports/summary_stats.csv')
print(summary)


In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

# Set style for visualizations
plt.style.use('seaborn')
sns.set_palette("husl")

# Read the dataset
df = pd.read_csv('../../data/raw/loan_data.csv')

# Display basic information about the dataset
print("Dataset Shape:", df.shape)
print("\nFirst few rows:")
display(df.head())
print("\nDataset Info:")
df.info()
