**Course**: Advanced Econometrics and Financial Risk Management  
**Assignment**: Capstone Project - Credit Risk Modeling  
**Date**: March 2025  
**Author**: Vanessa Quintero

---

## Learning Objectives 

1. Set up a professional data science environment for credit risk modeling
2. Understand the project structure used by financial institutions
3. Configure analysis parameters following industry standards
4. Prepare the foundation for mortgage portfolio analysis

---


In [1]:

# Import fundamental libraries for data manipulation
import pandas as pd
import numpy as np
import os
import sys
from datetime import datetime, timedelta
import warnings

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

print("Basic libraries imported successfully")
print(f"Pandas version: {pd.__version__}")
print(f"NumPy version: {np.__version__}")




Basic libraries imported successfully
Pandas version: 2.2.2
NumPy version: 1.26.4


In [2]:

# Import visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns

# Configure professional visualization settings
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 10
plt.rcParams['axes.titlesize'] = 12

print("Visualization libraries configured")
print(f"Matplotlib version: {plt.matplotlib.__version__}")
print(f"Seaborn version: {sns.__version__}")


Visualization libraries configured
Matplotlib version: 3.9.2
Seaborn version: 0.13.2


In [3]:


# Import statistical analysis libraries
from scipy import stats
from scipy.stats import chi2_contingency, ks_2samp
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

print("Statistical libraries imported")
print("Available: statistical tests, econometric modeling, VIF analysis")



Statistical libraries imported
Available: statistical tests, econometric modeling, VIF analysis


In [4]:

# Import machine learning libraries for credit risk modeling
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import (
    classification_report, confusion_matrix, roc_auc_score, 
    roc_curve, precision_recall_curve, brier_score_loss
)
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer

print("Machine learning libraries imported")
print("Models: Logistic Regression, Random Forest, Gradient Boosting")
print("Metrics: ROC-AUC, Precision-Recall, Brier Score")


Machine learning libraries imported
Models: Logistic Regression, Random Forest, Gradient Boosting
Metrics: ROC-AUC, Precision-Recall, Brier Score


In [5]:

# Set random seed for reproducible results
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

print(f"Random seed set to {RANDOM_SEED}")
print("All random operations will produce consistent results")



Random seed set to 42
All random operations will produce consistent results


In [6]:

# Define project folder and directory structure
PROJECT_FOLDER = "mortgage-credit-risk-modeling"

PROJECT_DIRECTORIES = [
    'notebooks',
    'data/raw',
    'data/processed',
    'data/synthetic',
    'models/saved_models',
    'models/model_artifacts',
    'results/figures',
    'results/tables',
    'results/reports',
    'src',
    'docs',
    'tests'
]

print("Project Directory Structure:")
print(f"Main folder: {PROJECT_FOLDER}/")
for i, directory in enumerate(PROJECT_DIRECTORIES, 1):
    print(f"{i:2}. {PROJECT_FOLDER}/{directory}")

Project Directory Structure:
Main folder: mortgage-credit-risk-modeling/
 1. mortgage-credit-risk-modeling/notebooks
 2. mortgage-credit-risk-modeling/data/raw
 3. mortgage-credit-risk-modeling/data/processed
 4. mortgage-credit-risk-modeling/data/synthetic
 5. mortgage-credit-risk-modeling/models/saved_models
 6. mortgage-credit-risk-modeling/models/model_artifacts
 7. mortgage-credit-risk-modeling/results/figures
 8. mortgage-credit-risk-modeling/results/tables
 9. mortgage-credit-risk-modeling/results/reports
10. mortgage-credit-risk-modeling/src
11. mortgage-credit-risk-modeling/docs
12. mortgage-credit-risk-modeling/tests


In [7]:

# Create all project directories inside the project folder
created_count = 0
existing_count = 0

print(f"Creating directories inside '{PROJECT_FOLDER}' folder...")

for directory in PROJECT_DIRECTORIES:
    # Create full path inside project folder
    full_path = os.path.join(PROJECT_FOLDER, directory)
    
    if not os.path.exists(full_path):
        os.makedirs(full_path)
        print(f"Created: {full_path}")
        created_count += 1
    else:
        print(f"Exists:  {full_path}")
        existing_count += 1

print(f"Summary: {created_count} created, {existing_count} already existed")


Creating directories inside 'mortgage-credit-risk-modeling' folder...
Created: mortgage-credit-risk-modeling/notebooks
Created: mortgage-credit-risk-modeling/data/raw
Created: mortgage-credit-risk-modeling/data/processed
Created: mortgage-credit-risk-modeling/data/synthetic
Created: mortgage-credit-risk-modeling/models/saved_models
Created: mortgage-credit-risk-modeling/models/model_artifacts
Created: mortgage-credit-risk-modeling/results/figures
Created: mortgage-credit-risk-modeling/results/tables
Created: mortgage-credit-risk-modeling/results/reports
Created: mortgage-credit-risk-modeling/src
Created: mortgage-credit-risk-modeling/docs
Created: mortgage-credit-risk-modeling/tests
Summary: 12 created, 0 already existed


In [8]:

# Project metadata
PROJECT_INFO = {
    'name': 'Mortgage Credit Risk Modeling',
    'version': '1.0',
    'author': 'Graduate Student - Economics Program',
    'course': 'Advanced Econometrics and Financial Risk Management',
    'date': 'March 2025',
    'data_period': '2010Q1 to 2024Q4 (15 years)'
}

# Analysis parameters following industry standards
ANALYSIS_CONFIG = {
    'sample_size': 100000,
    'observation_months': 60,
    'random_seed': 42,
    'test_size': 0.20,
    'validation_size': 0.20,
    'cv_folds': 5,
    'start_date': '2010-01-01',  # Post-financial crisis lending standards
    'end_date': '2024-12-31',    # Current data
    'analysis_years': 15         # Industry standard timeframe
}

print("PROJECT CONFIGURATION")
PROJECT_FOLDER: 'mortgage-credit-risk-modeling'
print(f"Project Folder: {PROJECT_FOLDER}")
print("\nProject Information:")
for key, value in PROJECT_INFO.items():
    print(f"  {key.replace('_', ' ').title()}: {value}")

print("\nAnalysis Parameters:")
for key, value in ANALYSIS_CONFIG.items():
    if isinstance(value, int) and value >= 1000:
        print(f"  {key.replace('_', ' ').title()}: {value:,}")
    else:
        print(f"  {key.replace('_', ' ').title()}: {value}")

print(f"\nData Coverage: 15 years (2010-2024) - Industry Standard")
print("This period captures:")
print("  - Post-crisis recovery (2010-2012)")
print("  - Low interest rate environment (2013-2015)")
print("  - Economic expansion (2016-2019)")
print("  - COVID-19 pandemic impact (2020-2021)")
print("  - Interest rate normalization (2022-2024)")
print("\nThis timeframe mirrors what major banks use for model development")

PROJECT CONFIGURATION
Project Folder: mortgage-credit-risk-modeling

Project Information:
  Name: Mortgage Credit Risk Modeling
  Version: 1.0
  Author: Graduate Student - Economics Program
  Course: Advanced Econometrics and Financial Risk Management
  Date: March 2025
  Data Period: 2010Q1 to 2024Q4 (15 years)

Analysis Parameters:
  Sample Size: 100,000
  Observation Months: 60
  Random Seed: 42
  Test Size: 0.2
  Validation Size: 0.2
  Cv Folds: 5
  Start Date: 2010-01-01
  End Date: 2024-12-31
  Analysis Years: 15

Data Coverage: 15 years (2010-2024) - Industry Standard
This period captures:
  - Post-crisis recovery (2010-2012)
  - Low interest rate environment (2013-2015)
  - Economic expansion (2016-2019)
  - COVID-19 pandemic impact (2020-2021)
  - Interest rate normalization (2022-2024)

This timeframe mirrors what major banks use for model development


In [9]:


# Credit risk modeling parameters following industry standards
RISK_PARAMETERS = {
    'default_definition_days': 90,
    'loss_given_default_floor': 0.0,
    'loss_given_default_ceiling': 1.0,
    'probability_threshold': 0.5,
    'confidence_level': 0.95,
    'time_horizon_months': 12
}

# Economic stress testing scenarios following Federal Reserve CCAR framework
STRESS_SCENARIOS = {
    'baseline': {
        'name': 'Baseline Economic Scenario',
        'unemployment_rate': 4.0,
        'gdp_growth_rate': 2.5,
        'house_price_change': 3.0,
        'description': 'Normal economic conditions with steady growth'
    },
    'adverse': {
        'name': 'Adverse Economic Scenario', 
        'unemployment_rate': 7.5,
        'gdp_growth_rate': -1.5,
        'house_price_change': -5.0,
        'description': 'Moderate economic downturn with job losses'
    },
    'severely_adverse': {
        'name': 'Severely Adverse Scenario',
        'unemployment_rate': 10.0,
        'gdp_growth_rate': -3.0,
        'house_price_change': -15.0,
        'description': 'Severe economic recession similar to 2008-2009'
    }
}

print("RISK MODELING PARAMETERS")
print("Credit Risk Standards:")
for key, value in RISK_PARAMETERS.items():
    print(f"  {key.replace('_', ' ').title()}: {value}")

print(f"\nEconomic Stress Testing Scenarios:")
for scenario_key, scenario in STRESS_SCENARIOS.items():
    print(f"\n{scenario['name']}:")
    print(f"  Description: {scenario['description']}")
    print(f"  Unemployment: {scenario['unemployment_rate']}%")
    print(f"  GDP Growth: {scenario['gdp_growth_rate']}%") 
    print(f"  House Prices: {scenario['house_price_change']:+.1f}%")




RISK MODELING PARAMETERS
Credit Risk Standards:
  Default Definition Days: 90
  Loss Given Default Floor: 0.0
  Loss Given Default Ceiling: 1.0
  Probability Threshold: 0.5
  Confidence Level: 0.95
  Time Horizon Months: 12

Economic Stress Testing Scenarios:

Baseline Economic Scenario:
  Description: Normal economic conditions with steady growth
  Unemployment: 4.0%
  GDP Growth: 2.5%
  House Prices: +3.0%

Adverse Economic Scenario:
  Description: Moderate economic downturn with job losses
  Unemployment: 7.5%
  GDP Growth: -1.5%
  House Prices: -5.0%

Severely Adverse Scenario:
  Description: Severe economic recession similar to 2008-2009
  Unemployment: 10.0%
  GDP Growth: -3.0%
  House Prices: -15.0%


In [10]:

# Import the os module 
import os
from datetime import datetime  # Also importing datetime since it's used at the end

# Define standardized file paths inside project folder
FILE_PATHS = {
    'raw_data': f'{PROJECT_FOLDER}/data/raw/',
    'processed_data': f'{PROJECT_FOLDER}/data/processed/',
    'synthetic_data': f'{PROJECT_FOLDER}/data/synthetic/',
    'acquisition_file': f'{PROJECT_FOLDER}/data/raw/acquisition_data.csv',
    'performance_file': f'{PROJECT_FOLDER}/data/raw/performance_data.csv',
    'master_dataset': f'{PROJECT_FOLDER}/data/processed/master_dataset.csv',
    'models': f'{PROJECT_FOLDER}/models/saved_models/',
    'model_reports': f'{PROJECT_FOLDER}/models/model_artifacts/',
    'figures': f'{PROJECT_FOLDER}/results/figures/',
    'tables': f'{PROJECT_FOLDER}/results/tables/',
    'final_report': f'{PROJECT_FOLDER}/results/reports/',
    'notebooks': f'{PROJECT_FOLDER}/notebooks/'
}

# Verify setup completion
print("FILE PATH CONFIGURATION")
all_paths_exist = True

for path_name, path_value in FILE_PATHS.items():
    if path_value.endswith('/'):
        exists = os.path.exists(path_value)
        status = "EXISTS" if exists else "MISSING"
        print(f"  {path_name.replace('_', ' ').title()}: {path_value} - {status}")
        if not exists:
            all_paths_exist = False
    else:
        print(f"  {path_name.replace('_', ' ').title()}: {path_value}")

print("\nSETUP VERIFICATION")
if all_paths_exist:
    print("All directories created successfully")
    print("Configuration parameters defined")
    print("Analysis environment ready")
    print(f"Will analyze {ANALYSIS_CONFIG['sample_size']:,} mortgage loans")
    print(f"{ANALYSIS_CONFIG['observation_months']} months of performance data")
    print(f"{len(STRESS_SCENARIOS)} economic scenarios configured")
else:
    print("Some directories missing - please rerun directory creation")

print(f"\nNotebook 01 completed at {datetime.now().strftime('%I:%M %p')}")
print("Next step: Proceed to Notebook 02 - Data Understanding and Exploration")

FILE PATH CONFIGURATION
  Raw Data: mortgage-credit-risk-modeling/data/raw/ - EXISTS
  Processed Data: mortgage-credit-risk-modeling/data/processed/ - EXISTS
  Synthetic Data: mortgage-credit-risk-modeling/data/synthetic/ - EXISTS
  Acquisition File: mortgage-credit-risk-modeling/data/raw/acquisition_data.csv
  Performance File: mortgage-credit-risk-modeling/data/raw/performance_data.csv
  Master Dataset: mortgage-credit-risk-modeling/data/processed/master_dataset.csv
  Models: mortgage-credit-risk-modeling/models/saved_models/ - EXISTS
  Model Reports: mortgage-credit-risk-modeling/models/model_artifacts/ - EXISTS
  Figures: mortgage-credit-risk-modeling/results/figures/ - EXISTS
  Tables: mortgage-credit-risk-modeling/results/tables/ - EXISTS
  Final Report: mortgage-credit-risk-modeling/results/reports/ - EXISTS
  Notebooks: mortgage-credit-risk-modeling/notebooks/ - EXISTS

SETUP VERIFICATION
All directories created successfully
Configuration parameters defined
Analysis environment