In [6]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score




In [7]:
df = pd.read_csv("../data/cs-training.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,1,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0
1,2,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0
2,3,0,0.65818,38,1,0.085113,3042.0,2,1,0,0,0.0
3,4,0,0.23381,30,0,0.03605,3300.0,5,0,0,0,0.0
4,5,0,0.907239,49,1,0.024926,63588.0,7,0,1,0,0.0


In [8]:
df.drop('Unnamed: 0',axis=1,inplace=True)

df.head()

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0
1,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0
2,0,0.65818,38,1,0.085113,3042.0,2,1,0,0,0.0
3,0,0.23381,30,0,0.03605,3300.0,5,0,0,0,0.0
4,0,0.907239,49,1,0.024926,63588.0,7,0,1,0,0.0


In [9]:
X=df.drop('SeriousDlqin2yrs',axis=1)
y=df['SeriousDlqin2yrs']

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)


In [10]:
imputer = SimpleImputer(strategy='median',add_indicator=True)
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)



In [11]:
rf_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    min_samples_split=190,
    class_weight='balanced',
    random_state=42,
    n_jobs=-1
)

rf_model.fit(X_train_imputed, y_train)

# 7. Evaluate
y_pred_rf = rf_model.predict_proba(X_test_imputed)[:, 1]
auc_rf = roc_auc_score(y_test, y_pred_rf)

auc_rf

0.8626256023619326

In [12]:
print("="*60)
print("CHECKING FOR OVERFITTING")
print("="*60)

# 1. Get predictions on BOTH train and test
y_pred_train_rf = rf_model.predict_proba(X_train_imputed)[:, 1]  # or X_train_scaled if you scaled
y_pred_test_rf = rf_model.predict_proba(X_test_imputed)[:, 1]

# 2. Calculate AUC for both
train_auc_rf = roc_auc_score(y_train, y_pred_train_rf)
test_auc_rf = roc_auc_score(y_test, y_pred_test_rf)

# 3. Compare
print(f"Train AUC: {train_auc_rf:.4f}")
print(f"Test AUC:  {test_auc_rf:.4f}")
print(f"Gap:       {abs(train_auc_rf - test_auc_rf):.4f}")

# 4. Interpretation
gap = abs(train_auc_rf - test_auc_rf)

if gap < 0.02:
    print("\n‚úÖ EXCELLENT - No overfitting!")
    print("Model generalizes very well.")
elif gap < 0.05:
    print("\n‚úÖ GOOD - Minimal overfitting")
    print("This is acceptable for Random Forest.")
elif gap < 0.10:
    print("\n‚ö†Ô∏è  MODERATE overfitting")
    print("Consider: reduce max_depth, increase min_samples_split")
else:
    print("\n‚ùå SEVERE overfitting!")
    print("Model memorizing training data. Need to regularize.")

CHECKING FOR OVERFITTING
Train AUC: 0.8814
Test AUC:  0.8626
Gap:       0.0187

‚úÖ EXCELLENT - No overfitting!
Model generalizes very well.


In [13]:
from sklearn.metrics import classification_report, confusion_matrix

print("\n" + "="*60)
print("DETAILED PERFORMANCE METRICS")
print("="*60)

# Get class predictions
y_pred_rf = rf_model.predict(X_test_imputed)

# Classification report
print("\nRandom Forest - Classification Report:")
print(classification_report(y_test, y_pred_rf, 
                          target_names=['No Default', 'Default']))

# Confusion matrix
cm = confusion_matrix(y_test, y_pred_rf)
print("\nConfusion Matrix:")
print(cm)

tn, fp, fn, tp = cm.ravel()
print(f"\nTrue Negatives:  {tn:,}")
print(f"False Positives: {fp:,}")
print(f"False Negatives: {fn:,}")
print(f"True Positives:  {tp:,}")

# Calculate recall for defaulters
recall_default = tp / (tp + fn)
print(f"\nüìä Recall for Defaulters: {recall_default:.2%}")
print(f"   (Catching {recall_default:.0%} of people who will default)")


DETAILED PERFORMANCE METRICS

Random Forest - Classification Report:
              precision    recall  f1-score   support

  No Default       0.98      0.81      0.89     28044
     Default       0.22      0.76      0.34      1956

    accuracy                           0.80     30000
   macro avg       0.60      0.79      0.61     30000
weighted avg       0.93      0.80      0.85     30000


Confusion Matrix:
[[22653  5391]
 [  462  1494]]

True Negatives:  22,653
False Positives: 5,391
False Negatives: 462
True Positives:  1,494

üìä Recall for Defaulters: 76.38%
   (Catching 76% of people who will default)


In [14]:
# Check if you have these variables
print("Checking what we have...")

# The trained model
print(f"‚úì Model: {type(rf_model)}")

# The feature columns
print(f"‚úì Features ({len(X.columns)}): {X.columns.tolist()}")

# Did you scale?
try:
    print(f"‚úì Scaler: {type(scaler)}")
    scaled = True
except:
    print("‚úó No scaler (didn't scale data)")
    scaled = False

Checking what we have...
‚úì Model: <class 'sklearn.ensemble._forest.RandomForestClassifier'>
‚úì Features (10): ['RevolvingUtilizationOfUnsecuredLines', 'age', 'NumberOfTime30-59DaysPastDueNotWorse', 'DebtRatio', 'MonthlyIncome', 'NumberOfOpenCreditLinesAndLoans', 'NumberOfTimes90DaysLate', 'NumberRealEstateLoansOrLines', 'NumberOfTime60-89DaysPastDueNotWorse', 'NumberOfDependents']
‚úó No scaler (didn't scale data)


In [25]:

import joblib
# These are the features a user will provide
original_features = [
    'RevolvingUtilizationOfUnsecuredLines',
    'age',
    'NumberOfTime30-59DaysPastDueNotWorse',
    'DebtRatio',
    'MonthlyIncome',
    'NumberOfOpenCreditLinesAndLoans',
    'NumberOfTimes90DaysLate',
    'NumberRealEstateLoansOrLines',
    'NumberOfTime60-89DaysPastDueNotWorse',
    'NumberOfDependents'
]

# Save them
features_path = '../models/feature_names.pkl'
joblib.dump(original_features, features_path)

print(f"‚úì Original feature names saved: {features_path}")
print(f"  Features: {len(original_features)}")
for i, name in enumerate(original_features, 1):
    print(f"    {i}. {name}")

‚úì Original feature names saved: ../models/feature_names.pkl
  Features: 10
    1. RevolvingUtilizationOfUnsecuredLines
    2. age
    3. NumberOfTime30-59DaysPastDueNotWorse
    4. DebtRatio
    5. MonthlyIncome
    6. NumberOfOpenCreditLinesAndLoans
    7. NumberOfTimes90DaysLate
    8. NumberRealEstateLoansOrLines
    9. NumberOfTime60-89DaysPastDueNotWorse
    10. NumberOfDependents


In [26]:
import os
# Save Random Forest model
model_path = '../models/random_forest_model.pkl'
joblib.dump(rf_model, model_path)

print(f"‚úì Model saved: {model_path}")

# Check file size
file_size = os.path.getsize(model_path) / (1024 * 1024)
print(f"  Size: {file_size:.2f} MB")

‚úì Model saved: ../models/random_forest_model.pkl
  Size: 3.28 MB


In [27]:
# Get the feature names the model was trained on
print("Model expects these features:")
print(f"Number of features: {rf_model.n_features_in_}")

# If you have X_train from training
print("\nFeatures used during training:")
print(X_train.columns.tolist())

# OR check X (before splitting)
print("\nAll columns in X:")
print(X.columns.tolist())

Model expects these features:
Number of features: 12

Features used during training:
['RevolvingUtilizationOfUnsecuredLines', 'age', 'NumberOfTime30-59DaysPastDueNotWorse', 'DebtRatio', 'MonthlyIncome', 'NumberOfOpenCreditLinesAndLoans', 'NumberOfTimes90DaysLate', 'NumberRealEstateLoansOrLines', 'NumberOfTime60-89DaysPastDueNotWorse', 'NumberOfDependents']

All columns in X:
['RevolvingUtilizationOfUnsecuredLines', 'age', 'NumberOfTime30-59DaysPastDueNotWorse', 'DebtRatio', 'MonthlyIncome', 'NumberOfOpenCreditLinesAndLoans', 'NumberOfTimes90DaysLate', 'NumberRealEstateLoansOrLines', 'NumberOfTime60-89DaysPastDueNotWorse', 'NumberOfDependents']


In [28]:
# Check the full dataframe before you created X
print("All columns in df:")
print(df.columns.tolist())
print(f"\nTotal columns in df: {len(df.columns)}")

# Look for columns with 'missing' in the name
missing_indicator_cols = [col for col in df.columns if 'missing' in col.lower()]
print(f"\nMissing indicator columns: {missing_indicator_cols}")

All columns in df:
['SeriousDlqin2yrs', 'RevolvingUtilizationOfUnsecuredLines', 'age', 'NumberOfTime30-59DaysPastDueNotWorse', 'DebtRatio', 'MonthlyIncome', 'NumberOfOpenCreditLinesAndLoans', 'NumberOfTimes90DaysLate', 'NumberRealEstateLoansOrLines', 'NumberOfTime60-89DaysPastDueNotWorse', 'NumberOfDependents']

Total columns in df: 11

Missing indicator columns: []


In [29]:
import joblib

# Save the fitted imputer (CRITICAL!)
imputer_path = '../models/imputer.pkl'
joblib.dump(imputer, imputer_path)

print(f"‚úì Imputer saved: {imputer_path}")

‚úì Imputer saved: ../models/imputer.pkl


In [31]:
import pandas as pd
import numpy as np

print("\n" + "="*60)
print("TESTING COMPLETE PIPELINE")
print("="*60)

# Load everything
loaded_model = joblib.load('../models/random_forest_model.pkl')
loaded_imputer = joblib.load('../models/imputer.pkl')
loaded_features = joblib.load('../models/feature_names.pkl')

print("‚úì Model loaded")
print("‚úì Imputer loaded")
print(f"‚úì Features loaded: {len(loaded_features)}")

# Create test data (10 features - what user provides)
test_data = {
    'RevolvingUtilizationOfUnsecuredLines': 0.5,
    'age': 35,
    'NumberOfTime30-59DaysPastDueNotWorse': 0,
    'DebtRatio': 0.3,
    'MonthlyIncome': 5000,
    'NumberOfOpenCreditLinesAndLoans': 8,
    'NumberOfTimes90DaysLate': 0,
    'NumberRealEstateLoansOrLines': 1,
    'NumberOfTime60-89DaysPastDueNotWorse': 0,
    'NumberOfDependents': 2
}

# Convert to DataFrame (10 features)
test_df = pd.DataFrame([test_data])[loaded_features]
print(f"\n1. User input: {test_df.shape} (10 features)")

# Apply imputer (adds 2 indicator columns)
test_imputed = loaded_imputer.transform(test_df)
print(f"2. After imputer: {test_imputed.shape} (12 features)")

# Predict
prob = loaded_model.predict_proba(test_imputed)[0][1]
score = int((1 - prob) * 100)

print(f"\n3. ‚úì Prediction works!")
print(f"   Default Probability: {prob:.2%}")
print(f"   Credit Score: {score}")

if score >= 80:
    tier = "Excellent"
elif score >= 70:
    tier = "Good"
elif score >= 60:
    tier = "Fair"
else:
    tier = "Poor"
    
print(f"   Risk Tier: {tier}")

print("\n" + "="*60)
print("‚úÖ COMPLETE PIPELINE WORKS!")
print("="*60)
print("\nFiles you need for FastAPI:")
print("  1. models/random_forest_model.pkl")
print("  2. models/imputer.pkl  ‚Üê NEW!")
print("  3. models/feature_names.pkl")



TESTING COMPLETE PIPELINE
‚úì Model loaded
‚úì Imputer loaded
‚úì Features loaded: 10

1. User input: (1, 10) (10 features)
2. After imputer: (1, 12) (12 features)

3. ‚úì Prediction works!
   Default Probability: 31.95%
   Credit Score: 68
   Risk Tier: Fair

‚úÖ COMPLETE PIPELINE WORKS!

Files you need for FastAPI:
  1. models/random_forest_model.pkl
  2. models/imputer.pkl  ‚Üê NEW!
  3. models/feature_names.pkl
