In [1]:
from sklearn.ensemble import RandomForestClassifier
from loan_default_model import preprocess, train_and_evaluate_model, make_predictions
import pandas as pd
import joblib
import pandas as pd
import altair as alt
import numpy as np
from sklearn.ensemble import StackingClassifier


In [10]:
from sklearn.ensemble import StackingClassifier, GradientBoostingClassifier, RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression


data = pd.read_csv('credit_risk_dataset.csv').dropna()
preprocessed_data = preprocess(data)

model = StackingClassifier(
    estimators=[
        ('rf', RandomForestClassifier(max_depth=3 ,random_state=42, n_estimators=100)),
        ('gb', GradientBoostingClassifier(learning_rate=0.01 ,random_state=42, n_estimators=100)),
        ('xgb', XGBClassifier(random_state=42, n_estimators=100))
    ],
    final_estimator=LogisticRegression()
)
trained_model = train_and_evaluate_model(model, preprocessed_data)
_ = joblib.dump(trained_model, 'model.joblib')

Confusion Matrix:
[[4404   39]
 [ 320  965]]
Classification Report:
              precision    recall  f1-score   support

         0.0       0.93      0.99      0.96      4443
         1.0       0.96      0.75      0.84      1285

    accuracy                           0.94      5728
   macro avg       0.95      0.87      0.90      5728
weighted avg       0.94      0.94      0.93      5728



['model.joblib']

In [15]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier, AdaBoostClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
import seaborn as sns
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
import pandas as pd

# List of models to test
models = {
    'Logistic Regression': LogisticRegression(random_state=42),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'SVM': SVC(),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Neural Network': MLPClassifier(random_state=42),
    'XGBoost': XGBClassifier(random_state=42),
    'CatBoost': CatBoostClassifier(random_state=42, verbose=0),
    'LightGBM': LGBMClassifier(random_state=42),
    'Extra Trees': ExtraTreesClassifier(random_state=42),
    'AdaBoost': AdaBoostClassifier(random_state=42),
    'Stacking': StackingClassifier(
        estimators=[
            ('rf', RandomForestClassifier(random_state=42)),
            ('gb', GradientBoostingClassifier(random_state=42)),
            ('xgb', XGBClassifier(random_state=42))
        ],
        final_estimator=LogisticRegression()
    )
}

data = pd.read_csv('credit_risk_dataset.csv').dropna()
preprocessed_data = preprocess(data)
X = preprocessed_data.drop('loan_status', axis=1)
y = preprocessed_data['loan_status']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

best_model_name = None
best_weighted_avg_f1 = 0

# Train and evaluate each model
for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    report = classification_report(y_test, y_pred, output_dict=True)
    weighted_avg_f1 = report['weighted avg']['f1-score']
    print(f"Model: {model_name}")
    print(classification_report(y_test, y_pred))
    print("\n")
    
    if weighted_avg_f1 > best_weighted_avg_f1:
        best_weighted_avg_f1 = weighted_avg_f1
        best_model_name = model_name

print(f"Best Model: {best_model_name} with Weighted Avg F1-Score: {best_weighted_avg_f1}")

Model: Random Forest
              precision    recall  f1-score   support

         0.0       0.93      0.99      0.96      4443
         1.0       0.97      0.72      0.83      1285

    accuracy                           0.93      5728
   macro avg       0.95      0.86      0.89      5728
weighted avg       0.94      0.93      0.93      5728



Model: Gradient Boosting
              precision    recall  f1-score   support

         0.0       0.92      0.99      0.95      4443
         1.0       0.94      0.71      0.81      1285

    accuracy                           0.92      5728
   macro avg       0.93      0.85      0.88      5728
weighted avg       0.92      0.92      0.92      5728



Model: XGBoost
              precision    recall  f1-score   support

         0.0       0.93      0.99      0.96      4443
         1.0       0.97      0.74      0.84      1285

    accuracy                           0.94      5728
   macro avg       0.95      0.87      0.90      5728
weighted 

In [8]:

joblib.dump({'model': trained_model}, 'loan_default_model.pkl')
# Load the dictionary
loaded_dict = joblib.load('loan_default_model.pkl')
loaded_model = loaded_dict['model']

# Example features for prediction
features_dict = {
    'person_age': 22,
    'person_income': 59000,
    'person_home_ownership': 'RENT',
    'person_emp_length': 123.0,
    'loan_grade': 'D',
    'loan_amnt': 35000,
    'loan_int_rate': 16.02,
    'loan_status': 1,
    'loan_percent_income': 0.59,
    'cb_person_default_on_file': 'Y',
    'cb_person_cred_hist_length': 3,
    'loan_intent_EDUCATION': 0.000000,
    'loan_intent_HOMEIMPROVEMENT': 0.000000,
    'loan_intent_MEDICAL': 1.000000,
    'loan_intent_PERSONAL': 0.000000,
    'loan_intent_VENTURE': 0.000000,
    'DTI': 3.778785
}

# Make predictions on the selected features
result = make_predictions(loaded_model, features_dict)

# Print the result
print(result)

The loan has a 98.08% chance of defaulting
[[0.01919406 0.98080594]]
