# Credit risk modelling project - A machine learning model
Based on the dataset related to credit risk and contains information about individuals who have taken loans, I develop a machine learning model to predict the probability of loan default based on various financial and demographic factors of the borrowers.

In [None]:
# Installing and importing necessary libraries and packages

In [None]:
pip install scikit-learn

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc

In [None]:
# Load the dataset
df = pd.read_csv('credit_risk_dataset.csv')

# Display the first few rows of the dataset
print(df.head())

# Exploratory Data Analysis (EDA)
# Check for missing values
print(df.isnull().sum())

# Summary statistics
print(df.describe())

# Visualize the distribution of the loan status
sns.countplot(x='loan_status', data=df)
plt.title('Distribution of Loan Status')
plt.show()

In [None]:
# Visualize the correlation matrix
# Drop non-numeric columns
df_numeric = df.select_dtypes(include=[np.number])

# Calculate the correlation matrix
plt.figure(figsize=(10, 8))
sns.heatmap(df_numeric.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()


In [None]:
# Preprocessing
# Convert categorical variables into dummy variables
df = pd.get_dummies(df, drop_first=True) ## Dropping the first category with drop_first=True prevents multicollinearity, which can lead to instability in model coefficients and inaccurate predictions.

# Define features (X) and target (y)
# the features X are the input variables used by the model to make predictions
X = df.drop('loan_status', axis=1) # removes the 'loan_status' column from the DataFrame 'df'
# Reason: 'loan_status' is the target variable that indicates whether the loan is in default (1) or not (0). It is the variable you want to predict, so it should not be included in the features.

# the target y is the variable that the model will learn to predict
y = df['loan_status']
# Reason: The 'loan_status' column contains the outcome or label (default or not default) that the model aims to predict based on the features.

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) # 30% of the data will be used for testing, and the remaining 70% will be used for training.
# Using a fixed 'random_state' allows you to get the same train-test split every time you run the code

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
# This approach ensures that both the training and testing sets are on the same scale, facilitating more reliable model training and evaluation.

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# Impute missing values in X_train and X_test
imputer = SimpleImputer(strategy='mean')  # missing values should be replaced with the mean of each feature
X_train = imputer.fit_transform(X_train) # Computes the mean of each feature using the training data, then replaces missing values in 'X_train'
X_test = imputer.transform(X_test) # Applies the same statistics computed from 'X_train' to impute missing values in 'X_test'

# Now fit the logistic regression model
log_reg = LogisticRegression() # modelling the relationship between the input features and the binary target variable using a logistic function
log_reg.fit(X_train, y_train) # training the logistic regression model using the training data



In [None]:
# Predictions
y_pred = log_reg.predict(X_test) # Uses the trained logistic regression model to predict the target variable ('loan_status') for the test set features ('X_test').


# Model Evaluation
print("Classification Report:\n", classification_report(y_test, y_pred)) # Generates a detailed report of the model’s performance, comparing the true values (y_test) with the predicted values (y_pred).

### Interpretation of the Classification Report:

1. **Class 0 (Non-Default) Metrics**:
   - **Precision**: 0.88
     - This means 88% of the loans predicted as non-defaults were actually non-defaults.
   - **Recall**: 0.95
     - The model correctly identified 95% of all actual non-defaults.
   - **F1-Score**: 0.92
     - The harmonic mean of precision and recall, showing a strong performance in predicting non-defaults.

2. **Class 1 (Default) Metrics**:
   - **Precision**: 0.77
     - This means 77% of the loans predicted as defaults were actual defaults.
   - **Recall**: 0.56
     - The model correctly identified 56% of all actual defaults, indicating some defaults were missed.
   - **F1-Score**: 0.65
     - This lower F1-score suggests the model is less effective in predicting defaults compared to non-defaults.

3. **Overall Metrics**:
   - **Accuracy**: 0.86
     - The overall accuracy of the model is 86%, meaning it correctly classified 86% of all loans.
   - **Macro Average** (average of precision, recall, and F1-score for both classes):
     - **Precision**: 0.83, **Recall**: 0.75, **F1-Score**: 0.78
     - This provides a balanced view of performance across classes.
   - **Weighted Average** (weighted by support):
     - **Precision**: 0.86, **Recall**: 0.86, **F1-Score**: 0.86
     - These metrics are similar to the accuracy and reflect the overall model performance considering the class imbalance.

In [None]:
# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

### Interpretation of the Confusion Matrix:

1. **True Negatives (TN)**: 7,247
   - The model correctly predicted 7,247 instances as non-defaults (actual non-defaults predicted as non-defaults).

2. **False Positives (FP)**: 366
   - The model incorrectly predicted 366 instances as defaults when they were actually non-defaults (false alarms).

3. **False Negatives (FN)**: 956
   - The model incorrectly predicted 956 instances as non-defaults when they were actually defaults (missed defaults).

4. **True Positives (TP)**: 1,206
   - The model correctly predicted 1,206 instances as defaults (actual defaults predicted as defaults).

### Key Insights:

- **High True Negatives**: The model is effective at correctly identifying non-defaults, as shown by the large number of true negatives (7,247).
  
- **Challenges with False Negatives**: There are 956 false negatives, indicating that the model missed a significant number of actual defaults. This suggests the model could be underestimating the risk of default for some loans.

- **Moderate True Positives and False Positives**: The model has correctly identified a moderate number of defaults (1,206), but it also has a smaller number of false positives (366), which indicates reasonable precision but room for improvement in identifying defaults.

In [None]:
# Plot the Receiver Operating Characteristic (ROC) curve and calculate the Area Under the Curve (AUC)
# 

y_pred_prob = log_reg.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
roc_auc = auc(fpr, tpr)

plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()


- ROC Curve: Visualizes the performance of the model across all classification thresholds, showing how well the model separates the classes.
- AUC: Provides a single metric that captures the model's overall ability to discriminate between positive and negative classes.
A higher AUC indicates better performance, with values closer to 1.0 showing strong discrimination.

### Interpretation of the ROC Curve:

1. **Shape of the ROC Curve**:
   - The curve shows a good balance between True Positive Rate (TPR) and False Positive Rate (FPR).
   - The curve rises quickly towards the top-left corner, indicating that the model is effective at distinguishing between positive and negative classes at various thresholds.

2. **AUC (Area Under the Curve)**:
   - The AUC value of **0.87** suggests that the model has strong discriminatory power.
   - An AUC of 0.87 means that there is an 87% chance that the model will correctly differentiate between a randomly chosen positive instance (default) and a randomly chosen negative instance (non-default).
   - Values close to 1 indicate excellent model performance, while values closer to 0.5 suggest a model with no discriminatory power.

3. **Diagonal Reference Line**:
   - The dashed line (diagonal from (0,0) to (1,1)) represents the performance of a random classifier with AUC = 0.5.
   - The model's ROC curve is well above this line, confirming that it performs significantly better than random guessing.

In [None]:
# Save the Trained Model and Scaler:
import joblib

# Save the trained logistic regression model
joblib.dump(log_reg, 'trained_model.pkl')

# Save the scaler used for transforming the data
joblib.dump(scaler, 'scaler.pkl')

print("Model and scaler have been saved successfully.")


# Applying the newly developed machine learning model to predict the probability of default for a new customer

## 1. Version 1: Simple version - modify the input of new customer's information to obtain the probability of default

In [None]:
# Define the function

import pandas as pd

def predict_default_probability(
    age, income, home_ownership, emp_length, loan_intent, loan_grade, 
    loan_amnt, loan_int_rate, loan_percent_income, cb_default, cred_hist_length, 
    model, scaler, feature_columns):
    """
    Predicts the probability of default for a new customer.

    Parameters:
    - age (int): Age of the customer
    - income (float): Income of the customer
    - home_ownership (str): Home ownership status (RENT, MORTGAGE, OWN, OTHER)
    - emp_length (int): Employment length in years
    - loan_intent (str): Purpose of the loan (PERSONAL, EDUCATION, MEDICAL, HOMEIMPROVEMENT, DEBTCONSOLIDATION, VENTURE)
    - loan_grade (str): Loan grade (B, C, D, E, F, G)
    - loan_amnt (float): Amount of the loan
    - loan_int_rate (float): Interest rate of the loan
    - loan_percent_income (float): Percentage of income dedicated to loan payments
    - cb_default (str): Credit bureau default on file ('Y' or 'N')
    - cred_hist_length (int): Length of credit history
    - model (sklearn model): Trained logistic regression model
    - scaler (sklearn scaler): Trained scaler for feature scaling
    - feature_columns (list): List of feature columns from the training set
    
    Returns:
    - float: Predicted probability of default
    """
    
    # Define the new customer data as a dictionary
    new_customer = {
        'person_age': age,
        'person_income': income,
        'person_home_ownership_RENT': 1 if home_ownership == 'RENT' else 0,
        'person_home_ownership_MORTGAGE': 1 if home_ownership == 'MORTGAGE' else 0,
        'person_home_ownership_OWN': 1 if home_ownership == 'OWN' else 0,
        'person_home_ownership_OTHER': 1 if home_ownership == 'OTHER' else 0,
        'person_emp_length': emp_length,
        'loan_intent_PERSONAL': 1 if loan_intent == 'PERSONAL' else 0,
        'loan_intent_EDUCATION': 1 if loan_intent == 'EDUCATION' else 0,
        'loan_intent_MEDICAL': 1 if loan_intent == 'MEDICAL' else 0,
        'loan_intent_HOMEIMPROVEMENT': 1 if loan_intent == 'HOMEIMPROVEMENT' else 0,
        'loan_intent_DEBTCONSOLIDATION': 1 if loan_intent == 'DEBTCONSOLIDATION' else 0,
        'loan_intent_VENTURE': 1 if loan_intent == 'VENTURE' else 0,
        'loan_grade_B': 1 if loan_grade == 'B' else 0,
        'loan_grade_C': 1 if loan_grade == 'C' else 0,
        'loan_grade_D': 1 if loan_grade == 'D' else 0,
        'loan_grade_E': 1 if loan_grade == 'E' else 0,
        'loan_grade_F': 1 if loan_grade == 'F' else 0,
        'loan_grade_G': 1 if loan_grade == 'G' else 0,
        'loan_amnt': loan_amnt,
        'loan_int_rate': loan_int_rate,
        'loan_percent_income': loan_percent_income,
        'cb_person_default_on_file_Y': 1 if cb_default == 'Y' else 0,
        'cb_person_cred_hist_length': cred_hist_length
    }

    # Create a DataFrame for the new customer data
    new_customer_df = pd.DataFrame([new_customer])

    # Ensure the DataFrame has the same columns as expected by the model
    for col in feature_columns:
        if col not in new_customer_df.columns:
            new_customer_df[col] = 0  # Add missing columns with 0

    # Reorder columns to match the training data
    new_customer_df = new_customer_df[feature_columns]

    # Apply the same scaling
    new_customer_scaled = scaler.transform(new_customer_df)

    # Predict the probability of default
    probability_of_default = model.predict_proba(new_customer_scaled)[0][1]

    return probability_of_default


**To use this function, pass in the details of the new customer:**

In [None]:
# Example usage: to estimate the default probability, simpply modify the 'age', 'income', 'home_ownership', 'emp_length', 'loan_intent', 'loan_grade', 'loan_amnt', 'loan_int_rate', 'loan_percent_income', 'cb_default', 'cred_hist_length' below and then run the code

probability = predict_default_probability(
    age=35,                             # can be modified
    income=60000,                       # can be modified
    home_ownership='MORTGAGE',          # can be modified
    emp_length=6,                       # can be modified
    loan_intent='PERSONAL',             # can be modified
    loan_grade='C',                     # can be modified
    loan_amnt=10000,                    # can be modified
    loan_int_rate=14.5,                 # can be modified
    loan_percent_income=0.2,            # can be modified
    cb_default='N',                     # can be modified
    cred_hist_length=5,                 # can be modified
    model=log_reg, 
    scaler=scaler, 
    feature_columns=X.columns
)

print(f"The predicted probability of default is: {probability:.2f}")


## 2. Version 2: Interactive Function version - Enter each detail of the new customer when asked to calculate the probability of default

Here’s an enhanced version of the function that will interactively prompt you to enter each piece of customer information. The function will guide you through entering details such as age, income, home ownership, and other relevant factors. After gathering all the inputs, it will calculate and display the probability of default.

In [None]:
import pandas as pd

def predict_default_probability_interactive(model, scaler, feature_columns):
    """
    Interactively collects customer information and predicts the probability of default.
    
    Parameters:
    - model (sklearn model): Trained logistic regression model
    - scaler (sklearn scaler): Trained scaler for feature scaling
    - feature_columns (list): List of feature columns from the training set
    
    Returns:
    - float: Predicted probability of default
    """
    
    # Collect inputs from the user
    age = int(input("Enter the customer's age: "))
    income = float(input("Enter the customer's income: "))
    
    # Home ownership input
    home_ownership = input("Enter the customer's home ownership status (RENT, MORTGAGE, OWN, OTHER): ").strip().upper()
    
    # Employment length
    emp_length = int(input("Enter the customer's employment length (in years): "))
    
    # Loan intent
    loan_intent = input("Enter the customer's loan intent (PERSONAL, EDUCATION, MEDICAL, HOMEIMPROVEMENT, DEBTCONSOLIDATION, VENTURE): ").strip().upper()
    
    # Loan grade
    loan_grade = input("Enter the customer's loan grade (B, C, D, E, F, G): ").strip().upper()
    
    # Loan amount
    loan_amnt = float(input("Enter the loan amount: "))
    
    # Loan interest rate
    loan_int_rate = float(input("Enter the loan interest rate (as a percentage): "))
    
    # Loan percent income
    loan_percent_income = float(input("Enter the percentage of income dedicated to loan payments: "))
    
    # Credit bureau default on file
    cb_default = input("Is there a credit bureau default on file? (Y/N): ").strip().upper()
    
    # Credit history length
    cred_hist_length = int(input("Enter the length of the customer's credit history (in years): "))
    
    # Define the new customer data as a dictionary
    new_customer = {
        'person_age': age,
        'person_income': income,
        'person_home_ownership_RENT': 1 if home_ownership == 'RENT' else 0,
        'person_home_ownership_MORTGAGE': 1 if home_ownership == 'MORTGAGE' else 0,
        'person_home_ownership_OWN': 1 if home_ownership == 'OWN' else 0,
        'person_home_ownership_OTHER': 1 if home_ownership == 'OTHER' else 0,
        'person_emp_length': emp_length,
        'loan_intent_PERSONAL': 1 if loan_intent == 'PERSONAL' else 0,
        'loan_intent_EDUCATION': 1 if loan_intent == 'EDUCATION' else 0,
        'loan_intent_MEDICAL': 1 if loan_intent == 'MEDICAL' else 0,
        'loan_intent_HOMEIMPROVEMENT': 1 if loan_intent == 'HOMEIMPROVEMENT' else 0,
        'loan_intent_DEBTCONSOLIDATION': 1 if loan_intent == 'DEBTCONSOLIDATION' else 0,
        'loan_intent_VENTURE': 1 if loan_intent == 'VENTURE' else 0,
        'loan_grade_B': 1 if loan_grade == 'B' else 0,
        'loan_grade_C': 1 if loan_grade == 'C' else 0,
        'loan_grade_D': 1 if loan_grade == 'D' else 0,
        'loan_grade_E': 1 if loan_grade == 'E' else 0,
        'loan_grade_F': 1 if loan_grade == 'F' else 0,
        'loan_grade_G': 1 if loan_grade == 'G' else 0,
        'loan_amnt': loan_amnt,
        'loan_int_rate': loan_int_rate,
        'loan_percent_income': loan_percent_income,
        'cb_person_default_on_file_Y': 1 if cb_default == 'Y' else 0,
        'cb_person_cred_hist_length': cred_hist_length
    }

    # Create a DataFrame for the new customer data
    new_customer_df = pd.DataFrame([new_customer])

    # Ensure the DataFrame has the same columns as expected by the model
    for col in feature_columns:
        if col not in new_customer_df.columns:
            new_customer_df[col] = 0  # Add missing columns with 0

    # Reorder columns to match the training data
    new_customer_df = new_customer_df[feature_columns]

    # Apply the same scaling
    new_customer_scaled = scaler.transform(new_customer_df)

    # Predict the probability of default
    probability_of_default = model.predict_proba(new_customer_scaled)[0][1]

    print(f"\nThe predicted probability of default for the customer is: {probability_of_default:.2f}")
    


**How to Use the Interactive Function:** 
- Run the function in your Python environment.
- The function will prompt you to enter each piece of customer information step by step.
- After entering all the details, the function will calculate and display the probability of default.

In [None]:
# Call the function with the trained model, scaler, and feature columns
predict_default_probability_interactive(
    model=log_reg, 
    scaler=scaler, 
    feature_columns=X.columns
)