In [37]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [38]:
import pandas as pd
df = pd.read_csv('/content/drive/MyDrive/BUS 458 Final/loan_data_analysis_final.csv')

In [39]:
# Get Column Names so you can Start Designing
df.columns

Index(['User ID', 'applications', 'Reason', 'Granted_Loan_Amount',
       'Requested_Loan_Amount', 'FICO_score', 'Fico_Score_group',
       'Employment_Status', 'Employment_Sector', 'Monthly_Gross_Income',
       'Monthly_Housing_Payment', 'Ever_Bankrupt_or_Foreclose', 'Lender',
       'Approved', 'bounty'],
      dtype='object')

In [42]:
# Create dummy variables for categorical features on the X DataFrame
# This operation replaces the original categorical columns with their one-hot encoded versions
dummy_cols_for_X = categorical_features
X_transformed_for_display = pd.get_dummies(X, columns=dummy_cols_for_X, drop_first=True)

# Display the updated DataFrame head
print("DataFrame head after creating dummy variables (X_transformed_for_display):")
print(X_transformed_for_display.head())

# Display descriptive statistics to find bounds for data entry in app
print("\nDescriptive statistics of X_transformed_for_display:")
print(X_transformed_for_display.describe())

DataFrame head after creating dummy variables (X_transformed_for_display):
   applications  Granted_Loan_Amount  Requested_Loan_Amount  FICO_score  \
0             1               100000               123000.0       669.0   
1             1                70000                79000.0       594.0   
2             1                10000                11000.0       596.0   
3             1               100000               120000.0       642.0   
4             1                30000                34000.0         NaN   

   Monthly_Gross_Income  Monthly_Housing_Payment  bounty  \
0                5024.0                      927       0   
1                5764.0                     1177       0   
2                4017.0                     1487       0   
3                3129.0                      904       0   
4                   NaN                     1620       0   

   Reason_credit_card_refinancing  Reason_debt_conslidation  \
0                           False                 

In [51]:
# Find bounds for data entry in app
print(df.describe())

       applications  Granted_Loan_Amount  Requested_Loan_Amount    FICO_score  \
count      100000.0         1.000000e+05           1.000000e+05  85000.000000   
mean            1.0         6.478465e+04           7.450309e+04    629.338153   
std             0.0         1.965837e+05           2.264856e+05     88.684201   
min             1.0         5.000000e+03           5.000000e+03    300.000000   
25%             1.0         2.000000e+04           2.400000e+04    572.000000   
50%             1.0         4.000000e+04           4.400000e+04    634.000000   
75%             1.0         7.000000e+04           8.400000e+04    693.000000   
max             1.0         2.000000e+06           2.500000e+06    850.000000   

       Monthly_Gross_Income  Monthly_Housing_Payment  \
count          85000.000000            100000.000000   
mean            5808.783776              1888.900720   
std             2988.185274              3431.924282   
min           -17702.000000               300.

In [43]:
from sklearn.tree import DecisionTreeClassifier

# Instantiate the Decision Tree Classifier model with max_depth=4 for slightly increased complexity
decision_tree_model = DecisionTreeClassifier(max_depth=4, random_state=42)

# Train the model
decision_tree_model.fit(X_train, y_train)

print("Decision Tree model with max_depth=4 trained successfully.")

Decision Tree model with max_depth=4 trained successfully.


In [44]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report

# 1. Make predictions on the test set with the Decision Tree model
y_pred_dt_new = decision_tree_model.predict(X_test)

# 2. Calculate probability scores for the positive class with the Decision Tree model
y_pred_proba_dt_new = decision_tree_model.predict_proba(X_test)[:, 1]

# 3. Calculate and print evaluation metrics for the Decision Tree model
accuracy_dt_new = accuracy_score(y_test, y_pred_dt_new)
precision_dt_new = precision_score(y_test, y_pred_dt_new)
recall_dt_new = recall_score(y_test, y_pred_dt_new)
f1_dt_new = f1_score(y_test, y_pred_dt_new)
roc_auc_dt_new = roc_auc_score(y_test, y_pred_proba_dt_new)

print("\n--- Decision Tree Model Performance (max_depth=4, leakage-free data) ---")
print(f"Accuracy: {accuracy_dt_new:.4f}")
print(f"Precision: {precision_dt_new:.4f}")
print(f"Recall: {recall_dt_new:.4f}")
print(f"F1-Score: {f1_dt_new:.4f}")
print(f"ROC AUC Score: {roc_auc_dt_new:.4f}")

# 4. Print the confusion matrix for the Decision Tree model
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_dt_new))

# 5. Print the classification report for the Decision Tree model
print("\nClassification Report:")
print(classification_report(y_test, y_pred_dt_new))


--- Decision Tree Model Performance (max_depth=4, leakage-free data) ---
Accuracy: 0.8901
Precision: 0.4242
Recall: 0.0257
F1-Score: 0.0485
ROC AUC Score: 0.7539

Confusion Matrix:
[[17745    76]
 [ 2123    56]]

Classification Report:
              precision    recall  f1-score   support

           0       0.89      1.00      0.94     17821
           1       0.42      0.03      0.05      2179

    accuracy                           0.89     20000
   macro avg       0.66      0.51      0.50     20000
weighted avg       0.84      0.89      0.84     20000



In [52]:
import pickle

with open("final_exam_model.pkl", "wb") as file:
    pickle.dump(decision_tree_model, file)

print("Model saved as final_exam_model.pkl")

Model saved as final_exam_model.pkl


In [50]:
feature_names = X_train.columns  # Replace X_train.columns with the list of feature names if needed

# Get feature importances from the decision tree
feature_importances = decision_tree_model.feature_importances_

# Filter features with non-zero importance, indicating they were used in splits
used_features = feature_names[feature_importances > 0]

# Display the features used in the tree
print("Features used in the decision tree:")
print(used_features.tolist())

Features used in the decision tree:
['Lender_B', 'Lender_C', 'Fico_Score_group_good', 'Fico_Score_group_very_good', 'Granted_Loan_Amount', 'FICO_score', 'Monthly_Gross_Income', 'housing_to_income_ratio']


In [None]:
import pickle
from sklearn.preprocessing import StandardScaler
import numpy as np

# --- Save the trained Decision Tree Model ---
# The current model is decision_tree_model
with open("final_exam_model.pkl", "wb") as file:
    pickle.dump(decision_tree_model, file)
print("Model saved as final_exam_model.pkl")

# --- Save the StandardScaler used for numerical features ---
# The scaler was last created and fit in cell 'da5d5a82'
# We need to re-initialize and fit it on the X_numerical that produced X_scaled_new

# Re-create X_numerical exactly as it was before the final scaling
initial_numerical_features = ['applications', 'Granted_Loan_Amount', 'Requested_Loan_Amount', 'FICO_score', 'Monthly_Gross_Income', 'Monthly_Housing_Payment']
X_numerical_for_scaler = X[initial_numerical_features].copy()

# Impute missing values with median for FICO_score and Monthly_Gross_Income
median_fico = X_numerical_for_scaler['FICO_score'].median()
median_income = X_numerical_for_scaler['Monthly_Gross_Income'].median()
X_numerical_for_scaler['FICO_score'] = X_numerical_for_scaler['FICO_score'].fillna(median_fico)
X_numerical_for_scaler['Monthly_Gross_Income'] = X_numerical_for_scaler['Monthly_Gross_Income'].fillna(median_income)

# Re-create engineered features
X_numerical_for_scaler['granted_requested_ratio'] = X_numerical_for_scaler['Granted_Loan_Amount'] / X_numerical_for_scaler['Requested_Loan_Amount']
max_finite_grr = X_numerical_for_scaler['granted_requested_ratio'].replace([np.inf, -np.inf], np.nan).max()
X_numerical_for_scaler['granted_requested_ratio'] = X_numerical_for_scaler['granted_requested_ratio'].replace([np.inf, -np.inf], max_finite_grr).fillna(0)

X_numerical_for_scaler['housing_to_income_ratio'] = X_numerical_for_scaler['Monthly_Housing_Payment'] / X_numerical_for_scaler['Monthly_Gross_Income']
max_finite_hir = X_numerical_for_scaler['housing_to_income_ratio'].replace([np.inf, -np.inf], np.nan).max()
X_numerical_for_scaler['housing_to_income_ratio'] = X_numerical_for_scaler['housing_to_income_ratio'].replace([np.inf, -np.inf], max_finite_hir).fillna(0)

# Now fit the scaler on this prepared X_numerical
scaler = StandardScaler()
scaler.fit(X_numerical_for_scaler)

with open("scaler.pkl", "wb") as file:
    pickle.dump(scaler, file)
print("StandardScaler saved as scaler.pkl")

# --- Save the feature columns for consistent input ordering in the app ---
with open("feature_columns.pkl", "wb") as file:
    pickle.dump(X_train.columns.tolist(), file)
print("Feature columns saved as feature_columns.pkl")

# --- Save the median values used for imputation ---
imputation_medians = {
    'FICO_score': median_fico,
    'Monthly_Gross_Income': median_income
}
with open("imputation_medians.pkl", "wb") as file:
    pickle.dump(imputation_medians, file)
print("Imputation medians saved as imputation_medians.pkl")

# --- Get unique values for categorical features for Streamlit select boxes ---
import pandas as pd # Ensure pandas is imported

# Using the original 'df' for unique category values before one-hot encoding
categorical_options = {
    'Reason': df['Reason'].unique().tolist(),
    'Employment_Status': df['Employment_Status'].unique().tolist(),
    'Lender': df['Lender'].unique().tolist(),
    'Fico_Score_group': df['Fico_Score_group'].unique().tolist(),
    'Employment_Sector': df['Employment_Sector'].unique().tolist(),
    'Ever_Bankrupt_or_Foreclose': df['Ever_Bankrupt_or_Foreclose'].unique().tolist() # This is 0/1, will be mapped to Yes/No or similar
}

with open("categorical_options.pkl", "wb") as file:
    pickle.dump(categorical_options, file)
print("Categorical options saved as categorical_options.pkl")

In [None]:
%%writefile hmeqapp.py

# -*- coding: utf-8 -*-
import streamlit as st
import pickle
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler # Required to unpickle the scaler

# Load the trained model, scaler, feature columns, imputation medians, and categorical options
with open("final_exam_model.pkl", "rb") as file:
    model = pickle.load(file)

with open("scaler.pkl", "rb") as file:
    scaler = pickle.load(file)

with open("feature_columns.pkl", "rb") as file:
    feature_columns = pickle.load(file)

with open("imputation_medians.pkl", "rb") as file:
    imputation_medians = pickle.load(file)

with open("categorical_options.pkl", "rb") as file:
    categorical_options = pickle.load(file)

# Title for the app
st.markdown(
    "<h1 style='text-align: center; background-color: #f0f2f6; padding: 10px; color: #31333F;'><b>Loan Approval Prediction</b></h1>",
    unsafe_allow_html=True
)

st.header("Enter Applicant's Details")

# Input fields for numerical values
st.subheader("Numerical Features")
applications = st.number_input("Applications (APPLICATIONS)", min_value=1, max_value=10, value=1)
granted_loan_amount = st.slider("Granted Loan Amount", min_value=5000, max_value=2000000, value=50000, step=1000)
requested_loan_amount = st.slider("Requested Loan Amount", min_value=5000, max_value=2500000, value=60000, step=1000)
fico_score = st.slider("FICO Score", min_value=300, max_value=850, value=650, step=1)
monthly_gross_income = st.slider("Monthly Gross Income", min_value=0, max_value=20000, value=5000, step=100)
monthly_housing_payment = st.slider("Monthly Housing Payment", min_value=300, max_value=50000, value=1500, step=100)

# Input fields for categorical values
st.subheader("Categorical Features")
reason = st.selectbox("Reason for Loan", categorical_options['Reason'])
employment_status = st.selectbox("Employment Status", categorical_options['Employment_Status'])
lender = st.selectbox("Lender", categorical_options['Lender'])
fico_score_group = st.selectbox("Fico Score Group", categorical_options['Fico_Score_group'])
employment_sector = st.selectbox("Employment Sector", categorical_options['Employment_Sector'])
ever_bankrupt_or_foreclose = st.selectbox("Ever Bankrupt or Foreclose", [0, 1], format_func=lambda x: "Yes" if x == 1 else "No")

# Create a button to make predictions
if st.button("Predict Loan Approval"):
    # Create a DataFrame from current inputs (before preprocessing)
    input_df = pd.DataFrame({
        'applications': [applications],
        'Granted_Loan_Amount': [granted_loan_amount],
        'Requested_Loan_Amount': [requested_loan_amount],
        'FICO_score': [fico_score],
        'Monthly_Gross_Income': [monthly_gross_income],
        'Monthly_Housing_Payment': [monthly_housing_payment],
        'Reason': [reason],
        'Employment_Status': [employment_status],
        'Lender': [lender],
        'Fico_Score_group': [fico_score_group],
        'Employment_Sector': [employment_sector],
        'Ever_Bankrupt_or_Foreclose': [ever_bankrupt_or_foreclose]
    })

    # --- Preprocessing identical to training data ---

    # 1. Impute missing numerical values (if any in original input, though sliders prevent this here)
    # We use saved medians, though for Streamlit inputs, this step might be redundant if all inputs are provided
    input_df['FICO_score'] = input_df['FICO_score'].fillna(imputation_medians['FICO_score'])
    input_df['Monthly_Gross_Income'] = input_df['Monthly_Gross_Income'].fillna(imputation_medians['Monthly_Gross_Income'])

    # 2. Feature Engineering: Create ratio features
    input_df['granted_requested_ratio'] = input_df['Granted_Loan_Amount'] / input_df['Requested_Loan_Amount']
    input_df['housing_to_income_ratio'] = input_df['Monthly_Housing_Payment'] / input_df['Monthly_Gross_Income']

    # Handle potential inf/-inf from division by zero, fillna(0) for engineered features
    input_df.replace([np.inf, -np.inf], np.nan, inplace=True)
    input_df['granted_requested_ratio'] = input_df['granted_requested_ratio'].fillna(0)
    input_df['housing_to_income_ratio'] = input_df['housing_to_income_ratio'].fillna(0)

    # Separate numerical and categorical for consistent processing
    numerical_cols_for_scaling = [
        'applications', 'Granted_Loan_Amount', 'Requested_Loan_Amount',
        'FICO_score', 'Monthly_Gross_Income', 'Monthly_Housing_Payment',
        'granted_requested_ratio', 'housing_to_income_ratio'
    ]
    categorical_cols_for_ohe = [
        'Reason', 'Employment_Status', 'Lender', 'Fico_Score_group',
        'Employment_Sector', 'Ever_Bankrupt_or_Foreclose'
    ]

    input_numerical = input_df[numerical_cols_for_scaling]
    input_categorical = input_df[categorical_cols_for_ohe]

    # 3. Scale numerical features
    input_numerical_scaled = scaler.transform(input_numerical)
    input_numerical_scaled_df = pd.DataFrame(
        input_numerical_scaled, columns=numerical_cols_for_scaling, index=input_df.index
    )

    # 4. One-hot encode categorical features
    input_categorical_ohe = pd.get_dummies(input_categorical, drop_first=True)

    # 5. Concatenate all preprocessed features
    final_input = pd.concat([input_categorical_ohe, input_numerical_scaled_df], axis=1)

    # Ensure the order and presence of columns matches training data
    # This is crucial! Any missing columns (due to a category not present in input_categorical_ohe)
    # must be added with a value of 0. Extra columns should be dropped.
    final_input = final_input.reindex(columns=feature_columns, fill_value=0)

    # Make prediction
    prediction = model.predict(final_input)
    prediction_proba = model.predict_proba(final_input)[:, 1]

    st.subheader("Prediction Results")
    if prediction[0] == 1:
        st.success(f"Loan Approval: YES (Probability: {prediction_proba[0]:.2f})")
        st.balloons()
    else:
        st.error(f"Loan Approval: NO (Probability: {prediction_proba[0]:.2f})")

    st.write("Note: A probability closer to 1 indicates higher likelihood of approval.")