In [40]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
import joblib
import warnings
warnings.filterwarnings('ignore')

In [21]:
df = pd.read_csv('data/loan_data.csv')

In [41]:
X = df.drop(['credit.policy', 'not.fully.paid'], axis=1)
y = df['credit.policy']

In [42]:
categorical_cols = X.select_dtypes(include=['object']).columns
if len(categorical_cols) > 0:
    le = LabelEncoder()
    for col in categorical_cols:
        X[col] = le.fit_transform(X[col])
    joblib.dump(le, 'label_encoder.pkl')  # Save the LabelEncoder
else:
    le = None  # No categorical variables

In [43]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [44]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [45]:
print("Training Models...")

Training Models...


In [46]:
lr_params = {'C': [0.01, 0.1, 1], 'max_iter': [1000]}
lr = GridSearchCV(LogisticRegression(), lr_params, cv=5, scoring='accuracy')
lr.fit(X_train_scaled, y_train)

In [47]:
rf_params = {'n_estimators': [100, 200], 'max_depth': [10, 20]}
rf = GridSearchCV(RandomForestClassifier(random_state=42), rf_params, cv=5, scoring='accuracy')
rf.fit(X_train, y_train)

In [None]:
xgb_params = {'n_estimators': [100, 200], 'max_depth': [3, 5], 'learning_rate': [0.1]}
xgb = GridSearchCV(XGBClassifier(random_state=42), xgb_params, cv=5, scoring='accuracy')
xgb.fit(X_train, y_train)

Training Random Forest...


In [48]:
def evaluate_model(model, X_test, y_test, name, scaled=False):
    X_eval = X_test_scaled if scaled else X_test
    y_pred = model.predict(X_eval)
    acc = accuracy_score(y_test, y_pred)
    print(f"{name} - Best Params: {model.best_params_}, Test Accuracy: {acc:.4f}")
    return acc

lr_acc = evaluate_model(lr, X_test, y_test, "Logistic Regression", scaled=True)
rf_acc = evaluate_model(rf, X_test, y_test, "Random Forest")
xgb_acc = evaluate_model(xgb, X_test, y_test, "XGBoost")

Logistic Regression - Best Params: {'C': 0.1, 'max_iter': 1000}, Test Accuracy: 0.9061
Random Forest - Best Params: {'max_depth': 20, 'n_estimators': 200}, Test Accuracy: 0.9875
XGBoost - Best Params: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100}, Test Accuracy: 0.9890


In [51]:
models = {'Logistic Regression': (lr, True), 'Random Forest': (rf, False), 'XGBoost': (xgb, False)}
best_model_name = max([(lr_acc, 'Logistic Regression'), (rf_acc, 'Random Forest'), (xgb_acc, 'XGBoost')], 
                     key=lambda x: x[0])[1]
best_model, needs_scaling = models[best_model_name]
joblib.dump(best_model, 'best_loan_model.pkl')
joblib.dump(scaler, 'scaler.pkl')

print(f"\nBest Model: {best_model_name}")


Best Model: XGBoost


In [52]:
best_model = joblib.load('best_loan_model.pkl')
scaler = joblib.load('scaler.pkl')
try:
    le = joblib.load('label_encoder.pkl')
except FileNotFoundError:
    le = None
    print("Warning: label_encoder.pkl not found. Assuming no categorical variables.")

In [53]:
def predict_loan_approval(features, model, scaler, label_encoder=None, needs_scaling=False):
    try:
        # Convert input features to DataFrame
        features_df = pd.DataFrame([features])
        
        # Handle categorical variables if present
        if label_encoder and 'purpose' in features_df.columns:  # Adjust 'purpose' to your categorical column
            features_df['purpose'] = label_encoder.transform(features_df['purpose'])
        
        # Ensure all required columns are present and in correct order
        required_columns = X.columns.tolist()  # Use training columns dynamically
        for col in required_columns:
            if col not in features_df.columns:
                raise ValueError(f"Missing required feature: {col}")
        
        features_df = features_df[required_columns]  # Reorder columns
        
        # Scale features if needed
        if needs_scaling:
            features_scaled = scaler.transform(features_df)
            probability = model.predict_proba(features_scaled)[0]
        else:
            probability = model.predict_proba(features_df)[0]
        
        # Return probability of approval (class 1) as percentage
        approval_prob = probability[1] * 100
        return approval_prob
    
    except Exception as e:
        print(f"Error in prediction: {str(e)}")
        return None

In [54]:
sample_features = {
    'int.rate': 0.12,
    'installment': 300,
    'log.annual.inc': 11.0,
    'dti': 15.0,
    'fico': 700,
    'revol.util': 30.0,
    'inq.last.6mths': 2,
    'purpose': 'major_purchase'  # Include if in your dataset
}

prediction = predict_loan_approval(
    sample_features, 
    best_model, 
    scaler, 
    label_encoder=le,
    needs_scaling=(best_model_name == 'Logistic Regression')
)

if prediction is not None:
    print(f"\nLoan Approval Probability: {prediction:.2f}%")
else:
    print("Prediction failed.")

Error in prediction: Missing required feature: days.with.cr.line
Prediction failed.
