In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score
from imblearn.over_sampling import SMOTE
import shap
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Load dataset with proper delimiter
data = pd.read_csv('credit_data.csv', sep=';')
data.head()

Unnamed: 0,YOB,NKID,DEP,PHON,SINC,AES,DAINC,RES,DHVAL,DMORT,DOUTM,DOUTL,DOUTHP,DOUTCC,BAD
0,19.0,4.0,0.0,1,0.0,R,0.0,O,14464.0,4.0,0.0,0.0,0.0,0.0,0.0
1,41.0,2.0,0.0,1,0.0,P,36000.0,O,0.0,0.0,280.0,664.0,0.0,80.0,0.0
2,66.0,0.0,0.0,1,0.0,N,30000.0,N,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,51.0,2.0,0.0,1,0.0,P,464.0,O,24928.0,8464.0,584.0,320.0,0.0,60.0,0.0
4,65.0,0.0,0.0,1,0.0,P,15000.0,P,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
# Rename columns for clarity
data.rename(columns={
    'YOB': 'year_of_birth',
    'NKID': 'num_kids',
    'DEP': 'num_dependents',
    'PHON': 'has_phone',
    'SINC': 'annual_income',
    'AES': 'employment_status',
    'DAINC': 'declared_income',
    'RES': 'residence_status',
    'DHVAL': 'home_value',
    'DMORT': 'mortgage_due',
    'DOUTM': 'outstanding_monthly',
    'DOUTL': 'outstanding_loans',
    'DOUTHP': 'outstanding_home_loan',
    'DOUTCC': 'outstanding_credit_card',
    'BAD': 'loan_status'
}, inplace=True)

# Convert all columns to numeric where possible
for col in data.columns:
    data[col] = pd.to_numeric(data[col], errors='coerce')


data.replace([np.inf, -np.inf], np.nan, inplace=True)
data.fillna(0, inplace=True)


In [4]:
# Basic preprocessing
data = data.dropna()

In [5]:
# Feature Engineering with domain insights (safe operations)
data['dti_ratio'] = data['outstanding_loans'] / data['annual_income'].replace(0, np.nan)
data['emi_income_ratio'] = data['outstanding_monthly'] / data['annual_income'].replace(0, np.nan)
data['home_equity'] = data['home_value'] - data['mortgage_due']

# Replace inf/-inf with NaN
data.replace([np.inf, -np.inf], np.nan, inplace=True)

# Drop rows with any NaN values created
data.dropna(inplace=True)

In [6]:
# Set features and target
X = data.drop('loan_status', axis=1)
y = data['loan_status']

In [7]:
# Encode categorical features
X = pd.get_dummies(X, drop_first=True)

In [8]:
# Save column order for future single user predictions
column_order = X.columns

print("Data shape after cleaning:", data.shape)


Data shape after cleaning: (317, 18)


In [9]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [10]:
# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [11]:
# Handling class imbalance
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)

In [12]:
# Initialize classifiers
clf1 = LogisticRegression(solver='liblinear', random_state=42)
clf2 = RandomForestClassifier(n_estimators=100, random_state=42)
clf3 = SVC(kernel='rbf', probability=True, random_state=42)
clf4 = KNeighborsClassifier(n_neighbors=5)

In [13]:
# Ensemble using soft voting
ensemble = VotingClassifier(estimators=[
    ('lr', clf1),
    ('rf', clf2),
    ('svm', clf3),
    ('knn', clf4)
], voting='soft')

In [14]:
# Fit ensemble
ensemble.fit(X_train_resampled, y_train_resampled)

In [15]:
# Evaluate on test set
y_pred = ensemble.predict(X_test_scaled)
y_proba = ensemble.predict_proba(X_test_scaled)[:, 1]

In [16]:
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_proba))


Classification Report:
              precision    recall  f1-score   support

         0.0       0.82      0.62      0.70        50
         1.0       0.27      0.50      0.35        14

    accuracy                           0.59        64
   macro avg       0.54      0.56      0.53        64
weighted avg       0.70      0.59      0.63        64

Accuracy: 0.59375
ROC AUC: 0.6371428571428571


In [17]:
def predict_single_user(input_dict):
    user_df = pd.DataFrame([input_dict])

    # Feature Engineering
    user_df['annual_income'] = pd.to_numeric(user_df['annual_income'], errors='coerce').replace(0, np.nan)
    user_df['dti_ratio'] = user_df['outstanding_loans'] / user_df['annual_income']
    user_df['emi_income_ratio'] = user_df['outstanding_monthly'] / user_df['annual_income']
    user_df['home_equity'] = user_df['home_value'] - user_df['mortgage_due']

    # Data Cleaning
    user_df = user_df.apply(pd.to_numeric, errors='coerce')
    user_df.replace([np.inf, -np.inf], np.nan, inplace=True)
    user_df.fillna(0, inplace=True)

    # One-hot encoding and column alignment
    user_df = pd.get_dummies(user_df)
    user_df = user_df.reindex(columns=column_order, fill_value=0)

    # Feature Scaling
    user_scaled = scaler.transform(user_df)

    # Prediction and Probability
    prediction = ensemble.predict(user_scaled)[0]
    probability = ensemble.predict_proba(user_scaled)[0, 1]

    # Risk Category Logic
    if probability < 0.45:
        risk_level = "Low Risk"
        recommendation = "Eligible for approval with minimal conditions."
    elif 0.45 <= probability <= 0.55:
        risk_level = "Moderate Risk"
        recommendation = "Needs additional checks or documentation."
    else:
        risk_level = "High Risk"
        recommendation = "High chance of default. Proceed with caution."

    # Output
    print("\n Single User Prediction")
    print(f"Loan Status Prediction: {'Default' if prediction == 1 else 'No Default'}")
    print(f"Probability of Default: {probability:.2f}")
    print(f"Risk Level: {risk_level}")
    print(f"Recommendation: {recommendation}")

In [18]:
sample_input = {
    'annual_income': 1200000,
    'outstanding_loans': 50000,
    'outstanding_monthly': 4000,
    'home_value': 9000000,
    'mortgage_due': 1500000,
    'has_phone': 1,
    'num_kids': 1,
    'employment_status': 1
}
predict_single_user(sample_input)


 Single User Prediction
Loan Status Prediction: No Default
Probability of Default: 0.11
Risk Level: Low Risk
Recommendation: Eligible for approval with minimal conditions.
