In [5]:

# Load dataset
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix

# Load data
df = pd.read_csv("/content/credit_risk_dataset.csv")
print(df.head())
# Handle categorical variables
cat_cols = ['person_home_ownership', 'loan_intent', 'loan_grade', 'cb_person_default_on_file']
for col in cat_cols:
    df[col] = LabelEncoder().fit_transform(df[col])

# Fill missing values
df = df.fillna(df.median())

# Features and target
X = df.drop('loan_status', axis=1)
y = df['loan_status']


   person_age  person_income person_home_ownership  person_emp_length  \
0          22          59000                  RENT              123.0   
1          21           9600                   OWN                5.0   
2          25           9600              MORTGAGE                1.0   
3          23          65500                  RENT                4.0   
4          24          54400                  RENT                8.0   

  loan_intent loan_grade  loan_amnt  loan_int_rate  loan_status  \
0    PERSONAL          D      35000          16.02            1   
1   EDUCATION          B       1000          11.14            0   
2     MEDICAL          C       5500          12.87            1   
3     MEDICAL          C      35000          15.23            1   
4     MEDICAL          C      35000          14.27            1   

   loan_percent_income cb_person_default_on_file  cb_person_cred_hist_length  
0                 0.59                         Y                           3  


In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [7]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Predict probabilities
y_probs = model.predict_proba(X_test)[:, 1]


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [8]:
import numpy as np

cost_fp = 500
cost_fn = 5000

thresholds = np.arange(0.0, 1.0, 0.01)
best_threshold = 0
min_cost = float('inf')

for t in thresholds:
    y_pred = (y_probs >= t).astype(int)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    total_cost = (fp * cost_fp) + (fn * cost_fn)
    if total_cost < min_cost:
        min_cost = total_cost
        best_threshold = t

print("Best Threshold:", best_threshold)
print("Minimum Business Cost:", min_cost)


Best Threshold: 0.14
Minimum Business Cost: 2978000


In [9]:
y_final = (y_probs >= best_threshold).astype(int)
tn, fp, fn, tp = confusion_matrix(y_test, y_final).ravel()
print("Confusion Matrix:")
print(f"TN={tn}, FP={fp}, FN={fn}, TP={tp}")


Confusion Matrix:
TN=4447, FP=3166, FN=279, TP=1883
