In [2]:
#Import necessary libraries
import pandas as pd
from pathlib import Path
import scipy.stats as st
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [3]:
#Load the dataset
csv_path = Path("../Resources/train_cleaned.csv")
df = pd.read_csv(csv_path)
pd.set_option('display.max_columns', None)
df.head()

Unnamed: 0,ID,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Num_of_Loan,Delay_from_due_date,Num_of_Delayed_Payment,Num_Credit_Inquiries,Credit_Mix,Outstanding_Debt,Credit_Utilization_Ratio,Payment_of_Min_Amount,Total_EMI_per_month,Amount_invested_monthly,Monthly_Balance,Credit_Score,Credit_History_Age_Months
0,0x1608,19114.12,1824.843333,3,4,3,4,3,8,4.0,Good,809.98,22.537593,No,49.574949,178.344067,244.565317,Good,271
1,0x160f,34847.84,3037.986667,2,4,6,1,7,1,2.0,Good,605.03,38.550848,No,18.816215,40.391238,484.591214,Good,320
2,0x1612,34847.84,3037.986667,2,4,6,1,3,1,2.0,Good,605.03,34.977895,No,18.816215,130.11542,444.867032,Good,323
3,0x1613,34847.84,3037.986667,2,4,6,1,3,0,2.0,Good,605.03,33.38101,No,18.816215,43.47719,481.505262,Good,324
4,0x1615,34847.84,3037.986667,2,4,6,1,3,4,2.0,Good,605.03,32.933856,No,18.816215,218.904344,356.078109,Good,326


In [5]:
# Check distribution of the target variable
df["Credit_Score"].value_counts(normalize=True)

Credit_Score
Standard    0.528165
Poor        0.311190
Good        0.160645
Name: proportion, dtype: float64

In [6]:
# Step 1: Encode the target variable
le = LabelEncoder()
df['Credit_Score'] = le.fit_transform(df['Credit_Score'])

# Step 2: Define features (X) and target (y)
X = df.drop(columns=['Credit_Score', 'ID', 'Credit_Mix', 'Payment_of_Min_Amount'])
y = df['Credit_Score']

# Step 3: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Step 4: Standardize the numerical features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Step 5: Train the multinomial logistic regression model
#Note: Added class_weight='balanced' to handle imbalanced classes
model = LogisticRegression(class_weight='balanced', multi_class='multinomial', solver='lbfgs', max_iter=1000, random_state=42)
model.fit(X_train_scaled, y_train)

# Step 6: Make predictions
y_pred = model.predict(X_test_scaled)

# Step 7: Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred, target_names=le.classes_)

# Display results
print(accuracy)
print(conf_matrix)
print(class_report)



0.5850773430391265
[[ 879   32  148]
 [ 278 1480  294]
 [1102  882 1499]]
              precision    recall  f1-score   support

        Good       0.39      0.83      0.53      1059
        Poor       0.62      0.72      0.67      2052
    Standard       0.77      0.43      0.55      3483

    accuracy                           0.59      6594
   macro avg       0.59      0.66      0.58      6594
weighted avg       0.66      0.59      0.58      6594

