In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score

In [2]:
# Loading the dataset generated by the team
df = pd.read_csv('./data/credit_risk_dataset.csv')

# Drop the raw text column (the model can't read sentences, only numbers/categories)
# Note: Your team already extracted 'sentiment' and 'risk_category' using GenAI, so we use those instead.
df_clean = df.drop(columns=['customer_financial_statement'])

In [3]:
# List of columns that are text (Categorical)
categorical_cols = [
    'employment_type', 'education_level', 'marital_status', 
    'region', 'sentiment', 'financial_stress_level', 'risk_category'
]

# Encode them (Turn "High Risk" into 2, "Low Risk" into 0, etc.)
le = LabelEncoder()
mapping_dict = {} # To keep track of what number means what

for col in categorical_cols:
    df_clean[col] = le.fit_transform(df_clean[col])
    mapping_dict[col] = dict(zip(le.classes_, le.transform(le.classes_)))

print("Data encoded successfully.")

# Define X (Features) and y (Target)
X = df_clean.drop('default_risk', axis=1)
y = df_clean['default_risk']

# Split: 80% Training, 20% Testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training Features Shape: {X_train.shape}")
print(f"Testing Features Shape: {X_test.shape}")

Data encoded successfully.
Training Features Shape: (1200, 15)
Testing Features Shape: (300, 15)
