In [None]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

import pickle
import os

In [None]:
# Load your dataset (update path as needed)
df = pd.read_csv("data/scored_survival_dataset.csv")

In [None]:
# Quantile thresholds (e.g., 0–25% = High Risk, 25–75% = Medium, 75–100% = Low)
q1 = df["survival_probability"].quantile(0.25)
q3 = df["survival_probability"].quantile(0.75)

def label_credit_risk_quantile(p):
    if p <= q1:
        return "High"
    elif p <= q3:
        return "Medium"
    else:
        return "Low"

# Apply to dataset
df["Credit_Risk"] = df["survival_probability"].apply(label_credit_risk_quantile)

In [None]:
# Drop columns you don't want as features (like ID, raw survival probabilities, etc.)
X = df.drop(columns=["Credit_Risk", "survival_probability"])
y = df["Credit_Risk"]  # target labels

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)

In [None]:
# Identify categorical columns
cat_cols = X.select_dtypes(include="object").columns.tolist()

In [None]:
# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols)
    ],
    remainder="passthrough"
)

In [None]:
clf = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(n_estimators=100, random_state=42))
])

# Fit the model
clf.fit(X_train, y_train)

In [None]:
os.makedirs("models", exist_ok=True)

with open("models/classification_model.pkl", "wb") as f:
    pickle.dump(clf, f)

In [None]:
y_pred = clf.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))