# Credit Limit — **Classification** (Colab)
Tiers customers into **Low/Med/High** with **Logistic Regression**, **Random Forest**, and **Gradient Boosting**.

**Train/Test split = 80% / 20%**. We report **both Train and Test** metrics.

## 0) Imports & Setup

In [None]:
import os, joblib, numpy as np, pandas as pd, matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, precision_recall_fscore_support, roc_curve, auc
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
os.makedirs('figures', exist_ok=True); os.makedirs('artifacts', exist_ok=True)

## 1) Load data & labels

In [None]:
csv_path = 'Credit_Prediction (3).csv'
df = pd.read_csv(csv_path)
df.head()

### Fix: Drop fully-empty columns
We drop any column that is entirely missing (e.g., `Unnamed: 19`) to avoid imputation warnings.

In [None]:
df = df.dropna(axis=1, how='all')
df = df.loc[:, ~df.columns.duplicated()]  # remove duplicate-named cols if any

Create tertile labels (balanced classes).

In [None]:
y = pd.qcut(df['Credit_Limit'], q=3, labels=['Low','Med','High'])
X = df.drop(columns=['Credit_Limit'])
labels = np.array(['Low','Med','High'])

## 2) Preprocessing & **80/20** Split

In [None]:
num = X.select_dtypes(include=[np.number]).columns.tolist()
cat = [c for c in X.columns if c not in num]
pre = ColumnTransformer([
    ('num', Pipeline([('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())]), num),
    ('cat', Pipeline([('imputer', SimpleImputer(strategy='most_frequent')), ('onehot', OneHotEncoder(handle_unknown='ignore'))]), cat)
])
Xtr, Xte, ytr, yte = train_test_split(X, y, test_size=0.20, random_state=42, stratify=y)
print('Train size:', Xtr.shape, ' Test size:', Xte.shape)

## 3) Train & Compare (Train vs Test)

In [None]:
models = {
  'LogisticRegression': LogisticRegression(max_iter=200),
  'RandomForestClassifier': RandomForestClassifier(n_estimators=300, max_depth=20, min_samples_split=5, min_samples_leaf=2, max_features='sqrt', random_state=42, n_jobs=-1),
  'GradientBoostingClassifier': GradientBoostingClassifier(random_state=42)
}
rows=[]; best_f1=-1e9; best_name=None; best_pipe=None; best_pred=None; best_prob=None
for name, est in models.items():
    pipe = Pipeline([('preprocess', pre), ('model', est)]).fit(Xtr, ytr)
    # Test metrics
    pred_te = pipe.predict(Xte)
    prob_te = pipe.predict_proba(Xte) if hasattr(pipe.named_steps['model'], 'predict_proba') else None
    acc_te = accuracy_score(yte, pred_te)
    f1m_te = f1_score(yte, pred_te, average='macro')
    # Train metrics
    pred_tr = pipe.predict(Xtr)
    acc_tr = accuracy_score(ytr, pred_tr)
    f1m_tr = f1_score(ytr, pred_tr, average='macro')
    rows.append({'model':name,
                'Accuracy_train':acc_tr, 'F1_macro_train':f1m_tr,
                'Accuracy_test':acc_te,  'F1_macro_test':f1m_te})
    if f1m_te > best_f1:
        best_f1, best_name, best_pipe, best_pred, best_prob = f1m_te, name, pipe, pred_te, prob_te
cls_compare = pd.DataFrame(rows).sort_values('F1_macro_test', ascending=False).round(4)
display(cls_compare)
cls_compare.to_csv('artifacts/model_compare_classification_train_test.csv', index=False)

## 4) Plots & Artifacts (Test)

In [None]:
cm = confusion_matrix(yte, best_pred, labels=labels)
plt.figure(); plt.imshow(cm, aspect='auto'); plt.title(f'Confusion Matrix — {best_name} (Test)')
plt.xlabel('Predicted'); plt.ylabel('Actual'); plt.xticks(range(len(labels)), labels); plt.yticks(range(len(labels)), labels)
for i in range(cm.shape[0]):
    for j in range(cm.shape[1]):
        plt.text(j, i, str(cm[i, j]), ha='center', va='center')
plt.tight_layout(); plt.savefig('figures/cls_confusion_matrix_test.png'); plt.show()
from sklearn.preprocessing import label_binarize
if best_prob is not None:
    Yb = label_binarize(yte, classes=labels)
    plt.figure(); aucs=[]
    for i, k in enumerate(labels):
        fpr, tpr, _ = roc_curve(Yb[:, i], best_prob[:, i])
        a = auc(fpr, tpr); aucs.append(a)
        plt.plot(fpr, tpr, label=f'{k} (AUC={a:.3f})')
    plt.plot([0,1],[0,1],'--'); plt.xlabel('FPR'); plt.ylabel('TPR'); plt.title(f'ROC (OvR) — {best_name} (Test)')
    plt.legend(); plt.tight_layout(); plt.savefig('figures/cls_roc_best_test.png'); plt.show()
from sklearn.metrics import precision_recall_fscore_support
prec, rec, f1, sup = precision_recall_fscore_support(yte, best_pred, labels=labels, zero_division=0)
plt.figure(); x=np.arange(len(labels)); w=0.25
plt.bar(x-w, prec, w, label='Precision'); plt.bar(x, rec, w, label='Recall'); plt.bar(x+w, f1, w, label='F1')
plt.xticks(x, labels); plt.ylabel('Score'); plt.title(f'Per-Class PRF — {best_name} (Test)')
plt.legend(); plt.tight_layout(); plt.savefig('figures/cls_prf_bars_test.png'); plt.show()
joblib.dump(best_pipe, 'artifacts/best_classification_pipeline.joblib')

## 5) Final Summary (Train & Test of Winner)

In [None]:
summary = cls_compare.iloc[0:1].copy(); summary.rename(columns={'model':'BestModel'}, inplace=True); display(summary)
summary.to_csv('artifacts/final_classification_summary_train_test.csv', index=False)

## 6) **Quick Test — Predictions & Accuracy**
We preview predictions on a few test rows and print overall **test Accuracy & Macro-F1** for the selected model.

In [None]:
import numpy as np, pandas as pd
from sklearn.metrics import accuracy_score, f1_score
rng = np.random.RandomState(42)
k = min(10, Xte.shape[0])
idx = rng.choice(range(Xte.shape[0]), size=k, replace=False)
pred_demo = best_pipe.predict(Xte.iloc[idx])
try:
    prob_demo = best_pipe.predict_proba(Xte.iloc[idx])
    conf = prob_demo.max(axis=1)
except Exception:
    prob_demo, conf = None, None
demo = pd.DataFrame({'Actual': yte.iloc[idx].values, 'Predicted': pred_demo})
if conf is not None:
    demo['Confidence'] = conf
display(demo)

pred_test = best_pipe.predict(Xte)
acc = accuracy_score(yte, pred_test)
f1m = f1_score(yte, pred_test, average='macro')
print(f'Best Model: {best_name} | Test Accuracy={acc:.4f} | Macro-F1={f1m:.4f}')
