# Credit Limit — **Regression** (Colab)
Compares **Linear Regression**, **Random Forest Regressor**, and **Gradient Boosting Regressor**.

**Train/Test split = 80% / 20%**. We report **both Train and Test** metrics for each model.

## 0) Imports & Setup

In [None]:
import os, joblib, numpy as np, pandas as pd, matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
os.makedirs('figures', exist_ok=True); os.makedirs('artifacts', exist_ok=True)

## 1) Load data

In [None]:
# from google.colab import files
# uploaded = files.upload(); csv_path = list(uploaded.keys())[0]
csv_path = 'Credit_Prediction (3).csv'
df = pd.read_csv(csv_path)
df.head()

### Fix: Drop fully-empty columns
We drop any column that is entirely missing (e.g., `Unnamed: 19`) to avoid imputation warnings.

In [None]:
df = df.dropna(axis=1, how='all')
df = df.loc[:, ~df.columns.duplicated()]  # remove duplicate-named cols if any

## 2) Quick EDA

In [None]:
display(pd.DataFrame({'rows':[df.shape[0]], 'columns':[df.shape[1]]}))
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
plt.figure(); df['Credit_Limit'].hist(bins=30); plt.title('Distribution of Credit_Limit'); plt.tight_layout();
plt.savefig('figures/target_hist.png'); plt.show()
corr = df[num_cols].corr(numeric_only=True)['Credit_Limit'].drop('Credit_Limit').sort_values(key=lambda s: s.abs(), ascending=False)
display(corr.head(12).to_frame('corr_with_target'))

## 3) Preprocessing & **80/20** Split

In [None]:
X = df.drop(columns=['Credit_Limit']); y = df['Credit_Limit']
num = X.select_dtypes(include=[np.number]).columns.tolist()
cat = [c for c in X.columns if c not in num]
pre = ColumnTransformer([
    ('num', Pipeline([('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())]), num),
    ('cat', Pipeline([('imputer', SimpleImputer(strategy='most_frequent')), ('onehot', OneHotEncoder(handle_unknown='ignore'))]), cat)
])
Xtr, Xte, ytr, yte = train_test_split(X, y, test_size=0.20, random_state=42)  # 80/20
print('Train size:', Xtr.shape, ' Test size:', Xte.shape)

## 4) Train & Compare (Train vs Test)

In [None]:
models = {
  'LinearRegression': LinearRegression(),
  'RandomForestRegressor': RandomForestRegressor(n_estimators=300, max_depth=20, min_samples_split=5, min_samples_leaf=2, max_features='sqrt', random_state=42, n_jobs=-1),
  'GradientBoostingRegressor': GradientBoostingRegressor(random_state=42)
}
rows = []; best_r2=-1e9; best_name=None; best_pipe=None; best_pred=None
for name, est in models.items():
    pipe = Pipeline([('preprocess', pre), ('model', est)]).fit(Xtr, ytr)
    # Test predictions & metrics
    pred_te = pipe.predict(Xte)
    r2_te = r2_score(yte, pred_te)
    mae_te = mean_absolute_error(yte, pred_te)
    rmse_te = float(np.sqrt(mean_squared_error(yte, pred_te)))
    # Train predictions & metrics (to gauge over/underfitting)
    pred_tr = pipe.predict(Xtr)
    r2_tr = r2_score(ytr, pred_tr)
    mae_tr = mean_absolute_error(ytr, pred_tr)
    rmse_tr = float(np.sqrt(mean_squared_error(ytr, pred_tr)))
    rows.append({'model':name,
                'R2_train':r2_tr, 'MAE_train':mae_tr, 'RMSE_train':rmse_tr,
                'R2_test':r2_te,  'MAE_test':mae_te,  'RMSE_test':rmse_te})
    if r2_te > best_r2:
        best_r2, best_name, best_pipe, best_pred = r2_te, name, pipe, pred_te
reg_compare = pd.DataFrame(rows).sort_values('R2_test', ascending=False).round(4)
display(reg_compare)
reg_compare.to_csv('artifacts/model_compare_regression_train_test.csv', index=False)

## 5) Diagnostics & Artifacts (Test)

In [None]:
plt.figure(); plt.scatter(yte, best_pred, s=8)
lo, hi = float(min(yte.min(), best_pred.min())), float(max(yte.max(), best_pred.max()))
plt.plot([lo,hi],[lo,hi]); plt.xlabel('Actual'); plt.ylabel('Predicted'); plt.title(f'Pred vs Actual — {best_name} (Test)'); plt.tight_layout();
plt.savefig('figures/reg_pred_vs_actual_test.png'); plt.show()
resid = yte - best_pred
plt.figure(); plt.hist(resid, bins=40); plt.xlabel('Residual'); plt.ylabel('Count'); plt.title('Residuals — Best Regression (Test)'); plt.tight_layout();
plt.savefig('figures/reg_residuals_test.png'); plt.show()
dec = pd.qcut(pd.Series(best_pred, index=yte.index), 10, labels=False, duplicates='drop')
cal = pd.DataFrame({'decile':dec, 'actual':yte, 'pred':best_pred}).groupby('decile').mean()
plt.figure(); plt.plot(cal.index, cal['actual']); plt.plot(cal.index, cal['pred']); plt.xticks(cal.index)
plt.xlabel('Predicted Decile'); plt.ylabel('Mean Credit_Limit'); plt.title('Calibration by Decile — Test'); plt.tight_layout();
plt.savefig('figures/reg_calibration_test.png'); plt.show()
joblib.dump(best_pipe, 'artifacts/best_regression_pipeline.joblib')

## 6) Final Summary (Train & Test of Winner)

In [None]:
summary = reg_compare.iloc[0:1].copy(); summary.rename(columns={'model':'BestModel'}, inplace=True); display(summary)
summary.to_csv('artifacts/final_regression_summary_train_test.csv', index=False)

## 7) **Quick Test — Predictions & Accuracy**
Below we:
1) show a small sample of **Actual vs Predicted** on the test set, and
2) print the final **test metrics** (R², MAE, RMSE) for the selected model.

In [None]:
import numpy as np, pandas as pd
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
# Sample 10 random test rows for a readable preview
rng = np.random.RandomState(42)
k = min(10, Xte.shape[0])
idx = rng.choice(range(Xte.shape[0]), size=k, replace=False)
pred_demo = best_pipe.predict(Xte.iloc[idx])
demo = pd.DataFrame({'Actual': yte.iloc[idx].values, 'Predicted': pred_demo})
display(demo.round(2))

# Final test metrics for the best model
pred_test = best_pipe.predict(Xte)
r2 = r2_score(yte, pred_test)
mae = mean_absolute_error(yte, pred_test)
rmse = float(np.sqrt(mean_squared_error(yte, pred_test)))
print(f'Best Model: {best_name} | Test R²={r2:.4f} | MAE={mae:.2f} | RMSE={rmse:.2f}')
