### 1: Imports and Load Final Assets

In [1]:
import pandas as pd
import numpy as np
import os
import warnings
import joblib

# --- Core Modeling Imports ---
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, f1_score

# --- Pipeline & Imbalance (Finding 4) ---
# We MUST use the pipeline from imblearn to correctly handle SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE

# --- Model Algorithm ---
# We'll use XGBoost as our powerful baseline, as defined in requirements.txt
from xgboost import XGBClassifier

# --- Setup ---
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

print("Libraries imported for Phase 4.")

# --- Define File Paths ---
# Use the full, confirmed paths from our last step
FULL_PATH_TO_PROCESSED = '/Users/yogeshdhaliya/Desktop/DS Learning/11. Projects/Credit-Risk-Prediction/data/processed'
FULL_PATH_TO_MODELS = '/Users/yogeshdhaliya/Desktop/DS Learning/11. Projects/Credit-Risk-Prediction/models'

X_path = os.path.join(FULL_PATH_TO_PROCESSED, 'X_model_input.csv')
y_path = os.path.join(FULL_PATH_TO_PROCESSED, 'y_target.csv') # We load the original y
PREPROCESSOR_path = os.path.join(FULL_PATH_TO_MODELS, 'preprocessor.joblib')

print(f"Loading X data from: {X_path}")
print(f"Loading y data from: {y_path}")
print(f"Loading preprocessor from: {PREPROCESSOR_path}")

try:
    # Load our 38-feature dataset
    X = pd.read_csv(X_path, index_col='PROSPECTID')
    
    # Load the original target with P1, P2, P3, P4
    y = pd.read_csv(y_path, index_col='PROSPECTID').squeeze('columns')
    
    # Load our saved preprocessor from Phase 3
    preprocessor = joblib.load(PREPROCESSOR_path)
    
    print("\n--- Assets Loaded Successfully ---")
    print(f"X (model input) shape: {X.shape}")
    print(f"y (target) shape: {y.shape}")
    print("\nPreprocessor object:")
    print(preprocessor)

except FileNotFoundError as e:
    print(f"\n[ERROR] Files not found. Please ensure your paths are correct.")
    print(e)

Libraries imported for Phase 4.
Loading X data from: /Users/yogeshdhaliya/Desktop/DS Learning/11. Projects/Credit-Risk-Prediction/data/processed/X_model_input.csv
Loading y data from: /Users/yogeshdhaliya/Desktop/DS Learning/11. Projects/Credit-Risk-Prediction/data/processed/y_target.csv
Loading preprocessor from: /Users/yogeshdhaliya/Desktop/DS Learning/11. Projects/Credit-Risk-Prediction/models/preprocessor.joblib

--- Assets Loaded Successfully ---
X (model input) shape: (51336, 38)
y (target) shape: (51336,)

Preprocessor object:
ColumnTransformer(remainder='passthrough',
                  transformers=[('ordinal',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='most_frequent')),
                                                 ('encoder',
                                                  OrdinalEncoder(categories=[['OTHERS',
                                                                       

### 2: Encode Target Variable (y)

In [2]:
print("\n--- Encoding Target Variable (y) ---")

# 1. Instantiate the LabelEncoder
target_encoder = LabelEncoder()

# 2. Fit and transform y
y_encoded = target_encoder.fit_transform(y)

# 3. Show the mapping (our "decoder ring")
print("Target variable encoded.")
print(f"Original classes: {target_encoder.classes_}")
print(f"Encoded classes: {np.unique(y_encoded)}")
print(f"Mapping: {dict(zip(target_encoder.classes_, np.unique(y_encoded)))}")

# 4. CRITICAL: Save this encoder for our app
ENCODER_SAVE_PATH = os.path.join(FULL_PATH_TO_MODELS, 'target_encoder.joblib')
joblib.dump(target_encoder, ENCODER_SAVE_PATH)
print(f"\nTarget encoder saved to: {ENCODER_SAVE_PATH}")


--- Encoding Target Variable (y) ---
Target variable encoded.
Original classes: ['P1' 'P2' 'P3' 'P4']
Encoded classes: [0 1 2 3]
Mapping: {'P1': 0, 'P2': 1, 'P3': 2, 'P4': 3}

Target encoder saved to: /Users/yogeshdhaliya/Desktop/DS Learning/11. Projects/Credit-Risk-Prediction/models/target_encoder.joblib


### 3: Train-Test Split (Finding 4)

In [3]:
print("\n--- Splitting Data (Stratified) ---")

# We split the 38-feature X and the new 0,1,2,3 encoded y
X_train, X_test, y_train, y_test = train_test_split(
    X, 
    y_encoded, 
    test_size=0.2, 
    random_state=42, 
    stratify=y_encoded  # Per Finding 4
)

print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}, y_test shape: {y_test.shape}")

# Verify stratification worked
train_counts = np.bincount(y_train)
test_counts = np.bincount(y_test)

print(f"\nTrain set distribution (P1, P2, P3, P4): {train_counts / len(y_train)}")
print(f"Test set distribution (P1, P2, P3, P4): {test_counts / len(y_test)}")
print("=> CONFIRMED: Distributions are nearly identical.")


--- Splitting Data (Stratified) ---
X_train shape: (41068, 38), y_train shape: (41068,)
X_test shape: (10268, 38), y_test shape: (10268,)

Train set distribution (P1, P2, P3, P4): [0.11303204 0.62722801 0.14514951 0.11459044]
Test set distribution (P1, P2, P3, P4): [0.11306973 0.62719127 0.14520841 0.11453058]
=> CONFIRMED: Distributions are nearly identical.


### 4: Build Full Modeling Pipeline (Finding 4)

In [4]:
print("\n--- Building Full Modeling Pipeline (with SMOTE) ---")

# --- 1. Define SMOTE (Finding 4) ---
# We set k_neighbors=4 because our minority classes are small.
# random_state=42 ensures reproducibility.
smote = SMOTE(random_state=42, k_neighbors=4)

# --- 2. Define Model ---
# We use XGBoost as our powerful baseline.
# 'objective='multi:softmax'' is for multiclass classification.
# 'num_class=4' explicitly tells it we have 4 target classes (0,1,2,3).
# 'enable_categorical=True' is a modern XGBoost feature, but we'll
# let our preprocessor handle encoding for robustness.
xgb_model = XGBClassifier(
    objective='multi:softmax',
    num_class=4,
    eval_metric='mlogloss',
    random_state=42,
    n_jobs=-1  # Use all available CPU cores
)

# --- 3. Create the Full Pipeline ---
# This is the key: we use ImbPipeline
model_pipeline = ImbPipeline(steps=[
    ('preprocessor', preprocessor),
    ('smote', smote),
    ('model', xgb_model)
])

print("Full model pipeline created successfully:")
print(model_pipeline)


--- Building Full Modeling Pipeline (with SMOTE) ---
Full model pipeline created successfully:
Pipeline(steps=[('preprocessor',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('ordinal',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('encoder',
                                                                   OrdinalEncoder(categories=[['OTHERS',
                                                                                               'SSC',
                                                                                               '12TH',
                                                                                               'UNDER '
                                                                       

### 5: Cross-Validate the Pipeline

In [5]:
print("\n--- Running 5-Fold Stratified Cross-Validation ---")
print("This may take a few minutes...")

# 1. Define our cross-validation strategy
# 5 splits, stratified, with shuffling for randomness.
cv_strategy = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# 2. Run the cross-validation
# We score on 'f1_macro' as per our project goals
# We pass the full X_train (unprocessed) and y_train
# The pipeline handles all preprocessing and SMOTE internally.
cv_scores = cross_val_score(
    model_pipeline,
    X_train,
    y_train,
    cv=cv_strategy,
    scoring='f1_macro',
    n_jobs=-1 # Use all available cores for CV
)

print("\n--- Cross-Validation Results ---")
print(f"F1 Macro Scores for each fold: {cv_scores}")
print(f"Mean F1 Macro Score: {np.mean(cv_scores):.4f}")
print(f"Std Dev of F1 Macro Score: {np.std(cv_scores):.4f}")

print("\n=> This 'Mean F1 Macro Score' is our robust baseline for model performance.")


--- Running 5-Fold Stratified Cross-Validation ---
This may take a few minutes...

--- Cross-Validation Results ---
F1 Macro Scores for each fold: [0.59445626 0.59295003 0.58830453 0.59718419 0.59341853]
Mean F1 Macro Score: 0.5933
Std Dev of F1 Macro Score: 0.0029

=> This 'Mean F1 Macro Score' is our robust baseline for model performance.


### 6: Train Final Model & Evaluate on Test Set

In [7]:
print("\n--- Training Final Model on Full Train Set ---")
# This will fit the preprocessor, then fit SMOTE, then fit the model
model_pipeline.fit(X_train, y_train)
print("Final model trained successfully.")

# --- 1. Make Predictions on Test Set ---
y_pred = model_pipeline.predict(X_test)

# --- 2. Decode Predictions ---
# We use our saved encoder to turn 0,1,2,3 back to P1,P2,P3,P4
y_test_labels = target_encoder.inverse_transform(y_test)
y_pred_labels = target_encoder.inverse_transform(y_pred)

# --- 3. Generate Final Report ---
print("\n--- Final Test Set Evaluation Report ---")

# --- CORRECTION HERE ---
# The parameter is 'labels', not 'order'
report = classification_report(
    y_test_labels, 
    y_pred_labels, 
    labels=target_encoder.classes_, # <-- FIX IS HERE
    zero_division=0
)
# --- END CORRECTION ---

print(report)


--- Training Final Model on Full Train Set ---
Final model trained successfully.

--- Final Test Set Evaluation Report ---
              precision    recall  f1-score   support

          P1       0.68      0.67      0.67      1161
          P2       0.78      0.89      0.83      6440
          P3       0.40      0.19      0.25      1491
          P4       0.67      0.59      0.63      1176

    accuracy                           0.73     10268
   macro avg       0.63      0.59      0.60     10268
weighted avg       0.70      0.73      0.71     10268



### 7: Save the Final Baseline Model

In [8]:
import joblib

print("\n--- Saving Final Baseline Model ---")

# --- 1. Define Save Path ---
MODEL_SAVE_PATH = os.path.join(FULL_PATH_TO_MODELS, 'baseline_model.joblib')

# --- 2. Save the Full Pipeline ---
# This 'model_pipeline' object is the one you just called .fit() on
joblib.dump(model_pipeline, MODEL_SAVE_PATH)

print(f"Final baseline model pipeline (with SMOTE+XGB) saved to: {MODEL_SAVE_PATH}")
print("\n--- Phase 4 is 100% complete and all assets are saved. ---")


--- Saving Final Baseline Model ---
Final baseline model pipeline (with SMOTE+XGB) saved to: /Users/yogeshdhaliya/Desktop/DS Learning/11. Projects/Credit-Risk-Prediction/models/baseline_model.joblib

--- Phase 4 is 100% complete and all assets are saved. ---


## Phase 4: Modeling Pipeline & Baseline

Our main goal in this phase was to build our first complete, end-to-end model pipeline, integrating our `preprocessor` from Phase 3 with a baseline `XGBClassifier` model. This is where we implemented our core strategy for **Finding 4 (Class Imbalance)**.

### 1. Key Actions Performed

* **Asset Loading:** We loaded our final 38-feature dataset (`X_model_input.csv`) and our saved `preprocessor.joblib`.
* **Target Encoding:** We converted our target variable `y` from strings ('P1', 'P2', 'P3', 'P4') into numerical labels (0, 1, 2, 3) using `LabelEncoder`. We **saved this encoder** (`target_encoder.joblib`) so our future Streamlit app can decode the model's predictions.
* **Stratified Split (Finding 4):** We performed our `train_test_split` using `stratify=y_encoded`. This was a non-negotiable step to ensure our unbalanced classes were represented equally in both the train and test sets.
* **`ImbPipeline` with `SMOTE` (Finding 4):** We used the `imbalanced-learn` `Pipeline` object to chain our three critical steps:
    1.  **`preprocessor`**: Applies all our scaling/encoding rules.
    2.  **`SMOTE`**: Resamples the data to fix the class imbalance. Using it inside the pipeline ensures it *only* runs on training data, preventing data leakage.
    3.  **`model`**: A baseline `XGBClassifier` to make the predictions.

### 2. Key Results & Evaluation

* **Robust Baseline Score:** Our 5-fold cross-validation on the training data yielded a very stable and reliable **`f1-macro` score of 0.5933** (with a tiny standard deviation of 0.0029).
* **Final Test Report (Qualified Success):**
    * The pipeline performed consistently on the unseen test set, achieving an **`f1-macro` of 0.60**.
    * **Success (Finding 4):** Our `SMOTE` strategy worked. The model learned to identify the crucial minority classes, achieving good F1-scores for **P1 (0.67)** and **P4 (0.63)**.
    * **New Challenge:** The report clearly identified our new primary challenge: the model struggles to distinguish **P3 (subprime)**, which had a very low F1-score of **0.25**.

### 3. Final Output

* **Final Baseline Model:** We saved our fully trained baseline pipeline (preprocessor + SMOTE + XGB model) as `baseline_model.joblib`. This model is our benchmark to beat in Phase 5.