In [None]:
"""
Python script for training a model version
"""
# Core
import os
import json
import logging
import pickle
import sys

# Third-party
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold, cross_validate, cross_val_score
from sklearn.calibration import calibration_curve
from sklearn import metrics
import utils.credit as utils

# Bedrock
import bdrk
from bedrock_client.bedrock.analyzer.model_analyzer import ModelAnalyzer
from bedrock_client.bedrock.analyzer import ModelTypes
from bedrock_client.bedrock.api import BedrockApi
from bedrock_client.bedrock.metrics.service import ModelMonitoringService

In [None]:
# Environmental params for Bedrock
# These are usually captured in the Bedrock HCL
DATA_DIR_LOCAL = "data/creditdata"
SEED = 3
TH = 0.5
LR_REGULARIZER = 1e-1
RF_N_ESTIMATORS = 100
CB_ITERATIONS = 100

ENV_PARAMS = {
    "DATA_DIR_LOCAL": DATA_DIR_LOCAL, 
    "SEED": SEED, 
    "TH": TH,
    "LR_REGULARIZER": LR_REGULARIZER,
    "RF_N_ESTIMATORS": RF_N_ESTIMATORS,
    "CB_ITERATIONS": CB_ITERATIONS
}

OUTPUT_MODEL_PATH = "/artefact/model.pkl"
FEATURE_COLS_PATH = "/artefact/feature_cols.pkl"

In [None]:
# Params for Bedrock Client
# Not required if training pipeline is run via UI
PROJECT_ID = "victor-sandbox"
PIPELINE_ID = "rest-credit-scoring"
ENVIRONMENT_ID = "sandbox-aws-production"

# Step 1: Train the model

In [None]:
# Extraneous columns (as might be determined through feature selection)
drop_cols = ['ID']

# --- Data ETL ---
# Load into pandas dataframes
# x_<name> : features
# y_<name> : labels
x_train, y_train = utils.load_dataset(os.path.join(DATA_DIR_LOCAL, 'creditdata_train_v2.csv'), drop_columns=drop_cols)
x_test, y_test = utils.load_dataset(os.path.join(DATA_DIR_LOCAL, 'creditdata_test_v2.csv'), drop_columns=drop_cols)


# --- Candidate Binary Classification Algos ---
# MODEL 1: LOGISTIC REGRESSION
# Use best parameters from a model selection and threshold tuning process
# model = utils.train_log_reg_model(x_train, y_train, seed=SEED, C=LR_REGULARIZER, upsample=True, verbose=True)
# model_name = "logreg_model"
# model_type = ModelTypes.LINEAR

# MODEL 2: RANDOM FOREST
# Uses default threshold of 0.5 and model parameters
# model = utils.train_rf_model(x_train, y_train, seed=SEED, upsample=True, verbose=True)
# model_name = "randomforest_model"
# model_type = ModelTypes.TREE

# MODEL 3: CATBOOST
# Uses default threshold of 0.5 and model parameters
model = utils.train_catboost_model(x_train, y_train, seed=SEED, upsample=True, verbose=True, iterations=CB_ITERATIONS, eval_data=x_test, eval_labels=y_test)
model_name = "catboost_model"
model_type = ModelTypes.TREE

# Step 2: Logging Metrics via Bedrock Client Library
- This bypasses the need to write Bedrock HCL or run Git-linked training pipelines
- Metrics will appear as a "pipeline" in the "Training" tab in the Bedrock workspace

In [None]:
# You can customize the logging logic here
_logger = logging.getLogger(bdrk.utils.vars.Constants.MAIN_LOG)
_logger.setLevel(logging.INFO)
if not _logger.handlers:
    _logger.addHandler(logging.StreamHandler(stream=sys.stdout))

# Visit https://bedrock.basis-ai.com/setting/token/ to get the personal access token.
# os.environ["BEDROCK_API_TOKEN"]

In [None]:
def compute_log_notebook_metrics(model, x_train, 
                                x_test, y_test, 
                                best_th=0.5,
                                model_name="tree_model", 
                                model_type=ModelTypes.TREE,
                                access_token=None,
                                project_id=None,
                                pipeline_id=None,
                                environment_id=None,
                                env_params=None):
    
    bdrk.init(access_token=access_token, project_id=project_id)
    with bdrk.start_run(pipeline_id=pipeline_id, environment_id=environment_id):
    
        bdrk.log_params(env_params)
        
        # Compute and log test metrics
        test_prob = model.predict_proba(x_test)[:, 1]
        test_pred = np.where(test_prob > best_th, 1, 0)

        acc = metrics.accuracy_score(y_test, test_pred)
        precision = metrics.precision_score(y_test, test_pred)
        recall = metrics.recall_score(y_test, test_pred)
        f1_score = metrics.f1_score(y_test, test_pred)
        roc_auc = metrics.roc_auc_score(y_test, test_prob)
        avg_prc = metrics.average_precision_score(y_test, test_prob)
        print("Evaluation\n"
              f"  Accuracy          = {acc:.4f}\n"
              f"  Precision         = {precision:.4f}\n"
              f"  Recall            = {recall:.4f}\n"
              f"  F1 score          = {f1_score:.4f}\n"
              f"  ROC AUC           = {roc_auc:.4f}\n"
              f"  Average precision = {avg_prc:.4f}")

        # --- Bedrock-native Integrations ---
        # Bedrock Logger: captures model metrics

        # Optional: Log metrics for each training step
        evals = model[1].evals_result_["validation"]
        for iteration in range(len(evals["Logloss"])):
            bdrk.log_metrics(metrics={
                "Training Logloss": evals["Logloss"][iteration],
                "Training AUC": evals["AUC"][iteration]
            }, x=iteration)
        
        # Log into charts, the binary classifier from the predicted data
        bdrk.log_binary_classifier_metrics(actual=y_test.astype(int).tolist(),
                                           probability=test_prob.flatten().tolist())

        # Log final key-value pairs
        bdrk.log_metric("Accuracy", acc)
        bdrk.log_metric("Precision", precision)
        bdrk.log_metric("Recall", recall)
        bdrk.log_metric("F1 score", f1_score)
        bdrk.log_metric("ROC AUC", roc_auc)
        bdrk.log_metric("Avg precision", avg_prc)

        # Alternative: Log dict of key-value pairs
        # e.g. bdrk.log_metrics(metrics={"Accuracy": acc})
        
        # Saving and log the model
        with open(OUTPUT_MODEL_PATH, "wb") as model_file:
            pickle.dump(model, model_file)
        bdrk.log_model(OUTPUT_MODEL_PATH)


In [None]:
# Log the Run
compute_log_notebook_metrics(model=model, 
                             x_train=x_train, 
                             x_test=x_test, 
                             y_test=y_test, 
                             best_th=TH,
                             model_name=model_name, 
                             model_type=model_type,
                             access_token=os.environ["BEDROCK_API_TOKEN"],
                             project_id=PROJECT_ID,
                             pipeline_id=PIPELINE_ID,
                             environment_id=ENVIRONMENT_ID,
                             env_params=ENV_PARAMS)