In [1]:
import os

In [2]:
%pwd

'/root/pacmann/loan-default-project/notebooks'

In [3]:
# Change to the main directory
# So, it's executed from main directory
os.chdir("../")

In [4]:
with open('.env') as f:
    os.environ.update(
        line.strip().split('=') for line in f
)

In [5]:
%pwd

'/root/pacmann/loan-default-project'

In [6]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class TrainEvaluationConfig:
    root_dir: Path
    input_train_path: Path
    input_test_path: Path
    output_train_path: Path
    output_test_path: Path
    encoded_train_path: Path
    encoded_test_path: Path
    encoder_model_path: Path
    scaler_model_path: Path
    model_path: Path
    score_path: Path
    mlflow_dataset_path: Path
    mlflow_dataset_column: list
    minio_endpoint_url: str
    minio_access_key_id: str
    minio_secret_access_key: str
    mlflow_tracking_uri: str
    mlflow_exp_name: str
    mlflow_dataset_bucket: str
    mlflow_run_name: str

In [7]:
from LoanDefault.constants import CONFIG_FILE_PATH, PARAMS_FILE_PATH
from LoanDefault.utils.common import read_yaml, create_directories, save_json

In [8]:
class ConfigurationManager:
    def __init__(self, 
                 config_filepath = CONFIG_FILE_PATH,
                 params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])
    
    def get_train_eval_config(self) -> TrainEvaluationConfig:
        """read training evaluation config file and store as 
        config entity then apply the dataclasses
        
        Returns:
            config: TrainEvaluationConfig type
        """
        data_dump_config = self.config.dump_data
        encoded_data_config = self.config.encoded_data
        train_config = self.config.train_model
        eval_config = self.config.train_evaluation

        create_directories([eval_config.root_dir])

        config = TrainEvaluationConfig(
            root_dir=eval_config.root_dir,
            input_train_path=Path(data_dump_config.input_train_path),
            input_test_path=Path(data_dump_config.input_test_path),
            output_train_path=Path(data_dump_config.output_train_path),
            output_test_path=Path(data_dump_config.output_test_path),
            encoded_train_path=Path(encoded_data_config.encoded_train_path),
            encoded_test_path=Path(encoded_data_config.encoded_test_path),
            encoder_model_path=Path(encoded_data_config.encoder_model_path),
            scaler_model_path=Path(encoded_data_config.scaler_model_path),
            model_path=Path(train_config.model_path),
            score_path=Path(eval_config.score_path),
            mlflow_dataset_path=Path(eval_config.mlflow_dataset_path),
            mlflow_dataset_column=eval_config.mlflow_dataset_column,
            minio_endpoint_url=os.environ['MLFLOW_S3_ENDPOINT_URL'],
            minio_access_key_id=os.environ['AWS_ACCESS_KEY_ID'],
            minio_secret_access_key=os.environ['AWS_SECRET_ACCESS_KEY'],
            mlflow_tracking_uri=os.environ["MLFLOW_TRACKING_URI"],
            mlflow_exp_name=eval_config.mlflow_exp_name,
            mlflow_dataset_bucket=os.environ["PROJECT_BUCKET"],
            mlflow_run_name=eval_config.mlflow_run_name
        )

        return config

In [9]:
import boto3
import json
import joblib
import mlflow
import string
import random
import pandas as pd

from mlflow.data.pandas_dataset import PandasDataset
from mlflow.data.dataset_source import DatasetSource
from sklearn.metrics import classification_report

from LoanDefault import logger

class TrainEvaluation:
    def __init__(self, config: TrainEvaluationConfig):
        self.config = config

    def get_prediction(self, model, X_input_enc, X_input) -> pd.DataFrame:
        if isinstance(X_input_enc, tuple):
            X_input_enc = X_input_enc[0]
        y_predict = pd.Series(model.predict(X_input_enc), index=X_input.index)
        return y_predict
    
    def get_report(self, y_output, y_predict) -> dict:
        metrics = classification_report(y_output, y_predict, output_dict=True)
        
        logger.info(f"Save report as json.")
        save_json(path=self.config.score_path, data=metrics)
        
        logger.info(f"Show the training report.")
        print(f"\n{classification_report(y_output, y_predict)}")
        
        return metrics
    
    def get_mlflow_metrics(self, metrics) -> dict:
        mlflow_metrics = {}

        max_iterations = min(len(metrics), 2)

        for rating in range(max_iterations):
            data_metric = metrics.get(str(rating))
            if data_metric:
                for name, value in data_metric.items():
                    mlflow_metrics[name + "_" + str(rating)] = value
            
        return mlflow_metrics
    
    def get_dataset(self, X_input, y_output, y_predict) -> pd.DataFrame:
        logger.info(f"Creating dataset for MLflow logging.")

        # Convert y_output and y_predict to DataFrame with appropriate names
        y_output_df = pd.DataFrame(y_output, index=X_input.index, columns=['Default'])
        y_predict_df = pd.DataFrame(y_predict, index=X_input.index, columns=['predictions'])

        # Combine X_input, y_output, and y_predict into a single DataFrame
        train_eval_result = pd.concat([X_input, y_output_df, y_predict_df], axis=1)

        # Log the number of columns and their names before renaming
        logger.info(f"Number of columns in X_input: {X_input.shape[1]}")
        logger.info(f"Number of rows in y_output: {y_output.shape[0]}")
        logger.info(f"Number of rows in y_predict: {y_predict.shape[0]}")
        logger.info(f"Number of columns in combined dataset: {train_eval_result.shape[1]}")
        logger.info(f"Column names in combined dataset: {train_eval_result.columns.tolist()}")

        # Log the expected column names
        logger.info(f"Expected column names: {self.config.mlflow_dataset_column}")

        # Check if the number of columns matches the expected number
        if len(self.config.mlflow_dataset_column) != train_eval_result.shape[1]:
            logger.error(f"Length mismatch: Expected axis has {train_eval_result.shape[1]} elements, new values have {len(self.config.mlflow_dataset_column)} elements")
            raise ValueError(f"Length mismatch: Expected axis has {train_eval_result.shape[1]} elements, new values have {len(self.config.mlflow_dataset_column)} elements")

        # Assign new column names
        train_eval_result.columns = self.config.mlflow_dataset_column
        logger.info(f"Dataset columns after renaming: {train_eval_result.columns.tolist()}")

        # Save the DataFrame to a CSV file
        train_eval_result.to_csv(self.config.mlflow_dataset_path, index=False)

        return train_eval_result
        
    def get_mlflow_dataset(self, mlflow_dataset, run_name) -> PandasDataset:
        mlflow_dataset = mlflow.data.from_pandas(
            mlflow_dataset,
            source=DatasetSource.load(f"s3://{self.config.mlflow_dataset_bucket}/{run_name}.csv"),
            name=f"{run_name}",
            targets=self.config.mlflow_dataset_column[-2],  # 'Default'
            predictions=self.config.mlflow_dataset_column[-1]  # 'predictions'
        )
        
        return mlflow_dataset

    
    def s3_upload_mlflow_dataset(self, run_name) -> None:
        s3 = boto3.client('s3',
                              endpoint_url=self.config.minio_endpoint_url,
                              aws_access_key_id=self.config.minio_access_key_id,
                              aws_secret_access_key=self.config.minio_secret_access_key)
        
        try:
            s3.upload_file(
                self.config.mlflow_dataset_path, 
                self.config.mlflow_dataset_bucket, 
                f'{run_name}.csv'
            )
        except Exception as e:
            logger.error(e)
            raise e
    
    def mlflow_log_train(self) -> None:
        try:
            logger.info(f"Load encoded data train from {self.config.encoded_train_path}.")
            X_train_enc = joblib.load(self.config.encoded_train_path)

            logger.info(f"Load data train from {self.config.input_train_path}.")
            X_train = joblib.load(self.config.input_train_path)
            X_test = joblib.load(self.config.input_test_path)

            logger.info(f"Load data train output from {self.config.output_train_path}.")
            y_train = joblib.load(self.config.output_train_path)

            logger.info(f"Load the model.")
            model = joblib.load(self.config.model_path)

            logger.info(f"Type of X_train_enc: {type(X_train_enc)}")
            logger.info(f"Example of X_train_enc: {X_train_enc[:1]}")
            logger.info(f"Type of X_train: {type(X_train)}")
            logger.info(f"Example of X_train: {X_train[:1]}")

            if isinstance(X_train_enc, tuple):
                X_train_enc = X_train_enc[0]

            if hasattr(X_train_enc, 'shape') and hasattr(X_train, 'shape'):
                logger.info(f"Checking shapes of X_train_enc: {X_train_enc.shape} and X_train: {X_train.shape}")
            else:
                logger.error("X_train_enc or X_train does not have attribute 'shape'. Please check the data type.")
                raise ValueError("X_train_enc or X_train does not have attribute 'shape'. Please check the data type.")

            logger.info(f"Predicting the data train.")
            y_train_pred = self.get_prediction(model, X_train_enc, X_train)

            logger.info(f"Generate classification report.")
            train_report = self.get_report(y_train, y_train_pred)

            logger.info(f"Set tracking URI.")
            mlflow.set_tracking_uri(self.config.mlflow_tracking_uri)

            logger.info(f"Set experiment name.")
            mlflow.set_experiment(self.config.mlflow_exp_name)

            logger.info(f"Set run name.")
            flag = ''.join(random.choices(
                string.ascii_uppercase + string.ascii_lowercase + string.digits, 
                k=5))
            run_name = f"{self.config.mlflow_run_name}-{flag}"

            logger.info(f"Construct report for MLflow.")
            mlflow_metrics = self.get_mlflow_metrics(train_report)

            logger.info(f"Construct MLflow dataset file in {self.config.mlflow_dataset_path}.")
            
            logger.info(f"Number of columns in X_train: {X_train.shape[1]}")
            logger.info(f"Number of rows in y_train: {y_train.shape[0]}")
            logger.info(f"Number of rows in y_train_pred: {y_train_pred.shape[0]}")
            
            mlflow_train_dataset = self.get_dataset(X_train, y_train, y_train_pred)

            logger.info(f"Construct MLflow input example.")
            sample = 10
            input_example = {"loanContents": X_train.head(sample).to_dict(orient='records')}

            logger.info(f"Experiment tracking to evaluate model with MLflow.")
            with mlflow.start_run(run_name=run_name):
                logger.info(f"Upload {self.config.mlflow_dataset_path} file to MinIO.")
                self.s3_upload_mlflow_dataset(run_name)

                logger.info(f"Set MLflow dataset.")
                dataset = self.get_mlflow_dataset(mlflow_train_dataset, run_name)

                logger.info(f"Logging to MLflow as an experiment.")
                model_params = model.get_params()
                mlflow.log_params(model_params)
                mlflow.log_metrics(mlflow_metrics)
                mlflow.log_input(dataset, context="training")
                mlflow.log_artifact(self.config.encoder_model_path, "encoder")
                mlflow.log_artifact(self.config.scaler_model_path, "scaler")
                mlflow.sklearn.log_model(
                    sk_model=model,
                    artifact_path="models",
                    serialization_format=mlflow.sklearn.SERIALIZATION_FORMAT_CLOUDPICKLE,
                    registered_model_name="logistic_regression",
                    input_example=input_example
                )
                mlflow.set_tags(
                    {
                        "dataset": "loan default training dataset and prediction result",
                        "model": "logistic_regression"
                    }
                )

        except Exception as e:
            logger.error(e)
            raise e

### Evaluate the Model

In [10]:
try:
    config = ConfigurationManager()
    eval_config = config.get_train_eval_config()
    evaluation = TrainEvaluation(config=eval_config)
    evaluation.mlflow_log_train()
except Exception as e:
    logger.error(e)
    raise e

[2024-07-13 10:05:36,079: INFO: common: yaml file: config/config.yaml loaded successfully]
[2024-07-13 10:05:36,094: INFO: common: yaml file: metrics/params.yaml loaded successfully]
[2024-07-13 10:05:36,096: INFO: common: created directory at: artifacts]
[2024-07-13 10:05:36,105: INFO: common: created directory at: artifacts/predict]
[2024-07-13 10:05:36,107: INFO: 1594587430: Load encoded data train from artifacts/preprocessing/X_train_encoded.pkl.]
[2024-07-13 10:05:36,319: INFO: 1594587430: Load data train from artifacts/data/X_train.pkl.]


[2024-07-13 10:05:40,419: INFO: 1594587430: Load data train output from artifacts/data/y_train.pkl.]
[2024-07-13 10:05:40,451: INFO: 1594587430: Load the model.]
[2024-07-13 10:05:40,666: INFO: 1594587430: Type of X_train_enc: <class 'tuple'>]
[2024-07-13 10:05:40,668: INFO: 1594587430: Example of X_train_enc: (array([[-0.49614949, -1.27925561, -0.34566167, ...,  1.        ,
         1.        ,  0.        ],
       [-1.6988325 ,  1.59211763, -0.70186312, ...,  0.        ,
         0.        ,  1.        ],
       [-0.29570232, -1.47514003, -0.63525393, ...,  0.        ,
         1.        ,  0.        ],
       ...,
       [ 1.64195363, -1.60723381, -0.90683951, ...,  0.        ,
         0.        ,  1.        ],
       [-1.16430672, -1.42409214, -1.07679267, ...,  0.        ,
         1.        ,  0.        ],
       [-1.56520105, -0.12235815, -0.78158481, ...,  0.        ,
         0.        ,  1.        ]]),)]
[2024-07-13 10:05:40,670: INFO: 1594587430: Type of X_train: <class 'pa

In [None]:
logger.info(f"Number of columns in DataFrame: {X_train.shape[1]}")
logger.info(f"Number of new column names: {len(new_column_names)}")


In [None]:
run = mlflow.get_run(mlflow.last_active_run().info.run_id)
dataset_info = run.inputs.dataset_inputs[0].dataset
print(f"Dataset name: {dataset_info.name}")
print(f"Dataset digest: {dataset_info.digest}")
print(f"Dataset profile: {dataset_info.profile}")
print(f"Dataset schema: {dataset_info.schema}")

In [None]:
try:
    config = ConfigurationManager()
    eval_config = config.get_train_eval_config()

    s3 = boto3.client('s3',
                    endpoint_url=eval_config.minio_endpoint_url,
                    aws_access_key_id=eval_config.minio_access_key_id,
                    aws_secret_access_key=eval_config.minio_secret_access_key)

    obj = s3.get_object(Bucket=eval_config.mlflow_dataset_bucket, Key=f"{dataset_info.name}.csv") 
    df = pd.read_csv(obj['Body'])
except Exception as e:
    logger.error(e)
    raise e

In [None]:
df

In [None]:
df.info()