In [None]:
import mlflow
import mlflow.pyfunc
import mlflow.xgboost

from pyspark.sql import SparkSession, functions as F
from pyspark.sql.functions import col, hour, to_timestamp, lit, udf
from pyspark.sql.types import StringType

from xgboost import XGBClassifier

from sklearn.metrics import log_loss
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.calibration import CalibratedClassifierCV

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pandas.api.types import CategoricalDtype
from hyperopt import fmin, tpe, hp, Trials, STATUS_OK, SparkTrials

import joblib
import logging
import sys
import json
import traceback
from typing import List, Dict, Any, Tuple
import yaml
import time
from datetime import datetime, timedelta


# Import libraries
%run reference
print(ENV_VARS, MODELS_NAME)

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

In [None]:
def get_task_values():
    try:
        run_id = dbutils.jobs.taskValues.get(taskKey="model_retraining_task_1", key="run_id")
        print(f"Received run_id: {run_id}")

        model_uri = dbutils.jobs.taskValues.get(taskKey="model_retraining_task_1", key="model_uri")
        print(f"Received model_uri: {model_uri}")

        return run_id, model_uri
    except Exception as e:
        logger.error(f"Error getting task values: {e}")
        raise
     

In [None]:
def download_artifacts(run_id: str) -> Dict[str, str]:
    try:
        artifacts = {
            'model': mlflow.artifacts.download_artifacts(run_id=run_id, artifact_path='prod_artifacts/model.sav'),
            'features_types': mlflow.artifacts.download_artifacts(run_id=run_id, artifact_path='prod_artifacts/features_types.sav'),
            'encodings': mlflow.artifacts.download_artifacts(run_id=run_id, artifact_path='prod_artifacts/encodings.sav')
            # 'conda_env': mlflow.artifacts.download_artifacts(run_id=run_id, artifact_path='xgboost-model/conda.yaml')
        }
        return artifacts
    except Exception as e:
        logger.error(f"Error downloading artifacts: {e}")
        raise

In [None]:
def download_conda_env_yaml(run_id: str) -> Dict[str, str]:
    try:
        return {
            'conda_env': mlflow.artifacts.download_artifacts(run_id=run_id, artifact_path='xgboost-model/conda.yaml')
        }
    except Exception as e:
        logger.error(f"Error downloading conda env: {e}")
        raise

In [None]:
class PreprocessingModelWrapper(mlflow.pyfunc.PythonModel):
    def __init__(self, run_id: str):
        try:
            # run_id, model_name, model_uri = get_task_values()
            # 
            self.RUN_ID = run_id
            artifacts = download_artifacts(self.RUN_ID)            
            self.model = joblib.load(artifacts['model'])
            self.feature_types = joblib.load(artifacts['features_types'])
            self.encodings = joblib.load(artifacts['encodings'])
            print("Model, feature types, and encodings loaded successfully.")
        except Exception as e:
            logger.error(f"Error loading model or preprocessors: {e}")
            raise e


    def clean_os(self, os_name: str) -> str:
        if pd.isna(os_name) or os_name in ["null", "NA", "", None]:
            return ""
        if os_name in ["Android", "iOS", "Windows", "Mac"]:
            return os_name
        return "Other"


    def clean_gender(self, gender: str) -> str:
        if pd.isna(gender) or gender in ["null", "NA", "", None]:
            return ""
        gender = gender.lower()
        if gender in ["f", "female"]:
            return "F"
        if gender in ["m", "male"]:
            return "M"
        return ""


    def clean_age(self, age) -> float:
        if age is None or age in ["null", "NA", "", None, -1]:
            return np.nan
        if isinstance(age, (int, float)):
            return float(age)
        if isinstance(age, str):
            try:
                return float(age)
            except ValueError:
                return np.nan
        return np.nan


    def clean_household_income(self, household_income) -> float:
        if household_income is None or household_income in ["null", "NA", "", None, -1]:
            return np.nan
        if isinstance(household_income, (int, float)):
            return float(household_income)
        if isinstance(household_income, str):
            try:
                return float(household_income)
            except ValueError:
                return np.nan
        return np.nan


    def clean_traffic_source_id(self, traffic_source_id) -> float:
        if traffic_source_id is None:
            return np.nan
        if isinstance(traffic_source_id, (int, float)):
            return float(traffic_source_id)
        if isinstance(traffic_source_id, str):
            if traffic_source_id.isnumeric():
                return float(traffic_source_id)
            else:
                return np.nan
        return np.nan


    def clean_data(self, df: pd.DataFrame) -> pd.DataFrame:
        df["os_name"] = df["os_name"].apply(self.clean_os)
        df["gender"] = df["gender"].apply(self.clean_gender)
        df["age"] = df["age"].apply(self.clean_age)
        df["household_income"] = df["household_income"].apply(self.clean_household_income)
        df["traffic_source_id"] = df["traffic_source_id"].apply(self.clean_traffic_source_id)
        
        return df


    def predict(self, context, model_input):
        try:
            # Handle input formats
            if isinstance(model_input, dict) and 'instances' in model_input:
                model_input = model_input['instances']

            if isinstance(model_input, pd.DataFrame):
                df = model_input
            elif isinstance(model_input, list):
                df = pd.DataFrame(model_input)
            else:
                return 'Invalid model input specified'
            
            # Check if the DataFrame is empty
            # if df.empty:
            #     return "Input data is empty. Please provide valid input data."

            # Clean the data
            df = self.clean_data(df)

            # Convert DataFrame to the correct types
            df = df.astype(self.feature_types)
            
            original_campaign_id = df['campaign_id'].copy()

            # Convert categorical features to the correct dtype
            for feature in self.encodings:
                df[feature] = df[feature].astype(CategoricalDtype(categories=self.encodings[feature]))
            
            # Generate predictions
            preprocessed_input = df[self.feature_types.keys()]
            pred_prob = self.model.predict_proba(preprocessed_input)

            df['model_ctr'] = pred_prob[:, 1]
            df['campaign_id'] = original_campaign_id
            df['model_version'] = f'adflow_click_{MODELS_NAME["MODEL1"]} - ' + str(self.RUN_ID)

            specific_columns = ['campaign_id', 'position', 'model_ctr', "model_version"]
            final = df[specific_columns]

            return final

        except Exception as e:
            error_message = f"Error processing input: {e}"
            logger.error(error_message)
            raise e

In [None]:
def register_and_log_model(run_id: str, model_name: str):
    try:
        with mlflow.start_run(run_id=run_id) as run:
            # Create an instance of PreprocessingModelWrapper
            model_wrapper = PreprocessingModelWrapper(run_id)

            # Define pip requirements
            pip_requirements = [
                "mlflow==2.11.3", "scikit-learn==1.3.0", "scipy==1.10.0",
                "psutil==5.9.0", "pandas==1.5.3", "cloudpickle==2.2.1",
                "numpy==1.23.5", "category-encoders==2.6.3", "xgboost==2.0.3",
                "lz4==4.3.2", "typing-extensions==4.10.0"
            ]

            # Log the model using mlflow.pyfunc
            mlflow.pyfunc.log_model(
                artifact_path="model",
                python_model=model_wrapper,
                pip_requirements=pip_requirements
            )

            # Register the model
            model_uri = f"runs:/{run.info.run_id}/model"
            registered_model = mlflow.register_model(model_uri, model_name)
            
            print(f"Model '{registered_model.name}' registered successfully with version '{registered_model.version}' with URI: {model_uri}")

            # Return the model name and version
            return registered_model.name, registered_model.version

    except mlflow.exceptions.MlflowException as e:
        if "Model with name" in str(e):
            print(f"Model '{model_name}' already exists. Consider using a different name or version.")
        else:
            logger.error(f"Error registering model: {e}")
            raise e

    except Exception as e:
        logger.error(f"Unexpected error during model registration: {e}")
        raise e

In [None]:
def main():
    try:
        # run_id, model_name, model_uri = get_task_values()
        run_id, model_uri = get_task_values()
        print(f"Received run_id: {run_id}, model_name: {MODELS_NAME['MODEL1']}, model_uri: {model_uri}")

        # Register and log the model
        registered_model_name, registered_model_version = register_and_log_model(run_id, MODELS_NAME['MODEL1'])
        dbutils.jobs.taskValues.set("registered_model_name_1", registered_model_name)
        dbutils.jobs.taskValues.set("registered_model_version_1", registered_model_version)
        print(f"Model '{registered_model_name}' registered successfully with version '{registered_model_version}' with URI: {model_uri}")

    except Exception as e:
        logger.error(f"Error during model registration: {e}")
        raise e


if __name__ == "__main__":
    main()