In [None]:
import mlrun
from config import get_params

In [None]:
project = mlrun.get_or_create_project(
    name="bert-aipc-project-prova1",
    context="./"
)
minio_parameters = get_params("minio")
print(minio_parameters)
secrets = {"MINIO_URL": minio_parameters["minio_url"], 
           "MINIO_AK": minio_parameters["minio_ak"], 
           "MINIO_SK": minio_parameters["minio_sk"], 
           "WANDB_ENTITY": "", "WANDB_PROJECT": "", "WANDB_API_KEY": "", "HF_TOKEN": ""}
project.set_secrets(secrets=secrets)

# Prepare data

In [None]:
%%writefile preprocessing/data-prep.py

from minio import Minio
import pandas as pd

def prep_data(context, filename):
    minio_client = Minio(
        context.get_secret("MINIO_URL"),
        access_key=context.get_secret("MINIO_AK"),
        secret_key=context.get_secret("MINIO_SK")
    )
    minio_client.fget_object("datalake", f"projects/tourism/meteo/{filename}", filename)
    df = pd.read_parquet(filename)
    context.log_dataset('cleaned_data', df=df, index=False, format='parquet')

In [None]:
data_prep_func = project.set_function(
    "preprocessing/data-prep.py", 
    name="data-prep", kind="job", 
    image="mlrun/mlrun", 
    handler="prep_data",
    requirements = ["minio", "pandas"]
)
project.build_function(data_prep_func)

In [None]:
prep_data_run = data_prep_func.run(name='prep_data',
                                   handler="prep_data",
                                   params={'filename': 'meteotrentino_bollettino.parquet'})

In [None]:
training_data_item = prep_data_run.outputs['cleaned_data']
training_data_item

# Training function

In [None]:
%%writefile functions/train_prova.py
import glob
import mlrun
from zipfile import ZipFile

def save_model(output_dir):
    with ZipFile("bert_model.zip", "w") as zip_file:
        for file in glob.glob(f"{output_dir}/*"):
            zip_file.write(file)
    return "bert_model.zip"

@mlrun.handler()
def train(context, training_data_item: mlrun.DataItem):
    output_dir = "."
    print(training_data_item.as_df())
    # Train the model using the dataframe obtained from the dataitem
    model_file = save_model(output_dir)
    context.log_model(
        "bert_model",
        parameters={
            "max_steps": 1000
        },
        metrics = {}, # TODO
        model_file=model_file,
        labels={"class": "AutoModelForCausalLM"},
        algorithm="AutoModelForCausalLM",
        framework="transformers"
    )    

In [None]:
# Training function
fn = project.set_function(
    image="mlrun/mlrun",
    name="training",
    func='functions/train_prova.py',
    handler='train',
    kind="job"
)
project.build_function(fn)
project.save()

In [139]:
training_function = project.run_function("training", inputs={"training_data_item": training_data_item })
#"store://models/bert-aipc-project-prova1/prova_bert_model#0:latest"

> 2024-06-05 15:47:39,067 [info] Storing function: {'name': 'training-train', 'uid': '32df5aab857247f682bf22f57b8d53e8', 'db': 'http://mlrun-api:8080'}
> 2024-06-05 15:47:39,187 [info] Job is running in the background, pod: training-train-lv97t


The clone_target_dir attribute is deprecated in 1.6.2 and will be removed in 1.8.0. Use spec.build.source_code_target_dir instead.


             data                    comune  ... intprec12-18 probtemp12-18
0      2018-03-03          BASELGA DI PINE'  ...       debole            --
1      2018-03-03                   CANAZEI  ...       debole            --
2      2018-03-03                  CAVALESE  ...       debole            --
3      2018-03-03                  FOLGARIA  ...       debole            --
4      2018-03-03                     FONDO  ...       debole            --
...           ...                       ...  ...          ...           ...
41408  2024-05-31              COMANO TERME  ...     moderata         bassa
41409  2024-05-31                  BEZZECCA  ...     moderata         bassa
41410  2024-05-31             PIEVE DI BONO  ...     moderata         bassa
41411  2024-05-31          CENTA SAN NICOLO  ...     moderata         bassa
41412  2024-05-31  SAN MARTINO DI CASTROZZA  ...     moderata         bassa

[41413 rows x 12 columns]
> 2024-06-05 15:47:42,194 [info] To track results use the CLI

project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
bert-aipc-project-prova1,...8d53e8,0,Jun 05 15:47:41,completed,training-train,v3io_user=acelepijakind=jobowner=acelepijamlrun/client_version=1.6.2-rc1mlrun/client_python_version=3.9.13host=training-train-lv97t,training_data_item,,,bert_model





> 2024-06-05 15:47:45,317 [info] Run execution finished: {'status': 'completed', 'name': 'training-train'}


In [141]:
training_function.outputs

{'bert_model': 'store://artifacts/bert-aipc-project-prova1/training-train_bert_model@32df5aab857247f682bf22f57b8d53e8'}

## Access data item

In [None]:
di = mlrun.get_dataitem(training_data_item)

In [None]:
di.as_df()

# Prova function

In [None]:
%%writefile functions/prova.py
import mlrun
from minio import Minio

def get_model(context):
    minio_client =  Minio(
        context.get_secret("MINIO_URL"),
        access_key=context.get_secret("MINIO_AK"),
        secret_key=context.get_secret("MINIO_SK")
    )
    model_file = "bert_model.zip"
    minio_client.fget_object("datalake", f"projects/bert-aipc-project-prova1/artifacts/{model_file}", model_file)
    
    context.log_model(
        "bert_model",
        parameters={
            "max_steps": 1000
        },
        metrics = {}, # TODO
        model_file=model_file,
        labels={"class": "AutoModelForCausalLM"},
        algorithm="AutoModelForCausalLM",
        framework="transformers"
    )    

In [None]:
get_model_func = project.set_function(
    "functions/prova.py", 
    name="prova", 
    kind="job", 
    image="mlrun/mlrun", 
    handler="get_model",
    requirements = ["minio", "pandas"]
)
project.build_function(get_model_func)
get_model = get_model_func.run(name='prova',
                                   handler="get_model")

# Serving function

In [None]:
%%writefile serving/model_serving.py

import json
import logging
import mlrun
import yaml
from os import path
from zipfile import ZipFile
from transformers import pipeline

class ClassifierModel(mlrun.serving.V2ModelServer):
    def load(self):
        """Download and initialize the model and/or other elements"""
        logging.basicConfig(level=logging.INFO)
        model_file, extra_data = self.get_model('.zip')
        print(model_file)
        # Open the model file as a zip file
        file = ZipFile(model_file)
        self.model_dir = '/tmp/model'
        file.extractall(self.model_dir)        
        self.model = self.load_model(self.model_dir)

    def predict(self, body: dict) -> list:
        """
        Make predictions
        """
        logging.basicConfig(level=logging.INFO)
        print(body)
        classifier = pipeline(
            "text-classification", 
            model=self.model_dir, 
            tokenizer=self.model_dir, 
            config=path.join(self.model_dir, "config.json"), 
            top_k=top_k, 
            device=device
        )
        response = []
        for el in body["inputs"]:
            row = el["row"]
            skip_special_tokens = el["skip_special_tokens"]
            max_new_tokens = el["max_new_tokens"]
            do_sample = el["do_sample"]
            preds = classifier(row)

            predictions = []
            for pred in preds[0]:
                if pred["score"] > threshold:
                    predictions.append({"label": pred["label"], "score": pred["score"]})
            response.append({"text": row, "predictions": predictions})            

        return response

In [None]:
#Serving function
serving_fn = mlrun.code_to_function(
    "serving-classifier", 
    filename="serving/model_serving.py", 
    kind="serving", 
    image="mlrun/mlrun-gpu",
)
serving_fn.spec.build.commands = [
    "pip install torch peft transformers bitsandbytes accelerate minio",
]

In [None]:
serving_fn.add_model(
    "bert_classifier",
    model_path=training_function.outputs["bert_model"]
    #"store://models/bert-aipc-project-prova1/prova_bert_model#0:latest",
    class_name="ClassifierModel"
)
project.deploy_function(serving_fn)
project.save()

In [None]:
text = "Example text"
skip_special_tokens=False
max_new_tokens=250
do_sample=False
sample = {
    "row": text, 
    "skip_special_tokens": skip_special_tokens, 
    "max_new_tokens": max_new_tokens,
    "do_sample": do_sample
}
response = serving_fn.invoke(path=f"/v2/models/bert_classifier/infer", body={"inputs": [sample]})