### Upload input data to MinIO

In [12]:
import os
import boto3

s3 = boto3.resource('s3',
                    #endpoint_url='http://127.0.0.1:61403',
                    #endpoint_url='http://127.0.0.1:30080',
                    endpoint_url='https://minio-api.digitalhub-test.smartcommunitylab.it/',
                    aws_access_key_id='minio',
                    aws_secret_access_key='digitalhub-test',
                    aws_session_token=None,
                    config=boto3.session.Config(signature_version='s3v4'))
s3.buckets.all()
for bucket in s3.buckets.all():
    print(bucket.name)

datalake
ipzs


In [13]:
ipzs_bucket = s3.Bucket("ipzs")
input_folder = "/home/albana/Desktop/Projects/AIxPA/ai_product_card_templates/document_classification_model/fasttext/ipzs"
years = ["2021"]

for year in years:
    for root, subfolders, files in os.walk(input_folder + "/" + year):
        for item in files:
            if item.endswith(".json") or item.endswith(".csv"):
                fileNamePath = str(os.path.join(root,item))
                ipzs_bucket.upload_file(fileNamePath, fileNamePath.replace(input_folder + "/", ""))

In [14]:
for root, subfolders, files in os.walk(input_folder):
    for item in files:
        if item.startswith(".DS"):
            fileNamePath = str(os.path.join(root,item))
            print(fileNamePath)
            os.remove(fileNamePath)

### Configure MLRun

In [15]:
import mlrun

ModuleNotFoundError: No module named 'mlrun'

In [None]:
#set path of MLRun API running on Minikube
#mlrun.set_environment("http://127.0.0.1:30070")
mlrun.set_environment("http://localhost:8060")
#set path of MLRun API running ok Kubernetes
#mlrun.set_environment("https://mlrun-api.digitalhub-test.smartcommunitylab.it", username="digitalhub-dev")

In [None]:
mlrun.get_secret_or_env("MLRUN_DBPATH")

### Create a project

In [None]:
#project = mlrun.new_project("document-classification", context="./", overwrite=True, init_git=False, user_project=False)
project = mlrun.get_or_create_project("text-classification-fasttext3", context="./", init_git=False, user_project=False)

### Register the pre-processing function and run it

In [None]:
preproc_fn = project.set_function(
    name="pre-processing",
    func="01-preprocessing_handlers.py",
    handler="parse_ipzs",
    kind="job",
    image="mlrun/mlrun", #includes sklearn, pandas, numpy
    #requirements=[] #list or path to a requirements.txt
)

In [None]:
project.save()

In [None]:
preproc_run = project.run_function(
    "pre-processing",
    #local=False,
    params={"bucket_name": "ipzs", "idPrefix": "ipzs-", "limit": 10, "max_documents": 250},
    outputs=["preprocessed_data"]
)

In [14]:
preproc_run.outputs["preprocessed_data"]

NameError: name 'preproc_run' is not defined

### Register the parsing function and run it

**NOTE**: building/auto-building images does not work on ARM because the resulting images are for ARM but AMD images are required. Manually building images and loading them on DockerHub is the current workaround.

In [None]:
#image has been built with:
# docker build -t classification-parsing:latest -<<EOF
# FROM mlrun/mlrun:1.2.1
# RUN pip install tqdm==4.61.1
# RUN pip install requests==2.25.1
# RUN pip install stanza==1.4.2
# EOF
parsing_fn = project.set_function(
    name="parsing",
    func="02-parsing_handlers.py",
    handler="parse",
    kind="job",
    image="ertomaselli/classification-parsing:latest"
)

#for VM with autobuild
# parsing_fn = project.set_function(
#     name="parsing",
#     func="02-parsing_handlers.py",
#     handler="parse",
#     kind="job",
#     image="mlrun/mlrun",
#     requirements=["tqdm==4.61.1", "requests==2.25.1", "stanza==1.4.2"] #list or path to a requirements.txt
# )

In [None]:
project.save()

In [None]:
parsing_run = project.run_function(
    "parsing",
    inputs={"input_file": preproc_run.outputs["preprocessed_data"]},
    params={"tint_url": None}
)

In [None]:
parsing_run.outputs

### Register the function that extracts test sets and run it

In [None]:
extracting_fn = project.set_function(
    name="extracting_test",
    func="03-extracting_test_handlers.py",
    handler="extract_test_sets",
    kind="job",
    image="mlrun/mlrun"
)

In [None]:
project.save()

In [None]:
extracting_run = project.run_function(
    "extracting_test",
    inputs={"input_file": preproc_run.outputs["preprocessed_data"], "tint_files": parsing_run.outputs["tint_files"]},
    params={"testRatio": 0.2, "devRatio": 0.2}
)

In [None]:
extracting_run.outputs

### Register the function for saving data and run it

In [None]:
saving_fn = project.set_function(
    name="saving_data",
    func="04-saving_data_handlers.py",
    handler="save_data",
    kind="job",
    image="ertomaselli/classification-parsing:latest"
)

In [None]:
project.save()

In [None]:
saving_run = project.run_function(
    "saving_data",
    inputs={"input_file": preproc_run.outputs["preprocessed_data"],
            "test_list_file": extracting_run.outputs["testlist"],
            "dev_list_file": extracting_run.outputs["devlist"],
            "tint_files": parsing_run.outputs["tint_files"]}
)

In [None]:
saving_run.outputs

### Register the filtering function and save it

In [None]:
filtering_fn = project.set_function(
    name="filtering",
    func="05-filtering_handlers.py",
    handler="filter",
    kind="job",
    image="ertomaselli/classification-parsing:latest"
)

In [None]:
project.save()

In [None]:
filtering_run = project.run_function(
    "filtering",
    inputs={"complete_json_file": saving_run.outputs["complete"]},
    params={"minFreq": 3}
)

In [None]:
filtering_run.outputs

### Register the training function and run it

In [None]:
#image has been built with:
# docker build -t classification-training:latest -<<EOF
# FROM mlrun/mlrun:1.2.1
# RUN apt-get update
# RUN apt-get install build-essential -y
# RUN pip install fasttext
# EOF
training_fn = project.set_function(
    name="training",
    func="training_handlers.py",
    handler="train",
    kind="job",
    image="ertomaselli/classification-training:latest"
)

In [None]:
project.save()

In [None]:
training_run = project.run_function(
    "training",
    inputs={"training_files": filtering_run.outputs["filtering_files"]}
)

In [None]:
training_run.outputs

### Register the evaluation function and run it

In [None]:
evaluation_fn = project.set_function(
    name="evaluation",
    func="06-evaluation_handlers.py",
    handler="evaluate",
    kind="job",
    image="mlrun/mlrun"
)

In [None]:
project.save()

In [None]:
evaluation_run = project.run_function(
    "evaluation",
    inputs={"pred_files": training_run.outputs["results"], "gold_files": filtering_run.outputs["filtering_files"]},
    params={"show_cm": True}
)

In [None]:
evaluation_run.outputs

### Define and run a pipeline

In [None]:
project.set_workflow(
    "classification",
    workflow_path="classification_pipeline.py",
    engine="kfp",
    handler="classification_pipeline"
)

In [None]:
project.save()

In [None]:
run_id = project.run(
    name="classification",
    # arguments={
    #     "data_format": "parse_ipzs",
    #     "bucket_name": "ipzs", "idPrefix": "ipzs-", "limit": 10, "max_documents": 100,
    #     "tint_url": None,
    #     "testRatio": 0.2, "devRatio": 0.2
    # }, 
    watch=True
)

### Deploy the models

In [None]:
serving_fn = mlrun.new_function("model-server", kind="serving", image="ertomaselli/classification-tqdm-stanza-fasttext:latest", project="document-classification")

#model_path = training_run.outputs["allTokens_unfiltered_model"]
model_path = "./allTokens_unfiltered_model.bin" #test with single local model

# set the topology/router and add models
graph = serving_fn.set_topology("router")
serving_fn.add_model("allTokens_unfiltered_model", model_path=model_path, class_name="model_serving.ClassifierModel")

project.set_function(serving_fn)
project.save()

In [None]:
#test function locally
server = serving_fn.to_mock_server()

In [None]:
csv_path = "/Users/erica/document-classification/input-folder/atti_materie_SG_nov2021.csv"
text = "Norme in materia tributaria, di previdenza, di assunzioni nella pubblica amministrazione ed altre disposizioni urgenti."

server.test("/v2/models/allTokens_unfiltered_model/infer", body={"inputs": [csv_path, text]})

In [None]:
#serving_fn.deploy()
mlrun.deploy_function(serving_fn)