In [0]:
import os
import pandas as pd

import accelerate

from transformers import pipeline
from huggingface_hub import snapshot_download

import mlflow
from mlflow.models import infer_signature
from mlflow.transformers import generate_signature_output
from mlflow.tracking import MlflowClient

In [0]:
# model = pipeline('text-generation', model='meta-llama/Llama-2-7b-chat-hf', device_map = "auto",load_in_8bit=True)
snapshot_location = os.path.expanduser("~/.cache/huggingface/model")
os.makedirs(snapshot_location, exist_ok=True)
# model.save_pretrained(snapshot_location)

In [0]:
snapshot_location = snapshot_download(repo_id="meta-llama/Llama-2-7b-chat-hf", local_dir=snapshot_location)

Fetching 16 files:   0%|          | 0/16 [00:00<?, ?it/s]

Downloading (…)434a8/.gitattributes:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

Downloading (…)fdadf434a8/README.md:   0%|          | 0.00/10.4k [00:00<?, ?B/s]

Downloading (…)adf434a8/LICENSE.txt:   0%|          | 0.00/7.02k [00:00<?, ?B/s]

Downloading (…)f434a8/USE_POLICY.md:   0%|          | 0.00/4.77k [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

Downloading (…)fetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading (…)adf434a8/config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

Downloading (…)model.bin.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading (…)434a8/tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

Downloading (…)of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

Downloading (…)of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

Downloading (…)l-00001-of-00002.bin:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

Downloading (…)l-00002-of-00002.bin:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Downloading tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

In [0]:
snapshot_location

'/root/.cache/huggingface/model'

In [0]:
class Llama_pyfunc(mlflow.pyfunc.PythonModel):
    def load_context(self, context):
        """
        This method initializes the tokenizer and language model
        using the specified model snapshot directory.
        """
        from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
        import accelerate


        # Initialize tokenizer and language model
        self.tokenizer = AutoTokenizer.from_pretrained(
            context.artifacts["snapshot"], padding_side="left"
        )

        # config = AutoConfig.from_pretrained(
        #     context.artifacts["snapshot"], trust_remote_code=True
        # )
        # If you are running this in a system that has a sufficiently powerful GPU with available VRAM,
        # uncomment the configuration setting below to leverage triton.
        # Note that triton dramatically improves the inference speed performance

        # config.attn_config["attn_impl"] = "triton"

        self.model = AutoModelForCausalLM.from_pretrained(
            context.artifacts["snapshot"],
            # torch_dtype=torch.bfloat16,
            load_in_4bit=True,
            trust_remote_code=True,
            device_map="auto"
        )

    def _build_prompt(self, instruction):
        """
        This method generates the prompt for the model.
        """

        return instruction


    def predict(self, context, model_input, params=None):
        """
        This method generates prediction for the given input.
        """
        prompt = model_input["prompt"][0]

        # Retrieve or use default values for temperature and max_tokens
        temperature = params["temperature"] if params else 0.1
        max_tokens = params["max_tokens" ]if params else 100

        # Build the prompt
        prompt = self._build_prompt(prompt)

        # Encode the input and generate prediction
        # NB: Sending the tokenized inputs to the GPU here explicitly will not work if your system does not have CUDA support.
        # If attempting to run this with GPU support, change 'cpu' to 'cuda' for maximum performance
        encoded_input = self.tokenizer.encode(prompt, return_tensors="pt").to("cuda")
        output = self.model.generate(
            encoded_input,
            do_sample=True,
            temperature=temperature,
            max_new_tokens=max_tokens,
        )

        # Decode the prediction to text
        generated_text = self.tokenizer.decode(output[0], skip_special_tokens=True)

        # Removing the prompt from the generated text
        prompt_length = len(self.tokenizer.encode(prompt, return_tensors="pt")[0])
        generated_response = self.tokenizer.decode(
            output[0][prompt_length:], skip_special_tokens=True
        )

        return {"predictions": [generated_response]}


In [0]:
import pandas as pd
import numpy as np
import mlflow
from mlflow.models.signature import ModelSignature, infer_signature
from mlflow.types import DataType, Schema, ColSpec, ParamSchema, ParamSpec

# Define input and output schema
# input_schema = Schema(
#     [
#         ColSpec(DataType.string, "prompt"),
#     ]
# )
# output_schema = Schema([ColSpec(DataType.string, "candidates")])




parameters = {"temperature":0.1,"max_tokens":150}

input_example = pd.DataFrame({"prompt": ["Hello, I'm a language model,"]})
output_example = pd.DataFrame({"predictions": ["How can I help you"]})
# inference_config={"max_new_tokens": 50, "temperature": 0.1}
signature = infer_signature(input_example, output_example, params=parameters)

# signature = ModelSignature(inputs=input_schema, outputs=output_schema, params=parameters)


# Define input example
input_example = pd.DataFrame({"prompt": ["What is machine learning?"]})


In [0]:
# Get the current base version of torch that is installed, without specific version modifiers
# torch_version = torch.__version__.split("+")[0]

# Start an MLflow run context and log the MPT-7B model wrapper along with the param-included signature to
# allow for overriding parameters at inference time

mlflow.set_registry_uri('databricks-uc')
CATALOG ="capgemini"
SCHEMA = "chatbot"
registered_model_name = f"{CATALOG}.{SCHEMA}.llama_pyfunc_model"

with mlflow.start_run(run_name="llm_as_pyfunc") as run:
    model_info = mlflow.pyfunc.log_model(
        artifact_path="llama_deployment_uc",
        python_model=Llama_pyfunc(),
        # NOTE: the artifacts dictionary mapping is critical! This dict is used by the load_context() method in our MPT() class.
        artifacts={"snapshot": snapshot_location},
        pip_requirements=['pandas==1.4.4',
    'torch==2.1.0',
    'transformers==4.34.0',
    'accelerate==0.23.0',
    'bitsandbytes==0.41.1',
    'tiktoken==0.5.1'],
        input_example=input_example,
        signature=signature,
    )


Downloading artifacts:   0%|          | 0/16 [00:00<?, ?it/s]

2023/11/06 12:51:00 INFO mlflow.store.artifact.artifact_repo: The progress bar can be disabled by setting the environment variable MLFLOW_ENABLE_ARTIFACTS_PROGRESS_BAR to false


Uploading artifacts:   0%|          | 0/22 [00:00<?, ?it/s]

2023/11/06 12:51:08 INFO mlflow.store.artifact.cloud_artifact_repo: The progress bar can be disabled by setting the environment variable MLFLOW_ENABLE_ARTIFACTS_PROGRESS_BAR to false


Uploading /local_disk0/repl_tmp_data/ReplId-74f99-74693-1346d-6/tmp96vi8bay/model/artifacts/model/pytorch_mode…

Uploading /local_disk0/repl_tmp_data/ReplId-74f99-74693-1346d-6/tmp96vi8bay/model/artifacts/model/pytorch_mode…

Uploading /local_disk0/repl_tmp_data/ReplId-74f99-74693-1346d-6/tmp96vi8bay/model/artifacts/model/model-00002-…

Uploading /local_disk0/repl_tmp_data/ReplId-74f99-74693-1346d-6/tmp96vi8bay/model/artifacts/model/model-00001-…

In [0]:
latest_model = mlflow.register_model(f'runs:/{run.info.run_id}/llama_deployment_uc', registered_model_name)

Registered model 'capgemini.chatbot.llama_pyfunc_model' already exists. Creating a new version of this model...


Downloading artifacts:   0%|          | 0/22 [00:00<?, ?it/s]

2023/11/06 12:51:55 INFO mlflow.store.artifact.artifact_repo: The progress bar can be disabled by setting the environment variable MLFLOW_ENABLE_ARTIFACTS_PROGRESS_BAR to false


Downloading /tmp/tmpxjbnjgpa/llama_deployment_uc/artifacts/model/model-00001-of-00002.safetensors:   0%|      …

Downloading /tmp/tmpxjbnjgpa/llama_deployment_uc/artifacts/model/pytorch_model-00001-of-00002.bin:   0%|      …

Downloading /tmp/tmpxjbnjgpa/llama_deployment_uc/artifacts/model/pytorch_model-00002-of-00002.bin:   0%|      …

Downloading /tmp/tmpxjbnjgpa/llama_deployment_uc/artifacts/model/model-00002-of-00002.safetensors:   0%|      …

Uploading artifacts:   0%|          | 0/22 [00:00<?, ?it/s]

2023/11/06 12:52:19 INFO mlflow.store.artifact.cloud_artifact_repo: The progress bar can be disabled by setting the environment variable MLFLOW_ENABLE_ARTIFACTS_PROGRESS_BAR to false


Uploading /tmp/tmpxjbnjgpa/llama_deployment_uc/artifacts/model/pytorch_model-00002-of-00002.bin:   0%|        …

Uploading /tmp/tmpxjbnjgpa/llama_deployment_uc/artifacts/model/model-00001-of-00002.safetensors:   0%|        …

Uploading /tmp/tmpxjbnjgpa/llama_deployment_uc/artifacts/model/pytorch_model-00001-of-00002.bin:   0%|        …

Uploading /tmp/tmpxjbnjgpa/llama_deployment_uc/artifacts/model/model-00002-of-00002.safetensors:   0%|        …

2023/11/06 12:53:08 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: capgemini.chatbot.llama_pyfunc_model, version 3
Created version '3' of model 'capgemini.chatbot.llama_pyfunc_model'.


In [0]:
from langchain.llms import Databricks

llm = Databricks(endpoint_name="pyfunc_llama_2",model_kwargs={"max_tokens":30})


In [0]:
llm(prompt="what is machine learning?")

[0;31m---------------------------------------------------------------------------[0m
[0;31mValidationError[0m                           Traceback (most recent call last)
File [0;32m<command-4261348161710289>, line 1[0m
[0;32m----> 1[0m [43mllm[49m[43m([49m[43mprompt[49m[38;5;241;43m=[39;49m[38;5;124;43m"[39;49m[38;5;124;43mwhat is machine learning?[39;49m[38;5;124;43m"[39;49m[43m)[49m

File [0;32m/databricks/python/lib/python3.10/site-packages/langchain/llms/base.py:382[0m, in [0;36mBaseLLM.__call__[0;34m(self, prompt, stop, callbacks, tags, metadata, **kwargs)[0m
[1;32m    375[0m [38;5;28;01mif[39;00m [38;5;129;01mnot[39;00m [38;5;28misinstance[39m(prompt, [38;5;28mstr[39m):
[1;32m    376[0m     [38;5;28;01mraise[39;00m [38;5;167;01mValueError[39;00m(
[1;32m    377[0m         [38;5;124m"[39m[38;5;124mArgument `prompt` is expected to be a string. Instead found [39m[38;5;124m"[39m
[1;32m    378[0m         [38;5;124mf[39m[38;5;124