In [None]:
import mlrun

# Create the 'LLM chatbot' AIPC project

In [None]:
project = mlrun.get_or_create_project(
    name="llm-text2seq-aipc",
    context="./",
    parameters={
        "source": "https://github.com/smartcommunitylab/ai_product_card_templates/tree/main/aipc_examples/text2seq",
    },
)
secrets = {"MINIO_URL": "", "MINIO_AK": "", "MINIO_SK": "", "WANDB_ENTITY": "", "WANDB_PROJECT": "", "WANDB_API_KEY": "", "HF_TOKEN": ""}
project.set_secrets(secrets=secrets)

# Training function

In [None]:
# Training function
fn = project.set_function(
    image="mlrun/mlrun-gpu",
    name="training",
    func='functions/training.py',
    handler='training',
    kind="job",
    requirements = ["peft", "transformers", "datasets", "bitsandbytes", "trl", "accelerate", "wandb"]
)
project.build_function(fn)

In [None]:
from kubernetes import client
tol = [
    client.V1Toleration(
        key='virtual-kubelet.io/provider',
        operator='Equal',
        value="k8sgpu",
        effect='NoExecute',
    ),
    client.V1Toleration(
        key='node.kubernetes.io/network-unavailable',
        operator='Exists',
        effect='NoSchedule',
    )
]
fn.with_node_selection(tolerations=tol,node_selector={"kubernetes.io/hostname": "k8s.gpu"})
fn.with_limits()

In [None]:
fn.run()

# Serving function

In [None]:
#Serving function
serving_fn = mlrun.code_to_function(
    "serving-llama2", 
    filename="serve2.py", 
    kind="serving", 
    image="mlrun/mlrun-gpu",
)
serving_fn.spec.build.commands = [
    "pip install torch peft transformers bitsandbytes accelerate minio",
]

In [None]:
# define resources
serving_fn.spec.replicas = 1
from kubernetes import client
tol = [
    client.V1Toleration(
        key='nvidia.com/gpu',
        operator='Equal',
        value='a100',
        effect='NoSchedule',
    )
]
serving_fn.with_node_selection(tolerations=tol)
serving_fn.with_limits(gpus=1,mem="200G")

In [None]:
serving_fn.add_model(
    "llama2",
    model_path=" ",
    model_name="meta-llama/Llama-2-7b-hf",
    adapter_path='checkpoint-400',
    class_name="ChatBot"
)
myproject.deploy_function(serving_fn)
myproject.save()

## Chat with the finetuned model

In [None]:
text = "'You are a powerful text-to-SQL model. Your job is to answer questions about a database. \
You are given a question and context regarding one or more tables. \
    \n\nYou must output the SQL query that answers the question.\
    \n\n### Input:\nWho won the points classification when the teams classification winner was Lampre-Farnese? \
\n\n### Context:\nCREATE TABLE table_28092844_16 (points_classification_klasyfikacja_punktowa VARCHAR, teams_classification VARCHAR)\
\n\n### Response:\n"
skip_special_tokens=False
max_new_tokens=250
do_sample=False
sample = {
    "row": text, 
    "skip_special_tokens": skip_special_tokens, 
    "max_new_tokens": max_new_tokens,
    "do_sample": do_sample
}
response = serving_fn.invoke(path=f"/v2/models/llama2/infer", body={"inputs": [sample]})