In [None]:
from huggingface_hub import HfApi
import os
import tqdm

os.environ["HF_ENABLE"] = "1"

api = HfApi()
repo_id = "microsoft/Phi-3-mini-4k-instruct"

artifacts_path = api.snapshot_download(
    repo_id = repo_id,
    revision = "c1358f8a35e6d2af81890deffbbfa575b978c62f",
    max_workers = 16,
    ignore_patterns = ["*.bin", "*.pkl", "*.pth"],
    tqdm_class = tqdm.tqdm,
    resume_download = True
)

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

tokenizer = AutoTokenizer.from_pretrained(artifacts_path)
model = AutoModelForCausalLM.from_pretrained(
    artifacts_path,
    torch_dtype = torch.bfloat16,
    attn_implementation = "flash_attention_2",
    device_map = "cuda"
)

In [None]:
import mlflow
from transformers import pipeline

experiment_name = "phi-3-trace"
mlflow.set_experiment(experiment_name)

generator = pipeline(task = "text-generation", model = model, tokenizer = tokenizer)

# Mark any function with the trace decorator to automatically capture input(s) and output(s)
@mlflow.trace
def generate(prompt: str) -> str:
    response = generator(prompt)
    return response


# Invoking the function will generate a trace that is logged to the active experiment
generate("What's ML?")
