# Introduction

this notebook demos example of using llm in a MPS backend (apple silicon GPU) using torch 2.x

Referece:
* torch 2.x MPS Backend: https://pytorch.org/docs/stable/notes/mps.html

In [1]:
import os
import applyllm as apl

print(apl.__version__)

0.0.9rc0


## Define global variables

In [2]:
from applyllm.accelerators import (
    DirectorySetting,
    TokenHelper,
    AcceleratorHelper,
)
  
dir_mode_map = {
    "kf_notebook": DirectorySetting(),
    "mac_local": DirectorySetting(
        home_dir="/Users/yingding/Code", # "/Users/yingding"
        transformers_cache_home="MODELS", 
        huggingface_token_file="MODELS/.huggingface_token"),
}
dir_setting = dir_mode_map["mac_local"]

# setup accelerator environment
AcceleratorHelper.init_torch_env(accelerator="mps", dir_setting=dir_setting)

# global model maps setup
model_map = {
    "llama7B-chat":     "meta-llama/Llama-2-7b-chat-hf",
    "llama13B-chat":    "meta-llama/Llama-2-13b-chat-hf",
    "llama70B-chat":    "meta-llama/Llama-2-70b-chat-hf",
    "llama3-8B-inst":   "meta-llama/Meta-Llama-3-8B-Instruct",
    "llama3.2-3B-inst": "meta-llama/Llama-3.2-3B-Instruct",
    "llama3-70B-inst":  "meta-llama/Meta-Llama-3-70B-Instruct",
    "mistral7B-01":     "mistralai/Mistral-7B-v0.1",
    "mistral7B-inst02": "mistralai/Mistral-7B-Instruct-v0.2",
    "mixtral8x7B-01":   "mistralai/Mixtral-8x7B-v0.1",
    "mixtral8x7B-inst01": "mistralai/Mixtral-8x7B-Instruct-v0.1", 
    "gemma7b-it": "google/gemma-7b-it",
    "gemma7b":    "google/gemma-7b",
    "gemma2b-it": "google/gemma-2b-it",
    "gemma2b":    "google/gemma-2b",
    "gemma7b-it-1.1": "google/gemma-1.1-7b-it",
    "gemma2b-it-1.1": "google/gemma-1.1-2b-it",
    "phi3-medium-128k-inst": "microsoft/Phi-3-medium-128k-instruct",
}

# default_model_type = "mistral7B-01"
default_model_type = "llama3.2-3B-inst"

# default_dir_mode = "mac_local"
# dir_setting = dir_mode_map[default_dir_mode]

# os.environ["WORLD_SIZE"] = "1" 
# os.environ['XDG_CACHE_HOME'] = dir_setting.get_cache_home()

print(os.environ['XDG_CACHE_HOME'])

/Users/yingding/Code/MODELS


In [3]:
import transformers
import torch

print(transformers.__version__)
print(torch.__version__)

5.0.0rc1
2.9.1


In [4]:
# check that MPS is availabe (Metal Performance Shaders)
if not torch.backends.mps.is_available():
    print("MPS is not available")
else:
    print("MPS is available")
    mps_device = torch.device("mps")
    print(mps_device)

MPS is available
mps


## Choose LLM model

In [5]:
# model_type = default_model_type
# model_type = "gemma7b-it"
# model_type = "gemma2b-it"
# model_type = "llama3-8B-inst"
model_type = "llama3.2-3B-inst"
# model_type = "phi3-medium-128k-inst"
# model_type = "mistral7B-inst02"
# model_type = "llama7B-chat"
# model_type = "llama13B-chat"

model_name = model_map.get(model_type, default_model_type)
print(model_name)

meta-llama/Llama-3.2-3B-Instruct


### Fast tokenizer

* https://github.com/huggingface/transformers/issues/23889#issuecomment-1584090357

### Load LLM Model and then Tokenizer

In [6]:
from applyllm.pipelines import (
    ModelCatalog,
    KwargsBuilder
)
th = TokenHelper(dir_setting=dir_setting, prefix_list=["llama"])
token_kwargs = th.gen_token_kwargs(model_type=model_type)

# data_type = torch.bfloat16
if model_name.startswith("microsoft"):
    data_type = "auto"
else:
    data_type = torch.float16

device_map = "mps" # "auto"
# device_map = "auto"

# auto caste not working for mps 4.38.2
# https://github.com/huggingface/transformers/issues/29431 

# mixtral model has no max_new_tokens limit, so it is not set here.
model_kwargs = {
    # "torch_dtype": data_type, # deprecated, replace with dtype
    "dtype": data_type, #bfloat16 is not supported on MPS backend, float16 only on GPU accelerator
    # torch_dtype=torch.float32,
    # max_length=MAX_LENGTH,
    "device_map": device_map,
    "max_length" : None, # remove the total length of the generated response
    "trust_remote_code" : True
}
print(f"model_kwargs: {model_kwargs}")

# set the transformers.pipeline kwargs
# the torch_dtype shall be set both for the model and the pipeline, due to a transformer issue.
# otherwise it will cause unnecessary more memory usage in the pipeline of transformers
# https://github.com/huggingface/transformers/issues/28817
# https://github.com/mlflow/mlflow/pull/10979

# if model_name.startswith("microsoft"):
#     do_sample = False
# else:
#     do_sample = True

do_sample = True

pipeline_kwargs = {
    "task": "text-generation",
    "max_new_tokens" : 200,
    "do_sample" : do_sample, # do_sample True is required for temperature
    "temperature" : 0.01, 
    "device_map" : device_map, # use the MPS device if available
    "top_k": 3,
    "top_p": 0.95,
    # "num_return_sequences": 1,
    # "framework": "pt", # use pytorch as framework, deprecated since transformers v5 only support torch
}

gemma_pipeline_kwargs = {
    "add_special_tokens": True,
    "torch_dtype": data_type,
}

# pipeline_kwargs override the model_kwargs during the merge
pipeline_kwargs = KwargsBuilder([model_kwargs]).override(pipeline_kwargs).build()

if model_name.startswith(ModelCatalog.GOOGLE_FAMILY):
    pipeline_kwargs = KwargsBuilder([pipeline_kwargs]).override(gemma_pipeline_kwargs).build()

print(f"pipeline_kwargs: {pipeline_kwargs}")

huggingface token loaded
model_kwargs: {'dtype': torch.float16, 'device_map': 'mps', 'max_length': None, 'trust_remote_code': True}
pipeline_kwargs: {'dtype': torch.float16, 'device_map': 'mps', 'max_length': None, 'trust_remote_code': True, 'task': 'text-generation', 'max_new_tokens': 200, 'do_sample': True, 'temperature': 0.01, 'top_k': 3, 'top_p': 0.95}


In [7]:
torch.mps.driver_allocated_memory()

475136

In [8]:
from applyllm.accelerators import AcceleratorStatus
acc_status = AcceleratorStatus.create_accelerator_status()

# acc_status.accelerator_mem_info()
acc_status.gpu_usage()

--------------------
Recom.Max memory : 96.000000 GB
Allocated memory : 0.000443 GB
--------------------


In [9]:
# driver_allocated_mem = f"{int(torch.mps.driver_allocated_memory()/1024**3)}GB"
# lm_allocated_mem = f"{int(torch.mps.current_allocated_memory()/1024**3)}GB"
# print(driver_allocated_mem)
# print(lm_allocated_mem)

### Max memory to offload parts of LLM model to the CPU memory
* https://huggingface.co/docs/accelerate/concept_guides/big_model_inference#designing-a-device-map

Note:
* Max Memory offload to CPU is CUDA implementation only



In [10]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from applyllm.utils import time_func
from applyllm.pipelines import ModelConfig, LocalCausalLMConfig

# For a M1 max with 64GB memory, set the limit to 48GB
# cuda_max_memory = {
#   0: "48GB", # GPU device 0
#   "cpu": "1GB", # CPU device with no memory, since M1 max has unified memory
# } 

base_lm_config = ModelConfig(
  model_config = {
    "pretrained_model_name_or_path": model_name,
    "device_map": device_map,
    "trust_remote_code" : True,
    # "max_memory": cuda_max_memory,
  }
)

# No bitsandbytes qunatization support for MPS backend yet, set quantized to False
kwargs = {
  "quantized": False,
  "model_config": base_lm_config.get_config(),
  "quantization_config": {
    "quantization_config": transformers.BitsAndBytesConfig(
      load_in_4bit=True,
      bnb_4bit_quant_type='nf4',
      bnb_4bit_use_double_quant=True,
      bnb_4bit_compute_dtype=torch.bfloat16
      )
    }
}

lm_config = LocalCausalLMConfig(**kwargs)

@time_func
def load_model():
  return AutoModelForCausalLM.from_pretrained(    
    **lm_config.get_config(),
    **token_kwargs,  
  )

model = load_model()

Loading weights:   0%|          | 0/254 [00:00<?, ?it/s]

executed: load_model() python function
walltime: 4.2993857860565186 in secs.


In [11]:
tokenizer_kwargs = {
    "model_config": {
        "pretrained_model_name_or_path": model_name,
        "device": "cpu",
        # "device_map": "auto", # put to GPU if GPU is available
        # "max_position_embeddings": MAX_LENGTH,
        # "max_length": MAX_LENGTH,
    },
}
tokenizer_config = ModelConfig(**tokenizer_kwargs)

tokenizer = AutoTokenizer.from_pretrained(
    **tokenizer_config.get_config(),
    **token_kwargs
)

In [12]:
# tokenizer

In [13]:
print(type(tokenizer))

<class 'transformers.tokenization_utils_tokenizers.TokenizersBackend'>


### Testing token
* https://huggingface.co/docs/tokenizers/pipeline

## Phi instruct template
* https://huggingface.co/microsoft/Phi-3-small-128k-instruct

In [14]:
print(model_name)

meta-llama/Llama-3.2-3B-Instruct


In [15]:
from langchain_core.prompts import PromptTemplate
from applyllm.pipelines import (
    ModelCatalog,
    PromptHelper
)

model_info = ModelCatalog.get_model_info(model_name)
prompt_helper = PromptHelper(model_info)



if model_info.model_family == ModelCatalog.GOOGLE_FAMILY:
    query = """BEGIN EXAMPLE
Q: Roger has 3 tennis balls. He buys 2 more cans of tennis balls. Each can has 4 tennis balls. How many tennis balls does he have now?
A: Roger started with 3 balls. 2 cans of 4 tennis balls each is 8 tennis balls. 3 + 8 = 11. The answer is 11.
END EXAMPLE

Your turn:            
Q: The cafeteria had 23 apples. If they used 20 to make lunch and bought 6 more, how many apples do they have? 
"""
    inputs=[prompt_helper.gen_prompt(query)]
elif model_name.startswith("microsoft"):
    # msft_template = "<|endoftext|><|user|>\n{query}<|end|>\n<|assistant|>"
    msft_template = "<|user|>\n{query}<|end|>\n<|assistant|>"
    user_query = """
Q: Roger has 3 tennis balls. He buys 2 more cans of tennis balls. Each can has 4 tennis balls. How many tennis balls does he have now?
A: Roger started with 3 balls. 2 cans of 4 tennis balls each is 8 tennis balls. 3 + 8 = 11. The answer is 11.
Q: The cafeteria had 23 apples. If they used 20 to make lunch and bought 6 more, how many apples do they have?
"""
    template = PromptTemplate(template=msft_template, input_variables=["query"])
    inputs=[template.format(query=user_query)]
else: 
    inputs=["""
Q: Roger has 3 tennis balls. He buys 2 more cans of tennis balls. Each can has 4 tennis balls. How many tennis balls does he have now?
A: Roger started with 3 balls. 2 cans of 4 tennis balls each is 8 tennis balls. 3 + 8 = 11. The answer is 11.
Q: The cafeteria had 23 apples. If they used 20 to make lunch and bought 6 more, how many apples do they have?
"""]

In [16]:
input_test_encoded = tokenizer.encode(inputs[0])
print(f"{len(input_test_encoded)}")
print(input_test_encoded)

107
[128000, 198, 48, 25, 29607, 706, 220, 18, 32515, 20953, 13, 1283, 50631, 220, 17, 810, 43732, 315, 32515, 20953, 13, 9062, 649, 706, 220, 19, 32515, 20953, 13, 2650, 1690, 32515, 20953, 1587, 568, 617, 1457, 5380, 32, 25, 29607, 3940, 449, 220, 18, 20953, 13, 220, 17, 43732, 315, 220, 19, 32515, 20953, 1855, 374, 220, 23, 32515, 20953, 13, 220, 18, 489, 220, 23, 284, 220, 806, 13, 578, 4320, 374, 220, 806, 627, 48, 25, 578, 94948, 1047, 220, 1419, 41776, 13, 1442, 814, 1511, 220, 508, 311, 1304, 16163, 323, 11021, 220, 21, 810, 11, 1268, 1690, 41776, 656, 814, 617, 5380]


In [17]:
response_test_decoded = tokenizer.decode(input_test_encoded)
print(response_test_decoded)

<|begin_of_text|>
Q: Roger has 3 tennis balls. He buys 2 more cans of tennis balls. Each can has 4 tennis balls. How many tennis balls does he have now?
A: Roger started with 3 balls. 2 cans of 4 tennis balls each is 8 tennis balls. 3 + 8 = 11. The answer is 11.
Q: The cafeteria had 23 apples. If they used 20 to make lunch and bought 6 more, how many apples do they have?



### Load LLM

In [18]:
# bitsandbytes quantization does not work with MPS backend
print(pipeline_kwargs)

# transformer pipeline kwargs
tp_kwargs = {
    "model": model,
    "tokenizer": tokenizer,
}

tp_config = ModelConfig(model_config = tp_kwargs)

generator = transformers.pipeline(
    **tp_config.get_config(),
    **pipeline_kwargs,
    **token_kwargs,
    # **compression_kwargs,
)

{'dtype': torch.float16, 'device_map': 'mps', 'max_length': None, 'trust_remote_code': True, 'task': 'text-generation', 'max_new_tokens': 200, 'do_sample': True, 'temperature': 0.01, 'top_k': 3, 'top_p': 0.95}


##### Install autopep8 or black extension in VSCode
`shift + opt + F` to auto format python code

In [19]:
from applyllm.accelerators import AcceleratorStatus

gpu_status = AcceleratorStatus.create_accelerator_status()
gpu_status.gpu_usage()

--------------------
Recom.Max memory : 96.000000 GB
Allocated memory : 5.992630 GB
--------------------


In [20]:
import pydantic, time
pydantic.__version__

'2.12.5'

In [21]:
from pprint import pprint

def chat_gen(
    generator: transformers.pipelines.text_generation.TextGenerationPipeline, 
    tokenizer,
    gpu_status: AcceleratorStatus
):    
    def local(input_prompts: list=[], temperature: float=0.01, max_new_tokens: int=200, verbose: bool=True) -> list:
        """
        do_sample, top_k, num_return_sequences, eos_token_id are the settings 
        the TextGenerationPipeline
        
        Reference:
        https://huggingface.co/docs/transformers/generation_strategies#customize-text-generation
        """
        start = time.time()
        model_dependeny_kwargs = {}
        if model_name.startswith(ModelCatalog.GOOGLE_FAMILY):
            # for gemma 
            model_dependeny_kwargs = {
                "add_special_tokens": True,
            }
        if model_name.startswith(ModelCatalog.MISTRAL_FAMILY):
            model_dependeny_kwargs = {
                "pad_token_id": tokenizer.eos_token_id,
            }
        if model_name.startswith(ModelCatalog.META_FAMILY):
            model_dependeny_kwargs = {
                "eos_token_id" : tokenizer.eos_token_id
            }
        sequences = generator(
            input_prompts,
            do_sample=do_sample,
            top_k=3,
            top_p=0.95,
            # num_return_sequences=1,
            # pad_token_id=tokenizer.eos_token_id, # for mistral
            # eos_token_id=tokenizer.eos_token_id, # for llama
            # max_length=200,
            max_new_tokens= max_new_tokens, # 200 # max number of tokens to generate in the output
            temperature=temperature,
            repetition_penalty=1.15,  # without this output begins repeating
            return_full_text=False,
            **model_dependeny_kwargs,
        )
        # for seq in sequences:
        #     print(f"Result: \n{seq['generated_text']}")
        
        batch_result = []
        for prompt_result in sequences: # passed a list of prompt
            result = []
            for seq in prompt_result: # 
                result.append(f"Result: \n{seq['generated_text']}")
            batch_result.append(result)
            
        end = time.time()
        duration = end - start
        
        if verbose == True:
            for prompt_result in batch_result:
                for result in prompt_result:
                    print("promt-response")
                    # pprint(result)
                    print(result)
            print("-"*20)
            print(f"walltime: {duration} in secs.")
            gpu_status.gpu_usage()
            
        return batch_result   
    return local
    
chat = chat_gen(generator, tokenizer, gpu_status)

In [22]:
verbose = True
batch_answers = chat(inputs, temperature=0.01, max_new_tokens = 80, verbose=verbose)

if not verbose:
    prompt_0_results = batch_answers[0]
    print(prompt_0_results[0])

Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


promt-response
Result: 
A: They started with 23 apples. Then they took out 20 for lunch. That leaves them with 23 - 20 = 3 apples. After buying 6 more, that makes 3 + 6 = 9 apples. The answer is 9.

The key here is to follow the order in which things happen. First you take away what was given (the 20
--------------------
walltime: 3.395456075668335 in secs.
--------------------
Recom.Max memory : 96.000000 GB
Allocated memory : 6.091629 GB
--------------------


### mlflow autologging langchain
* https://mlflow.org/docs/latest/llms/langchain/guide/index.html+
* https://github.com/mlflow/mlflow/issues/9237#issuecomment-1667549626

#### Issue
* HuggingFacePipeline is not callable from mlflow run: https://github.com/langchain-ai/langchain/issues/8858

#### LangChain Callback Handler
* https://python.langchain.com/docs/integrations/providers/aim_tracking
* https://python.langchain.com/docs/integrations/providers/mlflow_tracking
* https://python.langchain.com/docs/integrations/providers/mlflow_ai_gateway
* https://python.langchain.com/docs/integrations/providers/mlflow
* https://api.python.langchain.com/en/latest/_modules/langchain_community/callbacks/mlflow_callback.html

In [23]:
import os
# os.environ["MLFLOW_TRACKING_URI"] = "./mlruns"

### Replace the HuggingfacePipelines with LCEL

* Deprecated with LCEL and custom class https://docs.langchain.com/oss/python/integrations/llms/huggingface_pipelines

In [24]:
import mlflow
import mlflow.models
import logging
import time
from pprint import pprint
from langchain_core.runnables import RunnableLambda
from langchain_core.output_parsers import StrOutputParser

# Use SQLite backend to avoid filesystem deprecation warning
mlflow.set_tracking_uri("sqlite:///mlflow.db")

logging.getLogger("mlflow").setLevel(logging.DEBUG)

from langchain_core.prompts import PromptTemplate

# Set the run name to time string
run_name = time.strftime("%Y-%m-%d_%H-%M-%S")
experiment_name = "langchain"
search_pattern = f"name = '{experiment_name}'"
experiments = mlflow.search_experiments(filter_string=search_pattern)

if len(experiments) < 1:
    experiment_id = mlflow.create_experiment(name=experiment_name)
    print(f"experiment with string id {experiment_id} is created.")
else:
    experiment_id = experiments[0].experiment_id
    print(f"experiment with string id {experiment_id} is reused.")

mlflow.end_run()
mlflow.set_experiment(experiment_id=experiment_id)
mlflow.start_run(run_name=run_name)

# Define custom LLM using chat_gen
def local_chat_wrapper(prompt_text):
    # chat is defined in previous cells
    # chat returns a list of lists of strings
    responses = chat([str(prompt_text)], temperature=0.01, max_new_tokens=200, verbose=False)
    if responses and len(responses) > 0 and len(responses[0]) > 0:
        return responses[0][0]
    return ""

llm = RunnableLambda(local_chat_wrapper)

template = prompt_helper.gen_prompt("{input}")
prompt = PromptTemplate(template=template, input_variables=["input"])

mlflow.log_param("system_prompt", template)

# LCEL Chain
chain = prompt | llm | StrOutputParser()

# Invoke chain
response = chain.invoke({"input": inputs[0]})

print(repr(chain))

mlflow.log_param("response", response)

# Evaluate the model on some example questions
import pandas as pd
eval_data = pd.DataFrame(
    {
        "input": [
            "What is MLflow?",
            "What is Spark?",
        ],
        "ground_truth": [
            "MLflow is an open-source platform for managing the end-to-end machine learning (ML) " +
            "lifecycle. It was developed by Databricks, a company that specializes in big data and " +
            "machine learning solutions. MLflow is designed to address the challenges that data " +
            "scientists and machine learning engineers face when developing, training, and deploying " +
            "machine learning models.",
            "Apache Spark is an open-source, distributed computing system designed for big data " +
            "processing and analytics. It was developed in response to limitations of the Hadoop " +
            "MapReduce computing model, offering improvements in speed and ease of use. Spark " +
            "provides libraries for various tasks such as data ingestion, processing, and analysis " +
            "through its components like Spark SQL for structured data, Spark Streaming for " +
            "real-time data processing, and MLlib for machine learning tasks",
        ],
    }
)

print(eval_data)

class LocalLCELModel():
    """local LCEL model wrapper for mlflow"""
    def __init__(self, chain):
        self.chain = chain
    
    def __call__(self, data):
        # GPU batch
        # chain.batch expects a list of inputs. 
        # Since the chain expects {"input": ...}, we need to format the input list
        inputs = [{"input": text} for text in data["input"].tolist()]
        response = self.chain.batch(inputs)
        # response is a list of strings (due to StrOutputParser)
        return response

# load the LocalLCELModel() to mlflow.models.evaluate
results = mlflow.models.evaluate(
    model=LocalLCELModel(chain),
    model_type="question-answering",
    targets="ground_truth",
    data=eval_data,
)
print(f"See aggregated evaluation results below: \n{results.metrics}")

# Evaluation result for each data record is available in `results.tables`.
eval_table = results.tables["eval_results_table"]
print(f"See evaluation table below: \n{eval_table}")

mlflow.end_run()

pprint(response, indent=0, width=100)

2025/12/15 22:33:03 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2025/12/15 22:33:03 INFO mlflow.store.db.utils: Updating database tables
2025/12/15 22:33:03 INFO alembic.runtime.migration: Context impl SQLiteImpl.
2025/12/15 22:33:03 INFO alembic.runtime.migration: Will assume non-transactional DDL.
2025/12/15 22:33:03 INFO alembic.runtime.migration: Context impl SQLiteImpl.
2025/12/15 22:33:03 INFO alembic.runtime.migration: Will assume non-transactional DDL.
2025/12/15 22:33:03 DEBUG mlflow.utils.databricks_utils: dbutils not available, checking environment variable
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


experiment with string id 1 is reused.
PromptTemplate(input_variables=['input'], input_types={}, partial_variables={}, template="[INST]<<SYS>>You are a helpful, respectful and honest assistant.\nAlways answer as helpfully as possible using the context text provided.\nYour answers should only answer the question once and not have any text after the answer is done.\n\nIf a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct.\nIf you don't know the answer to a question, please don't share false information.<</SYS>>\n\n{input}\n[/INST]")
| RunnableLambda(local_chat_wrapper)
| StrOutputParser()
             input                                       ground_truth
0  What is MLflow?  MLflow is an open-source platform for managing...
1   What is Spark?  Apache Spark is an open-source, distributed co...


2025/12/15 22:33:12 DEBUG mlflow.models.evaluation.base: Evaluating the model with the default evaluator.
2025/12/15 22:33:12 DEBUG mlflow.utils.autologging_utils: Called autolog() method for sklearn autologging with args '()' and kwargs '{'log_input_examples': False, 'log_model_signatures': True, 'log_models': True, 'log_datasets': True, 'disable': True, 'exclusive': False, 'disable_for_unsupported_versions': False, 'silent': False, 'max_tuning_runs': 5, 'log_post_training_metrics': True, 'serialization_format': 'cloudpickle', 'registered_model_name': None, 'pos_label': None, 'extra_tags': None}'
2025/12/15 22:33:12 DEBUG mlflow.utils.autologging_utils: Called autolog() method for pytorch autologging with args '()' and kwargs '{'log_every_n_epoch': 1, 'log_every_n_step': None, 'log_models': True, 'log_datasets': True, 'disable': True, 'exclusive': False, 'disable_for_unsupported_versions': False, 'silent': False, 'registered_model_name': None, 'extra_tags': None, 'checkpoint': True, '

Loading weights:   0%|          | 0/201 [00:00<?, ?it/s]

RobertaForSequenceClassification LOAD REPORT from: facebook/roberta-hate-speech-dynabench-r4-target
Key                             | Status     |  | 
--------------------------------+------------+--+-
roberta.embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
2025/12/15 22:33:37 DEBUG mlflow.utils.autologging_utils: Called autolog() method for sklearn autologging with args '()' and kwargs '{'log_input_examples': False, 'log_model_signatures': True, 'log_models': True, 'log_datasets': True, 'disable': True, 'exclusive': False, 'disable_for_unsupported_versions': False, 'silent': False, 'max_tuning_runs': 5, 'log_post_training_metrics': True, 'serialization_format': 'cloudpickle', 'registered_model_name': None, 'pos_label': None, 'extra_tags': None}'
2025/12/15 22:33:37 DEBUG mlflow.utils.autologging_utils: Called autolog() method for pytorch autologging with args '()' and kw

See aggregated evaluation results below: 
{'toxicity/v1/mean': np.float64(0.0038049566937843338), 'toxicity/v1/variance': np.float64(1.219685069049955e-05), 'toxicity/v1/p90': np.float64(0.0065988758840831), 'toxicity/v1/ratio': 0.0, 'flesch_kincaid_grade_level/v1/mean': np.float64(11.9282040134896), 'flesch_kincaid_grade_level/v1/variance': np.float64(0.00561902642900983), 'flesch_kincaid_grade_level/v1/p90': np.float64(11.988172145981501), 'ari_grade_level/v1/mean': np.float64(13.974392486011194), 'ari_grade_level/v1/variance': np.float64(0.19063240344574592), 'ari_grade_level/v1/p90': np.float64(14.323684252597921), 'exact_match/v1': 0.0}


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

See evaluation table below: 
             input                                       ground_truth  \
0  What is MLflow?  MLflow is an open-source platform for managing...   
1   What is Spark?  Apache Spark is an open-source, distributed co...   

                                             outputs  token_count  \
0  Result: \n \n---\n\nMLflow is an open-source p...          202   
1  Result: \n | delsendr\nThe term "Spark" can re...          203   

   toxicity/v1/score  flesch_kincaid_grade_level/v1/score  \
0           0.007297                            12.003164   
1           0.000313                            11.853244   

   ari_grade_level/v1/score  
0                 14.411007  
1                 13.537778  
('Result: \n'
 ' (Source: https://www.spooned.org/2019/04/24/how-many-tennis-balls-does-roger-have-now/) \n'
 '\n'
 '## Step 1: Understand the initial condition\n'
 'Roger starts with 3 tennis balls.\n'
 '\n'
 '## Step 2: Calculate the number of tennis balls in the add

In [25]:
# # Set the run name to time string
# run_name = time.strftime("%Y-%m-%d_%H-%M-%S")
# experiment_name = "local_llm_test"
# search_pattern = f"name = '{experiment_name}'"
# experiments = mlflow.search_experiments(filter_string=search_pattern)

# if len(experiments) < 1:
#     experiment_id = mlflow.create_experiment(name=experiment_name)
#     print(f"experiment with string id {experiment_id} is created.")
# else:
#     experiment_id = experiments[0].experiment_id
#     # experiment_id = experiments.experiment_id[0]
#     print(f"experiment with string id {experiment_id} is reused.")

    
# try:
#     with mlflow.start_run(experiment_id=experiment_id, run_name=run_name) as run:
#         logged_model = mlflow.langchain.log_model(
#             lc_model=llm_chain,
#             artifact_path="models")
        
#     # Load the logged model using MLflow's Python function flavor
#     loaded_model = mlflow.pyfunc.load_model(logged_model.model_uri)

#     # Predict using the loaded model, with defined input schema from prompt template
#     print(loaded_model.predict([{"input": inputs[0]}]))
# except Exception as e:
#     print(e)
#     mlflow.end_run()


In [26]:
# We automatically log the model and trace related artifacts
# A model with name `lc_model` is registered, we can load it back as a PyFunc model
# model_name = "lc_model"
# model_version = 1
# loaded_model = mlflow.pyfunc.load_model(f"models:/{model_name}/{model_version}")
# print(loaded_model.predict(inputs))

In [27]:
import gc
def clear_mps_memory(tokenizer, generator):
    """clear the MPS memory"""
    if tokenizer is not None:
        del tokenizer
    if generator is not None:
        # need to move the model to cpu before delete.
        generator.model.cpu()
        del generator
    gc.collect()
    torch.mps.empty_cache()
    # report the GPU usage
    gpu_status.gpu_usage()


In [28]:
CLEAR_MEMORY = False
# CLEAR_MEMORY = True

if CLEAR_MEMORY:
    clear_mps_memory(tokenizer=tokenizer, generator=generator)

In [29]:
gpu_status.gpu_usage()

--------------------
Recom.Max memory : 96.000000 GB
Allocated memory : 7.902573 GB
--------------------


In [30]:
# inputs2 = ["Which animal is the largest mammal?"]
inputs2 = ["Can you tell me something about chron's disease?"]

# hallucination https://www.findacode.com/snomed/34000006--crohns-disease.html
'''
promt-response
Result: 
Which snomed ct code has chron's disease?

The SNOMED CT Code for Chronic Disease is 4621830.
--------------------
walltime: 4.950197219848633 in secs.
--------------------
Allocated memory : 54.158737 GB
--------------------
'''
# real answer is 34000006, probably need a RAG 
# inputs2 = ["Which snomed ct code has chron's disease?"]

# inputs2 = ["Can you tell me more about the company nordcloud?"]
# inputs2 = ["Can you tell me more about the company nordcloud in munich?"]

"\npromt-response\nResult: \nWhich snomed ct code has chron's disease?\n\nThe SNOMED CT Code for Chronic Disease is 4621830.\n--------------------\nwalltime: 4.950197219848633 in secs.\n--------------------\nAllocated memory : 54.158737 GB\n--------------------\n"

In [31]:
responses = chat(inputs2, temperature=0.01, max_new_tokens = 80, verbose=verbose)

Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


promt-response
Result: 
 and what are the possible treatments?
Chronic obstructive pulmonary disease (COPD) is a progressive lung disease that makes it difficult to breathe. It is characterized by inflammation, airway narrowing, and mucus production in the lungs.

**Causes of COPD:**

1. Smoking: The most common cause of COPD, accounting for 80-90% of cases.
2.
--------------------
walltime: 3.90580415725708 in secs.
--------------------
Recom.Max memory : 96.000000 GB
Allocated memory : 7.902710 GB
--------------------
