## Test MPT model

- MPT official blog : https://www.mosaicml.com/blog/mpt-7b
- Instruct MPT : https://huggingface.co/mosaicml/mpt-7b-instruct
- Chat MPT (Non-commercial) : https://huggingface.co/mosaicml/mpt-7b-chat

In [None]:
!pip install -q transformers accelerate sentencepiece bitsandbytes einops

In [None]:
import sagemaker
import transformers
import torch
print(sagemaker.__version__)
print(transformers.__version__)

In [None]:
from huggingface_hub import snapshot_download
from pathlib import Path
import os

local_model_path = Path("./pretrained-models")
local_model_path.mkdir(exist_ok=True)
instruct_model_name = "mosaicml/mpt-7b-instruct"
chat_model_name = "mosaicml/mpt-7b-chat"

allow_patterns = ["*.json", "*.pt", "*.bin", "*.txt", "*.model", "*.py"]

instruct_model_path = snapshot_download(
    repo_id=instruct_model_name,
    cache_dir=local_model_path,
    allow_patterns=allow_patterns,
)

chat_model_path = snapshot_download(
    repo_id=chat_model_name,
    cache_dir=local_model_path,
    allow_patterns=allow_patterns,
)

In [None]:
print(f"Instruct model path: {instruct_model_path}")
print(f"Chat model path: {chat_model_path}")

In [None]:
import os
from transformers import AutoModelForCausalLM, AutoTokenizer, StoppingCriteria, StoppingCriteriaList

model_path = instruct_model_path
# model_path = chat_model_path
tokenizer = AutoTokenizer.from_pretrained(model_path)

# model = AutoModelForCausalLM.from_pretrained(
#     model_path,
#     trust_remote_code=True,
#     torch_dtype=torch.bfloat16,
# )

# int8 quantization is not works now
# instruct model works well, but chat model have CUDA OOM error when torch_dtype=torch.bfloat16 is not specified.
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    low_cpu_mem_usage=True,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
)
model.to(device='cuda:0')

In [None]:
# Instruct prompt example
INSTRUCTION_KEY = "### Instruction:"
RESPONSE_KEY = "### Response:"
INTRO_BLURB = "Below is an instruction that describes a task. Write a response that appropriately completes the request."
PROMPT_FOR_GENERATION_FORMAT = """{intro}
{instruction_key}
{instruction}
{response_key}
""".format(
    intro=INTRO_BLURB,
    instruction_key=INSTRUCTION_KEY,
    instruction="{instruction}",
    response_key=RESPONSE_KEY,
)

# query = "I can't find my car key today. I visited home, bus, train stop and company today. How can I find my key? Explain it step by step."
query = "I got a cold 10 days ago, but still it has no progress. How can I get better? Explain it step by step."
prompt = PROMPT_FOR_GENERATION_FORMAT.format(instruction=query)
print(prompt)

In [None]:
# Chat prompt example
prompt = "I can't find my car key today. I visited home, bus, train stop and company today. How can I find my key?"
print(prompt)

In [None]:
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

In [None]:
%%time
tokens = model.generate(
  **inputs,
  max_new_tokens=256,
  temperature=0.5,
  do_sample=True
)

output = tokenizer.decode(tokens[0], skip_special_tokens=True)


In [None]:
print(output)