In [1]:
install_requires = [
    'composer[nlp,wandb]>=0.14.1,<0.15',
    'mosaicml-streaming>=0.4.1,<0.5',
    'torch==1.13.1',
    'datasets==2.10.1',
    'sentencepiece==0.1.97',
    'einops==0.5.0',
    'omegaconf>=2.2.3,<3',
    'slack-sdk<4',
    'mosaicml-cli>=0.3,<1',
    'onnx==1.13.1',
    'onnxruntime==1.14.1',
    'langchain==0.0.169',
    'bitsandbytes==0.35.0',
    'accelerate==0.18.0',
    'peft==0.2.0',
    'sentence-transformers==2.2.2'
]

pkgs = " ".join([f"'{x}'" for x in install_requires])
!pip uninstall -y torchaudio
!pip install {pkgs}

Found existing installation: torchaudio 2.0.1
Uninstalling torchaudio-2.0.1:
  Successfully uninstalled torchaudio-2.0.1
[0mCollecting composer[nlp,wandb]<0.15,>=0.14.1
  Downloading composer-0.14.1-py3-none-any.whl (565 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m565.9/565.9 kB[0m [31m48.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting mosaicml-streaming<0.5,>=0.4.1
  Downloading mosaicml_streaming-0.4.1-py3-none-any.whl (148 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m148.2/148.2 kB[0m [31m78.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torch==1.13.1
  Downloading torch-1.13.1-cp310-cp310-manylinux1_x86_64.whl (887.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m887.5/887.5 MB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting datasets==2.10.1
  Downloading datasets-2.10.1-py3-none-any.whl (469 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m469.0/469.0 kB[0m [31m8

In [2]:
import os
from pathlib import Path

Path('/workspace/.cache').mkdir(exist_ok=True)

os.environ['TRANSFORMERS_CACHE'] = '/workspace/.cache'

Downloading (…)okenizer_config.json:   0%|          | 0.00/156 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/457k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

In [3]:
from typing import Any, Dict, Tuple
import warnings

from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig
from transformers import StoppingCriteria, StoppingCriteriaList, TextIteratorStreamer
import torch

model_name = 'mosaicml/mpt-7b-instruct'

INSTRUCTION_KEY = "### Instruction:"
RESPONSE_KEY = "### Response:"
END_KEY = "### End"
INTRO_BLURB = "Below is an instruction that describes a task. Write a response that appropriately completes the request based on the Input if any input is provided."
INPUT_CONTEXT = """### Input: {context}"""
PROMPT_FOR_GENERATION_FORMAT = """{intro}
{input_context}
{instruction_key}
{instruction}
{response_key}
""".format(
    intro=INTRO_BLURB,
    input_context=INPUT_CONTEXT,
    instruction_key=INSTRUCTION_KEY,
    instruction="{instruction}",
    response_key=RESPONSE_KEY,
)


class InstructionTextGenerationPipeline:
    def __init__(
        self,
        model,
        tokenizer
    ) -> None:
        self.model = model
        self.tokenizer = tokenizer
        
    def format_instruction(self, instruction, context=""):
        return PROMPT_FOR_GENERATION_FORMAT.format(instruction=instruction, context=context)

    def __call__(
        self, instruction: str, context:str, **generate_kwargs: Dict[str, Any]
    ) -> Tuple[str, str, float]:
        s = PROMPT_FOR_GENERATION_FORMAT.format(instruction=instruction, context=context)
        input_ids = self.tokenizer(s, return_tensors="pt").input_ids
        input_ids = input_ids.to(self.model.device)
        with torch.no_grad():
            output_ids = self.model.generate(input_ids, **generate_kwargs)
        # Slice the output_ids tensor to get only new tokens
        new_tokens = output_ids[0, len(input_ids[0]) :]
        output_text = self.tokenizer.decode(new_tokens, skip_special_tokens=True)
        return output_text
    

def model_fn():
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            trust_remote_code = True,
            torch_dtype=torch.bfloat16
        )
        tokenizer = AutoTokenizer.from_pretrained(
            model_name,
            trust_remote_code = True,
            device_map='auto'
        )
        if tokenizer.pad_token_id is None:
            tokenizer.pad_token = tokenizer.eos_token
        tokenizer.padding_side = "left"
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        model.eval()
        model.to(device=device, dtype=torch.bfloat16)
        return model, tokenizer

def predict_fn(data, model_and_tokenizer):
    model, tokenizer = model_and_tokenizer
    prompt = data.pop("inputs", data)
    context = data.pop("context", data)
    generate_text = InstructionTextGenerationPipeline(model=model, tokenizer=tokenizer)
    response =  generate_text(prompt, context, **data)
    return response


'Below is an instruction that describes a task. Write a response that appropriately completes the request based on the Input if any input is provided.\n### Input: {context}\n### Instruction:\n{instruction}\n### Response:\n'

In [6]:
model_and_tokenizer = model_fn()

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.23k [00:00<?, ?B/s]

Explicitly passing a `revision` is encouraged when loading a configuration with custom code to ensure no malicious code has been contributed in a newer revision.


Downloading (…)configuration_mpt.py:   0%|          | 0.00/9.08k [00:00<?, ?B/s]

Explicitly passing a `revision` is encouraged when loading a model with custom code to ensure no malicious code has been contributed in a newer revision.


Downloading (…)main/modeling_mpt.py:   0%|          | 0.00/17.4k [00:00<?, ?B/s]

Downloading (…)ve/main/attention.py:   0%|          | 0.00/16.8k [00:00<?, ?B/s]

Downloading (…)resolve/main/norm.py:   0%|          | 0.00/2.56k [00:00<?, ?B/s]

Downloading (…)flash_attn_triton.py:   0%|          | 0.00/28.2k [00:00<?, ?B/s]

Downloading (…)n/adapt_tokenizer.py:   0%|          | 0.00/1.75k [00:00<?, ?B/s]

Downloading (…)meta_init_context.py:   0%|          | 0.00/3.64k [00:00<?, ?B/s]

Downloading (…)solve/main/blocks.py:   0%|          | 0.00/2.49k [00:00<?, ?B/s]

Downloading (…)refixlm_converter.py:   0%|          | 0.00/27.2k [00:00<?, ?B/s]

Downloading (…)in/param_init_fns.py:   0%|          | 0.00/12.6k [00:00<?, ?B/s]

Downloading (…)model.bin.index.json:   0%|          | 0.00/16.0k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)l-00001-of-00002.bin:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

Downloading (…)l-00002-of-00002.bin:   0%|          | 0.00/3.36G [00:00<?, ?B/s]



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)neration_config.json:   0%|          | 0.00/91.0 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/237 [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

In [24]:

def get_input_data(inputs, context="", temperature=0.1, max_length=512):
    data = {
        "inputs": inputs,
        "context": context,
        "temperature": temperature,
        "max_length": 512,
        "do_sample": False
    }
    return data




In [None]:
prompt = "Write a short story about a robot that has a nice day."
context = "Name of the robot is Jane"


response = predict_fn(get_input_data(inputs=prompt, context=context, temperature=0.5), model_and_tokenizer)
response

In [None]:
prompt = "What is Driver protection cover? Provide a one line short answer."
context = "Driver Protection Cover is extra cover We automatically provide with Your NRMA Compulsory Third Party (CTP) Insurance on most passenger Vehicles and some goods Vehicles. See Table 1. for the Vehicles that We cover and those We do not cover.Driver Protection Cover provides the driver of a covered Vehicle who suffers one or more of the injuries listed in the specified injuries table with a set payment if the driver was at-fault in the accident in which they were injured"


response = predict_fn(get_input_data(inputs=prompt, context=context), model_and_tokenizer)
response

In [16]:
prompt = "what is Driver protection cover? Provide a one line short answer."
context = ""


response = predict_fn(get_input_data(inputs=prompt, context=context), model_and_tokenizer)
response

'Driver protection cover is a type of insurance that covers the driver of a vehicle for any legal liability arising from an accident.#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n'