In [1]:
install_requires = [
    'composer[nlp,wandb]>=0.14.1,<0.15',
    'mosaicml-streaming>=0.4.1,<0.5',
    'torch==1.13.1',
    'datasets==2.10.1',
    'sentencepiece==0.1.97',
    'einops==0.5.0',
    'omegaconf>=2.2.3,<3',
    'slack-sdk<4',
    'mosaicml-cli>=0.3,<1',
    'onnx==1.13.1',
    'onnxruntime==1.14.1',
    'langchain==0.0.169',
    'bitsandbytes==0.35.0',
    'accelerate==0.18.0',
    'peft==0.2.0',
    'sentence-transformers==2.2.2',
    'accelerate>=0.16.0,<1',
    'einops'
]

pkgs = " ".join([f"'{x}'" for x in install_requires])
!pip uninstall -y torchaudio
!pip install {pkgs}

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m23.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [2]:
import os
from pathlib import Path

Path('/workspace/.cache').mkdir(exist_ok=True)

os.environ['TRANSFORMERS_CACHE'] = '/workspace/.cache'

Downloading (…)okenizer_config.json:   0%|          | 0.00/156 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/457k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

In [3]:
from typing import Any, Dict, Tuple
import warnings

from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig
from transformers import StoppingCriteria, StoppingCriteriaList, TextIteratorStreamer
import torch

model_name = 'mosaicml/mpt-7b-instruct'

INSTRUCTION_KEY = "### Instruction:"
RESPONSE_KEY = "### Response:"
END_KEY = "### End"
INTRO_BLURB = "Below is an instruction that describes a task. Write a response that appropriately completes the request based on the Input if any input is provided."
INPUT_CONTEXT = """### Input: {context}"""
PROMPT_FOR_GENERATION_FORMAT = """{intro}
{input_context}
{instruction_key}
{instruction}
{response_key}
""".format(
    intro=INTRO_BLURB,
    input_context=INPUT_CONTEXT,
    instruction_key=INSTRUCTION_KEY,
    instruction="{instruction}",
    response_key=RESPONSE_KEY,
)


class InstructionTextGenerationPipeline:
    def __init__(
        self,
        model,
        tokenizer
    ) -> None:
        self.model = model
        self.tokenizer = tokenizer
        
    def format_instruction(self, instruction, context=""):
        return PROMPT_FOR_GENERATION_FORMAT.format(instruction=instruction, context=context)

    def __call__(
        self, instruction: str, context:str, **generate_kwargs: Dict[str, Any]
    ) -> Tuple[str, str, float]:
        s = PROMPT_FOR_GENERATION_FORMAT.format(instruction=instruction, context=context)
        input_ids = self.tokenizer(s, return_tensors="pt").input_ids
        input_ids = input_ids.to(self.model.device)
        with torch.no_grad():
            output_ids = self.model.generate(input_ids, **generate_kwargs)
        # Slice the output_ids tensor to get only new tokens
        new_tokens = output_ids[0, len(input_ids[0]) :]
        output_text = self.tokenizer.decode(new_tokens, skip_special_tokens=True)
        return output_text
    

def model_fn():
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            trust_remote_code = True,
            torch_dtype=torch.bfloat16,
            max_seq_len=2048
        )
        tokenizer = AutoTokenizer.from_pretrained(
            model_name,
            trust_remote_code = True,
            device_map='auto'
        )
        if tokenizer.pad_token_id is None:
            tokenizer.pad_token = tokenizer.eos_token
        tokenizer.padding_side = "left"
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        model.eval()
        model.to(device=device, dtype=torch.bfloat16)
        return model, tokenizer

def predict_fn(data, model_and_tokenizer):
    model, tokenizer = model_and_tokenizer
    prompt = data.pop("inputs", data)
    context = data.pop("context", data)
    generate_text = InstructionTextGenerationPipeline(model=model, tokenizer=tokenizer)
    response =  generate_text(prompt, context, **data)
    return response


'Below is an instruction that describes a task. Write a response that appropriately completes the request based on the Input if any input is provided.\n### Input: {context}\n### Instruction:\n{instruction}\n### Response:\n'

In [4]:
model_and_tokenizer = model_fn()

Explicitly passing a `revision` is encouraged when loading a configuration with custom code to ensure no malicious code has been contributed in a newer revision.
Explicitly passing a `revision` is encouraged when loading a model with custom code to ensure no malicious code has been contributed in a newer revision.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [24]:

def get_input_data(inputs, context="", temperature=0.1, max_length=512):
    data = {
        "inputs": inputs,
        "context": context,
        "temperature": temperature,
        "max_length": 512,
        "do_sample": False
    }
    return data




In [None]:
prompt = "Write a short story about a robot that has a nice day."
context = "Name of the robot is Jane"


response = predict_fn(get_input_data(inputs=prompt, context=context, temperature=0.5), model_and_tokenizer)
response

In [None]:
prompt = "What is Driver protection cover? Provide a one line short answer."
context = "Driver Protection Cover is extra cover We automatically provide with Your NRMA Compulsory Third Party (CTP) Insurance on most passenger Vehicles and some goods Vehicles. See Table 1. for the Vehicles that We cover and those We do not cover.Driver Protection Cover provides the driver of a covered Vehicle who suffers one or more of the injuries listed in the specified injuries table with a set payment if the driver was at-fault in the accident in which they were injured"


response = predict_fn(get_input_data(inputs=prompt, context=context), model_and_tokenizer)
response

In [16]:
prompt = "what is Driver protection cover? Provide a one line short answer."
context = ""


response = predict_fn(get_input_data(inputs=prompt, context=context), model_and_tokenizer)
response

'Driver protection cover is a type of insurance that covers the driver of a vehicle for any legal liability arising from an accident.#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n\n#\n'

In [6]:
import torch
from transformers import StoppingCriteria, StoppingCriteriaList

model, tokenizer = model_and_tokenizer

# mtp-7b is trained to add "<|endoftext|>" at the end of generations
stop_token_ids = tokenizer.convert_tokens_to_ids(["<|endoftext|>"])

# define custom stopping criteria object
class StopOnTokens(StoppingCriteria):
    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
        for stop_id in stop_token_ids:
            if input_ids[0][-1] == stop_id:
                return True
        return False

stopping_criteria = StoppingCriteriaList([StopOnTokens()])

In [25]:
from langchain import PromptTemplate, LLMChain
from langchain.llms import HuggingFacePipeline
from torch import cuda
import transformers 

device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'
print(f"Device {device}")



generate_text = transformers.pipeline(
    model=model, tokenizer=tokenizer,
    return_full_text=True,  # langchain expects the full text
    task='text-generation',
    device=device,
    # we pass model parameters here too
    stopping_criteria=stopping_criteria,  # without this model will ramble
    temperature=0.1,  # 'randomness' of outputs, 0.0 is the min and 1.0 the max
    top_p=0.15,  # select from top tokens whose probability add up to 15%
    top_k=0,  # select from top 0 tokens (because zero, relies on top_p)
    max_new_tokens=512,  # mex number of tokens to generate in the output
    repetition_penalty=1.1  # without this output begins repeating
)
     
# template for an instruction with no input
prompt = PromptTemplate(
    input_variables=["instruction"],
    template="{instruction}\n\n###RESPONSE:"
)
prompt_with_context = PromptTemplate(
    input_variables=["instruction", "context"],
    template="{instruction}\n\nInput:\n{context} \n\n###RESPONSE:")


llm = HuggingFacePipeline(pipeline=generate_text)

llm_chain = LLMChain(llm=llm, prompt=prompt)
llm_chain_wc = LLMChain(llm=llm, prompt=prompt_with_context)


The model 'MPTForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'CodeGenForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'LlamaForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCausalLM', 'MegatronBertForCausalLM', 'MvpForCausalLM', 'OpenAIGPTLMHeadModel', 'OPTForCausalLM', 'PegasusForCausalLM', 'PLBartForCausalLM', 'ProphetNetForCausalLM', 'QDQBertLMHeadModel', 'ReformerModelWithLMHead', 'RemBertForCausalLM', 'RobertaForCausalLM', 'RobertaPreLayerNormForCausalLM', 'RoCBertForCa

Device cuda:0


In [24]:
query = "Explain to me What is Driver protection cover based on the Input below. Provide a one line short answer."
context = "Driver Protection Cover is extra cover We automatically provide with Your NRMA Compulsory Third Party (CTP) Insurance on most passenger Vehicles and some goods Vehicles. See Table 1. for the Vehicles that We cover and those We do not cover.Driver Protection Cover provides the driver of a covered Vehicle who suffers one or more of the injuries listed in the specified injuries table with a set payment if the driver was at-fault in the accident in which they were injured"

print(llm_chain.predict(instruction=query).lstrip())

print(llm_chain_wc.predict(instruction=query, context=context).lstrip())


Driver Protection Cover provides you with an additional layer of financial security in case your car meets with any accident and needs repairs or replacement, which may not be covered under standard insurance policies
NRMA CTP insurance covers drivers from third party claims, but does not protect them against their own accidents


In [17]:
query = "Explain to me the difference between nuclear fission and fusion."
print(llm_chain.predict(instruction=query).lstrip())


# print(llm_chain.predict(
#     instruction="Explain to me the difference between nuclear fission and fusion."
# ).lstrip())


Nuclear Fission is a process that splits heavy atoms into smaller, lighter ones by splitting their nuclei apart using high-energy particles or neutrons (a type of subatomic particle). Nuclear Fusion occurs when two light atomic nuclei are combined together in such a way as to form one heavier nucleus with the release of energy.
