In [1]:
import os
from dotenv import load_dotenv 
from datasets import load_dataset

# Load HF Dataset
instruct_tune_dataset = load_dataset("mosaicml/instruct-v3")

In [2]:
instruct_tune_dataset

DatasetDict({
    train: Dataset({
        features: ['prompt', 'response', 'source'],
        num_rows: 56167
    })
    test: Dataset({
        features: ['prompt', 'response', 'source'],
        num_rows: 6807
    })
})

In [3]:
# filter away all the subset datasets and only used the dolly_hhrlhf component
instruct_tune_dataset = instruct_tune_dataset.filter(lambda x: x["source"] == "dolly_hhrlhf")

In [4]:
sample = instruct_tune_dataset["train"][0]
sample

{'prompt': 'Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction\nWhat are different types of grass?\n\n### Response\n',
 'response': 'There are more than 12,000 species of grass. The most common is Kentucky Bluegrass, because it grows quickly, easily, and is soft to the touch. Rygrass is shiny and bright green colored. Fescues are dark green and shiny. Bermuda grass is harder but can grow in drier soil.',
 'source': 'dolly_hhrlhf'}

In [5]:
# Create Formatted Prompt
# Merging our prompt and response columns by creating the following template:
# <s>[INST] Use the provided input to create an instruction that could have been used to generate the response with an LLM.
# {input} [/INST] {response}</s>
# https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1
# <s> [INST] Instruction [/INST] Model answer</s> [INST] Follow-up instruction [/INST]

def create_prompt(sample):
  bos_token = "<s>"
  original_system_message = "Below is an instruction that describes a task. Write a response that appropriately completes the request."
  system_message = "[INST]Use the provided input to create an instruction that could have been used to generate the response with an LLM."
  input = sample["prompt"].replace(original_system_message, "").replace("\n\n### Instruction\n", "").replace("\n### Response\n", "").strip()
  response = sample["response"]   ## TODO: why input and response are reversed here???
  eos_token = "</s>"

  full_prompt = ""
  full_prompt += bos_token
  full_prompt += system_message
  full_prompt += "\n" + input
  full_prompt += "[/INST]"
  full_prompt += response
  full_prompt += eos_token

  return full_prompt

In [6]:
create_prompt(sample)

'<s>[INST]Use the provided input to create an instruction that could have been used to generate the response with an LLM.\nWhat are different types of grass?[/INST]There are more than 12,000 species of grass. The most common is Kentucky Bluegrass, because it grows quickly, easily, and is soft to the touch. Rygrass is shiny and bright green colored. Fescues are dark green and shiny. Bermuda grass is harder but can grow in drier soil.</s>'

In [7]:
# Loading the Base Model
# Load the model in 4bit, with double quantization, with bfloat16 as the compute dtype.
# Use the instruct-tuned model - instead of the base model. 
# https://mistral.ai/news/mixtral-of-experts/ 

model_id = "mistralai/Mistral-7B-Instruct-v0.2"

from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch

# nf4_config = BitsAndBytesConfig(
#    load_in_4bit=True,
#    bnb_4bit_quant_type="nf4",
#    bnb_4bit_use_double_quant=True,
#    bnb_4bit_compute_dtype=torch.bfloat16
# )  # only work for CUDA 

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map='mps',
    #quantization_config=nf4_config,
    load_in_8bit=False,
    use_cache=False,
    #attn_implementation="flash_attention_2"
)

config.json:   0%|          | 0.00/596 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

RuntimeError: MPS backend out of memory (MPS allocated: 18.01 GB, other allocations: 384.00 KB, max allowed: 18.13 GB). Tried to allocate 224.00 MB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).