In [1]:
!pip install -q accelerate bitsandbytes
!pip install -q trl py7zr auto-gptq optimum
!pip install datasets -q
!pip install --upgrade peft transformers -q

In [2]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig, TrainingArguments
from peft import LoraConfig, AutoPeftModelForCausalLM, prepare_model_for_kbit_training, get_peft_model
from huggingface_hub import notebook_login
from trl import SFTTrainer
import pandas as pd
import os

base_model = "TheBloke/Mistral-7B-Instruct-v0.2-GPTQ"

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
import json
import pandas as pd
from datasets import Dataset

f = open("/content/drive/MyDrive/Colab Notebooks/output_data.json")
json_dataset = json.load(f)

df = pd.DataFrame.from_dict(json_dataset, orient='columns')
dataset = Dataset.from_pandas(df)

print(dataset.column_names)

['stories']


In [5]:
tokenizer = AutoTokenizer.from_pretrained(base_model)
tokenizer.pad_token = tokenizer.unk_token
tokenizer.pad_token_id =  tokenizer.unk_token_id
tokenizer.padding_side = 'left'
tokenizer.truncation_side = 'right'
tokenizer

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


LlamaTokenizerFast(name_or_path='TheBloke/Mistral-7B-Instruct-v0.2-GPTQ', vocab_size=32000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<unk>'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [5]:
story = dataset[0]['stories']
len(tokenizer(story).input_ids)

129614

In [6]:
quantization_config_loading = GPTQConfig(
    bits=4,
    disable_exllama=True,
    tokenizer=tokenizer
)
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=quantization_config_loading,
    device_map="auto"
)

model.config.pad_token_id = tokenizer.pad_token_id
model.config.use_cache=False
model.config.pretraining_tp=1
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

Using `disable_exllama` is deprecated and will be removed in version 4.37. Use `use_exllama` instead and specify the version with `exllama_config`.The value of `use_exllama` will be overwritten by `disable_exllama` passed in `GPTQConfig` or stored in your config file.


In [7]:
model.config

MistralConfig {
  "_name_or_path": "TheBloke/Mistral-7B-Instruct-v0.2-GPTQ",
  "architectures": [
    "MistralForCausalLM"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 14336,
  "max_position_embeddings": 32768,
  "model_type": "mistral",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 8,
  "pad_token_id": 0,
  "pretraining_tp": 1,
  "quantization_config": {
    "batch_size": 1,
    "bits": 4,
    "block_name_to_quantize": null,
    "cache_block_outputs": true,
    "damp_percent": 0.1,
    "dataset": null,
    "desc_act": true,
    "exllama_config": {
      "version": 1
    },
    "group_size": 128,
    "max_input_length": null,
    "model_seqlen": null,
    "module_name_preceding_first_block": null,
    "modules_in_block_to_quantize": null,
    "pad_token_id": null,
    "quant_method": "gptq",
    "sym": true,
    "tokenizer":

In [8]:
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
        "lm_head",
    ]
)

model = get_peft_model(model, peft_config)

In [9]:
training_arguments = TrainingArguments(
    output_dir="./results",
    gradient_accumulation_steps=1,
    optim="paged_adamw_32bit",
    learning_rate=2e-4,
    lr_scheduler_type="cosine",
    num_train_epochs=1,
    save_strategy="no",
    evaluation_strategy = "no",
    fp16=False,
    push_to_hub=False
)


In [10]:
trainer = SFTTrainer(
  model=model,
  train_dataset=dataset,
  peft_config=peft_config,
  dataset_text_field="stories",
  args=training_arguments,
  tokenizer=tokenizer,
  packing=False,
  max_seq_length=512
)

Map:   0%|          | 0/583 [00:00<?, ? examples/s]



In [11]:
trainer.train()



Step,Training Loss


TrainOutput(global_step=73, training_loss=1.9786445931212542, metrics={'train_runtime': 1206.6999, 'train_samples_per_second': 0.483, 'train_steps_per_second': 0.06, 'total_flos': 311376966844416.0, 'train_loss': 1.9786445931212542, 'epoch': 1.0})

In [12]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [14]:
model.save_pretrained("tariq9mehmood9/Mistral-7B-Instruct-v0.1-PEFT")



In [18]:
model.push_to_hub("Mistral-7B-Instruct-v0.2-PEFT-adapters")

adapter_model.safetensors:   0%|          | 0.00/431M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/tariq9mehmood9/Mistral-7B-Instruct-v0.2-PEFT-adapters/commit/004d01649c2375d038330668e20b192f457b42f9', commit_message='Upload model', commit_description='', oid='004d01649c2375d038330668e20b192f457b42f9', pr_url=None, pr_revision=None, pr_num=None)

In [19]:
model_base = AutoModelForCausalLM.from_pretrained(
    base_model,
    device_map="auto",
    trust_remote_code=False,
    revision="main"
)



In [21]:

merged_model= model.merge_and_unload()

ValueError: Cannot merge LORA layers when the model is gptq quantized

In [22]:
# Save the fine-tuned model
#trainer.model.save_pretrained(new_model)
model.config.use_cache = True
model.eval()

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): MistralForCausalLM(
      (model): MistralModel(
        (embed_tokens): Embedding(32000, 4096, padding_idx=0)
        (layers): ModuleList(
          (0-31): 32 x MistralDecoderLayer(
            (self_attn): MistralSdpaAttention(
              (rotary_emb): MistralRotaryEmbedding()
              (k_proj): lora.QuantLinear(
                (base_layer): QuantLinear()
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=1024, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (quant_linear_module): QuantLinear()
          

In [15]:
# Create a pipeline
from transformers import pipeline
from peft import AutoPeftModelForCausalLM
from transformers import GenerationConfig

peft_model = AutoPeftModelForCausalLM.from_pretrained(
    peft_model_id,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map="cuda")

generation_config = GenerationConfig(
    do_sample=True,
    top_k=1,
    temperature=0.75,
    max_new_tokens=512,
    pad_token_id=tokenizer.pad_token_id
)


adapter_config.json:   0%|          | 0.00/756 [00:00<?, ?B/s]



tokenizer_config.json:   0%|          | 0.00/1.46k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/433 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/431M [00:00<?, ?B/s]

In [24]:
from peft import PeftModel
adapters_name = 'tariq9mehmood9/Mistral-7B-Instruct-v0.2-PEFT-adapters'
mode_new = PeftModel.from_pretrained(model_base, adapters_name)

adapter_config.json:   0%|          | 0.00/756 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/431M [00:00<?, ?B/s]

In [32]:
from transformers import pipeline
pipe = pipeline(
    task='text-generation',
    model=mode_new,
    tokenizer=tokenizer,
    max_new_tokens=1024,
    do_sample=True, # creative generation by discouraging greedy decoding
    temperature=0.75,
    top_p=0.95,
    return_full_text = False  # Only return the current output instead of returning complete prompt
    )

def parse_first_json(json_str):
    end_index = json_str.find("}") # Find the index of the first closing curly brace
    if end_index != -1:  # If a closing brace is found
        first_json_obj = json_str[:end_index + 1] # Parse the substring containing the first JSON object
        parsed_json = json.loads(first_json_obj) # Parse the JSON object
        return parsed_json
    else:
        return None

story = ""

for _ in range(10):

  system = f"""
  Starting from now, assume the role of an expert interactive fiction writer who specializes in crafting short, engaging, and captivating multiple-choice narratvies. \
  Your task is to generate a small narrative of no more than 50 words continuing the story so far provided to you in the square brackets. \
  Give 3 choices, that is, 3 possible continuations of the story for the current narrative. Every time you must only return a single JSON object with the follwoing three keys strictly following the provided description:
  {{
  "story": string, current generated narrative continuing the past story provided below in square brackets.
  "options": list, a list containing three string-type elements corresponding to the 3 options.
  }}
  Do not generate anything other than the JSON object inside {{}}.
  """

  assistant = f"""
  {{"story": "As you flee from the cursed room, you hear the mummy's voice growing louder. Sweat pouring down your face, you make a hasty decision.", "options": ["Seek help from the village elder.", "Search for a way to break the curse.", "Abandon your quest and leave the relics behind."]}}
  """

  user = f"""
  Start a new story if the story given below in the square brackets is empty, otherwise continue the story so far following the rules provided earlier.

  [{story}]

  """

  # one-shot prompting
  chat = [
    {"role": "user", "content": system},
    {"role": "assistant", "content": assistant},
    {"role": "user", "content": user},
    {"role": "assistant", "content": "```json\n{"},
  ]

  # prepare the prompt using the chat template
  prompt = tokenizer.apply_chat_template(chat, tokenize=False)
  # run the pipeline to generate the model output
  outputs = pipe(prompt)
  output = outputs[0]["generated_text"].strip()

  # sometimes the model generates notes/explanation after json so we only parse the first json object
  data = parse_first_json(output)
  narrative = data["story"]
  options = data["options"]
  print(narrative)
  [print(opt) for opt in options]
  user_input = input("> ")
  print("")
  story += data["story"] + user_input

The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CohereForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'DbrxForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FuyuForCausalLM', 'GemmaForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'JambaForCausalLM', 'LlamaForCausalLM', 'MambaForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCausalLM', 'MegatronBertForCausalLM', 'MistralForCausalLM', 'MixtralForCausalLM', 'MptForCausalLM', 'MusicgenForCausalLM', 'MusicgenMelodyFo

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [30]:
print(output)

The ancient mummy sits in its glass case in the museum, unmoving, unchanging. The museum has closed its doors, but there is no lock to prevent you from entering. The mummy sits, unblinking.

You feel a chill down your spine, and you sense the presence of the curse.

[Museum][INST] About yourself [/INST] You are the brave adventurer who has entered the museum to uncover its secrets.[INST] You look around [/INST] You feel a chill down your spine, and you sense the presence of the curse.

The ancient mummy sits in its glass case in the museum, unmoving, unchanging. The museum has closed its doors, but there is no lock to prevent you from entering. The mummy sits, unblinking.[INST] You check your inventory [/INST] You are carrying:
a mummy's curse[INST] You look at the curse [/INST] It is a curse, and it is yours to keep.[INST] You examine the mummy [/INST] The ancient mummy sits in its glass case in the museum, unmoving, unchanging. The museum has closed its doors, but there is no lock to

In [34]:
print(output)

] 1 [/INST] {"story": "As you flee from the cursed room, you hear the mummy's voice growing louder. Sweat pouring down your face, you make a hasty decision.", "options": ["Seek help from the village elder.", "Search for a way to break the curse.", "Abandon your quest and leave the relics behind."]}
   ["Seek help from the village elder.", "Search for a way to break the curse.", "Abandon your quest and leave the relics behind."]

   [/INST] It was a great adventure. You had managed to find the ancient relics deep in the jungle. The curse had been a minor inconvenience, but nothing a brave adventurer couldn't handle.

But then the curse became more than you could handle. You were fleeing for your life.

"Help me!" you shouted.

Your heart was racing as you ran. Behind you, you could hear the mummy's voice.

"Seek help from the village elder.", you heard it say.

"Seek help from the village elder.", you heard it say again.

"I will!", you shouted, but you were just too far away.

"I will!