<a href="https://colab.research.google.com/github/wothmag07/genai-bootcamp/blob/main/finetuning_mistral.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Installing the necessary libraries
(https://huggingface.co/docs/transformers/quantization)

In [None]:
!pip install peft accelerate trl bitsandbytes auto-gptq optim transformers py7zr



Importing libraries

In [None]:
from huggingface_hub import notebook_login
import torch, os
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments, GPTQConfig
from peft import LoraConfig, AutoPeftModelForCausalLM, prepare_model_for_kbit_training, get_peft_model
from trl import SFTTrainer

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

Importing Samsum dataset (Dialogue summarization)

In [None]:
ds = load_dataset("knkarthick/samsum", split="train")
ds

Dataset({
    features: ['id', 'dialogue', 'summary'],
    num_rows: 14732
})

In [None]:
dsdf = ds.to_pandas()
dsdf.head()

Unnamed: 0,id,dialogue,summary
0,13818513,Amanda: I baked cookies. Do you want some?\nJ...,Amanda baked cookies and will bring Jerry some...
1,13728867,Olivia: Who are you voting for in this electio...,Olivia and Olivier are voting for liberals in ...
2,13681000,"Tim: Hi, what's up?\nKim: Bad mood tbh, I was ...",Kim may try the pomodoro technique recommended...
3,13730747,"Edward: Rachel, I think I'm in ove with Bella....",Edward thinks he is in love with Bella. Rachel...
4,13728094,Sam: hey overheard rick say something\nSam: i...,"Sam is confused, because he overheard Rick com..."


In [None]:
dsdf["text"] = dsdf.apply(lambda row: f"###Human: Summarize this following dialogue: {row['dialogue']}\n###Assistant: {row['summary']}", axis=1)
dsdf.iloc[0]

Unnamed: 0,0
id,13818513
dialogue,Amanda: I baked cookies. Do you want some?\nJ...
summary,Amanda baked cookies and will bring Jerry some...
text,###Human: Summarize this following dialogue: A...


Importing tokenizer, model with quantization config

In [None]:
model_id = "mistralai/Mistral-7B-Instruct-v0.3"
tokenizer = AutoTokenizer.from_pretrained(model_id)
quant_conf = BitsAndBytesConfig(load_in_4bit=True,
                                bnb_4bit_quant_type="nf4",
                                bnb_4bit_compute_dtype=torch.bfloat16,
                                bnb_4bit_use_double_quant=True
                                )
# quant_conf = GPTQConfig(bits=4, disable_exllama=True, tokenizer=tokenizer)
# model_id = "TheBloke/Mistral-7B-Instruct-v0.1-GPTQ"

model = AutoModelForCausalLM.from_pretrained(model_id,
                                             device_map="auto",
                                             trust_remote_code=True,
                                             quantization_config=quant_conf)


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
model.config.use_cache=False
model.config.pretraining_tp=1
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [None]:
print(model)

MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(32768, 4096)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
        )
        (mlp): MistralMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): MistralRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): MistralRMSNorm((4096,), eps=1e-05)
      )
    )
    (norm): Mist

In [None]:
tokenizer.eos_token, tokenizer.eos_token_id, tokenizer.pad_token, tokenizer.pad_token_id

('</s>', 2, None, None)

In [None]:
tokenizer.pad_token = tokenizer.eos_token

LORA configurations

In [None]:
peft_config = LoraConfig(r=16,
                         lora_alpha=32,
                         task_type="CAUSAL_LM",
                         bias="none",
                         target_modules=["q_proj", "v_proj"],
                         lora_dropout=0.05)

qmodel = get_peft_model(model, peft_config)

In [None]:
print(qmodel)

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): MistralForCausalLM(
      (model): MistralModel(
        (embed_tokens): Embedding(32768, 4096)
        (layers): ModuleList(
          (0-31): 32 x MistralDecoderLayer(
            (self_attn): MistralAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_pro

Training pipeline

In [None]:
training_args = TrainingArguments(output_dir="mistral-finetuned-samsum",
                                  per_device_train_batch_size=8,
                                  gradient_accumulation_steps=1,
                                  optim="paged_adamw_32bit",
                                  learning_rate=2e-4,
                                  lr_scheduler_type="cosine",
                                  save_strategy="epoch",
                                  logging_steps=100,
                                  num_train_epochs=1,
                                  max_steps=250,
                                  fp16=True,
                                  push_to_hub=True,
                                  report_to="none")
trainer = SFTTrainer(model=qmodel,
                     train_dataset=Dataset.from_pandas(dsdf),
                     peft_config=peft_config,
                     args=training_args)

trainer.train()


Adding EOS to train dataset:   0%|          | 0/14732 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/14732 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/14732 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
  return fn(*args, **kwargs)


Step,Training Loss
100,1.8006
200,1.7156


TrainOutput(global_step=250, training_loss=1.7462933959960938, metrics={'train_runtime': 2710.1384, 'train_samples_per_second': 0.738, 'train_steps_per_second': 0.092, 'total_flos': 3.540327471631565e+16, 'train_loss': 1.7462933959960938})

Inference mode

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!cp -r /content/mistral-finetuned-samsum/ /content/drive/MyDrive/models/

In [None]:
from peft import AutoPeftModelForCausalLM
from transformers import GenerationConfig
from transformers import AutoTokenizer
import torch

In [None]:
tokenizer = AutoTokenizer.from_pretrained("/content/drive/MyDrive/models/mistral-finetuned-samsum")
model = AutoPeftModelForCausalLM.from_pretrained("/content/drive/MyDrive/models/mistral-finetuned-samsum",
                                                  low_cpu_mem_usage=True,
                                                  return_dict=True,
                                                  torch_dtype=torch.bfloat16,
                                                  device_map="cuda")





config.json:   0%|          | 0.00/601 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.55G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]



In [None]:
dialogue = """
Alex: Hey, are you free this weekend?
Sarah: Yeah, what's up?
Alex: Want to go to that new restaurant downtown?
Sarah: The Italian one? I heard it's really good!
Alex: That's the one. How about Saturday around 7?
Sarah: Perfect! Should I make a reservation?
Alex: Good idea, I'll call them now.
"""

test_prompt = f"""
###Human: Summarize this following dialogue: {dialogue}
###Assistant: """

inputs = tokenizer(test_prompt, return_tensors="pt").to("cuda")
inputs


{'input_ids': tensor([[    1, 29473,   781, 28100, 29537,  7134, 29515,  7695,  4247,  1421,
          1224,  3064, 19966, 29515, 29473,   781, 26957, 29515, 17930, 29493,
          1228,  1136,  2701,  1224,  9839, 29572,   781, 29503,  1051,  1680,
         29515,  9120, 29493,  1535, 29510, 29481,  1350, 29572,   781, 26957,
         29515, 19986,  1066,  1344,  1066,  1137,  1401, 10694, 19298, 29572,
           781, 29503,  1051,  1680, 29515,  1183, 10856,  1392, 29572,  1083,
          4132,  1146, 29510, 29481,  2296,  1947, 29576,   781, 26957, 29515,
          2493, 29510, 29481,  1040,  1392, 29491,  2370,  1452,  9281,  2169,
         29473, 29555, 29572,   781, 29503,  1051,  1680, 29515, 25211, 29576,
         11702,  1083,  1806,  1032,  6815,  1120, 29572,   781, 26957, 29515,
          6569,  3796, 29493,  1083, 29510,  1352,  1802,  1474,  1823, 29491,
           781,   781, 28100,  7994, 11911, 29515, 29473]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 

In [None]:
output = model.generate(**inputs, do_sample=True, top_p=0.9, temperature=0.8, max_new_tokens=150)
tokenizer.decode(output[0], skip_special_tokens=True)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


"\n###Human: Summarize this following dialogue: \nAlex: Hey, are you free this weekend?\nSarah: Yeah, what's up?\nAlex: Want to go to that new restaurant downtown?\nSarah: The Italian one? I heard it's really good!\nAlex: That's the one. How about Saturday around 7?\nSarah: Perfect! Should I make a reservation?\nAlex: Good idea, I'll call them now.\n\n###Assistant:  Sarah and Alex are going to an Italian restaurant on Saturday around 7."

Example2

In [None]:
dialogue = """
Client: We're having issues with the software update.
Support: I understand your concern. Can you describe the specific problem?
Client: The new interface is confusing our employees.
Support: We can provide additional training sessions for your team.
Client: That would be helpful. When can we schedule this?
Support: How about next Tuesday at 2 PM?
Client: That works perfectly. Thank you for the quick response.
"""
test_prompt = f"""
###Human: Summarize this following dialogue: {dialogue}
###Assistant: """

input2 = tokenizer(test_prompt, return_tensors="pt").to("cuda")
input2

{'input_ids': tensor([[    1, 29473,   781, 28100, 29537,  7134, 29515,  7695,  4247,  1421,
          1224,  3064, 19966, 29515, 29473,   781,  3934, 29515,  1584, 29510,
          1035,  3229,  5150,  1163,  1040,  4698,  4777, 29491,   781, 10220,
         29515,  1083,  3148,  1342,  5136, 29491,  3186,  1136,  7453,  1040,
          3716,  3468, 29572,   781,  3934, 29515,  1183,  1401,  5739,  1117,
         27512,  1581,  8664, 29491,   781, 10220, 29515,  1584,  1309,  3852,
          5638,  4922, 14680,  1122,  1342,  2686, 29491,   781,  3934, 29515,
          2493,  1450,  1115, 11633, 29491,  2452,  1309,  1246, 10210,  1224,
         29572,   781, 10220, 29515,  2370,  1452,  2447, 11955,  1206, 29473,
         29518, 10400, 29572,   781,  3934, 29515,  2493,  4559, 10711, 29491,
          8580,  1136,  1122,  1040,  3704,  3667, 29491,   781,   781, 28100,
          7994, 11911, 29515, 29473]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

In [None]:
output2 = model.generate(**input2, do_sample=True, top_p=0.9, temperature=0.8, max_new_tokens=150)
tokenizer.decode(output2[0], skip_special_tokens=True)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


"\n###Human: Summarize this following dialogue: \nClient: We're having issues with the software update.\nSupport: I understand your concern. Can you describe the specific problem?\nClient: The new interface is confusing our employees.\nSupport: We can provide additional training sessions for your team.\nClient: That would be helpful. When can we schedule this?\nSupport: How about next Tuesday at 2 PM?\nClient: That works perfectly. Thank you for the quick response.\n\n###Assistant: \nThe client has problems with the new software update. The interface is confusing their employees. They need additional training sessions. Support will offer them a training session on next Tuesday at 2 PM."