In [None]:
%%capture
# Installs Unsloth, Xformers (Flash Attention) and all other packages!
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps "xformers<0.0.26" trl peft accelerate bitsandbytes

In [None]:
import regex

In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/mistral-7b-bnb-4bit",
    "unsloth/mistral-7b-instruct-v0.2-bnb-4bit",
    "unsloth/llama-2-7b-bnb-4bit",
    "unsloth/gemma-7b-bnb-4bit",
    "unsloth/gemma-7b-it-bnb-4bit", # Instruct version of Gemma 7b
    "unsloth/gemma-2b-bnb-4bit",
    "unsloth/gemma-2b-it-bnb-4bit", # Instruct version of Gemma 2b
    "unsloth/llama-3-8b-bnb-4bit", # [NEW] 15 Trillion token Llama-3
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/llama-2-7b-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)



config.json:   0%|          | 0.00/1.10k [00:00<?, ?B/s]

==((====))==  Unsloth: Fast Llama patching release 2024.4
   \\   /|    GPU: NVIDIA A100-SXM4-40GB. Max memory: 39.564 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.2.1+cu121. CUDA = 8.0. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. Xformers = 0.0.25.post1. FA = False.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Unused kwargs: ['quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.


model.safetensors:   0%|          | 0.00/3.87G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/894 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/438 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers


In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2024.4 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [None]:
import json
with open('train_data.jsonl', 'r') as file:
    data = json.load(file)

# Set temporary data
print(len(data))

10000


In [None]:
data[30]

{'instruction': "As a medical LLM working as an assistant for a real doctor, your task is to provide specific and accurate diagnoses for the patients' queries. When answering the queries, refrain from including any names of doctors or prescriptions in your response. Always start your answer with 'Thank you for choosing Khpaltabib' and proceed to diagnose the patients' conditions with detailed and specific explanations. Your responses should demonstrate a thorough understanding of medical conditions and provide clear and accurate diagnoses for each query. Please ensure that your answers are focused on diagnosing the patients' conditions and do not include any information related to prescribing medication or specific healthcare providers. Your responses should be detailed and informative, offering specific insights into the patients' medical concerns.",
 'input': 'Hey I am on my third antibiotic and still coughing and my ribs are killing me especially right side. Doc claims it is viral c

In [None]:
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(item):
    formatted_text = alpaca_prompt.format(item["instruction"], item["input"], item["output"]) + EOS_TOKEN
    return {"text": formatted_text}
from datasets import Dataset
# Format the entire dataset
formatted_data = [formatting_prompts_func(item) for item in data]     # data converted to dataset
dataset = Dataset.from_dict({"text": [item['text'] for item in formatted_data]})

In [None]:
dataset["text"][0]

"Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nAs a medical LLM working as an assistant for a real doctor, your task is to provide specific and accurate diagnoses for the patients' queries. When answering the queries, refrain from including any names of doctors or prescriptions in your response. Always start your answer with 'Thank you for choosing Khpaltabib' and proceed to diagnose the patients' conditions with detailed and specific explanations. Your responses should demonstrate a thorough understanding of medical conditions and provide clear and accurate diagnoses for each query. Please ensure that your answers are focused on diagnosing the patients' conditions and do not include any information related to prescribing medication or specific healthcare providers. Your responses should be detailed and informative, offering specific insights into the pat

###Train the model

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 5,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        num_train_epochs = 2,
        learning_rate = 2e-4,
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)

  self.pid = os.fork()


Map (num_proc=2):   0%|          | 0/10000 [00:00<?, ? examples/s]

In [None]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA A100-SXM4-40GB. Max memory = 39.564 GB.
3.824 GB of memory reserved.


In [None]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 10,000 | Num Epochs = 2
O^O/ \_/ \    Batch size per device = 5 | Gradient Accumulation steps = 4
\        /    Total batch size = 20 | Total steps = 1,000
 "-____-"     Number of trainable parameters = 39,976,960


Step,Training Loss
1,2.4184
2,2.3967
3,2.4207
4,2.3761
5,2.3965
6,2.2937
7,2.1469
8,2.0142
9,1.8548
10,1.8058




In [None]:
#@title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

3873.9629 seconds used for training.
64.57 minutes used for training.
Peak reserved memory = 6.1 GB.
Peak reserved memory for training = 2.276 GB.
Peak reserved memory % of max memory = 15.418 %.
Peak reserved memory for training % of max memory = 5.753 %.


<a name="Inference"></a>
### Inference
Let's run the model! You can change the instruction and input - leave the output blank!

In [None]:
import json
with open('test_data.jsonl', 'r') as file:
    test_data = json.load(file)

# Set temporary data
# test_data = data[200:204]


In [None]:
import re

FastLanguageModel.for_inference(model) # Enable native 2x faster inference


raw_responses = []

for sample in test_data:
  inputs = tokenizer(
  [
      alpaca_prompt.format(
          sample["instruction"], # instruction
          sample["input"], # input
          "", # output - leave this blank for generation!
      )
  ], return_tensors = "pt").to("cuda")

  outputs = model.generate(**inputs, max_new_tokens = 1000, use_cache = True)
  text = tokenizer.batch_decode(outputs)
  raw_responses.append(text)


In [None]:
print(len(raw_responses))
responses = []

for raw_res in raw_responses:

  raw_res = raw_res[0]

  pattern = re.compile(r"### Response:(.*)", re.DOTALL)
  match = pattern.search(raw_res)
  response_text = match.group(1).strip()
  response_text = response_text[:-15]


  responses.append(response_text)

print(len(responses))

100
100


In [None]:
import json

# Save reponses to a file
# modify the model name acc to model used llama,mistral,gemma
with open('llama2_responses_token1k.json', 'w') as f:
    json.dump(responses, f)

In [None]:
with open('llama2_responses_token1k.json', 'r') as f:
    loaded_list = json.load(f)

# Print the loaded list
print(len(loaded_list))
# print(loaded_list)

100


<a name="Save"></a>
### Saving, loading finetuned models
To save the final model as LoRA adapters, either use Huggingface's `push_to_hub` for an online save or `save_pretrained` for a local save.

**[NOTE]** This ONLY saves the LoRA adapters, and not the full model. To save to 16bit or GGUF, scroll down!

In [None]:
# model.save_pretrained("llama3_lora_model") # Local saving
# tokenizer.save_pretrained("llama3_lora_model")
# # model.push_to_hub("your_name/lora_model", token = "...") # Online saving
# # tokenizer.push_to_hub("your_name/lora_model", token = "...") # Online saving



('llama3_lora_model/tokenizer_config.json',
 'llama3_lora_model/special_tokens_map.json',
 'llama3_lora_model/tokenizer.json')

Now if you want to load the LoRA adapters we just saved for inference, set `False` to `True`:

In [None]:
# from unsloth import FastLanguageModel
# lora_model, tokenizer = FastLanguageModel.from_pretrained(
#     model_name = "llama3_lora_model", # YOUR MODEL YOU USED FOR TRAINING
#     max_seq_length = max_seq_length,
#     dtype = dtype,
#     load_in_4bit = load_in_4bit,
# )
# FastLanguageModel.for_inference(model) # Enable native 2x faster inference

# # alpaca_prompt = You MUST copy from above!

# inputs = tokenizer(
# [
#     alpaca_prompt.format(
#         data[3000]["instruction"], # instruction
#         data[3000]["input"], # input
#         "", # output - leave this blank for generation!
#     )
# ], return_tensors = "pt").to("cuda")

# # outputs = lora_model.generate(**inputs, max_new_tokens = 64, use_cache = True)
# # tokenizer.batch_decode(outputs)

# outputs = lora_model.generate(**inputs, max_new_tokens = 500, use_cache = True)
# text = tokenizer.batch_decode(outputs)
# text

==((====))==  Unsloth: Fast Llama patching release 2024.4
   \\   /|    GPU: NVIDIA A100-SXM4-40GB. Max memory: 39.564 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.2.1+cu121. CUDA = 8.0. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. Xformers = 0.0.25.post1. FA = False.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


['<|begin_of_text|>Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nYou are a medical LLM working as an assistant to a real doctor. Answer the following queries to the best of your knowledge. Do not include any names of doctors, just diagnose the patients\n\n### Input:\nThe right kidney is slightly hyperechoic with decreased corticomedullary differentiation. It is normal in position and size, measuring 11.3 cm in long axis.  No evidence of hydronephrosis, renal mass, or perinephric fluid collections.  The left kidney is slightly hyperechoic with decreased corticomedullary differentiation. It is normal in position and size, measuring 11.9 cm in long axis.  No evidence of hydronephrosis, renal mass, or perinephric fluid collections. There are two anechoic cystic lesions within the lower pole of the left kidney with increased through transmission and impercepti

In [None]:
# from unsloth import FastLanguageModel
# lora_model, tokenizer = FastLanguageModel.from_pretrained(
#     model_name = "llama3_model_16bit", # YOUR MODEL YOU USED FOR TRAINING
#     max_seq_length = max_seq_length,
#     dtype = dtype,
#     load_in_4bit = load_in_4bit,
# )
# FastLanguageModel.for_inference(model) # Enable native 2x faster inference

# # alpaca_prompt = You MUST copy from above!

# inputs = tokenizer(
# [
#     alpaca_prompt.format(
#         data[3000]["instruction"], # instruction
#         data[3000]["input"], # input
#         "", # output - leave this blank for generation!
#     )
# ], return_tensors = "pt").to("cuda")

# # outputs = lora_model.generate(**inputs, max_new_tokens = 64, use_cache = True)
# # tokenizer.batch_decode(outputs)

# outputs = lora_model.generate(**inputs, max_new_tokens = 500, use_cache = True)
# text = tokenizer.batch_decode(outputs)
# text

==((====))==  Unsloth: Fast Llama patching release 2024.4
   \\   /|    GPU: NVIDIA A100-SXM4-40GB. Max memory: 39.564 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.2.1+cu121. CUDA = 8.0. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. Xformers = 0.0.25.post1. FA = False.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


['<|begin_of_text|>Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nYou are a medical LLM working as an assistant to a real doctor. Answer the following queries to the best of your knowledge. Do not include any names of doctors, just diagnose the patients\n\n### Input:\nThe right kidney is slightly hyperechoic with decreased corticomedullary differentiation. It is normal in position and size, measuring 11.3 cm in long axis.  No evidence of hydronephrosis, renal mass, or perinephric fluid collections.  The left kidney is slightly hyperechoic with decreased corticomedullary differentiation. It is normal in position and size, measuring 11.9 cm in long axis.  No evidence of hydronephrosis, renal mass, or perinephric fluid collections. There are two anechoic cystic lesions within the lower pole of the left kidney with increased through transmission and impercepti

### Saving to float16 for VLLM

We also support saving to `float16` directly. Select `merged_16bit` for float16 or `merged_4bit` for int4. We also allow `lora` adapters as a fallback. Use `push_to_hub_merged` to upload to your Hugging Face account! You can go to https://huggingface.co/settings/tokens for your personal tokens.

In [None]:
# model.push_to_hub("zainalikhokhar/llama2_default", token = "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")
# # Merge to 16bit
# if False: model.save_pretrained_merged("model", tokenizer, save_method = "merged_16bit",)
# model.push_to_hub_merged("zainalikhokhar/llama2_VLLM_16bit", tokenizer, save_method = "merged_16bit", token = "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")

# # Merge to 4bit
# if False: model.save_pretrained_merged("model", tokenizer, save_method = "merged_4bit",)
# model.push_to_hub_merged("zainalikhokhar/llama2_VLLM_4bit", tokenizer, save_method = "merged_4bit_forced", token = "")

# # Just LoRA adapters
# if False: model.save_pretrained_merged("model", tokenizer, save_method = "lora",)
# model.push_to_hub_merged("zainalikhokhar/llama2_VLLM_LORA", tokenizer, save_method = "lora", token = "")



config.json:   0%|          | 0.00/1.10k [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Saved model to https://huggingface.co/zainalikhokhar/llama2_default


And we're done! If you have any questions on Unsloth, we have a [Discord](https://discord.gg/u54VK8m8tk) channel! If you find any bugs or want to keep updated with the latest LLM stuff, or need help, join projects etc, feel free to join our Discord!

Some other links:
1. Zephyr DPO 2x faster [free Colab](https://colab.research.google.com/drive/15vttTpzzVXv_tJwEk-hIcQ0S9FcEWvwP?usp=sharing)
2. Llama 7b 2x faster [free Colab](https://colab.research.google.com/drive/1lBzz5KeZJKXjvivbYvmGarix9Ao6Wxe5?usp=sharing)
3. TinyLlama 4x faster full Alpaca 52K in 1 hour [free Colab](https://colab.research.google.com/drive/1AZghoNBQaMDgWJpi4RbffGM1h6raLUj9?usp=sharing)
4. CodeLlama 34b 2x faster [A100 on Colab](https://colab.research.google.com/drive/1y7A0AxE3y8gdj4AVkl2aZX47Xu3P1wJT?usp=sharing)
5. Mistral 7b [free Kaggle version](https://www.kaggle.com/code/danielhanchen/kaggle-mistral-7b-unsloth-notebook)
6. We also did a [blog](https://huggingface.co/blog/unsloth-trl) with 🤗 HuggingFace, and we're in the TRL [docs](https://huggingface.co/docs/trl/main/en/sft_trainer#accelerate-fine-tuning-2x-using-unsloth)!
7. `ChatML` for ShareGPT datasets, [conversational notebook](https://colab.research.google.com/drive/1Aau3lgPzeZKQ-98h69CCu1UJcvIBLmy2?usp=sharing)
8. Text completions like novel writing [notebook](https://colab.research.google.com/drive/1ef-tab5bhkvWmBOObepl1WgJvfvSzn5Q?usp=sharing)

<div class="align-center">
  <a href="https://github.com/unslothai/unsloth"><img src="https://github.com/unslothai/unsloth/raw/main/images/unsloth%20new%20logo.png" width="115"></a>
  <a href="https://discord.gg/u54VK8m8tk"><img src="https://github.com/unslothai/unsloth/raw/main/images/Discord.png" width="145"></a>
  <a href="https://ko-fi.com/unsloth"><img src="https://github.com/unslothai/unsloth/raw/main/images/Kofi button.png" width="145"></a></a> Support our work if you can! Thanks!
</div>