In [1]:
# https://medium.com/@prasadmahamulkar/fine-tuning-phi-2-a-step-by-step-guide-e672e7f1d009
# https://colab.research.google.com/drive/1zki5smRQDDIYGZ9LuTzVLV_qcq4dH8Aj#scrollTo=nltkxvCS7wl_

In [2]:
!pip install torch 
!pip install peft
!pip install bitsandbytes
!pip install transformers
!pip install trl 
!pip install accelerate
!pip install einops



In [3]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    AutoTokenizer,
    TrainingArguments,
    pipeline,
)
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training
from trl import SFTTrainer

In [4]:
# Model
base_model = "microsoft/phi-2"
#base_model = "microsoft/Phi-3-mini-128k-instruct"
#base_model = "microsoft/Phi-3-mini-4k-instruct"
new_model = "phi-2-medquad"

# Dataset
dataset = load_dataset("prsdm/medquad-phi2-1k", split="train")

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model, use_fast=True)
tokenizer.pad_token = tokenizer.unk_token
tokenizer.padding_side = "right"

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:
# Quantization configuration
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=False,
)

# Load base moodel
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    trust_remote_code=True,
    device_map={"": 0}
)

model.config.use_cache = False
model.config.pretraining_tp = 1

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [6]:
# LoRA configuration
peft_config = LoraConfig(
    r= 64,          
    lora_alpha= 16,
    lora_dropout=0.05, #0.1
    bias="none",
    task_type="CAUSAL_LM",
    #target_modules= ["Wqkv", "out_proj"],
    target_modules=["q_proj", "k_proj", "v_proj", "dense"],
)

In [7]:
# Set training arguments
training_arguments = TrainingArguments(
    output_dir = "./results",
    num_train_epochs = 1,
    fp16 = False,
    bf16 = False,
    per_device_train_batch_size = 4,
    per_device_eval_batch_size = 4,
    gradient_accumulation_steps = 1,
    gradient_checkpointing = True,
    max_grad_norm = 0.3,
    learning_rate = 2e-4,
    weight_decay = 0.001,
    optim = "paged_adamw_32bit",
    lr_scheduler_type = "cosine",
    max_steps = -1,
    warmup_ratio = 0.03,
    group_by_length = True,
    save_steps = 0,
    logging_steps = 25,
)

In [8]:
# Set supervised fine-tuning parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length= None,
    tokenizer=tokenizer,
    args=training_arguments,
)

# Train model
trainer.train()



  0%|          | 0/250 [00:00<?, ?it/s]

  attn_output = torch.nn.functional.scaled_dot_product_attention(


{'loss': 1.3902, 'grad_norm': 0.15410493314266205, 'learning_rate': 0.0001975746552556772, 'epoch': 0.1}
{'loss': 1.3314, 'grad_norm': 0.24028967320919037, 'learning_rate': 0.00018550053929480202, 'epoch': 0.2}
{'loss': 1.1431, 'grad_norm': 0.16636060178279877, 'learning_rate': 0.00016449948488669639, 'epoch': 0.3}
{'loss': 1.1119, 'grad_norm': 0.18997769057750702, 'learning_rate': 0.000136764169663272, 'epoch': 0.4}
{'loss': 1.1636, 'grad_norm': 0.1386626809835434, 'learning_rate': 0.00010519038181318999, 'epoch': 0.5}
{'loss': 1.1151, 'grad_norm': 0.1405104100704193, 'learning_rate': 7.307467669163655e-05, 'epoch': 0.6}
{'loss': 1.1166, 'grad_norm': 0.12577061355113983, 'learning_rate': 4.377019014049223e-05, 'epoch': 0.7}
{'loss': 1.0995, 'grad_norm': 0.1307191252708435, 'learning_rate': 2.03365443542764e-05, 'epoch': 0.8}
{'loss': 1.1257, 'grad_norm': 0.15648455917835236, 'learning_rate': 5.22039891260262e-06, 'epoch': 0.9}
{'loss': 1.0724, 'grad_norm': 0.152920663356781, 'learning

TrainOutput(global_step=250, training_loss=1.1669507522583007, metrics={'train_runtime': 1365.3448, 'train_samples_per_second': 0.732, 'train_steps_per_second': 0.183, 'total_flos': 1.255518376759296e+16, 'train_loss': 1.1669507522583007, 'epoch': 1.0})

In [9]:
# Save trained model
trainer.model.save_pretrained(new_model)

In [10]:
prompt = "What are the treatments for Gastrointestinal Carcinoid Tumors?"
instruction = f"### Instruction: {prompt} "
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
result = pipe(instruction)
print(result[0]['generated_text'][len(instruction):])

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


 ### Assistant: Gastrointestinal Carcinoid tumors are usually treated with surgery. The type of surgery depends on the location of the tumor and whether it has spread to other parts of the body.
                
If the tumor is in the stomach, the surgeon may remove the tumor and some of the surrounding tissue. If the tumor is in the small intestine, the surgeon may remove the tumor and a small part of the intestine. If the tumor is in the colon, the surgeon may remove the tumor and a small part of the colon.
                
If the tumor has spread to other parts of the body, the surgeon may remove the tumor and some of the surrounding tissue. If the tumor has spread to the liver, the surgeon may remove the tumor and part of the liver. If the tumor has spread to the lungs, the surgeon may remove the tumor and part of the lung.
                

