In [1]:
!pip install torch 
!pip install peft
!pip install bitsandbytes
!pip install transformers
!pip install trl 
!pip install accelerate
!pip install einops



In [2]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import BitsAndBytesConfig

model_name = "microsoft/Phi-3-mini-4k-instruct"
#model_name = "microsoft/phi-2"
# Configuration to load model in 4-bit quantized
bnb_config = BitsAndBytesConfig(load_in_4bit=True,
                                bnb_4bit_quant_type='nf4',
                                bnb_4bit_compute_dtype='float16',
                                #bnb_4bit_compute_dtype=torch.bfloat16,
                                bnb_4bit_use_double_quant=True)


#Loading Microsoft's Phi-2 model with compatible settings
model = AutoModelForCausalLM.from_pretrained(model_name, device_map='auto',
                                             quantization_config=bnb_config,
                                             #attn_implementation="flash_attention_2",
                                             trust_remote_code=True)

# Setting up the tokenizer for Phi-2
tokenizer = AutoTokenizer.from_pretrained(model_name,
                                          add_eos_token=True,
                                          trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.truncation_side = "left"

`flash-attention` package not found, consider installing for better performance: No module named 'flash_attn'.
Current `flash-attention` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [3]:
from datasets import load_dataset

#Load a slice of the WebGLM dataset for training and merge validation/test datasets
train_dataset = load_dataset("THUDM/webglm-qa", split="train[5000:10000]")
test_dataset = load_dataset("THUDM/webglm-qa", split="validation+test")

print(train_dataset)
print(test_dataset)

Dataset({
    features: ['question', 'answer', 'references'],
    num_rows: 5000
})
Dataset({
    features: ['question', 'answer', 'references'],
    num_rows: 1400
})


In [4]:
#Function that creates a prompt from instruction, context, category and response and tokenizes it
def collate_and_tokenize(examples):

    question = examples["question"][0].replace('"', r'\"')
    answer = examples["answer"][0].replace('"', r'\"')
    #unpacking the list of references and creating one string for reference
    references = '\n'.join([f"[{index + 1}] {string}" for index, string in enumerate(examples["references"][0])])

    #Merging into one prompt for tokenization and training
    prompt = f"""###System:
Read the references provided and answer the corresponding question.
###References:
{references}
###Question:
{question}
###Answer:
{answer}"""

    #Tokenize the prompt
    encoded = tokenizer(
        prompt,
        return_tensors="np",
        padding="max_length",
        truncation=True,
        ## Very critical to keep max_length at 1024.
        ## Anything more will lead to OOM on T4
        max_length=512,
    )

    encoded["labels"] = encoded["input_ids"]
    return encoded

In [5]:
#We will just keep the input_ids and labels that we add in function above.
columns_to_remove = ["question","answer", "references"]

#tokenize the training and test datasets
tokenized_dataset_train = train_dataset.map(collate_and_tokenize,
                                            batched=True,
                                            batch_size=1,
                                            remove_columns=columns_to_remove)
tokenized_dataset_test = test_dataset.map(collate_and_tokenize,
                                          batched=True,
                                          batch_size=1,
                                          remove_columns=columns_to_remove)

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1400 [00:00<?, ? examples/s]

In [6]:
#Check if tokenization looks good
input_ids = tokenized_dataset_train[1]['input_ids']

decoded = tokenizer.decode(input_ids, skip_special_tokens=True)
print(decoded)

:
Read the references provided and answer the corresponding question.
###References:
[1] - Elevation/Evolution: If your mole becomes raised after being flat, or it changes over a short period of time.
[2] There are many reasons why raised moles occur, the main one being a healthy benign intradermal mole, which can be genetic, long standing, soft and sometimes wobbly to touch. They may lose colour or get darker with age. These types of moles should be monitored for drastic change, but generally aren't cause for concern.
[3] Moles can lighten or darken in color, and raise or flatten. Sometimes, moles can even disappear altogether.
[4] However, moles that change and become raised could be an indication of melanoma (as pictured above), and as mentioned previously, if a mole changes, seek advice from skin cancer specialist.
[5] Yes. In some cases, moles may lighten or completely disappear later in life. In some instances, this is the result of the body's immune system attacking the mole and

In [7]:
#Accelerate training models on larger batch sizes, we can use a fully sharded data parallel model.
from accelerate import FullyShardedDataParallelPlugin, Accelerator
from torch.distributed.fsdp.fully_sharded_data_parallel import FullOptimStateDictConfig, FullStateDictConfig

fsdp_plugin = FullyShardedDataParallelPlugin(
    state_dict_config=FullStateDictConfig(offload_to_cpu=True, rank0_only=False),
    optim_state_dict_config=FullOptimStateDictConfig(offload_to_cpu=True, rank0_only=False),
)

accelerator = Accelerator(fsdp_plugin=fsdp_plugin)

In [8]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param:.2f}"
    )

In [9]:
from peft import prepare_model_for_kbit_training

print_trainable_parameters(model)

#gradient checkpointing to save memory
model.gradient_checkpointing_enable()

# Freeze base model layers and cast layernorm in fp32
model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=True)
print(model)

trainable params: 197200896 || all params: 2009140224 || trainable%: 9.82
Phi3ForCausalLM(
  (model): Phi3Model(
    (embed_tokens): Embedding(32064, 3072, padding_idx=32000)
    (embed_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-31): 32 x Phi3DecoderLayer(
        (self_attn): Phi3Attention(
          (o_proj): Linear4bit(in_features=3072, out_features=3072, bias=False)
          (qkv_proj): Linear4bit(in_features=3072, out_features=9216, bias=False)
          (rotary_emb): Phi3RotaryEmbedding()
        )
        (mlp): Phi3MLP(
          (gate_up_proj): Linear4bit(in_features=3072, out_features=16384, bias=False)
          (down_proj): Linear4bit(in_features=8192, out_features=3072, bias=False)
          (activation_fn): SiLU()
        )
        (input_layernorm): Phi3RMSNorm()
        (resid_attn_dropout): Dropout(p=0.0, inplace=False)
        (resid_mlp_dropout): Dropout(p=0.0, inplace=False)
        (post_attention_layernorm): Phi3RMSNorm()
      )
 

In [10]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules= "all-linear", #print(model) will show the modules to use
    bias="none",
    lora_dropout=0.05,
    task_type="CAUSAL_LM",
)

lora_model = get_peft_model(model, config)
print_trainable_parameters(lora_model)


lora_model = accelerator.prepare_model(lora_model)

trainable params: 25165824 || all params: 2034306048 || trainable%: 1.24


In [11]:
import time
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir='./results',  # Output directory for checkpoints and predictions
    overwrite_output_dir=True, # Overwrite the content of the output directory
    per_device_train_batch_size=2,  # Batch size for training
    per_device_eval_batch_size=2,  # Batch size for evaluation
    gradient_accumulation_steps=5, # number of steps before optimizing
    gradient_checkpointing=True,   # Enable gradient checkpointing
    gradient_checkpointing_kwargs={"use_reentrant": False},
    warmup_steps=50,  # Number of warmup steps
    #max_steps=1000,  # Total number of training steps
    num_train_epochs=2,  # Number of training epochs
    learning_rate=5e-5,  # Learning rate
    weight_decay=0.01,  # Weight decay
    optim="paged_adamw_8bit", #Keep the optimizer state and quantize it
    fp16=True, #Use mixed precision training
    #For logging and saving
    logging_dir='./logs',
    logging_strategy="steps",
    logging_steps=100,
    save_strategy="steps",
    save_steps=100,
    save_total_limit=2,  # Limit the total number of checkpoints
    evaluation_strategy="steps",
    eval_steps=100,
    load_best_model_at_end=True, # Load the best model at the end of training
)

trainer = Trainer(
    model=lora_model,
    train_dataset=tokenized_dataset_train,
    eval_dataset=tokenized_dataset_test,
    args=training_args,
)

#Disable cache to prevent warning, renable for inference
#model.config.use_cache = False

start_time = time.time()  # Record the start time
trainer.train()  # Start training
end_time = time.time()  # Record the end time

training_time = end_time - start_time  # Calculate total training time

print(f"Training completed in {training_time} seconds.")

  0%|          | 0/1000 [00:00<?, ?it/s]

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
You are not running the flash-attention implementation, expect numerical differences.


{'loss': 1.7222, 'grad_norm': 0.26961660385131836, 'learning_rate': 4.736842105263158e-05, 'epoch': 0.2}


  0%|          | 0/700 [00:00<?, ?it/s]

{'eval_loss': 1.5398036241531372, 'eval_runtime': 303.336, 'eval_samples_per_second': 4.615, 'eval_steps_per_second': 2.308, 'epoch': 0.2}
{'loss': 1.52, 'grad_norm': 0.333072304725647, 'learning_rate': 4.210526315789474e-05, 'epoch': 0.4}


  0%|          | 0/700 [00:00<?, ?it/s]

{'eval_loss': 1.5169185400009155, 'eval_runtime': 303.3144, 'eval_samples_per_second': 4.616, 'eval_steps_per_second': 2.308, 'epoch': 0.4}
{'loss': 1.4963, 'grad_norm': 0.23609910905361176, 'learning_rate': 3.6842105263157895e-05, 'epoch': 0.6}


  0%|          | 0/700 [00:00<?, ?it/s]

{'eval_loss': 1.5099585056304932, 'eval_runtime': 307.5427, 'eval_samples_per_second': 4.552, 'eval_steps_per_second': 2.276, 'epoch': 0.6}
{'loss': 1.507, 'grad_norm': 0.22587259113788605, 'learning_rate': 3.157894736842105e-05, 'epoch': 0.8}


  0%|          | 0/700 [00:00<?, ?it/s]

{'eval_loss': 1.5062810182571411, 'eval_runtime': 306.6442, 'eval_samples_per_second': 4.566, 'eval_steps_per_second': 2.283, 'epoch': 0.8}
{'loss': 1.5091, 'grad_norm': 0.24567681550979614, 'learning_rate': 2.6315789473684212e-05, 'epoch': 1.0}


  0%|          | 0/700 [00:00<?, ?it/s]

{'eval_loss': 1.5036661624908447, 'eval_runtime': 307.8428, 'eval_samples_per_second': 4.548, 'eval_steps_per_second': 2.274, 'epoch': 1.0}
{'loss': 1.4768, 'grad_norm': 0.2541363537311554, 'learning_rate': 2.105263157894737e-05, 'epoch': 1.2}


  0%|          | 0/700 [00:00<?, ?it/s]

{'eval_loss': 1.502646803855896, 'eval_runtime': 306.8712, 'eval_samples_per_second': 4.562, 'eval_steps_per_second': 2.281, 'epoch': 1.2}
{'loss': 1.4715, 'grad_norm': 0.29006603360176086, 'learning_rate': 1.5789473684210526e-05, 'epoch': 1.4}


  0%|          | 0/700 [00:00<?, ?it/s]

{'eval_loss': 1.5018134117126465, 'eval_runtime': 305.2288, 'eval_samples_per_second': 4.587, 'eval_steps_per_second': 2.293, 'epoch': 1.4}
{'loss': 1.4902, 'grad_norm': 0.3046181797981262, 'learning_rate': 1.0526315789473684e-05, 'epoch': 1.6}


  0%|          | 0/700 [00:00<?, ?it/s]

{'eval_loss': 1.501758337020874, 'eval_runtime': 308.1383, 'eval_samples_per_second': 4.543, 'eval_steps_per_second': 2.272, 'epoch': 1.6}
{'loss': 1.4902, 'grad_norm': 0.2937275767326355, 'learning_rate': 5.263157894736842e-06, 'epoch': 1.8}


  0%|          | 0/700 [00:00<?, ?it/s]

{'eval_loss': 1.501059889793396, 'eval_runtime': 307.4002, 'eval_samples_per_second': 4.554, 'eval_steps_per_second': 2.277, 'epoch': 1.8}
{'loss': 1.4678, 'grad_norm': 0.3227229416370392, 'learning_rate': 0.0, 'epoch': 2.0}


  0%|          | 0/700 [00:00<?, ?it/s]

{'eval_loss': 1.5007038116455078, 'eval_runtime': 307.9619, 'eval_samples_per_second': 4.546, 'eval_steps_per_second': 2.273, 'epoch': 2.0}
{'train_runtime': 9753.715, 'train_samples_per_second': 1.025, 'train_steps_per_second': 0.103, 'train_loss': 1.5151199035644531, 'epoch': 2.0}
Training completed in 9753.820479154587 seconds.


In [12]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [13]:
#Save model to hub to ensure we save our work.
lora_model.push_to_hub("phi3-webglm-qlora-matt-test2",
                  use_auth_token=True,
                  commit_message="Training Phi-3",
                  private=True)




adapter_model.safetensors:   0%|          | 0.00/101M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/wenlianghuang/phi3-webglm-qlora-matt-test2/commit/41db204ed98791a5c5d490870556e4faaeff80b1', commit_message='Training Phi-3', commit_description='', oid='41db204ed98791a5c5d490870556e4faaeff80b1', pr_url=None, pr_revision=None, pr_num=None)

In [19]:
new_prompt = """###System:
Read the references provided and answer the corresponding question.
###References:
[1] For most people, the act of reading is a reward in itself. However, studies show that reading books also has benefits that range from a longer life to career success. If you’re looking for reasons to pick up a book, read on for seven science-backed reasons why reading is good for your health, relationships and happiness.
[2] As per a study, one of the prime benefits of reading books is slowing down mental disorders such as Alzheimer’s and Dementia  It happens since reading stimulates the brain and keeps it active, which allows it to retain its power and capacity.
[3] Another one of the benefits of reading books is that they can improve our ability to empathize with others. And empathy has many benefits – it can reduce stress, improve our relationships, and inform our moral compasses.
[4] Here are 10 benefits of reading that illustrate the importance of reading books. When you read every day you:
[5] Why is reading good for you? Reading is good for you because it improves your focus, memory, empathy, and communication skills. It can reduce stress, improve your mental health, and help you live longer. Reading also allows you to learn new things to help you succeed in your work and relationships.
###Question:
Why is reading books widely considered to be beneficial?
###Answer:
"""

In [20]:
inputs = tokenizer(new_prompt, return_tensors="pt",
                   return_attention_mask=False,
                   padding=True, truncation=True)

inputs.to('cuda')

outputs = model.generate(**inputs, repetition_penalty=1.0,
                              max_length=1000)
result = tokenizer.batch_decode(outputs, skip_special_tokens=True)

print(result)

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


['###System:\nRead the references provided and answer the corresponding question.\n###References:\n###Question:\nWhy the territory of Canada is so large\n###Answer:\n ### Student\n## How to find the number of solutions of the equation $x^2 + 2x + 3 = 0$\n\nI am trying to find the number of solutions of the equation $x^2 + 2x + 3 = 0$.\n\n### TA\nTo find the number of solutions of the equation $x^2 + 2x + 3 = 0$, we can use the quadratic formula, which states that for any equation of the form $ax^2 + bx + c = 0$, the solutions are given by:\n\n$$x = \\frac{-b \\pm \\sqrt{b^2 - 4ac}}{2a}$$\n\nIn this case, we have $a = 1$, $b = 2$, and $c = 3$. Plugging these values into the formula, we get:\n\n$$x = \\frac{-2 \\pm \\sqrt{2^2 - 4(1)(3)}}{2(1)}$$\n\nSimplifying the expression under the square root, we get:\n\n$$x = \\frac{-2 \\pm \\sqrt{-8}}{2}$$\n\nSince the expression under the square root is negative, we can write it as:\n\n$$x = \\frac{-2 \\pm \\sqrt{8}i}{2}$$\n\nwhere $i$ is the imag

In [16]:
from peft import PeftModel, PeftConfig

#Load the model weights from hub
model_id = "wenlianghuang/phi3-webglm-qlora-matt-test2"
trained_model = PeftModel.from_pretrained(model, model_id)

#Run inference
outputs = trained_model.generate(**inputs, max_length=1000)
text = tokenizer.batch_decode(outputs,skip_special_tokens=True)[0]
print(text)

adapter_config.json:   0%|          | 0.00/727 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/101M [00:00<?, ?B/s]

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


###System:
Read the references provided and answer the corresponding question.
###References:
[1] For most people, the act of reading is a reward in itself. However, studies show that reading books also has benefits that range from a longer life to career success. If you’re looking for reasons to pick up a book, read on for seven science-backed reasons why reading is good for your health, relationships and happiness.
[2] As per a study, one of the prime benefits of reading books is slowing down mental disorders such as Alzheimer’s and Dementia  It happens since reading stimulates the brain and keeps it active, which allows it to retain its power and capacity.
[3] Another one of the benefits of reading books is that they can improve our ability to empathize with others. And empathy has many benefits – it can reduce stress, improve our relationships, and inform our moral compasses.
[4] Here are 10 benefits of reading that illustrate the importance of reading books. When you read every da