In [1]:
!pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.2/244.2 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.9/72.9 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.5/92.5 MB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m100.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.4/77.4 kB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m66.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.1/542.1 kB[0m [31m39.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.3/21.3 MB[0m [31m44.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━

In [2]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer

In [3]:
# Model name on Hugging Face Hub
model_name = "abhishek/llama-2-7b-hf-small-shards"

# Name for the fine-tuned model
new_model = "llama-2-7b-vedica-qna"

# QLoRA parameters
lora_r = 64 # LoRA attention dimension
lora_alpha = 16 # Alpha factor for LoRA scaling
lora_dropout = 0.1 # Dropout rate for LoRA layers (probability)


# bitsandbytes parameters
use_4bit = True # Activate 4-bit precision base model loading
bnb_4bit_compute_dtype = "float16" # Compute dtype for 4-bit base models
bnb_4bit_quant_type = "nf4" # Quantization type (fp4 or nf4)
use_nested_quant = False # Activate nested quantization for 4-bit base models (double quantization)


# TrainingArguments parameters

# Directory for model predictions and checkpoints
output_dir = "./results"

# Number of training epochs
num_train_epochs = 1

# Enable fp16/bf16 training 
fp16 = False
bf16 = False

# Batch size per GPU for training
per_device_train_batch_size = 2

# Batch size per GPU for evaluation
per_device_eval_batch_size = 2

# Number of update steps to accumulate the gradients for
gradient_accumulation_steps = 4

# Enable gradient checkpointing
gradient_checkpointing = True

# Maximum gradient normal (gradient clipping)
max_grad_norm = 0.3

# Initial learning rate (AdamW optimizer)
learning_rate = 2e-4

# Weight decay to apply to all layers except bias/LayerNorm weights
weight_decay = 0.001

# Optimizer to use
optim = "paged_adamw_32bit"

# Learning rate schedule (constant a bit better than cosine)
lr_scheduler_type = "constant"

# Number of training steps (overrides num_train_epochs)
max_steps = -1

# Ratio of steps for a linear warmup (from 0 to learning rate)
warmup_ratio = 0.03

# Group sequences into batches with same length (Saves memory and speeds up training)
group_by_length = True

# Save checkpoint every X updates steps
save_steps = 75

# Log every X updates steps
logging_steps = 75


# SFT parameters
max_seq_length = None # Maximum sequence length to use
packing = False # Pack multiple short examples in the same input sequence to increase efficiency
device_map = {"": 0} # Load the entire model on the GPU 0


In [4]:
# Load tokenizer and model with QLoRA configuration
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

# Check GPU compatibility with bfloat16
if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)

# Load base model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map=device_map
)
model.config.use_cache = False
model.config.pretraining_tp = 1

# Load LLaMA tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" # Fix overflow issue with fp16 training

# Load LoRA configuration
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

pytorch_model.bin.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/10 [00:00<?, ?it/s]

pytorch_model-00001-of-00010.bin:   0%|          | 0.00/2.95G [00:00<?, ?B/s]

pytorch_model-00002-of-00010.bin:   0%|          | 0.00/2.88G [00:00<?, ?B/s]

pytorch_model-00003-of-00010.bin:   0%|          | 0.00/2.99G [00:00<?, ?B/s]

pytorch_model-00004-of-00010.bin:   0%|          | 0.00/2.86G [00:00<?, ?B/s]

pytorch_model-00005-of-00010.bin:   0%|          | 0.00/2.88G [00:00<?, ?B/s]

pytorch_model-00006-of-00010.bin:   0%|          | 0.00/2.97G [00:00<?, ?B/s]

pytorch_model-00007-of-00010.bin:   0%|          | 0.00/2.88G [00:00<?, ?B/s]

pytorch_model-00008-of-00010.bin:   0%|          | 0.00/2.99G [00:00<?, ?B/s]

pytorch_model-00009-of-00010.bin:   0%|          | 0.00/2.86G [00:00<?, ?B/s]

pytorch_model-00010-of-00010.bin:   0%|          | 0.00/705M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/10 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/174 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/746 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/435 [00:00<?, ?B/s]

In [None]:
# Set training parameters
training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=1,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    fp16=fp16,
    bf16=bf16,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
    report_to="tensorboard"
)


dataset = load_dataset("csv",data_files='/content/train2.csv', column_names=['text'], skiprows=1, split="train" ,delimiter=',') # JUST CHANGE THE data_files wala parameter
# Set supervised fine-tuning parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=packing,
)

# Train model
trainer.train()

# Save trained model
trainer.model.save_pretrained(new_model)

Generating train split: 0 examples [00:00, ? examples/s]



Map:   0%|          | 0/4999 [00:00<?, ? examples/s]

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
75,0.9934
150,0.9143
225,0.7545
300,0.866
375,0.7392
450,0.8695
525,0.7572
600,0.8628




In [6]:
# Ignore warnings
logging.set_verbosity(logging.CRITICAL)

# Run text generation pipeline with the trained model
prompt = "How to treat Glaucoma?"
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)

result = pipe(f"[INST] <<SYS>> Answer the Questions <</SYS>> {prompt} [/INST]")
print(result[0]['generated_text'])



[INST] <<SYS>> Answer the Questions <</SYS>> How to treat Glaucoma? [/INST] The goal of treatment is to reduce pressure in the eye to a level that will prevent damage to the optic nerve and vision loss. Treatment may include medicines, laser surgery, conventional surgery, or a combination of these treatments. The type of treatment that is best for you will depend on the type of glaucoma you have, how severe it is, and whether you have other eye diseases. Your ophthalmologist will work with you to determine the best treatment for your condition.    Medicines - Medicines used to treat glaucoma work by either reducing the amount of fluid in the eye or by improving the flow through the drainage angle. These medicines are taken by mouth or applied directly to the eye.    Laser surgery - Laser surgery is used to


In [7]:
# Run text generation pipeline with the trained model
prompt = "What is Osteoporosis?"
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)

result = pipe(f"[INST] <<SYS>> Answer the Questions <</SYS>> {prompt} [/INST]")
print(result[0]['generated_text'])

[INST] <<SYS>> Answer the Questions <</SYS>> What is Osteoporosis? [/INST] Osteoporosis is a disease that causes bones to become weak and brittle. It is a silent disease because there are no symptoms. Osteoporosis is called the "silent disease" because it is not detected until a bone breaks. Osteoporosis is a major health problem in the United States. It affects 10 million Americans, including 1 million men. It is most common in women over age 50. Osteoporosis is a major cause of broken bones in older people. It is also a major cause of disability in older people. Osteoporosis is a major cause of broken bones in older people. It is also a major cause of disability in older people. Osteoporosis is a major cause of broken bones in


In [9]:
# Run text generation pipeline with the trained model
prompt = "Give Symptoms of Lung Cancer"
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)

result = pipe(f"[INST] <<SYS>> Answer the Questions <</SYS>> {prompt} [/INST]")
print(result[0]['generated_text'])

[INST] <<SYS>> Answer the Questions <</SYS>> Give Symptoms of Lung Cancer [/INST] Signs and symptoms of lung cancer include       -  A cough that doesn't go away or gets worse over time    -  Chest pain    -  Shortness of breath    -  Wheezing    -  Hoarseness    -  Trouble swallowing    -  Weight loss for no known reason    -  Loss of appetite    -  Fatigue    -  Coughing up blood    -  Chest infections that keep coming back       Lung cancer can cause other signs and symptoms. Other conditions can cause the same signs and symptoms. It's important to see your doctor if you have any of these problems. Only a doctor can make a diagnosis.    NIH: National Cancer Institute    NIH: National Institute of Environmental


In [10]:
# Empty VRAM
del model
del pipe
del trainer
import gc
gc.collect()
gc.collect()

20933

In [11]:
# Reload model in FP16 and merge it with LoRA weights
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map=device_map,
)
model = PeftModel.from_pretrained(base_model, new_model)
model = model.merge_and_unload()

# Reload tokenizer to save it
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"



Loading checkpoint shards:   0%|          | 0/10 [00:00<?, ?it/s]

In [12]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

In [13]:
!huggingface-cli login
# IN ORDER TO UPLOAD TO HUGGING FACE
model.push_to_hub("vbafnaa/supportiv_QnA", check_pr=True)

tokenizer.push_to_hub("vbafnaa/supportiv_QnA",check_pr=True)


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) y
Token is valid (permission: write).
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub.
Run the following command in your ter

pytorch_model-00002-of-00002.bin:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/vbafnaa/supportiv_QnA/commit/49c2a2c6d4cabaccfd6d86f7023340adeb0490b4', commit_message='Upload tokenizer', commit_description='', oid='49c2a2c6d4cabaccfd6d86f7023340adeb0490b4', pr_url=None, pr_revision=None, pr_num=None)