# Week 8: Fine-tuning Llama by Q-lora


*   Change data format
*   Change tokenizer
*   Change lora configuration

    [Ref](https://medium.com/@geronimo7/finetuning-llama2-mistral-945f9c200611)




In [None]:
%%capture output

! pip install datasets
! pip install peft==0.6.0
! pip install bitsandbytes==0.41.1
! pip install accelerate==0.24.1
! pip install trl==0.4.7
! pip install transformers
! pip install pypdfium2
! pip install faiss-gpu
! pip install langchain
! pip install rouge

## 1. Data Loading

In [None]:
import os, pandas as pd
from google.colab import drive

# Access drive
drive.mount('/content/drive')
path = '/content/drive/MyDrive/Capstone/'

file = os.path.join(path, 'QA_Sample', 'Chevron_2018.csv')

df = pd.read_csv(file)
df.head(5)

Mounted at /content/drive


Unnamed: 0,Questions,Context,Answers
0,What is Chevron Corporation's address in San R...,2018 \nannual report\n140 years of human p...,"Chevron Corporation's address in San Ramon, CA..."
1,How many metric tons of carbon dioxide are exp...,2018 \nannual report\n140 years of human p...,"Over the life of the Gorgon facility, it is ex..."
2,What are some of the competitive advantages of...,2018 \nannual report\n140 years of human p...,Chevron Corporation has several competitive ad...
3,What is the purpose of the Gorgon Carbon Dioxi...,2018 \nannual report\n140 years of human p...,The purpose of the Gorgon Carbon Dioxide Injec...
4,How does Chevron Corporation contribute to red...,2018 \nannual report\n140 years of human p...,Chevron Corporation contributes to reducing gr...


### prompt reformulation

In [None]:
df['text'] = f'<|im_start|>system\nAnswer question based on the context. ' + \
                'The context is: ' + df['Context'] + \
                '. The question is: ' + df['Questions'] + '<|im_end|>\n' + \
                '<|im_start|>assistant\n' + df['Answers'] + '<|im_end|>\n'
df.head()

Unnamed: 0,Questions,Context,Answers,text
0,What is Chevron Corporation's address in San R...,2018 \nannual report\n140 years of human p...,"Chevron Corporation's address in San Ramon, CA...",<|im_start|>system\nAnswer question based on t...
1,How many metric tons of carbon dioxide are exp...,2018 \nannual report\n140 years of human p...,"Over the life of the Gorgon facility, it is ex...",<|im_start|>system\nAnswer question based on t...
2,What are some of the competitive advantages of...,2018 \nannual report\n140 years of human p...,Chevron Corporation has several competitive ad...,<|im_start|>system\nAnswer question based on t...
3,What is the purpose of the Gorgon Carbon Dioxi...,2018 \nannual report\n140 years of human p...,The purpose of the Gorgon Carbon Dioxide Injec...,<|im_start|>system\nAnswer question based on t...
4,How does Chevron Corporation contribute to red...,2018 \nannual report\n140 years of human p...,Chevron Corporation contributes to reducing gr...,<|im_start|>system\nAnswer question based on t...


In [None]:
from datasets import Dataset
dataset = Dataset.from_pandas(df)
dataset = dataset.train_test_split(test_size=0.1)

## 2. Model & Tokenizer Loading with QLoRA Requirement

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig


access_token = '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'
model_id = 'meta-llama/Llama-2-7b-chat-hf'
new_model_id = 'llama-2-7b-qa'


# QLoRA configuration
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [None]:
# Load LLaMA tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id, token = access_token)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.add_tokens(["<|im_start|>"])
tokenizer.add_special_tokens(dict(eos_token="<|im_end|>"))
tokenizer.padding_side = "right"

# Load base model
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    token = access_token,
    quantization_config=bnb_config,
    device_map = {"": 0}
)
model.resize_token_embeddings(len(tokenizer))
model.config.eos_token_id = tokenizer.eos_token_id
model.config.use_cache = False

tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

## 3. QLoRA Configuration

In [None]:
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model

model = prepare_model_for_kbit_training(model)
# Load LoRA configuration
peft_config = LoraConfig(
    r=64,
    lora_alpha=16,
    target_modules = ['q_proj', 'k_proj', 'down_proj', 'v_proj', 'gate_proj', 'o_proj', 'up_proj'],
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM",
)
model = get_peft_model(model, peft_config)

## 4. Prepare data for training

In [None]:
def tokenize(element):
    return tokenizer(
        element["text"],
        truncation=True,
        max_length=4096,
        add_special_tokens=False,
    )

dataset_tokenized = dataset.map(
    tokenize,
    batched=True,
    num_proc=os.cpu_count(),    # multithreaded
    remove_columns=['Questions', 'Context', 'Answers', 'text']     # don't need the strings anymore, we have tokens from here on
)


# collate function - to transform list of dictionaries [ {input_ids: [123, ..]}, {.. ] to single batch dictionary { input_ids: [..], labels: [..], attention_mask: [..] }
def collate(elements):
    tokenlist=[e["input_ids"] for e in elements]
    tokens_maxlen=max([len(t) for t in tokenlist])  # length of longest input

    input_ids,labels,attention_masks = [],[],[]
    for tokens in tokenlist:
        # how many pad tokens to add for this sample
        pad_len=tokens_maxlen-len(tokens)

        # pad input_ids with pad_token, labels with ignore_index (-100) and set attention_mask 1 where content, otherwise 0
        input_ids.append( tokens + [tokenizer.pad_token_id]*pad_len )
        labels.append( tokens + [-100]*pad_len )
        attention_masks.append( [1]*len(tokens) + [0]*pad_len )

    batch={
        "input_ids": torch.tensor(input_ids),
        "labels": torch.tensor(labels),
        "attention_mask": torch.tensor(attention_masks)
    }
    return batch

Map (num_proc=8):   0%|          | 0/528 [00:00<?, ? examples/s]

Map (num_proc=8):   0%|          | 0/59 [00:00<?, ? examples/s]

## 4. Training Configuration

In [None]:
from trl import SFTTrainer
from transformers import Trainer, TrainingArguments

output_dir = os.path.join(path, 'exp', 'v4')

training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=1,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=1,
    optim='paged_adamw_32bit',
    save_steps=10,
    logging_steps=10,
    learning_rate=1e-3,
    weight_decay=0.001,
    fp16=True,
    bf16=False, # set to true if using A100
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type='constant',
    report_to="tensorboard",
    ddp_find_unused_parameters=False
)


trainer = Trainer(
    model=model,
    train_dataset=dataset_tokenized["train"],
    data_collator=collate,
    tokenizer=tokenizer,
    args=training_arguments,
)

## Training and Saving

In [None]:
trainer.train()

save_path = os.path.join(output_dir, new_model_id)
trainer.model.save_pretrained(save_path)



Step,Training Loss
10,1.928
20,2.102
30,1.8417
40,1.8636
50,1.6804
60,1.3119
70,1.5557
80,1.7581
90,1.3881
100,1.1098




RuntimeError: ignored

In [None]:
%load_ext tensorboard
%tensorboard --logdir /content/drive/MyDrive/Capstone/exp/v4/runs