# Llama 2 Fine Tuning For Scientific Question Answers

Install necessary packages.

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7


[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/244.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m235.5/244.2 kB[0m [31m7.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.2/244.2 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.9/72.9 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.5/92.5 MB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m82.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.4/77.4 kB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m69.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━

Colab: Connect Google Drive.

## Preparation

Import libraries and logging to Hugging Face Hub.

**Note:**
- You need to get access to Llama 2 by sending request [here](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) before training or using the model.
- Paste your Hugging Face token to the variable `login_token` (the access token must be in READ mode)

In [3]:
import os, torch, logging, re

from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, HfArgumentParser, TrainingArguments, pipeline
from peft import LoraConfig, PeftModel
from trl import SFTTrainer
from huggingface_hub import login


login_token = "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"

login(login_token)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /root/.cache/huggingface/token
Login successful


All paths needed to train
- `dataset_path`: folder containing the dataset
- `data_files`: train.csv file. The training data can be accessed [here](https://www.kaggle.com/datasets/thedevastator/sciq-a-dataset-for-science-question-answering)
- `llama_model_path`: Hugging Face Hub of Llama 2
- `save_dir`:
    - `save_dir/model`: save the final model after fine tuned
    - `save_dir/result`: save checkpoints

You can change your dataset path and save directory where you want

In [4]:
# Dataset
dataset_path = "/content/drive/MyDrive/llama_2_science/data"
data_files = {"train": "train.csv"}
# Model and tokenizer names
llama_model_path = "meta-llama/Llama-2-7b-chat-hf"
# Save directory
save_dir = "/content/drive/MyDrive/llama_2_science/"

Preprocessing data to associate with Llama 2 format.

In [5]:
# Dataset
dataset_path = "/content/drive/MyDrive/llama_2_science/data"
data_files = {"train": "train.csv"}
# Model and tokenizer names
llama_model_path = "meta-llama/Llama-2-7b-chat-hf"
# Save directory
save_dir = "/content/drive/MyDrive/llama_2_science/"


def replace_at_start(text):
    pattern = r'\d+\.\d+|\d+\.'
    if re.match(pattern, text):
        return re.sub(pattern, '', text.strip(), count=1)
    return text

def contains_remove_words(text):
    remove_words = ["figure", "chapter", "http", "www"]
    return any(word in text for word in remove_words)

def preprocess(sample):
    #sp = re.sub(r'\d+\.\d+|\d+\.', "", sample["support"])
    sp = replace_at_start(sample["support"])
    sample["text"] = '<s>[INST] ' + sample["question"] + " [/INST] " + "Answer: " + sample["correct_answer"][0].upper() + sample["correct_answer"][1:].lower() + ". " + sp + " </s>"
    return sample


dataset = load_dataset(dataset_path, data_files=data_files, split="train").filter(lambda x : x["support"] is not None)

full_dataset = dataset.filter(lambda x : not contains_remove_words(x['support'].lower()))
full_dataset = full_dataset.shuffle(seed = 7)
full_dataset = full_dataset.map(preprocess).remove_columns(['question', 'distractor3', 'distractor1', 'distractor2', 'correct_answer', 'support'])

print("DATA LEN: ", len(full_dataset))
for i in full_dataset.select(range(50)):
  print(i)
  #break


Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/11679 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10481 [00:00<?, ? examples/s]

Map:   0%|          | 0/7652 [00:00<?, ? examples/s]

DATA LEN:  7652
{'text': '<s>[INST] Today, new elements are usually named after famous scientists. the names of the elements can be cumbersome to write in full, especially when combined to form the names of compounds. therefore, each element name is abbreviated as a one- or two-letter chemical this? [/INST] Answer: Symbol. Note Today, new elements are usually named after famous scientists. The names of the elements can be cumbersome to write in full, especially when combined to form the names of compounds. Therefore, each element name is abbreviated as a one- or two-letter chemical symbol. By convention, the first letter of a chemical symbol is a capital letter, while the second letter (if there is one) is a lowercase letter. The first letter of the symbol is usually the first letter of the element’s name, while the second letter is some other letter from the name. Some elements have symbols that derive from earlier, mostly Latin names, so the symbols may not contain any letters from t

In [6]:
# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(llama_model_path, trust_remote_code=True)

tokenizer.padding_side = "right"

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

In [7]:
# Ensure pad_token is unique
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '<pad>'})

# Set the pad_token
tokenizer.pad_token = '<pad>'

# Verify tokens
print(f"PAD token id: {tokenizer.pad_token_id}")
print(f"EOS token id: {tokenizer.eos_token_id}")

Using pad_token, but it is not set yet.


PAD token id: 32000
EOS token id: 2


In [8]:
sentence = "<s> HELLO </s>"
print(tokenizer.pad_token)
print(tokenizer.pad_token_id)

tokenizer(sentence, padding = "max_length", max_length = 30)

<pad>
32000


{'input_ids': [1, 1, 29871, 17714, 2208, 29949, 29871, 2, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}

Quantization configuration and calling model.

In [9]:

# Quantization Config
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=False
)
# Model
model = AutoModelForCausalLM.from_pretrained(
    llama_model_path,
    quantization_config=quantization_config,
    device_map={"": 0}
)
model.config.use_cache = False
model.config.pretraining_tp = 1

config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

In [10]:
model.resize_token_embeddings(len(tokenizer))

Embedding(32001, 4096)

LoRA configuration for training

**Note:** For first training, please comment the line `model = PeftModel.from_pretrained(model, save_dir + "model", config = lora_parameters, is_trainable = True)` (line 9) - this line is used to load the model to continue training.


In [14]:
# LoRA Config
lora_parameters = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.12,
    r=8,
    bias="none",
    task_type="CAUSAL_LM"
)
#model = PeftModel.from_pretrained(model, save_dir + "model", config = lora_parameters, is_trainable = True)

Trainer configuration

**Note:** For first training, please uncomment the line `warmup_ratio=0.03` (line 12) - This line is used for first training to help the model warm up with new data (learning rate will increase linearly from 0 to the defined learning rate).

In [12]:
# Training Params
train_params = TrainingArguments(
    output_dir= save_dir + "result_2",
    num_train_epochs=2,
    gradient_accumulation_steps=1,
    optim="paged_adamw_32bit",
    save_steps=200,
    logging_steps=100,
    learning_rate=1.6e-4,
    weight_decay=0.001,
    max_grad_norm=0.35,
    warmup_ratio=0.053,
    group_by_length=True,
    lr_scheduler_type="constant",
    per_device_train_batch_size=4
)


# Trainer
fine_tuning = SFTTrainer(
    model = model,
    train_dataset = full_dataset,
    peft_config = lora_parameters,
    dataset_text_field = "text",
    tokenizer = tokenizer,
    args = train_params
)




Map:   0%|          | 0/7652 [00:00<?, ? examples/s]

Run the fine tuning

In [13]:
# Training
fine_tuning.train()



You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
100,1.5786
200,1.3469
300,1.3241
400,1.3294
500,1.3385
600,1.2922
700,1.2997
800,1.3133
900,1.2763
1000,1.3008




TrainOutput(global_step=3826, training_loss=1.2553469408137974, metrics={'train_runtime': 7905.534, 'train_samples_per_second': 1.936, 'train_steps_per_second': 0.484, 'total_flos': 4.038369470447616e+16, 'train_loss': 1.2553469408137974, 'epoch': 2.0})

Save the fine tuned model

In [16]:
# Save Model
fine_tuning.model.save_pretrained(save_dir+"model_2")


Test after fine tuning

In [19]:
query = "How can fish breathe under water?"
text_gen = pipeline(task="text-generation", model=fine_tuning.model, tokenizer=tokenizer, max_length=200)
output = text_gen(f"<s>[INST] {query} [/INST]")
print(output[0]['generated_text'])

The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'CodeGenForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'LlamaForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCausalLM', 'MegatronBertForCausalLM', 'MusicgenForCausalLM', 'MvpForCausalLM', 'OpenLlamaForCausalLM', 'OpenAIGPTLMHeadModel', 'OPTForCausalLM', 'PegasusForCausalLM', 'PLBartForCausalLM', 'ProphetNetForCausalLM', 'QDQBertLMHeadModel', 'ReformerModelWithLMHead', 'RemBertForCausal

<s>[INST] How can fish breathe under water? [/INST] Answer: Gills. Fish can breathe under water because they have gills. Gills are structures that extract oxygen from the water. 
