In [1]:
import pandas as pd
import json

In [2]:
conv_dataframe = pd.read_csv("../data/conversations-galaxy-q-a.csv", sep="\t")
conv_dataframe

Unnamed: 0,conversations,tokens
0,"\n\n<s>[INST] Hi, I met an error when I used t...",478
1,"\n\n<s>[INST] Hi,\nI’m attempting to run HISAT...",909
2,\n\n<s>[INST] submitting a job to a SGE 8.1.9 ...,1128
3,\n\n<s>[INST] I need a tool which can change t...,125
4,\n\n<s>[INST] hi\ni am working with galaxy for...,290
...,...,...
1251,"\n\n<s>[INST] hello,\n.\ni am working with can...",214
1252,\n\n<s>[INST] I have been trying to create a G...,175
1253,\n\n<s>[INST] I am trying to follow this trans...,250
1254,\n\n<s>[INST] Hello. Two questions\n\nDoes an...,394


In [3]:
len(conv_dataframe)

1256

In [4]:
from datasets import load_dataset
from datasets import Dataset

tr_index = 20 #1240
final_index = 25 #len(conv_dataframe)
tr_conv = conv_dataframe[:tr_index]
eval_conv = conv_dataframe[tr_index + 1: final_index]
dataset = Dataset.from_pandas(tr_conv).train_test_split(test_size=0.2, seed=42)

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
eval_conv.to_csv("../data/eval_dataset.csv", sep="\t", index=None)

In [5]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
)

model_name = "NousResearch/Llama-2-7b-chat-hf"
new_model = "llama-2-3b-galaxy-help"

compute_dtype = getattr(torch, "float16")

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, 
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4", 
    bnb_4bit_compute_dtype=compute_dtype #torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(
    model_name, 
    quantization_config=bnb_config,
    use_cache=False,
    device_map="auto"
)

model.config.pretraining_tp = 1


tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

#Resize the embeddings
#model.resize_token_embeddings(len(tokenizer))
#Configure the pad token in the model
#model.config.pad_token_id = tokenizer.pad_token_id
#model.config.use_cache = False # Gradient checkpointing is used by default but not compatible with caching

Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:14<00:00,  7.41s/it]


In [6]:
from peft import get_peft_config, prepare_model_for_kbit_training, get_peft_model, LoraConfig
from trl import SFTTrainer
import sys
import time

#target_modules = ['q_proj','k_proj','v_proj','o_proj','gate_proj','down_proj','up_proj','lm_head']
#or
#target_modules = ['q_proj','v_proj', 'k_proj', 'o_proj']

# Load LoRA configuration
peft_config = LoraConfig(
    lora_alpha=32,
    lora_dropout=0.1,
    r=8,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules= ["q_proj","v_proj"]
)

print("Extracting parameter efficient model ...")
s_time = time.time()
refined_model = prepare_model_for_kbit_training(model)
refined_model = get_peft_model(refined_model, peft_config)
e_time = time.time()
refined_model.print_trainable_parameters()
print("PEFT loading time: {} seconds".format(e_time - s_time))

base_dir = "llama"

print("Setting up Training arguments ...")

training_arguments = TrainingArguments(
    output_dir=base_dir,
    evaluation_strategy="steps",
    do_eval=True,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=1,
    per_device_eval_batch_size=4,
    optim="adamw_hf", #"paged_adamw_32bit",
    save_steps=4, #change to 500
    logging_steps=4, #change to 100
    learning_rate=1e-4,
    eval_steps=4, #change to 200
    fp16=True,
    max_grad_norm=0.3,
    num_train_epochs=1, # remove "#"
    #max_steps=10, #remove this
    warmup_ratio=0.03,
    lr_scheduler_type="constant",
)

print("Setting up SFTTrainer ...")

s_time = time.time()

# Set supervised fine-tuning parameters
trainer = SFTTrainer(
    model=refined_model,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    peft_config=peft_config,
    dataset_text_field="conversations",
    max_seq_length=512,
    tokenizer=tokenizer,
    args=training_arguments,
)

e_time = time.time()
print("SFTTTrainer setting up time: {} seconds".format(e_time - s_time))

print("Start training ...")
trainer.train()

#trainer.save_model("saved-model")
#trainer.model.save_pretrained(new_model)
# move this config to checkpoint folder for model reconstruction
refined_model.config.to_json_file("saved-model/config.json")
refined_model.save_pretrained('saved-model')



Extracting parameter efficient model ...
trainable params: 4,194,304 || all params: 6,742,609,920 || trainable%: 0.06220594176090199
PEFT loading time: 0.17100143432617188 seconds
Setting up Training arguments ...
Setting up SFTTrainer ...


Map: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:00<00:00, 612.87 examples/s]
Map: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 536.66 examples/s]
You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


SFTTTrainer setting up time: 0.15502619743347168 seconds
Start training ...


Step,Training Loss,Validation Loss
4,3.33,3.818697


In [7]:
tokenizer

LlamaTokenizerFast(name_or_path='NousResearch/Llama-2-7b-chat-hf', vocab_size=32000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'eos_token': AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'unk_token': AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'pad_token': '</s>'}, clean_up_tokenization_spaces=False)

In [8]:
predictions = []
original_instructions = []
ground_truth_answer = []

start_marker = '<s>[INST]'
end_marker = '[/INST]'
end_tag = "</s>"

for ri, row in eval_conv.iterrows():
    entire_conv = row["conversations"]
    start_index = entire_conv.find(start_marker)
    end_index = entire_conv.find(end_marker)
    instruction = entire_conv[start_index + len(start_marker):end_index].strip()
    prompt = entire_conv[start_index:end_index + len(end_marker)].strip()
    original_answer = entire_conv[end_index + len(end_marker): len(entire_conv) - len(end_tag) - 1].strip()
    original_instructions.append(instruction)
    ground_truth_answer.append(original_answer)
    print("encoding prompt ...")
    input_ids = tokenizer.encode(prompt, return_tensors="pt").to('cuda')
    print("generating response ...")
    outputs = refined_model.generate(input_ids=input_ids, 
        max_new_tokens=156,
        do_sample=True,
    )
    pred = tokenizer.decode(outputs[0])
    predictions.append(pred)
    break

pred_dataframe = pd.DataFrame(zip(original_instructions, ground_truth_answer, predictions), columns=["instructions", "ground truth answers", "generated answers"])
pred_dataframe.to_csv("../data/generated_answers.csv", sep="\t", index=None)

print("Finished generation =======")

encoding prompt ...
generating response ...






