In [8]:
import pandas as pd
import json

In [9]:
conv_dataframe = pd.read_csv("../data/all-conv-galaxy-q-a.csv", sep="\t")
# all-conv-galaxy-q-a.csv # conversations-galaxy-q-a.csv
conv_dataframe

Unnamed: 0,conversations,tensor_size
0,Act like Bioinformatician who uses Galaxy plat...,381
1,Act like Bioinformatician who uses Galaxy plat...,429
2,Act like Bioinformatician who uses Galaxy plat...,504
3,Act like Bioinformatician who uses Galaxy plat...,570
4,Act like Bioinformatician who uses Galaxy plat...,492
...,...,...
1984,Act like Bioinformatician who uses Galaxy plat...,192
1985,Act like Bioinformatician who uses Galaxy plat...,108
1986,Act like Bioinformatician who uses Galaxy plat...,179
1987,Act like Bioinformatician who uses Galaxy plat...,267


In [10]:
len(conv_dataframe)

1989

In [11]:
from datasets import load_dataset
from datasets import Dataset

tr_index = 1800
final_index = len(conv_dataframe)
tr_conv = conv_dataframe[:tr_index]
eval_conv = conv_dataframe[tr_index + 1: final_index]
dataset = Dataset.from_pandas(tr_conv).train_test_split(test_size=0.2, seed=42)

In [7]:
eval_conv.to_csv("../data/eval_dataset.csv", sep="\t", index=None)

In [7]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    AdamW
)


[2023-12-01 15:55:06,790] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)


In [8]:
model_name = "NousResearch/Llama-2-7b-chat-hf"
new_model = "llama-2-3b-galaxy-help"

compute_dtype = getattr(torch, "float16")

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, 
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4", 
    bnb_4bit_compute_dtype=compute_dtype
)

model = AutoModelForCausalLM.from_pretrained(
    model_name, 
    quantization_config=bnb_config,
    use_cache=False,
    device_map="auto"
)

model.config.pretraining_tp = 1

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:10<00:00,  5.48s/it]


In [9]:
from peft import get_peft_config, prepare_model_for_kbit_training, get_peft_model, LoraConfig
from trl import SFTTrainer
import sys
import time

#target_modules = ['q_proj','k_proj','v_proj','o_proj','gate_proj','down_proj','up_proj','lm_head']
#or
#target_modules = ['q_proj','v_proj', 'k_proj', 'o_proj']
target_modules = ["q_proj","v_proj"]

learning_rate = 1e-4

# Load LoRA configuration
peft_config = LoraConfig(
    lora_alpha=32,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=target_modules
)

print("Extracting parameter efficient model ...")
s_time = time.time()
refined_model = prepare_model_for_kbit_training(model)
refined_model = get_peft_model(refined_model, peft_config)
e_time = time.time()
refined_model.print_trainable_parameters()
print("PEFT loading time: {} seconds".format(e_time - s_time))

base_dir = "llama-test-all-conv-dec-1-1"

print("Setting up Training arguments ...")

#optimizer = AdamW(refined_model.parameters(), lr=learning_rate)

training_arguments = TrainingArguments(
    output_dir=base_dir,
    evaluation_strategy="steps",
    do_eval=True,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=1,
    per_device_eval_batch_size=8,
    optim="adamw_hf", #"adamw_hf", #"paged_adamw_32bit",
    save_steps=2, #change to 500
    logging_steps=2, #change to 100
    learning_rate=learning_rate,
    eval_steps=2, #change to 200
    fp16=True,
    max_grad_norm=0.3,
    num_train_epochs=1, # remove "#"
    warmup_ratio=0.03,
    lr_scheduler_type="constant",
    report_to="tensorboard"
)

print("Setting up SFTTrainer ...")

s_time = time.time()

# Set supervised fine-tuning parameters
trainer = SFTTrainer(
    model=refined_model,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    peft_config=peft_config,
    dataset_text_field="conversations",
    max_seq_length=700,
    tokenizer=tokenizer,
    args=training_arguments,
)

#print(dir(trainer))

e_time = time.time()
print("SFTTTrainer setting up time: {} seconds".format(e_time - s_time))

print("Start training ...")
trainer.train()

#trainer.save_model("saved-model")
#trainer.model.save_pretrained(new_model)
# move this config to checkpoint folder for model reconstruction
#refined_model.config.to_json_file("saved-model/config.json")
#refined_model.save_pretrained('saved-model')
#trainer.save_model()



Extracting parameter efficient model ...
trainable params: 33,554,432 || all params: 6,771,970,048 || trainable%: 0.49548996469513035
PEFT loading time: 1.2442035675048828 seconds
Setting up Training arguments ...
Setting up SFTTrainer ...


Map: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 40/40 [00:00<00:00, 693.31 examples/s]
Map: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 546.28 examples/s]
You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


SFTTTrainer setting up time: 0.29886937141418457 seconds
Start training ...




Step,Training Loss,Validation Loss
2,3.5837,2.972639
4,3.5817,2.908731




TrainOutput(global_step=5, training_loss=3.5200754165649415, metrics={'train_runtime': 223.0593, 'train_samples_per_second': 0.179, 'train_steps_per_second': 0.022, 'total_flos': 492793694453760.0, 'train_loss': 3.5200754165649415, 'epoch': 1.0})

In [10]:
'''import datetime

final_model_path = "saved-model"
filedt = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
model_path = final_model_path + "/saved_model_" + filedt
opt_path = final_model_path + "/saved_opt_" + filedt

torch.save(refined_model.state_dict(), model_path)
#torch.save(optimizer.state_dict(), opt_path)'''

'import datetime\n\nfinal_model_path = "saved-model"\nfiledt = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")\nmodel_path = final_model_path + "/saved_model_" + filedt\nopt_path = final_model_path + "/saved_opt_" + filedt\n\ntorch.save(refined_model.state_dict(), model_path)\n#torch.save(optimizer.state_dict(), opt_path)'