In [1]:
import os
os.environ['HF_HOME'] = '/mnt/hf_cache/'
import sys
import torch
import pandas as pd
from datasets import load_dataset, Dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments
from peft import get_peft_config, prepare_model_for_kbit_training, get_peft_model, LoraConfig
from trl import SFTTrainer
import time
import datetime

# Load conversation dataset
max_token_size = 700

galaxy_conv_dataframe = pd.read_csv("../data/conversations-galaxy-q-a.csv", sep="\t")
galaxy_conv_dataframe = galaxy_conv_dataframe[galaxy_conv_dataframe["tokens"] <= max_token_size]

biostar_conv_dataframe = pd.read_csv("../data/conversations-biostars-q-a.csv", sep="\t")
biostar_conv_dataframe = biostar_conv_dataframe[biostar_conv_dataframe["tokens"] <= max_token_size]

print("Size of Galaxy conversation data: {}".format(len(galaxy_conv_dataframe)))
print("Size of Biostars conversation data: {}".format(len(biostar_conv_dataframe)))

# Split dataset into training and evaluation sets, but only for Galaxy conversations
tr_index = 200
final_index = 250 #len(galaxy_conv_dataframe)
tr_conv = galaxy_conv_dataframe[:tr_index]
eval_conv = galaxy_conv_dataframe[tr_index + 1: final_index]

#biostar_conv_dataframe = biostar_conv_dataframe[:20]
# combine tr_conv with biostars data for training
#tr_conv = pd.concat([tr_conv, biostar_conv_dataframe], axis=0)

print("Size of tr/te: {}/{}".format(len(tr_conv), len(eval_conv)))
dataset = Dataset.from_pandas(tr_conv).train_test_split(test_size=0.2, seed=42)

# Save evaluation dataset to a CSV file
eval_conv.to_csv("../data/eval_dataset.csv", sep="\t", index=None)

# Load pre-trained model and tokenizer


  from .autonotebook import tqdm as notebook_tqdm


[2024-06-11 12:04:37,775] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)
Size of Galaxy conversation data: 1122
Size of Biostars conversation data: 3782
Size of tr/te: 200/49




In [2]:
model_name = "NousResearch/Llama-2-7b-chat-hf"
compute_dtype = torch.float16
bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_use_double_quant=True,
                                bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=compute_dtype)

model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=bnb_config, use_cache=True, device_map="auto")
model.config.pretraining_tp = 1

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:02<00:00,  1.49s/it]


In [3]:
target_modules = ['q_proj','k_proj','v_proj','o_proj','gate_proj','down_proj','up_proj','lm_head']
#target_modules = ['q_proj','v_proj', 'k_proj', 'o_proj']
#target_modules = ["q_proj","v_proj"]

# Load LoRA configuration
peft_config = LoraConfig(lora_alpha=32, lora_dropout=0.1, r=64, bias="none", task_type="CAUSAL_LM",
                          target_modules=target_modules)

print("Extracting parameter efficient model ...")
start_time = time.time()
refined_model = get_peft_model(prepare_model_for_kbit_training(model), peft_config)
end_time = time.time()
refined_model.print_trainable_parameters()
print(f"PEFT loading time: {end_time - start_time} seconds")

Extracting parameter efficient model ...
trainable params: 162,217,984 || all params: 6,900,633,600 || trainable%: 2.350769413405749
PEFT loading time: 2.6068170070648193 seconds


In [8]:
base_dir = "llama-linear-layers-all-conv-June-11-24"

print("Setting up Training arguments ...")

log_steps = 100
save_steps = 100

# Set up training arguments
training_arguments = TrainingArguments(
    output_dir=base_dir,
    evaluation_strategy="steps",
    do_eval=True,
    per_device_train_batch_size=6,
    gradient_accumulation_steps=1,
    per_device_eval_batch_size=6,
    optim="adamw_hf",
    save_steps=save_steps,
    logging_steps=log_steps,
    eval_steps=log_steps,
    learning_rate=1e-4,
    fp16=True,
    max_grad_norm=0.3,
    num_train_epochs=5,
    warmup_ratio=0.03,
    lr_scheduler_type="constant",
    report_to="tensorboard"
)

Setting up Training arguments ...


In [None]:
# https://github.com/huggingface/trl/issues/953

from transformers import Trainer

print("Setting up SFTTrainer ...")

start_time = time.time()
# model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=bnb_config, use_cache=True, device_map="auto")
def model_init_hs(trial):
    #base_model_hs = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=bnb_config, use_cache=False, device_map="auto")
    #peft_config_hs = LoraConfig(lora_alpha=32, lora_dropout=0.1, r=64, bias="none", task_type="CAUSAL_LM", target_modules=target_modules)
    return refined_model #get_peft_model(base_model_hs, peft_config_hs)


trainer = SFTTrainer(
    model=refined_model,
    model_init=model_init_hs,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    peft_config=peft_config,
    dataset_text_field="conversations",
    max_seq_length=max_token_size,
    tokenizer=tokenizer,
    args=training_arguments,
)

'''trainer = Trainer(
    model=None,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    peft_config=peft_config,
    #dataset_text_field="conversations",
    #max_seq_length=max_token_size,
    tokenizer=tokenizer,
    args=training_arguments,
    model_init=model_init_hs,
)'''

end_time = time.time()
print(f"SFTTTrainer setting up time: {end_time - start_time} seconds")

print("Start Hyperparameter search ...")


def optuna_hp_space(trial):
    return {
        "learning_rate": trial.suggest_float("learning_rate", 1e-6, 1e-3, log=True),
        "max_grad_norm": trial.suggest_float("max_grad_norm", 0.01, 0.5),
        "warmup_ratio": trial.suggest_float("warmup_ratio", 1e-4, 1e-1, log=True),
    }


best_trial = trainer.hyperparameter_search(
    direction="minimize",
    backend="optuna",
    hp_space=optuna_hp_space,
    n_trials=5
)

#trainer.train()
#trainer.save_model()

print("Finished Hyperparameter search ...")

Setting up SFTTrainer ...


Map: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 160/160 [00:00<00:00, 3389.94 examples/s]
Map: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 40/40 [00:00<00:00, 2702.82 examples/s]
[I 2024-06-11 13:39:07,658] A new study created in memory with name: no-name-90f7af65-1d7d-4750-950c-ce25a5f77761


SFTTTrainer setting up time: 0.22990632057189941 seconds
Start Hyperparameter search ...




Step,Training Loss,Validation Loss


In [None]:
best_trial

In [None]:
save_path = base_dir + "/best_params.txt"
with open(save_path) as f:
    f.write(best_trial)

In [6]:
'''
training_arguments = TrainingArguments(
    output_dir=base_dir,
    evaluation_strategy="steps",
    do_eval=True,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=1,
    per_device_eval_batch_size=8,
    optim="adamw_hf",
    save_steps=100,
    logging_steps=100,
    eval_steps=100,
    learning_rate=1e-4,
    fp16=True,
    max_grad_norm=0.3,
    num_train_epochs=1,
    warmup_ratio=0.03,
    lr_scheduler_type="constant",
    report_to="tensorboard"
)
'''

'''def optuna_hp_space(trial):
    return {
        "learning_rate": trial.suggest_float("learning_rate", 1e-6, 1e-3, log=True),
        "max_grad_norm": trial.suggest_float("max_grad_norm", 0.01, 0.5),
        "warmup_ratio": trial.suggest_float("warmup_ratio", 1e-4, 1e-1, log=True),
    }
    

best_trial = trainer.hyperparameter_search(
    direction="maximize",
    backend="optuna",
    hp_space=optuna_hp_space,
    n_trials=20,
    #compute_objective=compute_objective,
)'''

'def optuna_hp_space(trial):\n    return {\n        "learning_rate": trial.suggest_float("learning_rate", 1e-6, 1e-3, log=True),\n        "max_grad_norm": trial.suggest_float("max_grad_norm", 0.01, 0.5),\n        "warmup_ratio": trial.suggest_float("warmup_ratio", 1e-4, 1e-1, log=True),\n    }\n    \n\nbest_trial = trainer.hyperparameter_search(\n    direction="maximize",\n    backend="optuna",\n    hp_space=optuna_hp_space,\n    n_trials=20,\n    #compute_objective=compute_objective,\n)'