In [1]:
import pandas as pd
import json

In [2]:
conv_dataframe = pd.read_csv("../data/conversations-galaxy-q-a.csv", sep="\t")
# all-conv-galaxy-q-a.csv # conversations-galaxy-q-a.csv
# conversations-galaxy-q-a.csv
conv_dataframe

Unnamed: 0,conversations,tokens
0,Act like Bioinformatician who uses Galaxy plat...,462
1,Act like Bioinformatician who uses Galaxy plat...,143
2,Act like Bioinformatician who uses Galaxy plat...,302
3,Act like Bioinformatician who uses Galaxy plat...,204
4,Act like Bioinformatician who uses Galaxy plat...,445
...,...,...
1122,Act like Bioinformatician who uses Galaxy plat...,232
1123,Act like Bioinformatician who uses Galaxy plat...,187
1124,Act like Bioinformatician who uses Galaxy plat...,262
1125,Act like Bioinformatician who uses Galaxy plat...,383


In [3]:
len(conv_dataframe)

1127

In [4]:
from datasets import load_dataset
from datasets import Dataset

tr_index = 20
final_index = len(conv_dataframe)
tr_conv = conv_dataframe[:tr_index]
eval_conv = conv_dataframe[tr_index + 1: final_index]
dataset = Dataset.from_pandas(tr_conv).train_test_split(test_size=0.2, seed=42)

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
eval_conv.to_csv("../data/eval_dataset.csv", sep="\t", index=None)

In [6]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    AdamW
)

[2023-12-08 15:18:15,711] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)


In [7]:
model_name = "NousResearch/Llama-2-7b-chat-hf"

compute_dtype = getattr(torch, "float16")

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, 
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4", 
    bnb_4bit_compute_dtype=compute_dtype
)

model = AutoModelForCausalLM.from_pretrained(
    model_name, 
    quantization_config=bnb_config,
    use_cache=False,
    device_map="auto"
)

model.config.pretraining_tp = 1

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Loading checkpoint shards: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:22<00:00, 11.48s/it]


In [8]:
from peft import get_peft_config, prepare_model_for_kbit_training, get_peft_model, LoraConfig
from trl import SFTTrainer
import sys
import time
import datetime

target_modules = ['q_proj','k_proj','v_proj','o_proj','gate_proj','down_proj','up_proj','lm_head']
#or
#target_modules = ['q_proj','v_proj', 'k_proj', 'o_proj']
#target_modules = ["q_proj", "v_proj"]

learning_rate = 1e-4

# Load LoRA configuration
peft_config = LoraConfig(
    lora_alpha=32,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=target_modules
)

print("Extracting parameter efficient model ...")
s_time = time.time()
refined_model = prepare_model_for_kbit_training(model)
refined_model = get_peft_model(refined_model, peft_config)
e_time = time.time()
refined_model.print_trainable_parameters()
print("PEFT loading time: {} seconds".format(e_time - s_time))

base_dir = "llama-test-galaxy-conv-dec-8-1"

print("Setting up Training arguments ...")

#optimizer = AdamW(refined_model.parameters(), lr=learning_rate)

training_arguments = TrainingArguments(
    output_dir=base_dir,
    evaluation_strategy="steps",
    do_eval=True,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=1,
    per_device_eval_batch_size=8,
    optim="adamw_hf",
    save_steps=2,
    logging_steps=2,
    learning_rate=learning_rate,
    eval_steps=2,
    fp16=True,
    max_grad_norm=0.3,
    num_train_epochs=1, # remove "#"
    warmup_ratio=0.03,
    lr_scheduler_type="constant",
    report_to="tensorboard"
)

print("Setting up SFTTrainer ...")

s_time = time.time()

# Set supervised fine-tuning parameters
trainer = SFTTrainer(
    model=refined_model,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    peft_config=peft_config,
    dataset_text_field="conversations",
    max_seq_length=700,
    tokenizer=tokenizer,
    args=training_arguments,
)

e_time = time.time()
print("SFTTTrainer setting up time: {} seconds".format(e_time - s_time))

print("Start training ...")
trainer.train()



Extracting parameter efficient model ...
trainable params: 162,217,984 || all params: 6,900,633,600 || trainable%: 2.350769413405749
PEFT loading time: 4.816115617752075 seconds
Setting up Training arguments ...
Setting up SFTTrainer ...


Map: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:00<00:00, 559.27 examples/s]
Map: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 383.89 examples/s]

SFTTTrainer setting up time: 0.18945598602294922 seconds
Start training ...



You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss
2,3.5189,3.087666


TrainOutput(global_step=2, training_loss=3.5188612937927246, metrics={'train_runtime': 94.4784, 'train_samples_per_second': 0.169, 'train_steps_per_second': 0.021, 'total_flos': 135272832565248.0, 'train_loss': 3.5188612937927246, 'epoch': 1.0})

In [9]:
import pandas as pd
import time

eval_conv = pd.read_csv("../data/eval_dataset.csv", sep="\t")
eval_conv

Unnamed: 0,conversations,tokens
0,Act like Bioinformatician who uses Galaxy plat...,301
1,Act like Bioinformatician who uses Galaxy plat...,410
2,Act like Bioinformatician who uses Galaxy plat...,185
3,Act like Bioinformatician who uses Galaxy plat...,563
4,Act like Bioinformatician who uses Galaxy plat...,173
...,...,...
1101,Act like Bioinformatician who uses Galaxy plat...,232
1102,Act like Bioinformatician who uses Galaxy plat...,187
1103,Act like Bioinformatician who uses Galaxy plat...,262
1104,Act like Bioinformatician who uses Galaxy plat...,383


In [10]:
import datetime
filedt = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
filedt

'20231208-152029'

In [None]:
predictions = []
original_instructions = []
ground_truth_answer = []
extracted_answers = []

start_marker = 'Act like'#'[INST]'
end_marker = '[/INST]'
end_tag = ""

s_time = time.time()

for ri, row in eval_conv.iterrows():
    entire_conv = row["conversations"]
    start_index = entire_conv.find(start_marker)
    end_index = entire_conv.find(end_marker)
    instruction = entire_conv[start_index:end_index].strip()
    prompt = entire_conv[start_index:end_index + len(end_marker)].strip()
    original_answer = entire_conv[end_index + len(end_marker): len(entire_conv) - len(end_tag) - 1].strip()
    original_instructions.append(instruction)
    ground_truth_answer.append(original_answer)
    print("Prompt: \n")
    print(prompt)
    print("Instruction: \n")
    print(instruction)
    print()
    print("Ground truth answer: \n")
    print(original_answer)
    print()
    print("encoding prompt number {}...".format(ri+1))
    input_ids = tokenizer.encode(prompt, return_tensors="pt").to('cuda')
    print("generating response number {} ...".format(ri+1))
    outputs = refined_model.generate(input_ids=input_ids, 
        max_new_tokens=256,
        do_sample=True,
    )
    pred = tokenizer.decode(outputs[0])
    extracted_pred = pred[pred.find(end_marker) + len(end_marker): len(pred)].strip()
    predictions.append(pred)
    extracted_answers.append(extracted_pred)
    print("Generated answer: \n")
    print(extracted_pred)
    print()
    print("====================")
    if ri == 5:
        break

output_file_name = "generated_answers_peft_{}_{}".format(base_dir, filedt)
pred_dataframe = pd.DataFrame(zip(original_instructions, ground_truth_answer, extracted_answers, predictions), columns=["Instructions", "Ground truth answers", "Predicted answers", "Full generated answers"])
pred_dataframe.to_csv("../data/{}.csv".format(output_file_name), sep="\t", index=None)

e_time = time.time()

print("Finished generation in {} seconds".format(e_time - s_time))

Prompt: 

Act like Bioinformatician who uses Galaxy platform for biological data analysis. Understand the following instruction and prepare a suitable response.

[INST] hello, i am trying to assemble a plant genome using ont data with flye. prior to launch the assembly with all my data (about 37 gb in fastqsanger.gz) i recently tried using half of them and the job was successful after 2 days running with a descent genome size obtained. i then launched with all the available sequences (37 gb in fastqsanger.gz) and it is now been running for 5 days. i am just wondering if i am using more ressources than allowed for this tool or if it is still running. what makes me worrying is that now my quota usage now indicates 0% suggesting there might be a bug somewhere ? thank you very much. ben [/INST]
Instruction: 

Act like Bioinformatician who uses Galaxy platform for biological data analysis. Understand the following instruction and prepare a suitable response.

[INST] hello, i am trying to as

