In [1]:
import pandas as pd
import json

In [2]:
conv_dataframe = pd.read_csv("../data/conversations-galaxy-q-a.csv", sep="\t")
conv_dataframe

Unnamed: 0,conversations,tokens
0,"\n\n<s>[INST] Hi, I met an error when I used t...",478
1,"\n\n<s>[INST] Hi,\nI’m attempting to run HISAT...",909
2,\n\n<s>[INST] submitting a job to a SGE 8.1.9 ...,1128
3,\n\n<s>[INST] I need a tool which can change t...,125
4,\n\n<s>[INST] hi\ni am working with galaxy for...,290
...,...,...
1251,"\n\n<s>[INST] hello,\n.\ni am working with can...",214
1252,\n\n<s>[INST] I have been trying to create a G...,175
1253,\n\n<s>[INST] I am trying to follow this trans...,250
1254,\n\n<s>[INST] Hello. Two questions\n\nDoes an...,394


In [3]:
from datasets import load_dataset
from datasets import Dataset

dataset = Dataset.from_pandas(conv_dataframe).train_test_split(test_size=0.2, seed=42)

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
)

model_name = "NousResearch/Llama-2-7b-chat-hf"
new_model = "llama-2-3b-galaxy-help"

compute_dtype = getattr(torch, "float16")

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, 
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4", 
    bnb_4bit_compute_dtype=compute_dtype #torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(
    model_name, 
    quantization_config=bnb_config,
    use_cache=False,
    device_map="auto"
)

model.config.pretraining_tp = 1


tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

#Resize the embeddings
#model.resize_token_embeddings(len(tokenizer))
#Configure the pad token in the model
#model.config.pad_token_id = tokenizer.pad_token_id
#model.config.use_cache = False # Gradient checkpointing is used by default but not compatible with caching

Loading checkpoint shards: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:04<00:00,  2.00s/it]


In [None]:
from peft import get_peft_config, prepare_model_for_kbit_training, get_peft_model, LoraConfig
from trl import SFTTrainer
import sys
import time

#target_modules = ['q_proj','k_proj','v_proj','o_proj','gate_proj','down_proj','up_proj','lm_head']
#or
#target_modules = ['q_proj','v_proj', 'k_proj', 'o_proj']

# Load LoRA configuration
peft_config = LoraConfig(
    lora_alpha=32,
    lora_dropout=0.1,
    r=8,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules= ["q_proj","v_proj"]
)

print("Extracting parameter efficient model ...")
s_time = time.time()
refined_model = prepare_model_for_kbit_training(model)
refined_model = get_peft_model(refined_model, peft_config)
e_time = time.time()
refined_model.print_trainable_parameters()
print("PEFT loading time: {} seconds".format(e_time - s_time))

base_dir = "llama"

'''per_device_train_batch_size = 4
gradient_accumulation_steps = 4
optim = 'adamw_hf'
learning_rate = 1e-5
max_grad_norm = 0.3
warmup_ratio = 0.03
lr_scheduler_type = "linear"'''

print("Setting up Training arguments ...")

# Set training parameters
'''training_params = TrainingArguments(
    output_dir=base_dir,
    num_train_epochs=1,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=1,
    optim="adamw_hf",
    save_steps=25,
    logging_steps=25,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=False,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="linear",
    report_to="tensorboard"
)'''

training_arguments = TrainingArguments(
    output_dir=base_dir,
    evaluation_strategy="steps",
    do_eval=True,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=1,
    per_device_eval_batch_size=4,
    optim="adamw_hf", #"paged_adamw_32bit",
    save_steps=200, #change to 500
    logging_steps=25, #change to 100
    learning_rate=1e-4,
    eval_steps=25, #change to 200
    fp16=True,
    max_grad_norm=0.3,
    num_train_epochs=5, # remove "#"
    #max_steps=10, #remove this
    warmup_ratio=0.03,
    lr_scheduler_type="constant",
)

print("Setting up SFTTrainer ...")

s_time = time.time()

trainer = SFTTrainer(
    model=model,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    peft_config=peft_config,
    dataset_text_field="conversations",
    max_seq_length=512,
    tokenizer=tokenizer,
    args=training_arguments,
)


'''trainer = SFTTrainer(
    refined_model,
    train_dataset=dataset['train'],
    eval_dataset = dataset['test'],
    dataset_text_field="conversations",
    max_seq_length=128,
    args=training_args,
)'''

# Set supervised fine-tuning parameters
'''trainer = SFTTrainer(
    model=model,
    train_dataset=dataset['train'],
    eval_dataset = dataset['test'],
    peft_config=peft_args,
    dataset_text_field="conversations",
    max_seq_length=None,
    tokenizer=tokenizer,
    args=training_params,
    #packing=False,
)'''

e_time = time.time()
print("SFTTTrainer setting up time: {} seconds".format(e_time - s_time))

print("Start training ...")
trainer.train()

#trainer.model.save_pretrained(new_model)
# move this config to checkpoint folder for model reconstruction
#refined_model.config.to_json_file("saved-config/config.json")



Extracting parameter efficient model ...
trainable params: 4,194,304 || all params: 6,742,609,920 || trainable%: 0.06220594176090199
PEFT loading time: 0.15964317321777344 seconds
Setting up Training arguments ...
Setting up SFTTrainer ...


Map: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1004/1004 [00:00<00:00, 1838.19 examples/s]
Map: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 252/252 [00:00<00:00, 2263.46 examples/s]
You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


SFTTTrainer setting up time: 0.9094860553741455 seconds
Start training ...


Step,Training Loss,Validation Loss
25,2.6694,2.488999
50,2.3895,2.275794


In [None]:
from tensorboard import notebook
log_dir = "llama-qlora/runs"
notebook.start("--logdir {} --port 4000".format(log_dir))

In [None]:
'''user_template = """ Below is a question asked by a user: \n
### Instruction:
{}
"""

system_template = """### Response by the system to the above instruction: \n
{}
"""'''

'''
user_template = """### Instruction:{}"""

system_template = """### Response: {}"""
'''

# test fine-tuned model
# Number 1000, 1001 from data.json

test_strings = ["Hello!\nI am running a local instance of Galaxy (build 22.05). I installed the latest version of Deepvariant (1.4.0+galaxy0) which installed without any errors. However, when I try to run Deepvariant on BAM files output from HISAT2, the error “Fatal error: Exit code 127 ()” comes up. Further, it says that the tool generated the following error: “line 9: run_deepvariant: command not found”.\nWhen I look at the backend to see what process Galaxy is going through, even after installation of the tool, the following line keeps repeating on the command line interface:\nuvicorn.access INFO 2022-12-22 13:43:26,629 [pN:main.1,p:100965,tN:MainThread] 127.0.0.1:58158 - “GET /api/tool_shed_repositories?name=deepvariant&owner=iuc HTTP/1.1” 200\nI don’t understand this error. Could someone please help me out?\nI am running the same job on Galaxy.eu server and it is running (for a few hours now) but in the local instance in errors out pretty much instantly.\nThanks!",
"Dear Sir,\nKindly help in this regards I was trying to make a de novo contig using trinity and it is running since from one week.\nIs it ok??? or did I out something wrong\nKindly help"
               ]


#response_test_strings = [
#"Hello, I can’t give a full answer but I can maybe guide you in the right direction and maybe someone that can give a better answer will reply.\nGiven the error it looks like deepvariant is not installed (not found). The tool is using a “docker tool dependency”, in other words it needs a container where deepvariant is installed. If you have not checked this yet then I think this is the place to start. Below two links where you may find some more information.\nhttps://docs.galaxyproject.org/en/master/admin/special_topics/mulled_containers.html\n  \n      \n\n      training.galaxyproject.org\n  \n\n  \n    \n\nGalaxy Training: Tool Dependencies and Containers\n\n  Galaxy is an open-source project. Everyone can contribute...\n\n\n  \n\n  \n    \n    \n  \n\n  \n\n\nThe requirement can be seen here:\n  \n\n      github.com\n  \n\n  \n    galaxyproject/tools-iuc/blob/master/tools/deepvariant/macros.xml\n\n\n      <macros>\n    <token name="@TOOL_VERSION@">1.4.0</token>\n    <token name="@SUFFIX_VERSION@">0</token>\n    <xml name="edam_ontology">\n        <edam_topics>                                                                                  \n            <edam_topic>topic_0199</edam_topic>\n        </edam_topics>\n        <edam_operations>\n            <edam_operation>operation_3227</edam_operation>\n        </edam_operations>\n    </xml>\n    <xml name="requirements">\n        <requirements>\n            <container type="docker">google/deepvariant:@TOOL_VERSION@</container>\n        </requirements>\n    </xml>\n    <xml name="citations">\n        <citations>\n            <citation type="doi">10.1038/nbt.4235</citation>\n        </citations>\n\n\n\n\n  This file has been truncated. show original\n\n  \n\n  \n    \n    \n  \n\n  \n\n", 
# "Hello @Sachin_Srivastava\nIf the job is running (yellow/peach dataset), it is usually best to allow it to run. The same is true for queued jobs (grey dataset). This applies to jobs (any tool) executed at a public Galaxy server.\n20 GB of fastq data – uncompressed – creates a very large assembly job. If it fails later on for exceeding resources (red dataset), you’ll need to do one or more of these:\n\nTry a rerun to eliminate cluster issues\nMore QA/QC on the input reads (always recommended)\nConsider downsampling the reads (tool: Seqtk)\nPossibly need to move to your own Galaxy server where more resources can be allocated. The GVL version of Cloudman is one option: https://launch.usegalaxy.org/catalog\n\n\nI added some tags to your post that will find prior Q&A about the above actions. Or, you can search the forum with those keywords (not all posts get tagged).\nYou didn’t state where you are working. But, if by chance at Galaxy Main https://usegalaxy.org, I can let you know that the cluster that runs Trinity (and Unicycler + RNA-Star) is very busy. Longer queue times are expected. If you delete the current job and rerun, that will only place your job back at the end of the queue again, extending wait time.\nThanks!"]

predictions = []

for test in test_strings:
  prompt = """### Instruction:
  {}. 
  \n
  ### Response: \n""".format(test)

  input_ids = tokenizer.encode(prompt, return_tensors="pt").to('cuda')

  outputs = refined_model.generate(input_ids=input_ids, 
                           max_new_tokens=156, 
                           do_sample=True, 
                           #top_p=0.9,
                           #temperature=0.9
                          )
  predictions.append(tokenizer.decode(outputs[0]))
    

def extract_response_text(input_string):
    start_marker = 'Response: \n'
    end_marker = '###'
    start_index = input_string.find(start_marker)
    if start_index == -1:
        return None
    
    start_index += len(start_marker)
    
    end_index = input_string.find(end_marker, start_index)
    if end_index == -1:
        return input_string[start_index:]
    
    return input_string[start_index:end_index].strip()

for i in range(len(test_strings)): 
  pred = predictions[i]
  text = test_strings[i]
  print(text+'\n')
  print(pred+'\n')
  print(extract_response_text(pred))
  print("==============================")

In [None]:
### Original Llama2 model output

# test fine-tuned model
# Number 1000, 1001 from data.json

test_strings = ["Hello!\nI am running a local instance of Galaxy (build 22.05). I installed the latest version of Deepvariant (1.4.0+galaxy0) which installed without any errors. However, when I try to run Deepvariant on BAM files output from HISAT2, the error “Fatal error: Exit code 127 ()” comes up. Further, it says that the tool generated the following error: “line 9: run_deepvariant: command not found”.\nWhen I look at the backend to see what process Galaxy is going through, even after installation of the tool, the following line keeps repeating on the command line interface:\nuvicorn.access INFO 2022-12-22 13:43:26,629 [pN:main.1,p:100965,tN:MainThread] 127.0.0.1:58158 - “GET /api/tool_shed_repositories?name=deepvariant&owner=iuc HTTP/1.1” 200\nI don’t understand this error. Could someone please help me out?\nI am running the same job on Galaxy.eu server and it is running (for a few hours now) but in the local instance in errors out pretty much instantly.\nThanks!",
"Dear Sir,\nKindly help in this regards I was trying to make a de novo contig using trinity and it is running since from one week.\nIs it ok??? or did I out something wrong\nKindly help"
               ]

#response_test_strings = [
#"Hello, I can’t give a full answer but I can maybe guide you in the right direction and maybe someone that can give a better answer will reply.\nGiven the error it looks like deepvariant is not installed (not found). The tool is using a “docker tool dependency”, in other words it needs a container where deepvariant is installed. If you have not checked this yet then I think this is the place to start. Below two links where you may find some more information.\nhttps://docs.galaxyproject.org/en/master/admin/special_topics/mulled_containers.html\n  \n      \n\n      training.galaxyproject.org\n  \n\n  \n    \n\nGalaxy Training: Tool Dependencies and Containers\n\n  Galaxy is an open-source project. Everyone can contribute...\n\n\n  \n\n  \n    \n    \n  \n\n  \n\n\nThe requirement can be seen here:\n  \n\n      github.com\n  \n\n  \n    galaxyproject/tools-iuc/blob/master/tools/deepvariant/macros.xml\n\n\n      <macros>\n    <token name="@TOOL_VERSION@">1.4.0</token>\n    <token name="@SUFFIX_VERSION@">0</token>\n    <xml name="edam_ontology">\n        <edam_topics>                                                                                  \n            <edam_topic>topic_0199</edam_topic>\n        </edam_topics>\n        <edam_operations>\n            <edam_operation>operation_3227</edam_operation>\n        </edam_operations>\n    </xml>\n    <xml name="requirements">\n        <requirements>\n            <container type="docker">google/deepvariant:@TOOL_VERSION@</container>\n        </requirements>\n    </xml>\n    <xml name="citations">\n        <citations>\n            <citation type="doi">10.1038/nbt.4235</citation>\n        </citations>\n\n\n\n\n  This file has been truncated. show original\n\n  \n\n  \n    \n    \n  \n\n  \n\n", 
# "Hello @Sachin_Srivastava\nIf the job is running (yellow/peach dataset), it is usually best to allow it to run. The same is true for queued jobs (grey dataset). This applies to jobs (any tool) executed at a public Galaxy server.\n20 GB of fastq data – uncompressed – creates a very large assembly job. If it fails later on for exceeding resources (red dataset), you’ll need to do one or more of these:\n\nTry a rerun to eliminate cluster issues\nMore QA/QC on the input reads (always recommended)\nConsider downsampling the reads (tool: Seqtk)\nPossibly need to move to your own Galaxy server where more resources can be allocated. The GVL version of Cloudman is one option: https://launch.usegalaxy.org/catalog\n\n\nI added some tags to your post that will find prior Q&A about the above actions. Or, you can search the forum with those keywords (not all posts get tagged).\nYou didn’t state where you are working. But, if by chance at Galaxy Main https://usegalaxy.org, I can let you know that the cluster that runs Trinity (and Unicycler + RNA-Star) is very busy. Longer queue times are expected. If you delete the current job and rerun, that will only place your job back at the end of the queue again, extending wait time.\nThanks!"]

predictions = []

for test in test_strings:
  prompt = """### Instruction: {}.
  ### Response: \n""".format(test)

  input_ids = tokenizer.encode(prompt, return_tensors="pt").to('cuda')

  outputs = model.generate(input_ids=input_ids, 
                           max_new_tokens=156, 
                           do_sample=True, 
                           #top_p=0.9,
                           #temperature=0.9
                          )
  predictions.append(tokenizer.decode(outputs[0]))

def extract_response_text(input_string):
    start_marker = 'Response: \n'
    end_marker = '###'
    start_index = input_string.find(start_marker)
    if start_index == -1:
        return None
    
    start_index += len(start_marker)
    
    end_index = input_string.find(end_marker, start_index)
    if end_index == -1:
        return input_string[start_index:]
    
    return input_string[start_index:end_index].strip()

for i in range(len(test_strings)): 
  pred = predictions[i]
  text = test_strings[i]
  print(text+'\n')
  print(pred+'\n')
  print(extract_response_text(pred))
  print("==============================")

In [None]:
# Reload PEFT fined tuned model
# https://stackoverflow.com/questions/76459034/how-to-load-a-fine-tuned-peft-lora-model-based-on-llama-with-huggingface-transfo
'''from transformers import LlamaTokenizer, LlamaForCausalLM

saved_model_path = 'llama/checkpoint-2'
re_tokenizer = LlamaTokenizer.from_pretrained(saved_model_path)
re_model = LlamaForCausalLM.from_pretrained(saved_model_path, device_map='auto')'''

In [None]:

'''# test fine-tuned model
# Number 1000, 1001 from data.json

#re_tokenizer.pad_token = "[PAD]"
re_tokenizer.add_special_tokens({'pad_token': '[PAD]'})
re_tokenizer.padding_side = "left"

test_strings = ["Hello!\nI am running a local instance of Galaxy (build 22.05). I installed the latest version of Deepvariant (1.4.0+galaxy0) which installed without any errors. However, when I try to run Deepvariant on BAM files output from HISAT2, the error “Fatal error: Exit code 127 ()” comes up. Further, it says that the tool generated the following error: “line 9: run_deepvariant: command not found”.\nWhen I look at the backend to see what process Galaxy is going through, even after installation of the tool, the following line keeps repeating on the command line interface:\nuvicorn.access INFO 2022-12-22 13:43:26,629 [pN:main.1,p:100965,tN:MainThread] 127.0.0.1:58158 - “GET /api/tool_shed_repositories?name=deepvariant&owner=iuc HTTP/1.1” 200\nI don’t understand this error. Could someone please help me out?\nI am running the same job on Galaxy.eu server and it is running (for a few hours now) but in the local instance in errors out pretty much instantly.\nThanks!",
"Dear Sir,\nKindly help in this regards I was trying to make a de novo contig using trinity and it is running since from one week.\nIs it ok??? or did I out something wrong\nKindly help"
               ]

#response_test_strings = [
#"Hello, I can’t give a full answer but I can maybe guide you in the right direction and maybe someone that can give a better answer will reply.\nGiven the error it looks like deepvariant is not installed (not found). The tool is using a “docker tool dependency”, in other words it needs a container where deepvariant is installed. If you have not checked this yet then I think this is the place to start. Below two links where you may find some more information.\nhttps://docs.galaxyproject.org/en/master/admin/special_topics/mulled_containers.html\n  \n      \n\n      training.galaxyproject.org\n  \n\n  \n    \n\nGalaxy Training: Tool Dependencies and Containers\n\n  Galaxy is an open-source project. Everyone can contribute...\n\n\n  \n\n  \n    \n    \n  \n\n  \n\n\nThe requirement can be seen here:\n  \n\n      github.com\n  \n\n  \n    galaxyproject/tools-iuc/blob/master/tools/deepvariant/macros.xml\n\n\n      <macros>\n    <token name="@TOOL_VERSION@">1.4.0</token>\n    <token name="@SUFFIX_VERSION@">0</token>\n    <xml name="edam_ontology">\n        <edam_topics>                                                                                  \n            <edam_topic>topic_0199</edam_topic>\n        </edam_topics>\n        <edam_operations>\n            <edam_operation>operation_3227</edam_operation>\n        </edam_operations>\n    </xml>\n    <xml name="requirements">\n        <requirements>\n            <container type="docker">google/deepvariant:@TOOL_VERSION@</container>\n        </requirements>\n    </xml>\n    <xml name="citations">\n        <citations>\n            <citation type="doi">10.1038/nbt.4235</citation>\n        </citations>\n\n\n\n\n  This file has been truncated. show original\n\n  \n\n  \n    \n    \n  \n\n  \n\n", 
# "Hello @Sachin_Srivastava\nIf the job is running (yellow/peach dataset), it is usually best to allow it to run. The same is true for queued jobs (grey dataset). This applies to jobs (any tool) executed at a public Galaxy server.\n20 GB of fastq data – uncompressed – creates a very large assembly job. If it fails later on for exceeding resources (red dataset), you’ll need to do one or more of these:\n\nTry a rerun to eliminate cluster issues\nMore QA/QC on the input reads (always recommended)\nConsider downsampling the reads (tool: Seqtk)\nPossibly need to move to your own Galaxy server where more resources can be allocated. The GVL version of Cloudman is one option: https://launch.usegalaxy.org/catalog\n\n\nI added some tags to your post that will find prior Q&A about the above actions. Or, you can search the forum with those keywords (not all posts get tagged).\nYou didn’t state where you are working. But, if by chance at Galaxy Main https://usegalaxy.org, I can let you know that the cluster that runs Trinity (and Unicycler + RNA-Star) is very busy. Longer queue times are expected. If you delete the current job and rerun, that will only place your job back at the end of the queue again, extending wait time.\nThanks!"]

predictions = []

for test in test_strings:
  prompt = """ Below is a question asked by a user: \n
  ### Instruction:
  {}. 
  ### Response by the system to the above instruction:""".format(test)

  input_ids = re_tokenizer.encode(prompt, return_tensors="pt").to('cuda')

  outputs = re_model.generate(input_ids=input_ids, 
                           max_new_tokens=156, 
                           do_sample=True, 
                           top_p=0.9,
                           temperature=0.9
                          )
  predictions.append(tokenizer.decode(outputs[0]))
    

def extract_response_text(input_string):
    start_marker = 'Response by the system to the above instruction:'
    end_marker = '###'
    start_index = input_string.find(start_marker)
    if start_index == -1:
        return None
    
    start_index += len(start_marker)
    
    end_index = input_string.find(end_marker, start_index)
    if end_index == -1:
        return input_string[start_index:]
    
    return input_string[start_index:end_index].strip()

for i in range(len(test_strings)): 
  pred = predictions[i]
  text = test_strings[i]
  print(text+'\n')
  print(pred+'\n')
  print(extract_response_text(pred))
  print("==============================")'''