In [2]:
import pandas as pd
import json

In [3]:
with open("../out/data.json") as fout:
    raw_data = json.load(fout)

#user_template = """ Below is a question asked by a user: \n
### Instruction:
{}
#"""

user_template = """[INST]
{} [/INST]
"""

conversation_template = """<s> {} </s>"""


agg_conversations = []
for idx_thread, thread in enumerate(raw_data):
    conversations = ""
    prev_owner = ""
    
    for idx_post in range(len(thread)):
        post_curr = thread[idx_post]
        if post_curr["role"] == "user":
            if prev_owner == "user":
                conversations += "\n" + post_curr["text"]
            elif prev_owner == "system":
                conversations += user_template.format(post_curr["text"])
            else:
                conversations += user_template.format(post_curr["text"])

        if post_curr["role"] == "system":
            if prev_owner == "system":
                conversations += "\n" + post_curr["text"]
            elif prev_owner == "user":
                conversations += post_curr["text"]
            else:
                conversations += user_template.format(post_curr["text"])

        prev_owner = post_curr["role"]
    agg_conversations.append(conversation_template.format(conversations))

# create dataframe
conv_dataframe = pd.DataFrame(agg_conversations, columns=["conversations"])
conv_dataframe

Unnamed: 0,conversations
0,"<s> [INST]\nHi, I have a very basic notebook r..."
1,"<s> [INST]\nHi, I met an error when I used the..."
2,"<s> [INST]\nHi,\nI’m attempting to run HISAT2 ..."
3,<s> [INST]\nsubmitting a job to a SGE 8.1.9 Cl...
4,<s> [INST]\nI need a tool which can change the...
...,...
1406,"<s> [INST]\nhello,\n.\ni am working with candi..."
1407,<s> [INST]\nI have been trying to create a Gal...
1408,<s> [INST]\nI am trying to follow this transcr...
1409,<s> [INST]\nHello. Two questions\n\nDoes anyo...


In [4]:
conv_dataframe.to_csv("../data/conversations.csv", sep="\t", index=None)

In [5]:
from datasets import load_dataset
from datasets import Dataset
conv_dataframe = conv_dataframe[:998]
dataset = Dataset.from_pandas(conv_dataframe).train_test_split(test_size=0.2, seed=42)

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
import re
import sys
import torch
from transformers import LlamaTokenizer, LlamaForCausalLM

model_path = 'openlm-research/open_llama_3b_v2'

model = LlamaForCausalLM.from_pretrained(
    model_path, load_in_8bit=True, device_map='auto',
)

model.config.use_cache = False
model.config.pretraining_tp = 1

tokenizer = LlamaTokenizer.from_pretrained(model_path)

tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

model_modules = str(model.modules)
print(model_modules)



<bound method Module.modules of LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 3200, padding_idx=0)
    (layers): ModuleList(
      (0-25): 26 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear8bitLt(in_features=3200, out_features=3200, bias=False)
          (k_proj): Linear8bitLt(in_features=3200, out_features=3200, bias=False)
          (v_proj): Linear8bitLt(in_features=3200, out_features=3200, bias=False)
          (o_proj): Linear8bitLt(in_features=3200, out_features=3200, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear8bitLt(in_features=3200, out_features=8640, bias=False)
          (down_proj): Linear8bitLt(in_features=8640, out_features=3200, bias=False)
          (up_proj): Linear8bitLt(in_features=3200, out_features=8640, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_at

In [None]:
from peft import get_peft_config, PeftModel, PeftConfig, get_peft_model, LoraConfig, TaskType
from transformers import AutoModelForCausalLM
from transformers import LlamaTokenizer, LlamaForCausalLM
import torch
from transformers.trainer_callback import TrainerCallback
import os
import sys
import time
from transformers import BitsAndBytesConfig
from trl import SFTTrainer
from transformers import TrainingArguments


#target_modules = ['q_proj','k_proj','v_proj','o_proj','gate_proj','down_proj','up_proj','lm_head']
#or
target_modules = ['q_proj','v_proj', 'k_proj', 'o_proj']

lora_config = LoraConfig(
    r=8,#or r=16
    lora_alpha=8, #8,
    lora_dropout=0.05,
    bias="none",
    target_modules = target_modules,
    task_type="CAUSAL_LM",
)

#tokenizer.add_special_tokens({'pad_token': '[PAD]'})
#tokenizer.padding_side = 'right'

print("Extracting parameter efficient model ...")
s_time = time.time()
refined_model = get_peft_model(model, lora_config)
e_time = time.time()
refined_model.print_trainable_parameters()
print("PEFT loading time: {} seconds".format(e_time - s_time))

base_dir = "llama"

per_device_train_batch_size = 4
gradient_accumulation_steps = 4 #4
optim = 'adamw_hf'
learning_rate = 2e-4 #1e-5
max_grad_norm = 0.3
warmup_ratio = 0.03
lr_scheduler_type = "linear" #"linear"

print("Setting up Training arguments ...")

training_args = TrainingArguments(
    output_dir=base_dir,
    save_strategy="epoch",
    evaluation_strategy="epoch",
    num_train_epochs = 10.0,
    logging_strategy="epoch",
    logging_steps=25,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    weight_decay=0.001,
    learning_rate=learning_rate,
    fp16=False, #True,
    max_grad_norm=max_grad_norm,
    warmup_ratio=warmup_ratio,
    group_by_length=True,
    lr_scheduler_type=lr_scheduler_type,
    max_steps=-1,
    report_to="tensorboard"
)

print("Setting up SFTTrainer ...")

s_time = time.time()
trainer = SFTTrainer(
    refined_model,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    dataset_text_field="conversations",
    max_seq_length=128,
    args=training_args,
    #max_seq_length=None,
    tokenizer=tokenizer,
    #args=training_params,
    packing=False,
)

e_time = time.time()
print("SFTTTrainer setting up time: {} seconds".format(e_time - s_time))

print("Start training ...")
trainer.train()

# move this config to checkpoint folder for model reconstruction
#refined_model.config.to_json_file("saved-config/config.json")



Extracting parameter efficient model ...
trainable params: 5,324,800 || all params: 3,431,798,400 || trainable%: 0.15516062948219803
PEFT loading time: 0.22310543060302734 seconds
Setting up Training arguments ...
Setting up SFTTrainer ...


Map: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 798/798 [00:04<00:00, 194.83 examples/s]
Map: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 200/200 [00:01<00:00, 185.89 examples/s]


SFTTTrainer setting up time: 5.195271968841553 seconds
Start training ...


Epoch,Training Loss,Validation Loss
1,3.2575,3.962891
2,3.9741,3.699219
3,3.6611,3.544922
4,3.6011,3.513672


In [None]:
from transformers import pipeline

# test fine-tuned model
# Number 1000, 1001 from data.json

test_strings = ["Hello!\nI am running a local instance of Galaxy (build 22.05). I installed the latest version of Deepvariant (1.4.0+galaxy0) which installed without any errors. However, when I try to run Deepvariant on BAM files output from HISAT2, the error “Fatal error: Exit code 127 ()” comes up. Further, it says that the tool generated the following error: “line 9: run_deepvariant: command not found”.\nWhen I look at the backend to see what process Galaxy is going through, even after installation of the tool, the following line keeps repeating on the command line interface:\nuvicorn.access INFO 2022-12-22 13:43:26,629 [pN:main.1,p:100965,tN:MainThread] 127.0.0.1:58158 - “GET /api/tool_shed_repositories?name=deepvariant&owner=iuc HTTP/1.1” 200\nI don’t understand this error. Could someone please help me out?\nI am running the same job on Galaxy.eu server and it is running (for a few hours now) but in the local instance in errors out pretty much instantly.\nThanks!",
"Dear Sir,\nKindly help in this regards I was trying to make a de novo contig using trinity and it is running since from one week.\nIs it ok??? or did I out something wrong\nKindly help"
               ]


#response_test_strings = [
#"Hello, I can’t give a full answer but I can maybe guide you in the right direction and maybe someone that can give a better answer will reply.\nGiven the error it looks like deepvariant is not installed (not found). The tool is using a “docker tool dependency”, in other words it needs a container where deepvariant is installed. If you have not checked this yet then I think this is the place to start. Below two links where you may find some more information.\nhttps://docs.galaxyproject.org/en/master/admin/special_topics/mulled_containers.html\n  \n      \n\n      training.galaxyproject.org\n  \n\n  \n    \n\nGalaxy Training: Tool Dependencies and Containers\n\n  Galaxy is an open-source project. Everyone can contribute...\n\n\n  \n\n  \n    \n    \n  \n\n  \n\n\nThe requirement can be seen here:\n  \n\n      github.com\n  \n\n  \n    galaxyproject/tools-iuc/blob/master/tools/deepvariant/macros.xml\n\n\n      <macros>\n    <token name="@TOOL_VERSION@">1.4.0</token>\n    <token name="@SUFFIX_VERSION@">0</token>\n    <xml name="edam_ontology">\n        <edam_topics>                                                                                  \n            <edam_topic>topic_0199</edam_topic>\n        </edam_topics>\n        <edam_operations>\n            <edam_operation>operation_3227</edam_operation>\n        </edam_operations>\n    </xml>\n    <xml name="requirements">\n        <requirements>\n            <container type="docker">google/deepvariant:@TOOL_VERSION@</container>\n        </requirements>\n    </xml>\n    <xml name="citations">\n        <citations>\n            <citation type="doi">10.1038/nbt.4235</citation>\n        </citations>\n\n\n\n\n  This file has been truncated. show original\n\n  \n\n  \n    \n    \n  \n\n  \n\n", 
# "Hello @Sachin_Srivastava\nIf the job is running (yellow/peach dataset), it is usually best to allow it to run. The same is true for queued jobs (grey dataset). This applies to jobs (any tool) executed at a public Galaxy server.\n20 GB of fastq data – uncompressed – creates a very large assembly job. If it fails later on for exceeding resources (red dataset), you’ll need to do one or more of these:\n\nTry a rerun to eliminate cluster issues\nMore QA/QC on the input reads (always recommended)\nConsider downsampling the reads (tool: Seqtk)\nPossibly need to move to your own Galaxy server where more resources can be allocated. The GVL version of Cloudman is one option: https://launch.usegalaxy.org/catalog\n\n\nI added some tags to your post that will find prior Q&A about the above actions. Or, you can search the forum with those keywords (not all posts get tagged).\nYou didn’t state where you are working. But, if by chance at Galaxy Main https://usegalaxy.org, I can let you know that the cluster that runs Trinity (and Unicycler + RNA-Star) is very busy. Longer queue times are expected. If you delete the current job and rerun, that will only place your job back at the end of the queue again, extending wait time.\nThanks!"]

predictions = []

for test in test_strings:
    prompt = """<s> [INST] {} [/INST]""".format(test)

    input_ids = tokenizer.encode(prompt, return_tensors="pt").to('cuda')

    outputs = refined_model.generate(
        input_ids=input_ids,
        max_new_tokens=156,
        do_sample=True,
        #top_p=0.9,
        #temperature=0.9
    )
    predictions.append(tokenizer.decode(outputs[0]))

for i in range(len(test_strings)): 
    pred = predictions[i]
    text = test_strings[i]
    print(text+'\n')
    print(pred+'\n')
    print("==============================")

In [None]:
### Original Llama2 model output

# test fine-tuned model
# Number 1000, 1001 from data.json

test_strings = ["Hello!\nI am running a local instance of Galaxy (build 22.05). I installed the latest version of Deepvariant (1.4.0+galaxy0) which installed without any errors. However, when I try to run Deepvariant on BAM files output from HISAT2, the error “Fatal error: Exit code 127 ()” comes up. Further, it says that the tool generated the following error: “line 9: run_deepvariant: command not found”.\nWhen I look at the backend to see what process Galaxy is going through, even after installation of the tool, the following line keeps repeating on the command line interface:\nuvicorn.access INFO 2022-12-22 13:43:26,629 [pN:main.1,p:100965,tN:MainThread] 127.0.0.1:58158 - “GET /api/tool_shed_repositories?name=deepvariant&owner=iuc HTTP/1.1” 200\nI don’t understand this error. Could someone please help me out?\nI am running the same job on Galaxy.eu server and it is running (for a few hours now) but in the local instance in errors out pretty much instantly.\nThanks!",
"Dear Sir,\nKindly help in this regards I was trying to make a de novo contig using trinity and it is running since from one week.\nIs it ok??? or did I out something wrong\nKindly help"
               ]

#response_test_strings = [
#"Hello, I can’t give a full answer but I can maybe guide you in the right direction and maybe someone that can give a better answer will reply.\nGiven the error it looks like deepvariant is not installed (not found). The tool is using a “docker tool dependency”, in other words it needs a container where deepvariant is installed. If you have not checked this yet then I think this is the place to start. Below two links where you may find some more information.\nhttps://docs.galaxyproject.org/en/master/admin/special_topics/mulled_containers.html\n  \n      \n\n      training.galaxyproject.org\n  \n\n  \n    \n\nGalaxy Training: Tool Dependencies and Containers\n\n  Galaxy is an open-source project. Everyone can contribute...\n\n\n  \n\n  \n    \n    \n  \n\n  \n\n\nThe requirement can be seen here:\n  \n\n      github.com\n  \n\n  \n    galaxyproject/tools-iuc/blob/master/tools/deepvariant/macros.xml\n\n\n      <macros>\n    <token name="@TOOL_VERSION@">1.4.0</token>\n    <token name="@SUFFIX_VERSION@">0</token>\n    <xml name="edam_ontology">\n        <edam_topics>                                                                                  \n            <edam_topic>topic_0199</edam_topic>\n        </edam_topics>\n        <edam_operations>\n            <edam_operation>operation_3227</edam_operation>\n        </edam_operations>\n    </xml>\n    <xml name="requirements">\n        <requirements>\n            <container type="docker">google/deepvariant:@TOOL_VERSION@</container>\n        </requirements>\n    </xml>\n    <xml name="citations">\n        <citations>\n            <citation type="doi">10.1038/nbt.4235</citation>\n        </citations>\n\n\n\n\n  This file has been truncated. show original\n\n  \n\n  \n    \n    \n  \n\n  \n\n", 
# "Hello @Sachin_Srivastava\nIf the job is running (yellow/peach dataset), it is usually best to allow it to run. The same is true for queued jobs (grey dataset). This applies to jobs (any tool) executed at a public Galaxy server.\n20 GB of fastq data – uncompressed – creates a very large assembly job. If it fails later on for exceeding resources (red dataset), you’ll need to do one or more of these:\n\nTry a rerun to eliminate cluster issues\nMore QA/QC on the input reads (always recommended)\nConsider downsampling the reads (tool: Seqtk)\nPossibly need to move to your own Galaxy server where more resources can be allocated. The GVL version of Cloudman is one option: https://launch.usegalaxy.org/catalog\n\n\nI added some tags to your post that will find prior Q&A about the above actions. Or, you can search the forum with those keywords (not all posts get tagged).\nYou didn’t state where you are working. But, if by chance at Galaxy Main https://usegalaxy.org, I can let you know that the cluster that runs Trinity (and Unicycler + RNA-Star) is very busy. Longer queue times are expected. If you delete the current job and rerun, that will only place your job back at the end of the queue again, extending wait time.\nThanks!"]

predictions = []

for test in test_strings:
  prompt = """### Instruction: {}.
  ### Response: \n""".format(test)

  input_ids = tokenizer.encode(prompt, return_tensors="pt").to('cuda')

  outputs = model.generate(input_ids=input_ids, 
                           max_new_tokens=156, 
                           do_sample=True, 
                           #top_p=0.9,
                           #temperature=0.9
                          )
  predictions.append(tokenizer.decode(outputs[0]))

def extract_response_text(input_string):
    start_marker = 'Response: \n'
    end_marker = '###'
    start_index = input_string.find(start_marker)
    if start_index == -1:
        return None
    
    start_index += len(start_marker)
    
    end_index = input_string.find(end_marker, start_index)
    if end_index == -1:
        return input_string[start_index:]
    
    return input_string[start_index:end_index].strip()

for i in range(len(test_strings)): 
  pred = predictions[i]
  text = test_strings[i]
  print(text+'\n')
  print(pred+'\n')
  print(extract_response_text(pred))
  print("==============================")

In [None]:
# Reload PEFT fined tuned model
# https://stackoverflow.com/questions/76459034/how-to-load-a-fine-tuned-peft-lora-model-based-on-llama-with-huggingface-transfo
'''from transformers import LlamaTokenizer, LlamaForCausalLM

saved_model_path = 'llama/checkpoint-2'
re_tokenizer = LlamaTokenizer.from_pretrained(saved_model_path)
re_model = LlamaForCausalLM.from_pretrained(saved_model_path, device_map='auto')'''

In [None]:

'''# test fine-tuned model
# Number 1000, 1001 from data.json

#re_tokenizer.pad_token = "[PAD]"
re_tokenizer.add_special_tokens({'pad_token': '[PAD]'})
re_tokenizer.padding_side = "left"

test_strings = ["Hello!\nI am running a local instance of Galaxy (build 22.05). I installed the latest version of Deepvariant (1.4.0+galaxy0) which installed without any errors. However, when I try to run Deepvariant on BAM files output from HISAT2, the error “Fatal error: Exit code 127 ()” comes up. Further, it says that the tool generated the following error: “line 9: run_deepvariant: command not found”.\nWhen I look at the backend to see what process Galaxy is going through, even after installation of the tool, the following line keeps repeating on the command line interface:\nuvicorn.access INFO 2022-12-22 13:43:26,629 [pN:main.1,p:100965,tN:MainThread] 127.0.0.1:58158 - “GET /api/tool_shed_repositories?name=deepvariant&owner=iuc HTTP/1.1” 200\nI don’t understand this error. Could someone please help me out?\nI am running the same job on Galaxy.eu server and it is running (for a few hours now) but in the local instance in errors out pretty much instantly.\nThanks!",
"Dear Sir,\nKindly help in this regards I was trying to make a de novo contig using trinity and it is running since from one week.\nIs it ok??? or did I out something wrong\nKindly help"
               ]

#response_test_strings = [
#"Hello, I can’t give a full answer but I can maybe guide you in the right direction and maybe someone that can give a better answer will reply.\nGiven the error it looks like deepvariant is not installed (not found). The tool is using a “docker tool dependency”, in other words it needs a container where deepvariant is installed. If you have not checked this yet then I think this is the place to start. Below two links where you may find some more information.\nhttps://docs.galaxyproject.org/en/master/admin/special_topics/mulled_containers.html\n  \n      \n\n      training.galaxyproject.org\n  \n\n  \n    \n\nGalaxy Training: Tool Dependencies and Containers\n\n  Galaxy is an open-source project. Everyone can contribute...\n\n\n  \n\n  \n    \n    \n  \n\n  \n\n\nThe requirement can be seen here:\n  \n\n      github.com\n  \n\n  \n    galaxyproject/tools-iuc/blob/master/tools/deepvariant/macros.xml\n\n\n      <macros>\n    <token name="@TOOL_VERSION@">1.4.0</token>\n    <token name="@SUFFIX_VERSION@">0</token>\n    <xml name="edam_ontology">\n        <edam_topics>                                                                                  \n            <edam_topic>topic_0199</edam_topic>\n        </edam_topics>\n        <edam_operations>\n            <edam_operation>operation_3227</edam_operation>\n        </edam_operations>\n    </xml>\n    <xml name="requirements">\n        <requirements>\n            <container type="docker">google/deepvariant:@TOOL_VERSION@</container>\n        </requirements>\n    </xml>\n    <xml name="citations">\n        <citations>\n            <citation type="doi">10.1038/nbt.4235</citation>\n        </citations>\n\n\n\n\n  This file has been truncated. show original\n\n  \n\n  \n    \n    \n  \n\n  \n\n", 
# "Hello @Sachin_Srivastava\nIf the job is running (yellow/peach dataset), it is usually best to allow it to run. The same is true for queued jobs (grey dataset). This applies to jobs (any tool) executed at a public Galaxy server.\n20 GB of fastq data – uncompressed – creates a very large assembly job. If it fails later on for exceeding resources (red dataset), you’ll need to do one or more of these:\n\nTry a rerun to eliminate cluster issues\nMore QA/QC on the input reads (always recommended)\nConsider downsampling the reads (tool: Seqtk)\nPossibly need to move to your own Galaxy server where more resources can be allocated. The GVL version of Cloudman is one option: https://launch.usegalaxy.org/catalog\n\n\nI added some tags to your post that will find prior Q&A about the above actions. Or, you can search the forum with those keywords (not all posts get tagged).\nYou didn’t state where you are working. But, if by chance at Galaxy Main https://usegalaxy.org, I can let you know that the cluster that runs Trinity (and Unicycler + RNA-Star) is very busy. Longer queue times are expected. If you delete the current job and rerun, that will only place your job back at the end of the queue again, extending wait time.\nThanks!"]

predictions = []

for test in test_strings:
  prompt = """ Below is a question asked by a user: \n
  ### Instruction:
  {}. 
  ### Response by the system to the above instruction:""".format(test)

  input_ids = re_tokenizer.encode(prompt, return_tensors="pt").to('cuda')

  outputs = re_model.generate(input_ids=input_ids, 
                           max_new_tokens=156, 
                           do_sample=True, 
                           top_p=0.9,
                           temperature=0.9
                          )
  predictions.append(tokenizer.decode(outputs[0]))
    

def extract_response_text(input_string):
    start_marker = 'Response by the system to the above instruction:'
    end_marker = '###'
    start_index = input_string.find(start_marker)
    if start_index == -1:
        return None
    
    start_index += len(start_marker)
    
    end_index = input_string.find(end_marker, start_index)
    if end_index == -1:
        return input_string[start_index:]
    
    return input_string[start_index:end_index].strip()

for i in range(len(test_strings)): 
  pred = predictions[i]
  text = test_strings[i]
  print(text+'\n')
  print(pred+'\n')
  print(extract_response_text(pred))
  print("==============================")'''