In [1]:
import pandas as pd
import json

In [7]:
with open("../out/data.json") as fout:
    raw_data = json.load(fout)

#question = []
#response = []
user_template = """Below is a question asked by a user: \n
{}
"""

system_template = """Below is the response by the system: \n
{}
"""

agg_conversations = []
for idx_thread, thread in enumerate(raw_data):
    conversations = ""
    for idx_post in range(len(thread)):
        post = thread[idx_post]
        if post["role"] == "user":
            conversations += user_template.format(post["text"])
            conversations += "\n"
        if post["role"] == "system":
            conversations += system_template.format(post["text"])
            conversations += "\n"
    agg_conversations.append(conversations)

# create dataframe
conv_dataframe = pd.DataFrame(agg_conversations, columns=["conversations"])
conv_dataframe

Unnamed: 0,conversations
0,"Below is a question asked by a user: \n\nHi, I..."
1,"Below is a question asked by a user: \n\nHi, I..."
2,"Below is a question asked by a user: \n\nHi,\n..."
3,Below is a question asked by a user: \n\nsubmi...
4,Below is a question asked by a user: \n\nI nee...
...,...
1406,Below is a question asked by a user: \n\nhello...
1407,Below is a question asked by a user: \n\nI hav...
1408,Below is a question asked by a user: \n\nI am ...
1409,Below is a question asked by a user: \n\nHello...


In [11]:
from datasets import load_dataset
from datasets import Dataset
dataset = Dataset.from_pandas(conv_dataframe).train_test_split(test_size=0.2, seed=42)

  from .autonotebook import tqdm as notebook_tqdm


In [12]:
import torch
from transformers import BioGptTokenizer, BioGptForCausalLM, set_seed
import re
import sys

model_path = "microsoft/biogpt"

tokenizer = BioGptTokenizer.from_pretrained(model_path)
model = BioGptForCausalLM.from_pretrained(model_path, torch_dtype=torch.float32)

model_modules = str(model.modules)
print(model_modules)

<bound method Module.modules of BioGptForCausalLM(
  (biogpt): BioGptModel(
    (embed_tokens): Embedding(42384, 1024, padding_idx=1)
    (embed_positions): BioGptLearnedPositionalEmbedding(1026, 1024)
    (layers): ModuleList(
      (0-23): 24 x BioGptDecoderLayer(
        (self_attn): BioGptAttention(
          (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
          (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
          (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
          (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
        )
        (activation_fn): GELUActivation()
        (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (fc1): Linear(in_features=1024, out_features=4096, bias=True)
        (fc2): Linear(in_features=4096, out_features=1024, bias=True)
        (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      )
    )
    (layer_norm):

In [14]:
from peft import get_peft_config, PeftModel, PeftConfig, get_peft_model, LoraConfig, TaskType
import torch
from transformers.trainer_callback import TrainerCallback
import os
import sys
from transformers import BitsAndBytesConfig
from trl import SFTTrainer
from transformers import TrainingArguments

target_modules = ['q_proj','v_proj']

lora_config = LoraConfig(
    r=8,
    lora_alpha=8,
    lora_dropout=0.05,
    bias="none",
    target_modules = target_modules,
    task_type="CAUSAL_LM",
)

base_dir = "biogpt"

per_device_train_batch_size = 4
gradient_accumulation_steps = 4
optim = 'adamw_hf'
learning_rate = 1e-5
max_grad_norm = 0.3
warmup_ratio = 0.03
lr_scheduler_type = "linear"

training_args = TrainingArguments(
    output_dir=base_dir,
    save_strategy="epoch",
    evaluation_strategy="epoch",
    num_train_epochs = 5.0,
    logging_strategy="epoch",
    logging_steps=200,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    learning_rate=learning_rate,
    max_grad_norm=max_grad_norm,
    warmup_ratio=warmup_ratio,
    group_by_length=True,
    lr_scheduler_type=lr_scheduler_type,
)
    
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainer = SFTTrainer(
    model,
    train_dataset=dataset['train'],
    eval_dataset = dataset['test'],
    dataset_text_field="conversations",
    max_seq_length=256,
    args=training_args,
)

trainer.train()

trainable params: 786,432 || all params: 347,549,696 || trainable%: 0.22627900672944337


Map: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1128/1128 [00:23<00:00, 48.28 examples/s]
Map: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 283/283 [00:05<00:00, 52.34 examples/s]


Epoch,Training Loss,Validation Loss
0,5.1455,4.999636
2,5.0363,4.955318
2,5.0659,4.919374
4,4.9701,4.896855
4,4.9891,4.889808


TrainOutput(global_step=350, training_loss=5.041433803013393, metrics={'train_runtime': 1617.6174, 'train_samples_per_second': 3.487, 'train_steps_per_second': 0.216, 'total_flos': 2336047919677440.0, 'train_loss': 5.041433803013393, 'epoch': 4.96})

In [15]:
'''fine_tuned_model_path = "saved-model"
tokenizer.save_pretrained(fine_tuned_model_path)
model.save_pretrained(fine_tuned_model_path)
model.config.to_json_file(fine_tuned_model_path + "/config.json")'''

'fine_tuned_model_path = "saved-model"\ntokenizer.save_pretrained(fine_tuned_model_path)\nmodel.save_pretrained(fine_tuned_model_path)\nmodel.config.to_json_file(fine_tuned_model_path + "/config.json")'

In [16]:
#trainer.save_model(fine_tuned_model_path)

In [17]:
# load the best model
'''import torch
from transformers import BioGptTokenizer, BioGptForCausalLM

#best_checkpoint_path = "biogpt/checkpoint-1820"
#model_path = "microsoft/biogpt"

reload_tokenizer = BioGptTokenizer.from_pretrained("saved-model")
reload_model = BioGptForCausalLM.from_pretrained("saved-model", torch_dtype=torch.float32)'''

'import torch\nfrom transformers import BioGptTokenizer, BioGptForCausalLM\n\n#best_checkpoint_path = "biogpt/checkpoint-1820"\n#model_path = "microsoft/biogpt"\n\nreload_tokenizer = BioGptTokenizer.from_pretrained("saved-model")\nreload_model = BioGptForCausalLM.from_pretrained("saved-model", torch_dtype=torch.float32)'

In [18]:
#checkpoint = torch.load(best_checkpoint_path + "/pytorch_model.bin")
#reload_model.load_state_dict(checkpoint['model_state_dict'])
#optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

In [23]:
# test fine-tuned model
# Number 1000, 1001 from data.json
test_strings = ["Hello!\nI am running a local instance of Galaxy (build 22.05). I installed the latest version of Deepvariant (1.4.0+galaxy0) which installed without any errors. However, when I try to run Deepvariant on BAM files output from HISAT2, the error “Fatal error: Exit code 127 ()” comes up. Further, it says that the tool generated the following error: “line 9: run_deepvariant: command not found”.\nWhen I look at the backend to see what process Galaxy is going through, even after installation of the tool, the following line keeps repeating on the command line interface:\nuvicorn.access INFO 2022-12-22 13:43:26,629 [pN:main.1,p:100965,tN:MainThread] 127.0.0.1:58158 - “GET /api/tool_shed_repositories?name=deepvariant&owner=iuc HTTP/1.1” 200\nI don’t understand this error. Could someone please help me out?\nI am running the same job on Galaxy.eu server and it is running (for a few hours now) but in the local instance in errors out pretty much instantly.\nThanks!",
"Dear Sir,\nKindly help in this regards I was trying to make a de novo contig using trinity and it is running since from one week.\nIs it ok??? or did I out something wrong\nKindly help",]


'''user_template = """Below is a question asked by a user: \n
{}.
"""

system_template = """Below is the response by the system: \n
{}'''

#response_test_strings = [
#"Hello, I can’t give a full answer but I can maybe guide you in the right direction and maybe someone that can give a better answer will reply.\nGiven the error it looks like deepvariant is not installed (not found). The tool is using a “docker tool dependency”, in other words it needs a container where deepvariant is installed. If you have not checked this yet then I think this is the place to start. Below two links where you may find some more information.\nhttps://docs.galaxyproject.org/en/master/admin/special_topics/mulled_containers.html\n  \n      \n\n      training.galaxyproject.org\n  \n\n  \n    \n\nGalaxy Training: Tool Dependencies and Containers\n\n  Galaxy is an open-source project. Everyone can contribute...\n\n\n  \n\n  \n    \n    \n  \n\n  \n\n\nThe requirement can be seen here:\n  \n\n      github.com\n  \n\n  \n    galaxyproject/tools-iuc/blob/master/tools/deepvariant/macros.xml\n\n\n      <macros>\n    <token name="@TOOL_VERSION@">1.4.0</token>\n    <token name="@SUFFIX_VERSION@">0</token>\n    <xml name="edam_ontology">\n        <edam_topics>                                                                                  \n            <edam_topic>topic_0199</edam_topic>\n        </edam_topics>\n        <edam_operations>\n            <edam_operation>operation_3227</edam_operation>\n        </edam_operations>\n    </xml>\n    <xml name="requirements">\n        <requirements>\n            <container type="docker">google/deepvariant:@TOOL_VERSION@</container>\n        </requirements>\n    </xml>\n    <xml name="citations">\n        <citations>\n            <citation type="doi">10.1038/nbt.4235</citation>\n        </citations>\n\n\n\n\n  This file has been truncated. show original\n\n  \n\n  \n    \n    \n  \n\n  \n\n", 
# "Hello @Sachin_Srivastava\nIf the job is running (yellow/peach dataset), it is usually best to allow it to run. The same is true for queued jobs (grey dataset). This applies to jobs (any tool) executed at a public Galaxy server.\n20 GB of fastq data – uncompressed – creates a very large assembly job. If it fails later on for exceeding resources (red dataset), you’ll need to do one or more of these:\n\nTry a rerun to eliminate cluster issues\nMore QA/QC on the input reads (always recommended)\nConsider downsampling the reads (tool: Seqtk)\nPossibly need to move to your own Galaxy server where more resources can be allocated. The GVL version of Cloudman is one option: https://launch.usegalaxy.org/catalog\n\n\nI added some tags to your post that will find prior Q&A about the above actions. Or, you can search the forum with those keywords (not all posts get tagged).\nYou didn’t state where you are working. But, if by chance at Galaxy Main https://usegalaxy.org, I can let you know that the cluster that runs Trinity (and Unicycler + RNA-Star) is very busy. Longer queue times are expected. If you delete the current job and rerun, that will only place your job back at the end of the queue again, extending wait time.\nThanks!"]


predictions = []


for test in test_strings:
  prompt = """ Below is a question asked by a user: \n;
  {}.
  Below is the response by the system:""".format(test)
    
  input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to('cuda')

  generation_output = model.generate(
      input_ids=input_ids, max_new_tokens=256
  )
  predictions.append(tokenizer.decode(generation_output[0]))
    

def extract_response_text(input_string):
    start_marker = 'Below is the response by the system:'
    end_marker = ''
    start_index = input_string.find(start_marker)
    if start_index == -1:
        return None
    
    start_index += len(start_marker)
    
    end_index = input_string.find(end_marker, start_index)
    if end_index == -1:
        return input_string[start_index:]
    
    return input_string[start_index:end_index].strip()

for i in range(len(test_strings)): 
  pred = predictions[i]
  text = test_strings[i]
  #print(text+'\n')
  print(pred+'\n')
  #print(extract_response_text(pred))
  print('--------')

</s>Below is a question asked by a user:; Hello! I am running a local instance of Galaxy (build 22.05). I installed the latest version of Deepvariant (1.4.0 + galaxy0) which installed without any errors. However, when I try to run Deepvariant on BAM files output from HISAT2, the error <unk>Fatal error: Exit code 127 () <unk>comes up. Further, it says that the tool generated the following error: <unk>line 9: run _ deepvariant: command not found <unk>. When I look at the backend to see what process Galaxy is going through, even after installation of the tool, the following line keeps repeating on the command line interface: uvicorn.access INFO 2022-12-22 13: 43: 26,629 [pN: main.1, p: 100965, tN: MainThread] 127.0.0.1: 58158 - <unk>GET / api / tool _ shed _ repositories? name = deepvariant & owner = iuc HTTP / 1.1 <unk>200 I don <unk>t understand this error. Could someone please help me out? I am running the same job on Galaxy.eu server and it is running (for a few hours now) but in the 

In [16]:
'''sentence = "COVID-19 is"
inputs = tokenizer(sentence, return_tensors="pt").input_ids.to('cuda')

set_seed(42)

#with torch.no_grad():
beam_output = model.generate(input_ids=inputs,
                            min_length=100,
                            max_length=1024,
                            num_beams=5,
                            early_stopping=True
                            )
    
output = tokenizer.decode(beam_output[0], skip_special_tokens=True)
output
'''

'COVID-19 is a global pandemic caused by severe acute respiratory syndrome coronavirus 2 (SARS-CoV-2), the causative agent of coronavirus disease 2019 (COVID-19), which has spread to more than 200 countries and territories, including the United States (US), Canada, Australia, New Zealand, the United Kingdom (UK), and the United States of America (USA), as of March 11, 2020, with more than 800,000 confirmed cases and more than 800,000 deaths.'