In [1]:
import re
import sys
import torch
from transformers import (LlamaTokenizer, LlamaForCausalLM)

model_path = 'NousResearch/Llama-2-7b-chat-hf'
tokenizer = LlamaTokenizer.from_pretrained(model_path)

  from .autonotebook import tqdm as notebook_tqdm


[2023-12-01 17:01:46,048] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)


In [2]:
import nltk
import nltk.corpus
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /home/centos/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/centos/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [3]:
import pandas as pd
import json
from bs4 import BeautifulSoup
import string
import contractions
import ftfy


## Text preprocessing for fine tuning
## https://www.linkedin.com/pulse/pre-processing-text-data-gpt-models-techniques-best-practices-tilix/
## TODO: https://ftfy.readthedocs.io/en/latest/
## TODO: Read this: https://arxiv.org/abs/2212.10496

def remove_punctuation(text):
    translator = str.maketrans('', '', string.punctuation)
    clean_text = text.translate(translator)
    return clean_text

def lowercase_text(text):
    return text.lower()

def wikitext_detokenizer(string):
    # https://github.com/kingoflolz/mesh-transformer-jax/blob/master/create_finetune_tfrecords.py
    # contractions
    string = string.replace("s '", "s'")
    string = re.sub(r"/' [0-9]/", r"/'[0-9]/", string)
    # number separators
    string = string.replace(" @-@ ", "-")
    string = string.replace(" @,@ ", ",")
    string = string.replace(" @.@ ", ".")
    # punctuation
    string = string.replace(" : ", ": ")
    string = string.replace(" ; ", "; ")
    string = string.replace(" . ", ". ")
    string = string.replace(" ! ", "! ")
    string = string.replace(" ? ", "? ")
    string = string.replace(" , ", ", ")
    # double brackets
    string = re.sub(r"\(\s*([^\)]*?)\s*\)", r"(\1)", string)
    string = re.sub(r"\[\s*([^\]]*?)\s*\]", r"[\1]", string)
    string = re.sub(r"{\s*([^}]*?)\s*}", r"{\1}", string)
    string = re.sub(r"\"\s*([^\"]*?)\s*\"", r'"\1"', string)
    string = re.sub(r"'\s*([^']*?)\s*'", r"'\1'", string)
    # miscellaneous
    string = string.replace("= = = =", "====")
    string = string.replace("= = =", "===")
    string = string.replace("= =", "==")
    string = string.replace(" " + chr(176) + " ", chr(176))
    string = string.replace(" \n", "\n")
    string = string.replace("\n ", "\n")
    string = string.replace(" N ", " 1 ")
    string = string.replace(" 's", "'s")

    return string

def clean_html_text(soup: BeautifulSoup):
    # Process quote aside tags
    for quote in soup.select('aside.quote'):
        blockquote_content = quote.find('blockquote').get_text(strip=True)
        quote.replace_with(f"@quote\n{blockquote_content}")

    # Remove all a.anchor elements
    for anchor in soup.select('a.anchor'):
        anchor.decompose()

    for a_tag in soup.find_all('a'):
        text = a_tag.get_text()
        href = a_tag.get('href', '')

        if text in href:
            a_tag.replace_with(f"@link {href}")
        else:
            a_tag.replace_with(f"{text} (@link {href})")

    # Replace all img tags with their alt text
    for img in soup.find_all('img'):
        alt_text = img.get('alt', '')
        img.replace_with(alt_text)

    # Remove all svg elements
    for svg in soup.find_all('svg'):
        svg.decompose()

    rgx = "<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});"
    cleanr = re.compile(rgx)
    cleantext = re.sub(cleanr, '', str(soup))

    cleantext = re.sub(r'Screen.+KB', '', cleantext)
    cleantext = re.sub(r'[0-9].+KB', '', cleantext)
    cleantext = re.sub(r'Kind.+[a-z0-9]', '', cleantext)
    cleantext = re.sub(r'Dear @.+[a-z0-9]', '', cleantext)
    cleantext = re.sub(r'Hi @.+[a-z0-9]', '', cleantext)
    cleantext = re.sub(r'Hello @.+[a-z0-9]', '', cleantext)
    cleantext = re.sub(r'Cheers', '', cleantext)
    cleantext = re.sub(r'Best .+[a-z0-9]', '', cleantext)
    cleantext = wikitext_detokenizer(cleantext)
    cleantext = lowercase_text(cleantext)
    cleantext = ftfy.fix_text(cleantext)

    tagged_sentence = nltk.tag.pos_tag(cleantext.split())
    cleantext = [word for word, tag in tagged_sentence if tag != 'NNP' and tag != 'NNPS']
    cleantext = " ".join(cleantext)
    # fix contractions
    cleantext = contractions.fix(cleantext)
    return cleantext


with open("../out/data.json") as fout:
    raw_data = json.load(fout)

prompt_template_no_sys = """
[INST] {} [/INST] {}
"""

user_template = """
Below is an instruction that describes a task. Write a response that appropriately completes the request.
[INST]
{}
"""

user_template_end_tag = """
Below is an instruction that describes a task. Write a response that appropriately completes the request.
[INST]
{}
"""

system_template = """[/INST] {}"""

end_template = """{}"""

agg_conversations = []
for idx_thread, thread in enumerate(raw_data):
    conversations = ""
    prev_owner = ""
    
    for idx_post in range(len(thread)):
        post_curr = thread[idx_post]
        soup = BeautifulSoup(post_curr["text"], 'html.parser')
        cleantext = clean_html_text(soup)
        
        
        if post_curr["role"] == "user":
            if prev_owner == "user":
                conversations += "\n" + cleantext
            elif prev_owner == "system":
                conversations += "\n" + user_template_end_tag.format(cleantext)
            else:
                conversations += "\n" + user_template.format(cleantext)

        if post_curr["role"] == "system":
            if prev_owner == "system":
                conversations += "\n" + cleantext
            elif prev_owner == "user":
                conversations += "\n" + system_template.format(cleantext)

        prev_owner = post_curr["role"]
    agg_conversations.append(end_template.format(conversations))

# create dataframe
test_conv_dataframe = pd.DataFrame(agg_conversations, columns=["conversations"])
test_conv_dataframe

  soup = BeautifulSoup(post_curr["text"], 'html.parser')
  soup = BeautifulSoup(post_curr["text"], 'html.parser')


Unnamed: 0,conversations
0,\n\nBelow is an instruction that describes a t...
1,\n\nBelow is an instruction that describes a t...
2,\n\nBelow is an instruction that describes a t...
3,\n\nBelow is an instruction that describes a t...
4,\n\nBelow is an instruction that describes a t...
...,...
1406,\n\nBelow is an instruction that describes a t...
1407,\n\nBelow is an instruction that describes a t...
1408,\n\nBelow is an instruction that describes a t...
1409,\n\nBelow is an instruction that describes a t...


In [4]:
test_conv_dataframe.to_csv("../data/test-conversations-all-conv.csv", sep="\t", index=None)

In [13]:
import re

pre_instruction = "Act like Bioinformatician who uses Galaxy platform for biological data analysis. Understand the following instruction and prepare a suitable response."

def extract_ins_res_pairs(input_string):
    pairs_ins_res = []
    size_pairs = []
    full_pat = '[INST]'
    pattern = "Below is an instruction that describes a task. Write a response that appropriately completes the request." #[INST]
    matches = [m.start() for m in re.finditer(pattern, input_string)]
    #print(matches)
    for i in range(len(matches)):
        if i < len(matches)-1:
            f,n = matches[i], matches[i+1]
            extracted_conv = input_string[f + len(pattern):n]
        else:
            extracted_conv = input_string[matches[i] + len(pattern):len(input_string)]
        if "[/INST]" in extracted_conv:
            input_ids = tokenizer.encode(extracted_conv, return_tensors="pt")
            extracted_conv = pre_instruction + "\n" + extracted_conv
            pairs_ins_res.append(extracted_conv)
            size_pairs.append(len(input_ids[0]))
    return pairs_ins_res, size_pairs

ins_res_dataset = []
prs_size_dataset = []
for idx, item in test_conv_dataframe.iterrows():
    prs, size_prs = extract_ins_res_pairs(item["conversations"])
    ins_res_dataset.extend(prs)
    prs_size_dataset.extend(size_prs)
individual_conversations = pd.DataFrame(zip(ins_res_dataset, prs_size_dataset), columns=["conversations", "tensor_size"])

In [14]:
size = individual_conversations["tensor_size"]

In [15]:
import numpy as np

len(size), np.mean(size), np.median(size)

(1989, 362.74459527400705, 235.0)

In [16]:
small_df = individual_conversations
small_df

Unnamed: 0,conversations,tensor_size
0,Act like Bioinformatician who uses Galaxy plat...,381
1,Act like Bioinformatician who uses Galaxy plat...,429
2,Act like Bioinformatician who uses Galaxy plat...,504
3,Act like Bioinformatician who uses Galaxy plat...,570
4,Act like Bioinformatician who uses Galaxy plat...,492
...,...,...
1984,Act like Bioinformatician who uses Galaxy plat...,192
1985,Act like Bioinformatician who uses Galaxy plat...,108
1986,Act like Bioinformatician who uses Galaxy plat...,179
1987,Act like Bioinformatician who uses Galaxy plat...,267


In [17]:
small_df.to_csv("../data/all-conv-galaxy-q-a.csv", sep="\t", index=None)

In [18]:
#small_df = small_df[small_df["tensor_size"] <= 700]
small_df

Unnamed: 0,conversations,tensor_size
0,Act like Bioinformatician who uses Galaxy plat...,381
1,Act like Bioinformatician who uses Galaxy plat...,429
2,Act like Bioinformatician who uses Galaxy plat...,504
3,Act like Bioinformatician who uses Galaxy plat...,570
4,Act like Bioinformatician who uses Galaxy plat...,492
...,...,...
1984,Act like Bioinformatician who uses Galaxy plat...,192
1985,Act like Bioinformatician who uses Galaxy plat...,108
1986,Act like Bioinformatician who uses Galaxy plat...,179
1987,Act like Bioinformatician who uses Galaxy plat...,267


In [19]:
small_df.to_csv("../data/all-conv-galaxy-q-a.csv", sep="\t", index=None)

In [20]:
for i, row in small_df.iterrows():
    print(row["conversations"])
    print("-------------------")
    if i == 10:
        break

Act like Bioinformatician who uses Galaxy platform for biological data analysis. Understand the following instruction and prepare a suitable response.

[INST]
hi, i have a very basic notebook running in our local galaxy portal with interactive tools. as you see, put() and get() functions are not recognized: the tool-wrapper is the following taken from some official galaxy jupyterlab/jupyternotebook tutorial page. what might i be missing to be able to have get() and put() working? tool wrapper: default empty notebook, reuse a previous one, or upload a new quay.io/bgruening/docker-jupyter-notebook:2021-03-05 8888 lab $__galaxy_url__ 8080 re export galaxy_working_dir=`pwd` mkdir mkdir re.sub('[^\w\-\.]', str($input.element_identifier)) ln '$input' './jupyter/data/${cleaned_name}' && ## change into the directory where the notebooks are located cd export path=/home/jovyan/.local/bin:\$path copy default notebook cp '$__tool_directory__/default_notebook.ipynb' jupyter trust jupyter lab cp '$j