In [1]:
import re
import sys
import torch
from transformers import (LlamaTokenizer, LlamaForCausalLM)

model_path = "NousResearch/Llama-2-7b-chat-hf"
tokenizer = LlamaTokenizer.from_pretrained(model_path)

  from .autonotebook import tqdm as notebook_tqdm


[2024-02-07 10:35:47,790] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)


In [2]:
## Clean text
import nltk
import nltk.corpus
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
from nltk.corpus import stopwords

import pandas as pd
import json
from bs4 import BeautifulSoup
import string
import contractions
import ftfy


## Text preprocessing for fine tuning
## https://www.linkedin.com/pulse/pre-processing-text-data-gpt-models-techniques-best-practices-tilix/
## TODO: https://ftfy.readthedocs.io/en/latest/
## TODO: Read this: https://arxiv.org/abs/2212.10496

def remove_punctuation(text):
    translator = str.maketrans('', '', string.punctuation)
    clean_text = text.translate(translator)
    return clean_text

def lowercase_text(text):
    return text.lower()

def wikitext_detokenizer(string):
    # https://github.com/kingoflolz/mesh-transformer-jax/blob/master/create_finetune_tfrecords.py
    # contractions
    string = string.replace("s '", "s'")
    string = re.sub(r"/' [0-9]/", r"/'[0-9]/", string)
    # number separators
    string = string.replace(" @-@ ", "-")
    string = string.replace(" @,@ ", ",")
    string = string.replace(" @.@ ", ".")
    # punctuation
    string = string.replace(" : ", ": ")
    string = string.replace(" ; ", "; ")
    string = string.replace(" . ", ". ")
    string = string.replace(" ! ", "! ")
    string = string.replace(" ? ", "? ")
    string = string.replace(" , ", ", ")
    # double brackets
    string = re.sub(r"\(\s*([^\)]*?)\s*\)", r"(\1)", string)
    string = re.sub(r"\[\s*([^\]]*?)\s*\]", r"[\1]", string)
    string = re.sub(r"{\s*([^}]*?)\s*}", r"{\1}", string)
    string = re.sub(r"\"\s*([^\"]*?)\s*\"", r'"\1"', string)
    string = re.sub(r"'\s*([^']*?)\s*'", r"'\1'", string)
    # miscellaneous
    string = string.replace("= = = =", "====")
    string = string.replace("= = =", "===")
    string = string.replace("= =", "==")
    string = string.replace(" " + chr(176) + " ", chr(176))
    string = string.replace(" \n", "\n")
    string = string.replace("\n ", "\n")
    string = string.replace(" N ", " 1 ")
    string = string.replace(" 's", "'s")

    return string

def clean_html_text(soup: BeautifulSoup):
    # Process quote aside tags
    rgx = "<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});"
    cleanr = re.compile(rgx)
    cleantext = re.sub(cleanr, '', str(soup))

    cleantext = re.sub(r'Screen.+KB', '', cleantext)
    cleantext = re.sub(r'[0-9].+KB', '', cleantext)
    cleantext = re.sub(r'Kind.+[a-z0-9]', '', cleantext)
    cleantext = re.sub(r'Dear @.+[a-z0-9]', '', cleantext)
    cleantext = re.sub(r'Hi @.+[a-z0-9]', '', cleantext)
    cleantext = re.sub(r'Hello @.+[a-z0-9]', '', cleantext)
    cleantext = re.sub(r'Cheers', '', cleantext)
    cleantext = re.sub(r'Best .+[a-z0-9]', '', cleantext)
    cleantext = wikitext_detokenizer(cleantext)
    cleantext = lowercase_text(cleantext)
    cleantext = ftfy.fix_text(cleantext)
    tagged_sentence = nltk.tag.pos_tag(cleantext.split())
    cleantext = [word for word, tag in tagged_sentence if tag != 'NNP' and tag != 'NNPS']
    cleantext = " ".join(cleantext)
    # fix contractions
    cleantext = contractions.fix(cleantext)
    return cleantext

[nltk_data] Downloading package stopwords to /home/centos/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/centos/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [12]:
import pandas as pd
import json

file_name = "data_biostars_q_a.json" #"data_galaxy_q_a.json"
output_file_name = "conversations-biostars-q-a.csv" #"conversations-galaxy-q-a.csv"

with open("../out/" + file_name) as fout:
    raw_data = json.load(fout)

"""
<s>[INST] <<SYS>>\n \n
<</SYS>>\n\n {} [/INST] {} </s>
<s>[INST] {user_message_2} [/INST]
"""

#system_message = "Below is an instruction that describes a task. Write a response that appropriately completes the request."
system_message = "Galaxy is a web server to process scientific datasets. Act like a Bioinformatician who uses the Galaxy platform for biological data analysis. Understand the following instructions and prepare a suitable response."

prompt_template = """
[INST] <<SYS>> \n {} \n <</SYS>> \n\n {} [/INST] {}
"""

prompt_template_no_sys = """
[INST] {} [/INST] {}
"""

agg_conversations = []
size_conversations = []

for idx_thread in range(len(raw_data)):
    user_post = raw_data[idx_thread][0]
    system_post = raw_data[idx_thread][1]
    if user_post["role"] == "user" and system_post["role"] == "system":
        
        usersoup = BeautifulSoup(user_post["text"], 'html.parser')
        systemsoup = BeautifulSoup(system_post["text"], 'html.parser')
        user_cleantext = clean_html_text(usersoup)
        system_cleantext = clean_html_text(systemsoup)
        conversations = system_message + "\n" + prompt_template_no_sys.format(user_cleantext, system_cleantext)
        input_ids = tokenizer.encode(conversations, return_tensors="pt")
        size_conversations.append(len(input_ids[0]))
        agg_conversations.append(conversations)

# create dataframe
test_conv_dataframe = pd.DataFrame(zip(agg_conversations, size_conversations), columns=["conversations", "tokens"])
test_conv_dataframe

  systemsoup = BeautifulSoup(system_post["text"], 'html.parser')
  usersoup = BeautifulSoup(user_post["text"], 'html.parser')
  systemsoup = BeautifulSoup(system_post["text"], 'html.parser')


Unnamed: 0,conversations,tokens
0,Galaxy is a web server to process scientific d...,311
1,Galaxy is a web server to process scientific d...,1584
2,Galaxy is a web server to process scientific d...,1452
3,Galaxy is a web server to process scientific d...,429
4,Galaxy is a web server to process scientific d...,131
...,...,...
4798,Galaxy is a web server to process scientific d...,1757
4799,Galaxy is a web server to process scientific d...,660
4800,Galaxy is a web server to process scientific d...,193
4801,Galaxy is a web server to process scientific d...,203


In [13]:
import numpy as np

size = test_conv_dataframe["tokens"]
len(size), np.mean(size), np.median(size)

(4803, 548.2904434728295, 413.0)

In [14]:
#small_df = test_conv_dataframe[test_conv_dataframe["tokens"] <= 700]
#small_df

In [15]:
test_conv_dataframe.to_csv("../data/" + output_file_name, sep="\t", index=None)