In [1]:
import re
import sys
import torch
from transformers import (LlamaTokenizer, LlamaForCausalLM)

model_path = 'NousResearch/Llama-2-7b-chat-hf'
tokenizer = LlamaTokenizer.from_pretrained(model_path)

  from .autonotebook import tqdm as notebook_tqdm


[2024-03-12 16:36:29,056] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)


  return torch._C._cuda_getDeviceCount() > 0


In [2]:
## Clean text
import nltk
import nltk.corpus
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
from nltk.corpus import stopwords

import pandas as pd
import json
from bs4 import BeautifulSoup
import string
import contractions
import ftfy


## Text preprocessing for fine tuning
## https://www.linkedin.com/pulse/pre-processing-text-data-gpt-models-techniques-best-practices-tilix/
## TODO: https://ftfy.readthedocs.io/en/latest/
## TODO: Read this: https://arxiv.org/abs/2212.10496

def remove_punctuation(text):
    translator = str.maketrans('', '', string.punctuation)
    clean_text = text.translate(translator)
    return clean_text

def lowercase_text(text):
    return text.lower()

def wikitext_detokenizer(string):
    # https://github.com/kingoflolz/mesh-transformer-jax/blob/master/create_finetune_tfrecords.py
    # contractions
    string = string.replace("s '", "s'")
    string = re.sub(r"/' [0-9]/", r"/'[0-9]/", string)
    # number separators
    string = string.replace(" @-@ ", "-")
    string = string.replace(" @,@ ", ",")
    string = string.replace(" @.@ ", ".")
    # punctuation
    string = string.replace(" : ", ": ")
    string = string.replace(" ; ", "; ")
    string = string.replace(" . ", ". ")
    string = string.replace(" ! ", "! ")
    string = string.replace(" ? ", "? ")
    string = string.replace(" , ", ", ")
    # double brackets
    string = re.sub(r"\(\s*([^\)]*?)\s*\)", r"(\1)", string)
    string = re.sub(r"\[\s*([^\]]*?)\s*\]", r"[\1]", string)
    string = re.sub(r"{\s*([^}]*?)\s*}", r"{\1}", string)
    string = re.sub(r"\"\s*([^\"]*?)\s*\"", r'"\1"', string)
    string = re.sub(r"'\s*([^']*?)\s*'", r"'\1'", string)
    # miscellaneous
    string = string.replace("= = = =", "====")
    string = string.replace("= = =", "===")
    string = string.replace("= =", "==")
    string = string.replace(" " + chr(176) + " ", chr(176))
    string = string.replace(" \n", "\n")
    string = string.replace("\n ", "\n")
    string = string.replace(" N ", " 1 ")
    string = string.replace(" 's", "'s")

    return string

def clean_html_text(soup: BeautifulSoup):
    # Process quote aside tags
    rgx = "<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});"
    cleanr = re.compile(rgx)
    cleantext = re.sub(cleanr, '', str(soup))

    cleantext = re.sub(r'Screen.+KB', '', cleantext)
    cleantext = re.sub(r'[0-9].+KB', '', cleantext)
    cleantext = re.sub(r'Kind.+[a-z0-9]', '', cleantext)
    cleantext = re.sub(r'Dear @.+[a-z0-9]', '', cleantext)
    cleantext = re.sub(r'Hi @.+[a-z0-9]', '', cleantext)
    cleantext = re.sub(r'Hello @.+[a-z0-9]', '', cleantext)
    cleantext = re.sub(r'Cheers', '', cleantext)
    cleantext = re.sub(r'Best .+[a-z0-9]', '', cleantext)
    cleantext = wikitext_detokenizer(cleantext)
    cleantext = lowercase_text(cleantext)
    cleantext = ftfy.fix_text(cleantext)
    tagged_sentence = nltk.tag.pos_tag(cleantext.split())
    cleantext = [word for word, tag in tagged_sentence if tag != 'NNP' and tag != 'NNPS']
    cleantext = " ".join(cleantext)
    # fix contractions
    cleantext = contractions.fix(cleantext)
    return cleantext

[nltk_data] Downloading package stopwords to /home/centos/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/centos/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [3]:
import pandas as pd
import json


with open("../out/data.json") as fout:
    raw_data = json.load(fout)

user_template = """

### Instruction:

{}

"""

system_template = """### Response:

{} 

"""

end_template = """{}### End"""

agg_conversations = []
for idx_thread, thread in enumerate(raw_data):
    conversations = ""
    prev_owner = ""
    
    
    for idx_post in range(len(thread)):
        post_curr = thread[idx_post]
        soup = BeautifulSoup(post_curr["text"], 'html.parser')
        #systemsoup = BeautifulSoup(system_post["text"], 'html.parser')
        cleantext = clean_html_text(soup)
        #system_cleantext = clean_html_text(systemsoup)
        
        if post_curr["role"] == "user":
            if prev_owner == "user":
                conversations += "\n" + cleantext #post_curr["text"]
            elif prev_owner == "system":
                conversations += "\n" + user_template.format(cleantext) #post_curr["text"]
            else:
                conversations += "\n" + user_template.format(cleantext) #post_curr["text"]

        if post_curr["role"] == "system":
            if prev_owner == "system":
                conversations += "\n" + cleantext #post_curr["text"]
            elif prev_owner == "user":
                conversations += "\n" + system_template.format(cleantext)
            else:
                conversations += "\n" + user_template.format(cleantext)

        prev_owner = post_curr["role"]
    agg_conversations.append(end_template.format(conversations))

# create dataframe
test_conv_dataframe = pd.DataFrame(agg_conversations, columns=["conversations"])
test_conv_dataframe

  soup = BeautifulSoup(post_curr["text"], 'html.parser')
  soup = BeautifulSoup(post_curr["text"], 'html.parser')


Unnamed: 0,conversations
0,"\n\n\n### Instruction:\n\nhi, i have a very ba..."
1,"\n\n\n### Instruction:\n\nhi, i met an error w..."
2,"\n\n\n### Instruction:\n\nhi, i am attempting ..."
3,\n\n\n### Instruction:\n\nsubmitting a job to ...
4,\n\n\n### Instruction:\n\ni need a tool which ...
...,...
1406,"\n\n\n### Instruction:\n\nhello, . i am workin..."
1407,\n\n\n### Instruction:\n\ni have been trying t...
1408,\n\n\n### Instruction:\n\ni am trying to follo...
1409,\n\n\n### Instruction:\n\nhello. two questions...


In [4]:
#test_conv_dataframe.to_csv("../data/test-conversations.csv", sep="\t", index=None)
test_conv_dataframe.to_csv("../data/documents_conversations_galaxy_help.csv", sep="\t", index=None)

In [3]:
import re
import sys
import torch
from transformers import (LlamaTokenizer, LlamaForCausalLM)

model_path = 'openlm-research/open_llama_3b_v2'

'''model = LlamaForCausalLM.from_pretrained(
    model_path, 
    load_in_8bit=True,
    device_map='auto', 
    # If passing a string for `device_map`, please choose 'auto', 'balanced', 'balanced_low_0' or 'sequential'.
)

model.config.use_cache = False
model.config.pretraining_tp = 1'''

tokenizer = LlamaTokenizer.from_pretrained(model_path)

  from .autonotebook import tqdm as notebook_tqdm


[2023-11-24 08:58:46,792] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)


In [11]:
import re

def extract_ins_res_pairs(input_string):
    pairs_ins_res = []
    size_pairs = []
    pattern = "Below is an instruction that describes a task. Write a response that appropriately completes the request"
    matches = [m.start() for m in re.finditer(pattern, input_string)]
    #print(matches, len(input_string))
    for i in range(len(matches)-1):
        f,n = matches[i], matches[i+1]
        if i+1 < len(matches)-1:
            extracted_conv = input_string[f:n]
        else:
            extracted_conv = input_string[matches[i+1]:len(input_string)]
        input_ids = tokenizer.encode(extracted_conv, return_tensors="pt")
        pairs_ins_res.append(extracted_conv)
        size_pairs.append(len(input_ids[0]))
    return pairs_ins_res, size_pairs

ins_res_dataset = []
prs_size_dataset = []
for idx, item in test_conv_dataframe.iterrows():
    prs, size_prs = extract_ins_res_pairs(item["conversations"])
    ins_res_dataset.extend(prs)
    prs_size_dataset.extend(size_prs)
    print(idx, len(ins_res_dataset), len(prs_size_dataset))
    if idx == 2:
        break
individual_conversations = pd.DataFrame(zip(ins_res_dataset, prs_size_dataset), columns=["conversations", "tensor_size"])

0 7 7
1 15 15
2 15 15


In [13]:
size = individual_conversations["tensor_size"]

In [14]:
import numpy as np

len(size), np.mean(size), np.median(size)

(15, 659.8, 645.0)

In [15]:
small_df = individual_conversations #individual_conversations[individual_conversations["tensor_size"] <= 200]
small_df

Unnamed: 0,conversations,tensor_size
0,Below is an instruction that describes a task....,1368
1,Below is an instruction that describes a task....,497
2,Below is an instruction that describes a task....,917
3,Below is an instruction that describes a task....,732
4,Below is an instruction that describes a task....,639
5,Below is an instruction that describes a task....,103
6,Below is an instruction that describes a task....,1027
7,Below is an instruction that describes a task....,498
8,Below is an instruction that describes a task....,908
9,Below is an instruction that describes a task....,868


In [16]:
small_df.to_csv("../data/small-conversations.csv", sep="\t", index=None)