In [1]:
import argparse
from concurrent.futures import ProcessPoolExecutor
import json
import transformers
from tqdm import tqdm

from rosemary import jpt_parse_args

In [3]:

parser = argparse.ArgumentParser()
parser.add_argument("--in-files", nargs="+", type=str)
parser.add_argument("--out-file", type=str, default="sharegpt_split.json")
parser.add_argument("--begin", type=int)
parser.add_argument("--end", type=int)
parser.add_argument("--model-name-or-path", type=str, required=True)
parser.add_argument("--max-length", type=int, default=2048)
# args = parser.parse_args()

cmd = """
--in-files ../data/raw_train/sharegpt/sg_90k_part1_html_cleaned.json ../data/raw_train/sharegpt/sg_90k_part2_html_cleaned.json
--out-file ../data/raw_train/sharegpt/sharegpt_html_cleaned_and_split.json
--model-name-or-path mosaicml/mpt-7b
"""

# --model-name-or-path huggyllama/llama-7b
# 

args = jpt_parse_args(parser, cmd=cmd)

In [4]:
content = []
for file in args.in_files:
    content.extend(json.load(open(file)))

In [5]:
print(len(content))
print(content[0].keys())

i = 1
conversations = content[i]['conversations']
print(len(conversations))
print(conversations[i].keys())

N = 3

for i in range(N):
    print(f"\n===== {conversations[i]['from']} =====")
    print(conversations[i]['value'])

76920
dict_keys(['id', 'conversations'])
12
dict_keys(['from', 'value'])

===== human =====
Summarize the main ideas of Jeff Walker's Product Launch Formula into bullet points as it pertains to a growth marketing agency implementing these strategies and tactics for their clients...

===== gpt =====
Here are the main ideas of Jeff Walker's Product Launch Formula that can be applied by a growth marketing agency for their clients:

1. Identify the target audience and their needs: Understand the ideal customer for the product or service, and create a messaging that resonates with them.
2. Pre-launch: Build anticipation and excitement for the launch by creating buzz, gathering testimonials and case studies, and using social media to create awareness.
3. Launch: Use a well-crafted launch sequence to maximize sales and conversions. This can include offering bonuses, creating scarcity, and using a deadline to create urgency.
4. Post-launch: Follow up with customers, gather feedback, and contin

In [7]:

tokenizer = transformers.AutoTokenizer.from_pretrained(
    args.model_name_or_path,
    padding_side="right",
#     use_fast=False,
)

# mpt-7b

In [10]:
from split_sharegpt_conversations import split_all, filter_invalid_roles
new_content = split_all(content, args.begin, args.end, tokenizer, args.max_length)

# len(new_content):
# llama:  178604
# mpt-7b: 168815
#
new_content = filter_invalid_roles(new_content)

# len(new_content):
# mpt-7b: 160564


Token indices sequence length is longer than the specified maximum sequence length for this model (2149 > 2048). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (2932 > 2048). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (12931 > 2048). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (2539 > 2048). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (2452 > 2048). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence len

  2%|▏         | 1503/76920 [00:01<01:13, 1027.95it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (2158 > 2048). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (2299 > 2048). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (2650 > 2048). Running this sequence through the model will result in indexing errors
  2%|▏         | 1623/76920 [00:01<01:40, 748.38it/s] Token indices sequence length is longer than the specified maximum sequence length for this model (2422 > 2048). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (2849 > 2048). Running this sequence through the model wil

Token indices sequence length is longer than the specified maximum sequence length for this model (3200 > 2048). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (2399 > 2048). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (2556 > 2048). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (3114 > 2048). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (2576 > 2048). Running this sequence through the model will result in indexing errors
  6%|▌         | 4241/76920 [00:03<01:13, 985.24it/s] Token indices sequence len

 21%|██▏       | 16510/76920 [00:13<00:43, 1382.73it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (3125 > 2048). Running this sequence through the model will result in indexing errors
 24%|██▎       | 18207/76920 [00:15<00:42, 1368.50it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (2314 > 2048). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (4969 > 2048). Running this sequence through the model will result in indexing errors
 28%|██▊       | 21361/76920 [00:17<00:32, 1706.51it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (2269 > 2048). Running this sequence through the model will result in indexing errors
100%|██████████| 76920/76920 [00:59<00:00, 1291.53it/s]


In [11]:

print(f"total: {len(content)}, new: {len(new_content)}")
json.dump(new_content, open(args.out_file, "w"), indent=2)


total: 76920, new: 160564
