In [1]:
from glob import glob
import json
import os
import re
import sys
import pandas as pd
import numpy as np
import seaborn as sns
from tqdm import tqdm
from typing import List, Dict, Mapping, Tuple, Union, Optional
from transformers import AutoTokenizer

tqdm.pandas()

ROOT_DIR = os.path.dirname(os.path.dirname(os.path.dirname(os.getcwd()))) # Should be your path to the repo `mint`
sys.path.insert(0, ROOT_DIR)
os.chdir(ROOT_DIR)
print(f"Working directory: {os.getcwd()}")


Working directory: /home/xingyao6/llm-agent


# Tokenizer AND Special Symbols

In [2]:
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")

# follow https://github.com/openai/openai-python/blob/main/chatml.md
# and https://huggingface.co/OpenAssistant/codellama-13b-oasst-sft-v10
INST_START = "<|im_start|>"
INST_END = "<|im_end|>"

"""
<|im_start|>system
{system_message}<|im_end|>
<|im_start|>user
{prompt}<|im_end|>
<|im_start|>assistant
"""

# add the special tokens
tokenizer.add_special_tokens({"additional_special_tokens": [INST_START, INST_END]})
len(tokenizer)


32002

# Analyze Dataset

In [3]:
SYSTEM_MSG_TO_REPLACE = """At each turn, you should first provide your step-by-step thinking for solving the task. Your thought process should be enclosed using "<thought>" tag, for example: <thought> I need to print "Hello World!" </thought>.

After that, you have two options:"""

SYSTEM_MSG_NEW = """At each turn, you should first provide your step-by-step thinking for solving the task. After that, you have two options:"""

def convert_state_to_traj(state) -> List[Mapping[str, str]]:
    STRIP_WORDS = [
        "Assistant:",
    ]
    history = state["history"]
    res = []

    for i, turn in enumerate(history):
        role = turn["role"]
        text = turn["content"].strip()
        for word in STRIP_WORDS:
            text = text.lstrip(word)
        text = text.strip()
        if i == 0:
            assert role == "user"
            # specifically handle the first turn
            splited_text = text.split("\n---\n\nTask:")
            assert len(splited_text) == 3, f"Expecting 3 parts. But got {len(splited_text)} parts: \n{text}"
            system_message, in_context_example, task = splited_text
            system_message = system_message.replace(SYSTEM_MSG_TO_REPLACE, SYSTEM_MSG_NEW)
            # res += f"{INST_START}system\n{system_message.strip()}{INST_END}\n"
            res.append({
                "role": "system",
                "content": system_message.strip()
            })
            # res += f"{INST_START}user\nTask:\n{task.strip()}{INST_END}\n"
            res.append({
                "role": "user",
                "content": f"Task:\n{task.strip()}"
            })

        elif role == "user":
            res.append({
                "role": "user",
                "content": text
            })
        elif role == "assistant":
            # replace <thought> and </thought> with empty string, but keep the content between them
            # if is on a separate line, remove the line
            # do not capture the space right after <thought> and before </thought>
            text = re.sub(r"<thought>(.*?)</thought>", lambda match: match.group(1).strip(), text, flags=re.DOTALL)

            res.append({
                "role": "assistant",
                "content": text
            })
    return res

def format_traj_to_str(traj: List[Mapping[str, str]]) -> str:
    res = ""
    for turn in traj:
        res += f"{INST_START}{turn['role']}\n{turn['content']}{INST_END}\n"
    return res

def visualize_traj(traj: List[Mapping[str, str]]):
    print("==========================")
    for turn in traj:
        if turn["role"] == "user":
            print("\033[1;34;40m" + f"USER:\n{turn['content']}" + "\033[0m")
            print("==========================")
        elif turn["role"] == "assistant":
            # green for assistant
            print("\033[1;32;40m" + f"ASSISTANT:\n{turn['content']}" + "\033[0m")
            print("==========================")
        elif turn["role"] == "system":
            # yellow for system
            print("\033[1;33;40m" + f"SYSTEM:\n{turn['content']}" + "\033[0m")
            print("==========================")


## Token Count

In [4]:
def analyze_dataset(df, max_tokens=4096):
    # print("Coverting conversations to string...")
    conv_str = df["conversations"].apply(format_traj_to_str)
    # print("Tokenizing...")
    output_traj_length = conv_str.progress_apply(lambda x: len(tokenizer(x)["input_ids"])).rename("token_length")

    # plot the distribution of the length
    # print(output_traj_length.describe())
    # Use seaborn to plot the distribution (ecdf)
    # sns.ecdfplot(output_traj_length)

    # Cap the length to max_tokens
    output_traj_length = output_traj_length.clip(upper=max_tokens)
    # Print the sum of tokens
    print(f"Total number of tokens: {output_traj_length.sum():,}")

    return output_traj_length


In [5]:
os.listdir("data/datasets/")


['oct24_full5545.jsonl',
 'oct20_apps354.jsonl',
 'openorca.n50000.jsonl',
 'openorca.n10000.jsonl',
 'agent_instruct.jsonl',
 'sharegpt.n10000.jsonl',
 'openorca.n30000.jsonl',
 'oct30_easy8155.jsonl',
 'evolcode.n10000.jsonl',
 'sharegpt.jsonl',
 'nov2_gpt4hard411.jsonl',
 'oct28_full6728.jsonl',
 'sharegpt_gpt4.jsonl']

In [7]:
dataset_stats = {}
dataset_dir = 'data/datasets'
DATASET_NAMES = [
    'openorca.n50000.jsonl',
    'openorca.n10000.jsonl',
    'openorca.n30000.jsonl',

    'agent_instruct.jsonl',

    'sharegpt.n10000.jsonl',
    'sharegpt.jsonl',
    'sharegpt_gpt4.jsonl',
    
    'nov2_gpt4hard411.jsonl',
    'oct24_full5545.jsonl',
]

for dataset in tqdm(DATASET_NAMES):
    ds_path = os.path.join(dataset_dir, dataset)
    print(f'- Analyzing dataset {dataset}')
    df = pd.read_json(ds_path, lines=True, orient="records")
    if "conversations" not in df.columns:
        conversations = []
        with open(ds_path) as f:
            for line in f:
                conversations.append(json.loads(line))
        df = pd.Series(conversations).to_frame().reset_index().rename(columns={"index": "id", 0: "conversations"})
    print(f"Number of conversations: {len(df)}")
    dataset_stats[dataset] = analyze_dataset(df)


dataset_stats_df = []

for dataset_name, seq_lens in dataset_stats.items():
    num_examples = len(seq_lens)
    num_tokens = seq_lens.clip(upper=4096).sum()
    dataset_stats_df.append({
        "dataset": dataset_name,
        "num_examples": num_examples,
        "num_tokens": num_tokens,
        "avg_tokens": num_tokens / num_examples
    })

dataset_stats_df = pd.DataFrame(dataset_stats_df)


  0%|          | 0/9 [00:00<?, ?it/s]

- Analyzing dataset openorca.n50000.jsonl
Number of conversations: 50000


100%|██████████| 50000/50000 [00:32<00:00, 1555.34it/s]
 11%|█         | 1/9 [00:33<04:26, 33.35s/it]

Total number of tokens: 14,028,347
- Analyzing dataset openorca.n10000.jsonl
Number of conversations: 10000


100%|██████████| 10000/10000 [00:05<00:00, 1775.18it/s]
 22%|██▏       | 2/9 [00:39<02:00, 17.17s/it]

Total number of tokens: 2,813,879
- Analyzing dataset openorca.n30000.jsonl
Number of conversations: 30000


100%|██████████| 30000/30000 [00:18<00:00, 1614.55it/s]
 33%|███▎      | 3/9 [00:58<01:48, 18.04s/it]

Total number of tokens: 8,424,516
- Analyzing dataset agent_instruct.jsonl
Number of conversations: 1866


100%|██████████| 1866/1866 [00:05<00:00, 345.95it/s]
 44%|████▍     | 4/9 [01:03<01:05, 13.10s/it]

Total number of tokens: 2,501,909
- Analyzing dataset sharegpt.n10000.jsonl
Number of conversations: 10000


100%|██████████| 10000/10000 [00:42<00:00, 234.36it/s]
 56%|█████▌    | 5/9 [01:47<01:35, 23.99s/it]

Total number of tokens: 17,932,833
- Analyzing dataset sharegpt.jsonl
Number of conversations: 39537


100%|██████████| 39537/39537 [03:04<00:00, 214.49it/s]
 67%|██████▋   | 6/9 [04:54<03:58, 79.39s/it]

Total number of tokens: 70,947,803
- Analyzing dataset sharegpt_gpt4.jsonl
Number of conversations: 4583


100%|██████████| 4583/4583 [00:51<00:00, 88.80it/s]
 78%|███████▊  | 7/9 [05:46<02:21, 70.67s/it]

Total number of tokens: 10,868,124
- Analyzing dataset nov2_gpt4hard411.jsonl
Number of conversations: 411


100%|██████████| 411/411 [00:01<00:00, 299.74it/s]
 89%|████████▉ | 8/9 [05:48<00:48, 48.64s/it]

Total number of tokens: 592,701
- Analyzing dataset oct24_full5545.jsonl
Number of conversations: 5545


100%|██████████| 5545/5545 [00:17<00:00, 325.31it/s]
100%|██████████| 9/9 [06:05<00:00, 40.63s/it]

Total number of tokens: 8,251,072





In [18]:
dataset_stats_df\
    .sort_values("num_examples", ascending=False)\
    .style.format({
    "num_examples": "{:,}",
    "num_tokens": "{:,}",
    "avg_tokens": "{:.2f}",
}).background_gradient(subset=["num_examples", "num_tokens", "avg_tokens"], cmap="Blues")


Unnamed: 0,dataset,num_examples,num_tokens,avg_tokens
0,openorca.n50000.jsonl,50000,14028347,280.57
5,sharegpt.jsonl,39537,70947803,1794.47
2,openorca.n30000.jsonl,30000,8424516,280.82
1,openorca.n10000.jsonl,10000,2813879,281.39
4,sharegpt.n10000.jsonl,10000,17932833,1793.28
8,oct24_full5545.jsonl,5545,8251072,1488.02
6,sharegpt_gpt4.jsonl,4583,10868124,2371.4
3,agent_instruct.jsonl,1866,2501909,1340.79
7,nov2_gpt4hard411.jsonl,411,592701,1442.09
