# Getting started with OpenAssistant OASST1 data

- https://huggingface.co/datasets/OpenAssistant/oasst1

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/LAION-AI/Open-Assistant/blob/master/notebooks/openassistant-oasst1/getting-started.ipynb)

In [19]:
# uncomment and run below lines to set up if running in colab
# %pip install datasets pandas treelib

# Imports

In [20]:
import pandas as pd
from datasets import load_dataset
from treelib import Tree

# set some pandas options to make the output more readable
pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)
pd.set_option("display.width", 1000)


def add_tree_level(df):
    """helper function to add tree level to a df"""

    # if tree level already exists, return df
    if "tree_level" in df.columns:
        return df

    else:
        tree_level_map = {}

        # iterate over rows in df
        for i, row in df.iterrows():
            message_id = row["message_id"]
            parent_id = row["parent_id"]

            # if parent_id is None, then it is a root message
            if parent_id is None:
                tree_level_map[message_id] = 0
            # if parent_id is the same as message_tree_id, then it is a direct reply to the root message
            elif parent_id == row["message_tree_id"]:
                tree_level_map[message_id] = 1
            # else just look up the tree level of the parent_id and add 1
            else:
                tree_level_map[message_id] = tree_level_map[parent_id] + 1

        # create a df from the tree_level_map and merge it with the original df
        df_tree_level_map = (
            pd.DataFrame.from_dict(
                tree_level_map, orient="index", columns=["tree_level"]
            )
            .reset_index()
            .rename(columns={"index": "message_id"})
        )

        return df.merge(df_tree_level_map, on="message_id")

# Load Dataset

In [21]:
# load dataset from huggingface datasets
ds = load_dataset("OpenAssistant/oasst1")
print(ds)

Found cached dataset parquet (/home/passaglia/.cache/huggingface/datasets/OpenAssistant___parquet/OpenAssistant--oasst1-2960c57d7e52ab15/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2/2 [00:00<00:00, 125.36it/s]

DatasetDict({
    train: Dataset({
        features: ['message_id', 'parent_id', 'user_id', 'created_date', 'text', 'role', 'lang', 'review_count', 'review_result', 'deleted', 'rank', 'synthetic', 'model_name', 'detoxify', 'message_tree_id', 'tree_state', 'emojis', 'labels'],
        num_rows: 84437
    })
    validation: Dataset({
        features: ['message_id', 'parent_id', 'user_id', 'created_date', 'text', 'role', 'lang', 'review_count', 'review_result', 'deleted', 'rank', 'synthetic', 'model_name', 'detoxify', 'message_tree_id', 'tree_state', 'emojis', 'labels'],
        num_rows: 4401
    })
})





# Create Pandas Dataframe

In [22]:
# lets convert the train dataset to a pandas df
# df = ds["train"].to_pandas()
# lets convert the val dataset to a pandas df
#df = ds["validation"].to_pandas()
# lets convert the entire dataset to a pandas df
df = pd.concat([ds["train"].to_pandas(), ds['validation'].to_pandas()])

In [23]:
# look at the df info
df.info(verbose=True, memory_usage=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Index: 88838 entries, 0 to 4400
Data columns (total 18 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   message_id       88838 non-null  object 
 1   parent_id        78474 non-null  object 
 2   user_id          88838 non-null  object 
 3   created_date     88838 non-null  object 
 4   text             88838 non-null  object 
 5   role             88838 non-null  object 
 6   lang             88838 non-null  object 
 7   review_count     88838 non-null  int32  
 8   review_result    88106 non-null  object 
 9   deleted          88838 non-null  bool   
 10  rank             51263 non-null  float64
 11  synthetic        88838 non-null  bool   
 12  model_name       0 non-null      object 
 13  detoxify         76093 non-null  object 
 14  message_tree_id  88838 non-null  object 
 15  tree_state       88838 non-null  object 
 16  emojis           75315 non-null  object 
 17  labels           8

In [24]:
# look at a sample row in a json format we can easily read
df.sample(1).transpose().to_dict()

{49686: {'message_id': '7b933c65-ee07-4478-acbb-b9bb9353b2ff',
  'parent_id': '22237a3e-2650-4328-86d9-14bb6a88435e',
  'user_id': '33d896a1-47c3-4cb7-99f7-9a5c97a98c8d',
  'created_date': '2023-02-10T17:33:35.284937+00:00',
  'text': 'Para poder crear un men√∫ teniendo en cuenta costos, ingrediente y preparaciones necesito saber costos, ingredientes y preparaciones',
  'role': 'assistant',
  'lang': 'es',
  'review_count': 3,
  'review_result': True,
  'deleted': False,
  'rank': 2.0,
  'synthetic': False,
  'model_name': None,
  'detoxify': {'toxicity': 0.0005959299742244184,
   'severe_toxicity': 0.002321615582332015,
   'obscene': 0.01839069463312626,
   'identity_attack': 0.0021741921082139015,
   'insult': 0.015613683499395847,
   'threat': 0.0011323046637699008,
   'sexual_explicit': 0.0006280389497987926},
  'message_tree_id': '22237a3e-2650-4328-86d9-14bb6a88435e',
  'tree_state': 'ready_for_export',
  'emojis': {'name': array(['-1'], dtype=object),
   'count': array([1], dtyp

In [25]:
from numpy.random import choice

# lets grab a random message tree
message_tree_id = choice(df.query("lang == 'ja'")["message_tree_id"].unique())
print(message_tree_id)

762fa281-e397-474e-b6e0-4b475b259f54


In [26]:
len(df.query("lang == 'ja'")["message_tree_id"].unique())

50

In [27]:
message_tree_id = 'a7ed3aef-5112-4277-bba8-3393f9d0002d'

In [28]:
# look at all data for this message tree
df_message_tree = df.query(f"message_tree_id == '{message_tree_id}'").sort_values(
    "created_date"
)

# add tree level to df
df_message_tree = add_tree_level(df_message_tree)

df_message_tree.head()

Unnamed: 0,message_id,parent_id,user_id,created_date,text,role,lang,review_count,review_result,deleted,rank,synthetic,model_name,detoxify,message_tree_id,tree_state,emojis,labels,tree_level
0,a7ed3aef-5112-4277-bba8-3393f9d0002d,,03a73c05-ce1d-4348-b305-ee5fa58eacc3,2023-02-09T17:21:09.537864+00:00,Implementa el algoritmo `bubble sort` en C.,prompter,es,4,True,False,,False,,"{'toxicity': 0.0006991538684815168, 'severe_to...",a7ed3aef-5112-4277-bba8-3393f9d0002d,ready_for_export,"{'name': ['+1', 'red_flag', '_skip_reply', '_s...","{'name': ['spam', 'lang_mismatch', 'pii', 'not...",0
1,ac99b9fb-fc98-4f8a-8fb5-dad2947c470b,a7ed3aef-5112-4277-bba8-3393f9d0002d,3050f574-1316-449c-9723-ee73eb211245,2023-02-10T04:31:06.704280+00:00,xdxdxd,assistant,es,3,True,False,2.0,False,,"{'toxicity': 0.000538912252523005, 'severe_tox...",a7ed3aef-5112-4277-bba8-3393f9d0002d,ready_for_export,"{'name': ['-1'], 'count': [3]}","{'name': ['spam', 'fails_task', 'lang_mismatch...",1
2,6f5eee6f-57c9-47f1-8c86-69aa87793c9d,a7ed3aef-5112-4277-bba8-3393f9d0002d,bc65a4e8-9a92-4978-96d8-b3b25df987f2,2023-02-10T04:39:30.409478+00:00,#include <stdio.h>\n\nvoid bubbleSort(int arr[...,assistant,es,4,True,False,1.0,False,,"{'toxicity': 0.0003762982087209821, 'severe_to...",a7ed3aef-5112-4277-bba8-3393f9d0002d,ready_for_export,"{'name': ['+1', '_skip_labeling'], 'count': [1...","{'name': ['spam', 'fails_task', 'lang_mismatch...",1
3,2f576231-c86a-4be1-8f23-56d809f04d93,a7ed3aef-5112-4277-bba8-3393f9d0002d,53e0d057-4983-4c15-ba36-5dcf78c711b6,2023-02-10T04:41:03.658933+00:00,Aqu√≠ te presento una implementaci√≥n del algori...,assistant,es,3,True,False,0.0,False,,"{'toxicity': 0.0003566639788914472, 'severe_to...",a7ed3aef-5112-4277-bba8-3393f9d0002d,ready_for_export,"{'name': ['+1', '_skip_labeling'], 'count': [1...","{'name': ['spam', 'fails_task', 'lang_mismatch...",1


## Create Message Tree

In [29]:
# lets create a tree of message ids
id_tree = Tree()
# lets create a tree of message texts
text_tree = Tree()
# lets set a max char length for the text
max_char_len = 10000

# iterate over rows in df_message_tree
for i, row in df_message_tree.iterrows():
    # grab the message_id, parent_id, text, and parent text
    message_id = row["message_id"]
    parent_id = row["parent_id"]
    text = row["text"]
    text_short = text[:max_char_len] if len(text) > max_char_len else text
    text_short = text_short.replace("\n", " ")
    parent_text = (
        df_message_tree.query(f"message_id == '{parent_id}'")["text"].values[0]
        if parent_id is not None
        else "ROOT"
    )
    parent_text_short = (
        parent_text[:max_char_len] if len(parent_text) > max_char_len else parent_text
    )
    parent_text_short = parent_text_short.replace("\n", " ")

    # create a node in the id_tree and text_tree, add row as data in case want it later
    id_tree.create_node(message_id, message_id, parent=parent_id, data=row.to_dict())

    # if parent_id is None, then it is a root message so dont add parent text as is none
    if parent_id is None:
        text_tree.create_node(text_short, text_short)
    # else use the parent text short as the parent
    else:
        text_tree.create_node(text_short, text_short, parent=parent_text_short)

print("id_tree:")
id_tree.show()

print("text_tree:")
text_tree.show()

id_tree:
a7ed3aef-5112-4277-bba8-3393f9d0002d
‚îú‚îÄ‚îÄ 2f576231-c86a-4be1-8f23-56d809f04d93
‚îú‚îÄ‚îÄ 6f5eee6f-57c9-47f1-8c86-69aa87793c9d
‚îî‚îÄ‚îÄ ac99b9fb-fc98-4f8a-8fb5-dad2947c470b

text_tree:
Implementa el algoritmo `bubble sort` en C.
‚îú‚îÄ‚îÄ #include <stdio.h>  void bubbleSort(int arr[], int n) {   int i, j;   for (i = 0; i < n-1; i++) {     for (j = 0; j < n-i-1; j++) {       if (arr[j] > arr[j+1]) {         int temp = arr[j];         arr[j] = arr[j+1];         arr[j+1] = temp;       }     }   } }  int main() {   int arr[] = {64, 34, 25, 12, 22, 11, 90};   int n = sizeof(arr)/sizeof(arr[0]);    printf("Array antes del ordenamiento:\n");   int i;   for (i = 0; i < n; i++) {     printf("%d ", arr[i]);   }   printf("\n");    bubbleSort(arr, n);    printf("Array despu√©s del ordenamiento:\n");   for (i = 0; i < n; i++) {     printf("%d ", arr[i]);   }   printf("\n");    return 0; }  El c√≥digo utiliza dos bucles for anidados para comparar cada par de elementos consecutivos en e

# Make an eval dataset

In [39]:
output = []

# pretext = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions."
# username = 'Human'
# assistantname = 'Assistant'
# space = ' '
# semicolon = ":"

# pretext = "Â•ΩÂ•áÂøÉÊó∫Áõõ„Å™‰∫∫Èñì„Å®AI„Ç¢„Ç∑„Çπ„Çø„É≥„Éà„Å®„ÅÆÂØæË©±„ÄÇAI„Ç¢„Ç∑„Çπ„Çø„É≥„Éà„ÅØ„ÄÅ„É¶„Éº„Ç∂„Éº„Åã„Çâ„ÅÆË≥™Âïè„Å´ÂØæ„Åó„ÄÅË©≥Á¥∞„ÅßÂΩπ„Å´Á´ã„Å§‰∏ÅÂØß„Å™ÂõûÁ≠î„Çí„Åó„Åæ„Åô„ÄÇ"
# username = '„É¶„Éº„Ç∂„Éº'
# assistantname = '„Ç¢„Ç∑„Çπ„Çø„É≥„Éà'
# space = '„ÄÄ'
# semicolon = "Ôºö"

# <|assistantlabel|> -> "### Assistant:"
# <|userlabel|> -> "### Human:"

role_label_dict = {"assistant": f"<|assistantlabel|>", "prompter": f"<|userlabel|>"}
space = "<|space|>"

subdf = df.query(f"lang == 'ja'")
#subdf = df
for message_tree_id in subdf["message_tree_id"].unique():
    print(message_tree_id)
    # look at all data for this message tree
    df_message_tree = df.query(f"message_tree_id == '{message_tree_id}'").sort_values(
        "created_date"
    )

    # add tree level to df
    df_message_tree = add_tree_level(df_message_tree)

    # lets create a tree of message ids
    id_tree = Tree()
    # lets create a tree of message texts
    text_tree = Tree()
    # lets set a max char length for the text
    max_char_len = 20000

    # iterate over rows in df_message_tree
    for i, row in df_message_tree.iterrows():
        # grab the message_id, parent_id, text, and parent text
        message_id = row["message_id"]
        parent_id = row["parent_id"]
        text = row["text"]
        text_short = text[:max_char_len] if len(text) > max_char_len else text
        text_short = text_short.replace("\n", " ")
        parent_text = (
            df_message_tree.query(f"message_id == '{parent_id}'")["text"].values[0]
            if parent_id is not None
            else "ROOT"
        )
        parent_text_short = (
            parent_text[:max_char_len]
            if len(parent_text) > max_char_len
            else parent_text
        )
        parent_text_short = parent_text_short.replace("\n", " ")

        # create a node in the id_tree and text_tree, add row as data in case want it later
        id_tree.create_node(
            message_id, message_id, parent=parent_id, data=row.to_dict()
        )

        # # if parent_id is None, then it is a root message so dont add parent text as is none
        # if parent_id is None:
        #     text_tree.create_node(text_short, text_short)
        # # else use the parent text short as the parent
        # else:
        #     text_tree.create_node(text_short, text_short, parent=parent_text_short)

    # print("id_tree:")
    # id_tree.show()

    # print("text_tree:")
    # text_tree.show()

    from treelib import Node

    # Assuming your tree is named id_tree
    def navigate_tree(tree, node_id):
        # Empty list to store the resulting dictionaries
        result = []

        # Get the current node and its parent
        node = tree.get_node(node_id)
        parent = tree.parent(node_id)

        # Get the corresponding row in the dataframe for the current node
        assert node.tag == node_id
        df_row = df_message_tree[df_message_tree["message_id"] == node_id].iloc[0]

        siblings = tree.siblings(node_id)
        sibling_ranks = [df[df['message_id'] == s.identifier]['rank'].iloc[0] for s in siblings]
        
        if df_row["rank"] == 0 and not (not siblings or all(pd.isnull(s_rank) or df_row['rank'] <= s_rank for s_rank in sibling_ranks)):
            print('big bug here')
            print(f"rank {df_row['rank']}")
            print(f'len siblings: {len(siblings)}')
            print(f"siblings ranks: {[df[df['message_id'] == s.identifier]['rank'].iloc[0] for s in siblings]}")
            
        if df_row["role"] == "assistant" and df_row["rank"] < 10 and (not siblings or all(pd.isnull(s_rank) or df_row['rank'] <= s_rank for s_rank in sibling_ranks)):
            if int(df_row['rank']) != 0:
                print(df_row['rank'])
                print([df[df['message_id'] == s.identifier]['rank'].iloc[0] for s in siblings])
            # Make a dictionary for the current node
            new_dict = {
                "output": df_row["text"],
                "message_id": df_row["message_id"],
                "parent_id": parent.identifier if parent else None,
                "rank": int(df_row["rank"]),
                "lang": df_row["lang"],
                "question_id": df_row["message_id"],
                "category": "generic",
                "parents": []
            }

            # Initialize parent text to empty string
            parent_text = ""

            # Loop through all the parents and concatenate their text
            while parent:
                # Get the corresponding row in the dataframe for the parent
                parent_row = df[df["message_id"] == parent.tag].iloc[0]

                new_dict['parents'].append({'role': parent_row["role"], 'text':parent_row['text']})

                # Move up to the next parent
                parent = tree.parent(parent.identifier)

            # Add the new dictionary to the result list
            result.append(new_dict)

        # Recurse on the children
        for child in tree.children(node_id):
            result.extend(navigate_tree(tree, child.identifier))

        return result

    # Call the function with the root node of the tree
    result = navigate_tree(id_tree, id_tree.root)
    # print(result)
    output += result

import json
with open("oa_ja.jsonl", "w") as file:
    for dict_item in output:
        file.write(json.dumps(dict_item, ensure_ascii=False)+ '\n')

a322501a-4deb-4465-9e81-8f636b182c39
88f10f38-99be-4105-b838-fe33a5d48cbf
db9585ff-9575-4bdf-87c9-718e21fffab8
1c57bc22-3a09-4cb4-8800-dfabff928a73
bcae3771-01b1-48df-9132-6e89c40a4a46
0c32d5fb-ac24-4b49-866f-a36f3d9e1384
cbf70d3f-6f35-40a0-9e37-4fde5f002223
3c336d07-add2-49ab-a48b-b6c8dd0ddc1c
c88cb316-1cda-43dc-9e7c-ea1b61e6c952
43254b25-488e-47fa-9619-9f98c748d854
d9276f90-3f36-4076-b237-5eb330ef8a95
ea8e6a92-227c-4719-be23-c4dc89b99060
8b20f71e-f12b-41d6-ab88-1bf04a718fd8
9ba4195a-060d-4ec1-82b9-9f63f4636a92
3110bdb8-4ff6-4a3c-b3b8-f199aaf6e5a0
d51855ae-b0b2-41d6-b7e4-b7d1c3604800
b50f6392-5c56-4269-a4bb-3b963f23bbe9
166adc17-4ca7-4c4c-818a-9096feb7582f
bd3cab3c-b431-45c6-8c5c-5590add7e2fc
23f65f10-33a9-4016-b1c8-d9fc3fc5ccbe
f915f92f-38b3-4ed9-8653-1bfcf0de688b
b4931f30-8578-45a4-b2e6-e6800e872049
8203b2ad-85b0-4c03-b193-121940167e17
605233f5-788d-4204-bd6f-f9763baaeaa1
253eadf4-a128-493a-9289-4a88e2f037c0
57b89777-87b4-475e-9b1a-2162997f8cf6
50e1bd1d-f9a9-4cac-a630-6f8209dca013
0

In [40]:
output[1]

{'output': '„Çè„Åã„Çä„Åæ„Åô„Çà„ÄÅ„Åù„ÅÆÊ∞óÊåÅ„Å°ÔºÅ\nÁå´„Å°„ÇÉ„Çì„ÅÆÊÑõ„Åè„Çã„Åó„ÅÑÂßø„ÅØ„ÄÅÂ§©‰Ωø„Åø„Åü„ÅÑ„Å´ÂèØÊÑõ„ÅÑ„Åß„Åô„Çà„Å≠ÔºÅ\n„ÅÇ„Å™„Åü„ÅÆ„ÅäÂÆ∂„Å´„ÄÅÁå´„Å°„ÇÉ„Çì„ÅØ„ÅÑ„Çã„Çì„Åß„Åô„ÅãÔºü',
 'message_id': 'd19c58bb-b17e-4457-a12e-18b2850c6770',
 'parent_id': 'ab2b4c74-186a-42bc-9bbf-74d8fa463a41',
 'rank': 0,
 'lang': 'ja',
 'question_id': 'd19c58bb-b17e-4457-a12e-18b2850c6770',
 'category': 'generic',
 'parents': [{'role': 'prompter', 'text': 'Áå´„Å°„ÇÉ„Çì„ÅÆÂÖ®„Å¶„ÅåÂ•Ω„Åç„Åß„ÅôÔºÅÁå´„Å°„ÇÉ„Çì„ÅØÂ§©‰Ωø„Åß„ÅôÔºÅÔºÅ'},
  {'role': 'assistant',
   'text': 'Áå´„Å°„ÇÉ„Çì„ÅåÂ•Ω„Åç„Å™„Çì„Åß„Åô„Å≠„ÄÇ\nÁßÅ„ÇÇÁå´„Å°„ÇÉ„Çì„ÅåÂ•Ω„Åç„Åß„Åô„ÇàÔºÅ\nÁå´„Å°„ÇÉ„Çì„ÅÆ„Å©„Çì„Å™„Å®„Åì„Çç„ÅåÂ•Ω„Åç„Åß„Åô„ÅãÔºü'},
  {'role': 'prompter', 'text': 'ÁßÅ„ÅØÁå´„Å°„ÇÉ„Çì„ÅåÂ•Ω„Åç„Å™„Çì„Åß„Åô„Åë„Å©\nË≤¥Êñπ„ÇÇÁå´„Å°„ÇÉ„Çì„ÅåÂ•Ω„Åç„Åß„Åô„Åã?'}]}

In [41]:
output

[{'output': 'Áå´„Å°„ÇÉ„Çì„ÅåÂ•Ω„Åç„Å™„Çì„Åß„Åô„Å≠„ÄÇ\nÁßÅ„ÇÇÁå´„Å°„ÇÉ„Çì„ÅåÂ•Ω„Åç„Åß„Åô„ÇàÔºÅ\nÁå´„Å°„ÇÉ„Çì„ÅÆ„Å©„Çì„Å™„Å®„Åì„Çç„ÅåÂ•Ω„Åç„Åß„Åô„ÅãÔºü',
  'message_id': 'ea9f4729-00ab-4095-bc75-080c7e8b8d8b',
  'parent_id': 'a322501a-4deb-4465-9e81-8f636b182c39',
  'rank': 0,
  'lang': 'ja',
  'question_id': 'ea9f4729-00ab-4095-bc75-080c7e8b8d8b',
  'category': 'generic',
  'parents': [{'role': 'prompter',
    'text': 'ÁßÅ„ÅØÁå´„Å°„ÇÉ„Çì„ÅåÂ•Ω„Åç„Å™„Çì„Åß„Åô„Åë„Å©\nË≤¥Êñπ„ÇÇÁå´„Å°„ÇÉ„Çì„ÅåÂ•Ω„Åç„Åß„Åô„Åã?'}]},
 {'output': '„Çè„Åã„Çä„Åæ„Åô„Çà„ÄÅ„Åù„ÅÆÊ∞óÊåÅ„Å°ÔºÅ\nÁå´„Å°„ÇÉ„Çì„ÅÆÊÑõ„Åè„Çã„Åó„ÅÑÂßø„ÅØ„ÄÅÂ§©‰Ωø„Åø„Åü„ÅÑ„Å´ÂèØÊÑõ„ÅÑ„Åß„Åô„Çà„Å≠ÔºÅ\n„ÅÇ„Å™„Åü„ÅÆ„ÅäÂÆ∂„Å´„ÄÅÁå´„Å°„ÇÉ„Çì„ÅØ„ÅÑ„Çã„Çì„Åß„Åô„ÅãÔºü',
  'message_id': 'd19c58bb-b17e-4457-a12e-18b2850c6770',
  'parent_id': 'ab2b4c74-186a-42bc-9bbf-74d8fa463a41',
  'rank': 0,
  'lang': 'ja',
  'question_id': 'd19c58bb-b17e-4457-a12e-18b2850c6770',
  'category': 'generic',
  'parents': [{'role': 'prompter', 'text': 'Áå´„Å°„ÇÉ„Çì„ÅÆÂÖ®„

# Make a prompt dataset

In [202]:
output = []

pretext = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions."
username = 'Human'
assistantname = 'Assistant'
space = ' '
semicolon = ":"

pretext = "Â•ΩÂ•áÂøÉÊó∫Áõõ„Å™‰∫∫Èñì„Å®AI„Ç¢„Ç∑„Çπ„Çø„É≥„Éà„Å®„ÅÆÂØæË©±„ÄÇAI„Ç¢„Ç∑„Çπ„Çø„É≥„Éà„ÅØ„ÄÅ„É¶„Éº„Ç∂„Éº„Åã„Çâ„ÅÆË≥™Âïè„Å´ÂØæ„Åó„ÄÅË©≥Á¥∞„ÅßÂΩπ„Å´Á´ã„Å§‰∏ÅÂØß„Å™ÂõûÁ≠î„Çí„Åó„Åæ„Åô„ÄÇ"
username = '„É¶„Éº„Ç∂„Éº'
assistantname = '„Ç¢„Ç∑„Çπ„Çø„É≥„Éà'
space = '„ÄÄ'
semicolon = "Ôºö"

role_label_dict = {"assistant": f"### {assistantname}", "prompter": f"### {username}"}

subdf = df.query(f"lang == 'ja'")
#subdf = df
for message_tree_id in subdf["message_tree_id"].unique():
    print(message_tree_id)
    # look at all data for this message tree
    df_message_tree = df.query(f"message_tree_id == '{message_tree_id}'").sort_values(
        "created_date"
    )

    # add tree level to df
    df_message_tree = add_tree_level(df_message_tree)

    # lets create a tree of message ids
    id_tree = Tree()
    # lets create a tree of message texts
    text_tree = Tree()
    # lets set a max char length for the text
    max_char_len = 20000

    # iterate over rows in df_message_tree
    for i, row in df_message_tree.iterrows():
        # grab the message_id, parent_id, text, and parent text
        message_id = row["message_id"]
        parent_id = row["parent_id"]
        text = row["text"]
        text_short = text[:max_char_len] if len(text) > max_char_len else text
        text_short = text_short.replace("\n", " ")
        parent_text = (
            df_message_tree.query(f"message_id == '{parent_id}'")["text"].values[0]
            if parent_id is not None
            else "ROOT"
        )
        parent_text_short = (
            parent_text[:max_char_len]
            if len(parent_text) > max_char_len
            else parent_text
        )
        parent_text_short = parent_text_short.replace("\n", " ")

        # create a node in the id_tree and text_tree, add row as data in case want it later
        id_tree.create_node(
            message_id, message_id, parent=parent_id, data=row.to_dict()
        )

        # # if parent_id is None, then it is a root message so dont add parent text as is none
        # if parent_id is None:
        #     text_tree.create_node(text_short, text_short)
        # # else use the parent text short as the parent
        # else:
        #     text_tree.create_node(text_short, text_short, parent=parent_text_short)

    # print("id_tree:")
    # id_tree.show()

    # print("text_tree:")
    # text_tree.show()

    from treelib import Node

    # Assuming your tree is named id_tree
    def navigate_tree(tree, node_id):
        # Empty list to store the resulting dictionaries
        result = []

        # Get the current node and its parent
        node = tree.get_node(node_id)
        parent = tree.parent(node_id)

        # Get the corresponding row in the dataframe for the current node
        assert node.tag == node_id
        df_row = df_message_tree[df_message_tree["message_id"] == node_id].iloc[0]

        siblings = tree.siblings(node_id)
        sibling_ranks = [df[df['message_id'] == s.identifier]['rank'].iloc[0] for s in siblings]
        
        if df_row["rank"] == 0 and not (not siblings or all(pd.isnull(s_rank) or df_row['rank'] <= s_rank for s_rank in sibling_ranks)):
            print('big bug here')
            print(f"rank {df_row['rank']}")
            print(f'len siblings: {len(siblings)}')
            print(f"siblings ranks: {[df[df['message_id'] == s.identifier]['rank'].iloc[0] for s in siblings]}")
            
        if df_row["role"] == "assistant" and df_row["rank"] < 10 and (not siblings or all(pd.isnull(s_rank) or df_row['rank'] <= s_rank for s_rank in sibling_ranks)):
            if int(df_row['rank']) != 0:
                print(df_row['rank'])
                print([df[df['message_id'] == s.identifier]['rank'].iloc[0] for s in siblings])
            # Make a dictionary for the current node
            new_dict = {
                "input": "",
                "output": df_row["text"],
                "message_id": df_row["message_id"],
                "parent_id": parent.identifier if parent else None,
                "rank": int(df_row["rank"]),
                "lang": df_row["lang"],
                "prompt": "",
                "question_id": df_row["message_id"],
                "category": "generic",
                "text": "",
            }

            # Initialize parent text to empty string
            parent_text = ""

            # Loop through all the parents and concatenate their text
            while parent:
                # Get the corresponding row in the dataframe for the parent
                parent_row = df[df["message_id"] == parent.tag].iloc[0]

                # Concatenate the parent's role and text
                parent_text = (
                    role_label_dict[parent_row["role"]]
                    + semicolon + space
                    + parent_row["text"]
                    + space
                    + parent_text
                )

                # Move up to the next parent
                parent = tree.parent(parent.identifier)

            # Add the current node's role to the parent text
            new_dict["input"] = parent_text + role_label_dict[df_row["role"]] + semicolon

            # Make 'prompt' and 'text' the same as 'input'
            new_dict["prompt"] = (
                pretext + space +
                new_dict["input"]
            )
            new_dict["text"] = new_dict["input"]

            # Add the new dictionary to the result list
            result.append(new_dict)

        # Recurse on the children
        for child in tree.children(node_id):
            result.extend(navigate_tree(tree, child.identifier))

        return result

    # Call the function with the root node of the tree
    result = navigate_tree(id_tree, id_tree.root)
    # print(result)
    output += result

a322501a-4deb-4465-9e81-8f636b182c39
88f10f38-99be-4105-b838-fe33a5d48cbf
db9585ff-9575-4bdf-87c9-718e21fffab8
1c57bc22-3a09-4cb4-8800-dfabff928a73
bcae3771-01b1-48df-9132-6e89c40a4a46
0c32d5fb-ac24-4b49-866f-a36f3d9e1384
cbf70d3f-6f35-40a0-9e37-4fde5f002223
3c336d07-add2-49ab-a48b-b6c8dd0ddc1c
c88cb316-1cda-43dc-9e7c-ea1b61e6c952
43254b25-488e-47fa-9619-9f98c748d854
d9276f90-3f36-4076-b237-5eb330ef8a95
ea8e6a92-227c-4719-be23-c4dc89b99060
8b20f71e-f12b-41d6-ab88-1bf04a718fd8
9ba4195a-060d-4ec1-82b9-9f63f4636a92
3110bdb8-4ff6-4a3c-b3b8-f199aaf6e5a0
d51855ae-b0b2-41d6-b7e4-b7d1c3604800
b50f6392-5c56-4269-a4bb-3b963f23bbe9
166adc17-4ca7-4c4c-818a-9096feb7582f
bd3cab3c-b431-45c6-8c5c-5590add7e2fc
23f65f10-33a9-4016-b1c8-d9fc3fc5ccbe
f915f92f-38b3-4ed9-8653-1bfcf0de688b
b4931f30-8578-45a4-b2e6-e6800e872049
8203b2ad-85b0-4c03-b193-121940167e17
605233f5-788d-4204-bd6f-f9763baaeaa1
253eadf4-a128-493a-9289-4a88e2f037c0
57b89777-87b4-475e-9b1a-2162997f8cf6
50e1bd1d-f9a9-4cac-a630-6f8209dca013
0

In [203]:
import json
with open("oa_ja.jsonl", "w") as file:
    for dict_item in output:
        file.write(json.dumps(dict_item)+ '\n')

In [204]:
output

[{'input': '### „É¶„Éº„Ç∂„ÉºÔºö\u3000ÁßÅ„ÅØÁå´„Å°„ÇÉ„Çì„ÅåÂ•Ω„Åç„Å™„Çì„Åß„Åô„Åë„Å©\nË≤¥Êñπ„ÇÇÁå´„Å°„ÇÉ„Çì„ÅåÂ•Ω„Åç„Åß„Åô„Åã?\u3000### „Ç¢„Ç∑„Çπ„Çø„É≥„ÉàÔºö',
  'output': 'Áå´„Å°„ÇÉ„Çì„ÅåÂ•Ω„Åç„Å™„Çì„Åß„Åô„Å≠„ÄÇ\nÁßÅ„ÇÇÁå´„Å°„ÇÉ„Çì„ÅåÂ•Ω„Åç„Åß„Åô„ÇàÔºÅ\nÁå´„Å°„ÇÉ„Çì„ÅÆ„Å©„Çì„Å™„Å®„Åì„Çç„ÅåÂ•Ω„Åç„Åß„Åô„ÅãÔºü',
  'message_id': 'ea9f4729-00ab-4095-bc75-080c7e8b8d8b',
  'parent_id': 'a322501a-4deb-4465-9e81-8f636b182c39',
  'rank': 0,
  'lang': 'ja',
  'prompt': 'Â•ΩÂ•áÂøÉÊó∫Áõõ„Å™‰∫∫Èñì„Å®AI„Ç¢„Ç∑„Çπ„Çø„É≥„Éà„Å®„ÅÆÂØæË©±„ÄÇAI„Ç¢„Ç∑„Çπ„Çø„É≥„Éà„ÅØ„ÄÅ„É¶„Éº„Ç∂„Éº„Åã„Çâ„ÅÆË≥™Âïè„Å´ÂØæ„Åó„ÄÅË©≥Á¥∞„ÅßÂΩπ„Å´Á´ã„Å§‰∏ÅÂØß„Å™ÂõûÁ≠î„Çí„Åó„Åæ„Åô„ÄÇ\u3000### „É¶„Éº„Ç∂„ÉºÔºö\u3000ÁßÅ„ÅØÁå´„Å°„ÇÉ„Çì„ÅåÂ•Ω„Åç„Å™„Çì„Åß„Åô„Åë„Å©\nË≤¥Êñπ„ÇÇÁå´„Å°„ÇÉ„Çì„ÅåÂ•Ω„Åç„Åß„Åô„Åã?\u3000### „Ç¢„Ç∑„Çπ„Çø„É≥„ÉàÔºö',
  'question_id': 'ea9f4729-00ab-4095-bc75-080c7e8b8d8b',
  'category': 'generic',
  'text': '### „É¶„Éº„Ç∂„ÉºÔºö\u3000ÁßÅ„ÅØÁå´„Å°„ÇÉ„Çì„ÅåÂ•Ω„Åç„Å™„Çì„Åß„Åô„Åë„Å©\nË≤¥Êñπ„ÇÇÁå´„Å°„ÇÉ„Çì„ÅåÂ•Ω„Åç„Åß„Åô„Åã?\u

# Validation vs qlora dataset 

In [178]:
oa_validation_data = []
with open("oa_validation_data.jsonl", "r") as file:
    for line in file:
        oa_validation_data.append(json.loads(line))

oa_questions = []
with open("oa_questions.jsonl", "r") as file:
    for line in file:
        oa_questions.append(json.loads(line))

str_validation_data = [str(x) for x in oa_validation_data]
str_questions_data = [str(x) for x in oa_questions]

In [179]:
# set(str_questions_data) - set(str_validation_data)

In [180]:
set(str_questions_data).symmetric_difference(set(str_validation_data))

set()