In [12]:
import os
from os import path
import pandas as pd
import numpy as np
import glob
from pqdm.threads import pqdm


  from .autonotebook import tqdm as notebook_tqdm


# Definitions

In [6]:
HOME=os.path.expanduser('~')
LIFE2SCENARIO_ROOT_PATH=path.join(HOME,"Documents/life2scenario_core/datasets/life2scenario_medium")
DATASET_ROOT_PATH=path.join(LIFE2SCENARIO_ROOT_PATH,"train")
print(DATASET_ROOT_PATH)

/mnt/home/yucedago/Documents/life2scenario_core/datasets/life2scenario_medium/train


In [7]:
PROMPTS_ROOT=path.join(DATASET_ROOT_PATH, "prompts")
REFERENCE_ROOT=path.join(DATASET_ROOT_PATH, "ref_scenarios")
TARGET_ROOT=path.join(DATASET_ROOT_PATH, "target_scenarios")

In [8]:
PREP_PICKLES_ROOT=path.join(LIFE2SCENARIO_ROOT_PATH, "prep_pickles")

# Utils

In [14]:
def readFile(file):
    cur_target = open(file).read()
    return [file, cur_target]
    
def load_file(filename: str):
    return np.load(filename)

def save_np_to_file(data: np.ndarray, filename: str):
    np.save(filename, data)
    print(f"Saved to {filename}")

def save_pd_to_file(data: pd.DataFrame, filename: str):
    data.to_csv(filename, index=False)  

In [15]:
def file2index(filename: str):
    return os.path.basename(filename).split(".")[0].split("_")[-1]

def file_data_pairs_to_data_dict(in_arr: np.ndarray):
    return {
                "data": [data for data in in_arr[:, 1]],
                "id": [file2index(filename) for filename in in_arr[:, 0]]
            }


# Prepare DataFrame

In [16]:
prompt_arr = []
prompt_file_list = glob.glob(f"{PROMPTS_ROOT}/*.txt", recursive=False)
prompt_read_results = pqdm(prompt_file_list, readFile, n_jobs=64)
prompt_arr = np.array(prompt_read_results)

print(f"{len(prompt_arr)} data points will be saved.")
save_np_to_file(prompt_arr, path.join(PREP_PICKLES_ROOT, "prompt_arr.npy"))

QUEUEING TASKS | : 100%|██████████| 222129/222129 [00:05<00:00, 41496.80it/s]
PROCESSING TASKS | : 100%|██████████| 222129/222129 [01:00<00:00, 3647.19it/s]
COLLECTING RESULTS | : 100%|██████████| 222129/222129 [00:00<00:00, 533370.75it/s]


222129

In [18]:
ref_arr = []
ref_file_list = glob.glob(f"{REFERENCE_ROOT}/*.xosc", recursive=False)
ref_read_results = pqdm(ref_file_list, readFile, n_jobs=64)
ref_arr = np.array(ref_read_results)

print(f"{len(ref_arr)} data points will be saved.")
save_np_to_file(ref_arr, path.join(PREP_PICKLES_ROOT, "ref_arr.npy"))

QUEUEING TASKS | : 100%|██████████| 222129/222129 [00:06<00:00, 33809.07it/s]
PROCESSING TASKS | : 100%|██████████| 222129/222129 [01:20<00:00, 2766.70it/s]
COLLECTING RESULTS | : 100%|██████████| 222129/222129 [00:00<00:00, 558956.18it/s]

: 

: 

In [None]:
target_arr = []
target_file_list = glob.glob(f"{TARGET_ROOT}/*.xosc", recursive=False)
target_read_results = pqdm(target_file_list, readFile, n_jobs=64)
target_arr = np.array(target_read_results)
len(target_arr)

In [None]:
df_train = pd.DataFrame()

## Load Array Pickles

In [19]:
target_arr = load_file(path.join(PREP_PICKLES_ROOT, "target_arr.npy"))
ref_arr = load_file(path.join(PREP_PICKLES_ROOT, "ref_arr.npy"))
prompt_arr = load_file(path.join(PREP_PICKLES_ROOT, "prompt_arr.npy"))

In [20]:
target_dict = file_data_pairs_to_data_dict(target_arr)
ref_dict = file_data_pairs_to_data_dict(ref_arr)
prompt_dict = file_data_pairs_to_data_dict(prompt_arr)

In [21]:
target_df = pd.DataFrame(target_dict)
target_df = target_df.rename(columns={"data": "target_scenario"})

ref_df = pd.DataFrame(ref_dict)
ref_df = ref_df.rename(columns={"data": "reference_scenario"})

prompt_df = pd.DataFrame(prompt_dict)
prompt_df = prompt_df.rename(columns={"data": "prompt"})

In [22]:
target_prompt_df = pd.merge(target_df, prompt_df, on="id")
train_df = pd.merge(target_prompt_df, ref_df, on="id")
train_df.head()

Unnamed: 0,target_scenario,id,prompt,reference_scenario
0,"<?xml version=""1.0"" ?>\n<OpenSCENARIO>\n <Fil...",4122,would you add pedestrian close to hero,"<?xml version=""1.0"" ?>\n<OpenSCENARIO>\n <Fil..."
1,"<?xml version=""1.0"" ?>\n<OpenSCENARIO>\n <Fil...",5383,i would like to command you to add pedestrian ...,"<?xml version=""1.0"" ?>\n<OpenSCENARIO>\n <Fil..."
2,"<?xml version=""1.0"" ?>\n<OpenSCENARIO>\n <Fil...",23788,i would like to request you to add pedestrian ...,"<?xml version=""1.0"" ?>\n<OpenSCENARIO>\n <Fil..."
3,"<?xml version=""1.0"" ?>\n<OpenSCENARIO>\n <Fil...",3403,i would like to tell you to add pedestrian at ...,"<?xml version=""1.0"" ?>\n<OpenSCENARIO>\n <Fil..."
4,"<?xml version=""1.0"" ?>\n<OpenSCENARIO>\n <Fil...",18951,add pedestrian close to hero,"<?xml version=""1.0"" ?>\n<OpenSCENARIO>\n <Fil..."


In [23]:
train_df.describe()

Unnamed: 0,target_scenario,id,prompt,reference_scenario
count,22303,22303,22303,22303
unique,22115,22303,11006,21124
top,"<?xml version=""1.0"" ?>\n<OpenSCENARIO>\n <Fil...",4122,i want you to add pedestrian close to hero,"<?xml version=""1.0"" ?>\n<OpenSCENARIO>\n <Fil..."
freq,22,1,187,1049


## Format-like `Stack Exchange Instruction @ HuggingFace`

In [57]:
train_df["request"] = train_df[['prompt', 'reference_scenario']].apply(lambda x : '{}?\n```\n{}\n```'.format(x[0], x[1]), axis=1)
train_df["response"] = train_df['target_scenario'].apply(lambda x : 'Here is the result:\n```\n{}\n```'.format(x))

  train_df["request"] = train_df[['prompt', 'reference_scenario']].apply(lambda x : '{}?\n```\n{}\n```'.format(x[0], x[1]), axis=1)


In [64]:
print(train_df["request"][1])

i would like to command you to add pedestrian close to hero?
```
<?xml version="1.0" ?>
<OpenSCENARIO>
  <FileHeader revMajor="1" revMinor="0" date="2020-03-24T12:00:00" description="CARLA:LaneChangeSimple" author=""/>
  <ParameterDeclarations/>
  <CatalogLocations/>
  <RoadNetwork>
    <LogicFile filepath="Town04"/>
    <SceneGraphFile filepath=""/>
  </RoadNetwork>
  <Entities>
    <ScenarioObject name="hero">
      <Vehicle name="vehicle.tesla.model3" vehicleCategory="car">
        <ParameterDeclarations/>
        <Performance maxSpeed="69.444" maxAcceleration="200" maxDeceleration="10.0"/>
        <BoundingBox>
          <Center x="1.5" y="0.0" z="0.9"/>
          <Dimensions width="2.1" length="4.5" height="1.8"/>
        </BoundingBox>
        <Axles>
          <FrontAxle maxSteering="0.5" wheelDiameter="0.6" trackWidth="1.8" positionX="3.1" positionZ="0.3"/>
          <RearAxle maxSteering="0.0" wheelDiameter="0.6" trackWidth="1.8" positionX="0.0" positionZ="0.3"/>
        </Axl

In [63]:
print(train_df["response"][1])

Here is the result:
```
<?xml version="1.0" ?>
<OpenSCENARIO>
  <FileHeader revMajor="1" revMinor="0" date="2020-03-24T12:00:00" description="CARLA:LaneChangeSimple" author=""/>
  <ParameterDeclarations/>
  <CatalogLocations/>
  <RoadNetwork>
    <LogicFile filepath="Town04"/>
    <SceneGraphFile filepath=""/>
  </RoadNetwork>
  <Entities>
    <ScenarioObject name="hero">
      <Vehicle name="vehicle.tesla.model3" vehicleCategory="car">
        <ParameterDeclarations/>
        <Performance maxSpeed="69.444" maxAcceleration="200" maxDeceleration="10.0"/>
        <BoundingBox>
          <Center x="1.5" y="0.0" z="0.9"/>
          <Dimensions width="2.1" length="4.5" height="1.8"/>
        </BoundingBox>
        <Axles>
          <FrontAxle maxSteering="0.5" wheelDiameter="0.6" trackWidth="1.8" positionX="3.1" positionZ="0.3"/>
          <RearAxle maxSteering="0.0" wheelDiameter="0.6" trackWidth="1.8" positionX="0.0" positionZ="0.3"/>
        </Axles>
        <Properties>
          <Prope

In [65]:
train_df.head()

Unnamed: 0,target_scenario,id,prompt,reference_scenario,request,response
0,"<?xml version=""1.0"" ?>\n<OpenSCENARIO>\n <Fil...",4122,would you add pedestrian close to hero,"<?xml version=""1.0"" ?>\n<OpenSCENARIO>\n <Fil...",would you add pedestrian close to hero?\n```\n...,"Here is the result:\n```\n<?xml version=""1.0"" ..."
1,"<?xml version=""1.0"" ?>\n<OpenSCENARIO>\n <Fil...",5383,i would like to command you to add pedestrian ...,"<?xml version=""1.0"" ?>\n<OpenSCENARIO>\n <Fil...",i would like to command you to add pedestrian ...,"Here is the result:\n```\n<?xml version=""1.0"" ..."
2,"<?xml version=""1.0"" ?>\n<OpenSCENARIO>\n <Fil...",23788,i would like to request you to add pedestrian ...,"<?xml version=""1.0"" ?>\n<OpenSCENARIO>\n <Fil...",i would like to request you to add pedestrian ...,"Here is the result:\n```\n<?xml version=""1.0"" ..."
3,"<?xml version=""1.0"" ?>\n<OpenSCENARIO>\n <Fil...",3403,i would like to tell you to add pedestrian at ...,"<?xml version=""1.0"" ?>\n<OpenSCENARIO>\n <Fil...",i would like to tell you to add pedestrian at ...,"Here is the result:\n```\n<?xml version=""1.0"" ..."
4,"<?xml version=""1.0"" ?>\n<OpenSCENARIO>\n <Fil...",18951,add pedestrian close to hero,"<?xml version=""1.0"" ?>\n<OpenSCENARIO>\n <Fil...",add pedestrian close to hero?\n```\n<?xml vers...,"Here is the result:\n```\n<?xml version=""1.0"" ..."


## Save the DataFrame

In [66]:
save_pd_to_file(train_df, path.join(PREP_PICKLES_ROOT, "train_dataset.csv"))

# Create HuggingFace Dataset

In [5]:
from datasets import load_dataset
from datasets import Dataset

  from .autonotebook import tqdm as notebook_tqdm


## Load Train DataFrame

In [6]:
train_final = pd.read_csv(path.join(PREP_PICKLES_ROOT, "train_dataset.csv"))

train_final = train_final[["request", "response"]]
train_final.head()


Unnamed: 0,request,response
0,would you add pedestrian close to hero?\n```\n...,"Here is the result:\n```\n<?xml version=""1.0"" ..."
1,i would like to command you to add pedestrian ...,"Here is the result:\n```\n<?xml version=""1.0"" ..."
2,i would like to request you to add pedestrian ...,"Here is the result:\n```\n<?xml version=""1.0"" ..."
3,i would like to tell you to add pedestrian at ...,"Here is the result:\n```\n<?xml version=""1.0"" ..."
4,add pedestrian close to hero?\n```\n<?xml vers...,"Here is the result:\n```\n<?xml version=""1.0"" ..."
