In [1]:
import os
from os import path
import pandas as pd
import numpy as np
import glob

# Definitions

In [2]:
HOME=os.path.expanduser('~')
LIFE2SCENARIO_ROOT_PATH=path.join(HOME,"Documents/life2scenario/")
DATASET_ROOT_PATH=path.join(LIFE2SCENARIO_ROOT_PATH,"life2scenario_minimal/dataset/train/")

print(DATASET_ROOT_PATH)

/mnt/home/yucedago/Documents/life2scenario/life2scenario_minimal/dataset/train/


In [3]:
PROMPTS_ROOT=path.join(DATASET_ROOT_PATH, "prompts")
REFERENCE_ROOT=path.join(DATASET_ROOT_PATH, "ref_scenarios")
TARGET_ROOT=path.join(DATASET_ROOT_PATH, "target_scenarios")

In [4]:
PREP_PICKLES_ROOT=path.join(LIFE2SCENARIO_ROOT_PATH, "prep_pickles")

# Utils

In [5]:
def load_file(filename: str):
    return np.load(filename)

def save_np_to_file(data: np.ndarray, filename: str):
    np.save(filename, data)
    print(f"Saved to {filename}")

def save_pd_to_file(data: pd.DataFrame, filename: str):
    data.to_csv(filename, index=False)  

In [6]:
def file2index(filename: str):
    return os.path.basename(filename).split(".")[0].split("_")[-1]

def file_data_pairs_to_data_dict(in_arr: np.ndarray):
    return {
                "data": [data for data in in_arr[:, 1]],
                "id": [file2index(filename) for filename in in_arr[:, 0]]
            }


# Prepare DataFrame

In [None]:
prompt_arr = []
for file in glob.glob(f"{PROMPTS_ROOT}/*.txt", recursive=False):
    cur_prompt = open(file).read()
    prompt_arr.append([file, cur_prompt])

prompt_arr = np.array(prompt_arr)

In [None]:
ref_arr = []
for file in glob.glob(f"{REFERENCE_ROOT}/*.xosc", recursive=False):
    cur_ref = open(file).read()
    ref_arr.append([file, cur_ref])
ref_arr = np.array(ref_arr)

In [None]:
target_arr = []
for file in glob.glob(f"{TARGET_ROOT}/*.xosc", recursive=False):
    cur_target = open(file).read()
    target_arr.append([file, cur_target])
target_arr = np.array(target_arr)

In [None]:
save_np_to_file(target_arr, path.join(PREP_PICKLES_ROOT, "target_arr.npy"))
save_np_to_file(ref_arr, path.join(PREP_PICKLES_ROOT, "ref_arr.npy"))
save_np_to_file(prompt_arr, path.join(PREP_PICKLES_ROOT, "prompt_arr.npy"))

Saved to /mnt/home/yucedago/Documents/life2scenario/prep_pickles/target_arr.npy
Saved to /mnt/home/yucedago/Documents/life2scenario/prep_pickles/ref_arr.npy
Saved to /mnt/home/yucedago/Documents/life2scenario/prep_pickles/prompt_arr.npy


In [None]:
df_train = pd.DataFrame()

## Load Array Pickles

In [19]:
target_arr = load_file(path.join(PREP_PICKLES_ROOT, "target_arr.npy"))
ref_arr = load_file(path.join(PREP_PICKLES_ROOT, "ref_arr.npy"))
prompt_arr = load_file(path.join(PREP_PICKLES_ROOT, "prompt_arr.npy"))

In [20]:
target_dict = file_data_pairs_to_data_dict(target_arr)
ref_dict = file_data_pairs_to_data_dict(ref_arr)
prompt_dict = file_data_pairs_to_data_dict(prompt_arr)

In [21]:
target_df = pd.DataFrame(target_dict)
target_df = target_df.rename(columns={"data": "target_scenario"})

ref_df = pd.DataFrame(ref_dict)
ref_df = ref_df.rename(columns={"data": "reference_scenario"})

prompt_df = pd.DataFrame(prompt_dict)
prompt_df = prompt_df.rename(columns={"data": "prompt"})

In [22]:
target_prompt_df = pd.merge(target_df, prompt_df, on="id")
train_df = pd.merge(target_prompt_df, ref_df, on="id")
train_df.head()

Unnamed: 0,target_scenario,id,prompt,reference_scenario
0,"<?xml version=""1.0"" ?>\n<OpenSCENARIO>\n <Fil...",4122,would you add pedestrian close to hero,"<?xml version=""1.0"" ?>\n<OpenSCENARIO>\n <Fil..."
1,"<?xml version=""1.0"" ?>\n<OpenSCENARIO>\n <Fil...",5383,i would like to command you to add pedestrian ...,"<?xml version=""1.0"" ?>\n<OpenSCENARIO>\n <Fil..."
2,"<?xml version=""1.0"" ?>\n<OpenSCENARIO>\n <Fil...",23788,i would like to request you to add pedestrian ...,"<?xml version=""1.0"" ?>\n<OpenSCENARIO>\n <Fil..."
3,"<?xml version=""1.0"" ?>\n<OpenSCENARIO>\n <Fil...",3403,i would like to tell you to add pedestrian at ...,"<?xml version=""1.0"" ?>\n<OpenSCENARIO>\n <Fil..."
4,"<?xml version=""1.0"" ?>\n<OpenSCENARIO>\n <Fil...",18951,add pedestrian close to hero,"<?xml version=""1.0"" ?>\n<OpenSCENARIO>\n <Fil..."


In [23]:
train_df.describe()

Unnamed: 0,target_scenario,id,prompt,reference_scenario
count,22303,22303,22303,22303
unique,22115,22303,11006,21124
top,"<?xml version=""1.0"" ?>\n<OpenSCENARIO>\n <Fil...",4122,i want you to add pedestrian close to hero,"<?xml version=""1.0"" ?>\n<OpenSCENARIO>\n <Fil..."
freq,22,1,187,1049


## Format-like `Stack Exchange Instruction @ HuggingFace`

In [57]:
train_df["request"] = train_df[['prompt', 'reference_scenario']].apply(lambda x : '{}?\n```\n{}\n```'.format(x[0], x[1]), axis=1)
train_df["response"] = train_df['target_scenario'].apply(lambda x : 'Here is the result:\n```\n{}\n```'.format(x))

  train_df["request"] = train_df[['prompt', 'reference_scenario']].apply(lambda x : '{}?\n```\n{}\n```'.format(x[0], x[1]), axis=1)


In [64]:
print(train_df["request"][1])

i would like to command you to add pedestrian close to hero?
```
<?xml version="1.0" ?>
<OpenSCENARIO>
  <FileHeader revMajor="1" revMinor="0" date="2020-03-24T12:00:00" description="CARLA:LaneChangeSimple" author=""/>
  <ParameterDeclarations/>
  <CatalogLocations/>
  <RoadNetwork>
    <LogicFile filepath="Town04"/>
    <SceneGraphFile filepath=""/>
  </RoadNetwork>
  <Entities>
    <ScenarioObject name="hero">
      <Vehicle name="vehicle.tesla.model3" vehicleCategory="car">
        <ParameterDeclarations/>
        <Performance maxSpeed="69.444" maxAcceleration="200" maxDeceleration="10.0"/>
        <BoundingBox>
          <Center x="1.5" y="0.0" z="0.9"/>
          <Dimensions width="2.1" length="4.5" height="1.8"/>
        </BoundingBox>
        <Axles>
          <FrontAxle maxSteering="0.5" wheelDiameter="0.6" trackWidth="1.8" positionX="3.1" positionZ="0.3"/>
          <RearAxle maxSteering="0.0" wheelDiameter="0.6" trackWidth="1.8" positionX="0.0" positionZ="0.3"/>
        </Axl

In [63]:
print(train_df["response"][1])

Here is the result:
```
<?xml version="1.0" ?>
<OpenSCENARIO>
  <FileHeader revMajor="1" revMinor="0" date="2020-03-24T12:00:00" description="CARLA:LaneChangeSimple" author=""/>
  <ParameterDeclarations/>
  <CatalogLocations/>
  <RoadNetwork>
    <LogicFile filepath="Town04"/>
    <SceneGraphFile filepath=""/>
  </RoadNetwork>
  <Entities>
    <ScenarioObject name="hero">
      <Vehicle name="vehicle.tesla.model3" vehicleCategory="car">
        <ParameterDeclarations/>
        <Performance maxSpeed="69.444" maxAcceleration="200" maxDeceleration="10.0"/>
        <BoundingBox>
          <Center x="1.5" y="0.0" z="0.9"/>
          <Dimensions width="2.1" length="4.5" height="1.8"/>
        </BoundingBox>
        <Axles>
          <FrontAxle maxSteering="0.5" wheelDiameter="0.6" trackWidth="1.8" positionX="3.1" positionZ="0.3"/>
          <RearAxle maxSteering="0.0" wheelDiameter="0.6" trackWidth="1.8" positionX="0.0" positionZ="0.3"/>
        </Axles>
        <Properties>
          <Prope

In [65]:
train_df.head()

Unnamed: 0,target_scenario,id,prompt,reference_scenario,request,response
0,"<?xml version=""1.0"" ?>\n<OpenSCENARIO>\n <Fil...",4122,would you add pedestrian close to hero,"<?xml version=""1.0"" ?>\n<OpenSCENARIO>\n <Fil...",would you add pedestrian close to hero?\n```\n...,"Here is the result:\n```\n<?xml version=""1.0"" ..."
1,"<?xml version=""1.0"" ?>\n<OpenSCENARIO>\n <Fil...",5383,i would like to command you to add pedestrian ...,"<?xml version=""1.0"" ?>\n<OpenSCENARIO>\n <Fil...",i would like to command you to add pedestrian ...,"Here is the result:\n```\n<?xml version=""1.0"" ..."
2,"<?xml version=""1.0"" ?>\n<OpenSCENARIO>\n <Fil...",23788,i would like to request you to add pedestrian ...,"<?xml version=""1.0"" ?>\n<OpenSCENARIO>\n <Fil...",i would like to request you to add pedestrian ...,"Here is the result:\n```\n<?xml version=""1.0"" ..."
3,"<?xml version=""1.0"" ?>\n<OpenSCENARIO>\n <Fil...",3403,i would like to tell you to add pedestrian at ...,"<?xml version=""1.0"" ?>\n<OpenSCENARIO>\n <Fil...",i would like to tell you to add pedestrian at ...,"Here is the result:\n```\n<?xml version=""1.0"" ..."
4,"<?xml version=""1.0"" ?>\n<OpenSCENARIO>\n <Fil...",18951,add pedestrian close to hero,"<?xml version=""1.0"" ?>\n<OpenSCENARIO>\n <Fil...",add pedestrian close to hero?\n```\n<?xml vers...,"Here is the result:\n```\n<?xml version=""1.0"" ..."


## Save the DataFrame

In [66]:
save_pd_to_file(train_df, path.join(PREP_PICKLES_ROOT, "train_dataset.csv"))

# Create HuggingFace Dataset

In [5]:
from datasets import load_dataset
from datasets import Dataset

  from .autonotebook import tqdm as notebook_tqdm


## Load Train DataFrame

In [6]:
train_final = pd.read_csv(path.join(PREP_PICKLES_ROOT, "train_dataset.csv"))

train_final = train_final[["request", "response"]]
train_final.head()


Unnamed: 0,request,response
0,would you add pedestrian close to hero?\n```\n...,"Here is the result:\n```\n<?xml version=""1.0"" ..."
1,i would like to command you to add pedestrian ...,"Here is the result:\n```\n<?xml version=""1.0"" ..."
2,i would like to request you to add pedestrian ...,"Here is the result:\n```\n<?xml version=""1.0"" ..."
3,i would like to tell you to add pedestrian at ...,"Here is the result:\n```\n<?xml version=""1.0"" ..."
4,add pedestrian close to hero?\n```\n<?xml vers...,"Here is the result:\n```\n<?xml version=""1.0"" ..."


## Create Dataset

In [7]:
life2scenario_dataset = Dataset.from_pandas(train_final)

In [8]:
l2s_dataset = life2scenario_dataset.train_test_split(test_size=0.1)

In [9]:
l2s_dataset

DatasetDict({
    train: Dataset({
        features: ['request', 'response'],
        num_rows: 20072
    })
    test: Dataset({
        features: ['request', 'response'],
        num_rows: 2231
    })
})

In [10]:
from finetune import create_datasets, ConstantLengthDataset, chars_token_ratio, run_training
from transformers import AutoModelForCausalLM, AutoTokenizer

ModuleNotFoundError: No module named 'finetune'

In [None]:
class Dict2Obj(object):
  def __init__(self, dictionary):
    for key in dictionary:
        setattr(self, key, dictionary[key])
  
  def __repr__(self):
    return "<dict2obj: %s>" % self.__dict__

# Training Params
train_dict = {
    "model_path": "bigcode/starcoder",
    "subset": "data/finetune",
    "streaming": True,
    "seq_length": 2048,
    "max_steps": 1000,
    "batch_size": 1,
    "input_column_name": "request",
    "output_column_name": "response",
    "gradient_accumulation_steps": 16,
    "learning_rate": 1e-4,
    "lr_scheduler_type": "cosine",
    "num_warmup_steps": 100,
    "weight_decay": 0.05,
    "output_dir": "./checkpoints",

    "local_rank": 0,
    "eos_token_id": 49152,
    "no_gradient_checkpointing": False,
    "shuffle_buffer": 5000,
    "lora_r": 16,
    "lora_alpha": 32,
    "lora_dropout": 0.05,
    "no_fp16": False,
    "bf16":False,
    "seed": 0,
    "num_workers": None,
    "log_freq": 100,
    "eval_freq":100,
    "save_freq": 1000
  }

train_args = Dict2Obj(train_dict)
train_args

<dict2obj: {'model_path': 'bigcode/starcoder', 'subset': 'data/finetune', 'streaming': True, 'seq_length': 2048, 'max_steps': 1000, 'batch_size': 1, 'input_column_name': 'request', 'output_column_name': 'response', 'gradient_accumulation_steps': 16, 'learning_rate': 0.0001, 'lr_scheduler_type': 'cosine', 'num_warmup_steps': 100, 'weight_decay': 0.05, 'output_dir': './checkpoints', 'local_rank': 0, 'eos_token_id': 49152, 'no_gradient_checkpointing': False, 'shuffle_buffer': 5000, 'lora_r': 16, 'lora_alpha': 32, 'lora_dropout': 0.05, 'no_fp16': False, 'bf16': False, 'seed': 0, 'num_workers': None, 'log_freq': 100, 'eval_freq': 100, 'save_freq': 1000}>

In [None]:
import torch
checkpoint = "bigcode/starcoder"
device = "cuda" # for GPU usage or "cpu" for CPU usage

tokenizer = AutoTokenizer.from_pretrained(checkpoint, cache_dir="/mnt/scratch/yucedago/.cache")
# to save memory consider using fp16 or bf16 by specifying torch_dtype=torch.float16 for example
model = AutoModelForCausalLM.from_pretrained(checkpoint, cache_dir="/mnt/scratch/yucedago/.cache", torch_dtype=torch.float16).to(device)

Loading checkpoint shards: 100%|██████████| 7/7 [00:29<00:00,  4.25s/it]


In [35]:
train_data = l2s_dataset["train"]
test_data = l2s_dataset["test"]


chars_per_token = chars_token_ratio(train_data, tokenizer, train_args.input_column_name, train_args.output_column_name)


train_dataset = ConstantLengthDataset(
    tokenizer,
    train_data,
    infinite=True,
    seq_length=train_args.seq_length,
    chars_per_token=chars_per_token,
    input_column_name=train_args.input_column_name,
    output_column_name=train_args.output_column_name
)

valid_dataset = ConstantLengthDataset(
    tokenizer,
    test_data,
    infinite=False,
    seq_length=train_args.seq_length,
    chars_per_token=chars_per_token,
    input_column_name=train_args.input_column_name,
    output_column_name=train_args.output_column_name
)

100%|██████████| 400/400 [00:07<00:00, 53.21it/s]


In [None]:
run_training(train_args, train_data, test_data)

In [None]:
list(train_dataset.__iter__())