In [102]:
from __future__ import annotations

import concurrent.futures
import pickle
from pathlib import Path
from typing import TYPE_CHECKING

if TYPE_CHECKING:
    from data import DataPoint

# load generated data

In [7]:
def from_pickle(input_file: str) -> list[DataPoint]:
    with Path(input_file).open("rb") as file:
        return list(pickle.load(file))


train: list[DataPoint] = from_pickle("dataset/train_compiled.pkl")
test: list[DataPoint] = from_pickle("dataset/test_compiled.pkl")

In [8]:
num_c_snippets: int = len(train) # 300m tokens
num_asm_snippets: int = sum(len(dp.asm) for dp in train) # 2.5b tokens

print(f"training with {num_c_snippets} C snippets compiled to {num_asm_snippets} assembly snippets")

training with 323184 C snippets compiled to 2481055 assembly snippets


# build vocabulary

In [104]:
unique_c_types = set()
unique_c_text = set()

unique_asm_types = set()
unique_asm_text = set()


def unique_tokens_from(dp: DataPoint) -> tuple[set[str], set[str], set[str], set[str]]:
    c_types = set()
    c_text = set()
    asm_types = set()
    asm_text = set()

    for (typ, txt) in dp.c_code.as_tokens():
        c_types.add(typ)
        c_text.add(txt)

    for (typ, txt) in [pair for asm_code in dp.asm for pair in asm_code.as_tokens()]:
        asm_types.add(typ)
        asm_text.add(txt)

    return c_types, c_text, asm_types, asm_text


with concurrent.futures.ThreadPoolExecutor() as executor:
    results = list(executor.map(unique_tokens_from, test))

for c_types, c_text, asm_types, asm_text in results:
    unique_c_types.update(c_types)
    unique_c_text.update(c_text)
    unique_asm_types.update(asm_types)
    unique_asm_text.update(asm_text)

In [109]:
print(len(unique_c_types))
print(len(unique_c_text))
print(len(unique_asm_types))
print(len(unique_asm_text))

100
21038
22
44671
