In [1]:
from src.guess.guess import ConfigType, Guess, Config
from src.domain.datasets.UnixCommandDataset import UnixCommandDataset  # Register dataset
from src.domain.datasets.ProjectEulerDataset import ProjectEulerDataset

config = Config(ConfigType.BART_RISC2ARM.get_path())
guess = Guess(config=config)
predictions = guess.guess()

  from .autonotebook import tqdm as notebook_tqdm


Loading BART model: celinelee/bartlarge_risctoarm_cloze2048


  Referenced from: <2BD1B165-EC09-3F68-BCE4-8FE4E70CA7E2> /opt/homebrew/lib/python3.11/site-packages/torchvision/image.so
  warn(
2025-04-11 06:20:08,268 - BartLargeModel - INFO - Initialized BartLargeModel on device: mps
2025-04-11 06:20:08,271 - src.helpers.dataset - INFO - Loaded 11 instances
2025-04-11 06:20:08,272 - src.helpers.data_loader - INFO - Loaded 11 samples from dataset
  test_elements = torch.tensor(test_elements)


Source JSONL: data/processed/RISCV/UnixCommands_risc.jsonl
Target JSONL: data/processed/ARM64/UnixCommands_arm.jsonl
Loading file: data/processed/RISCV/UnixCommands_risc.jsonl
Loaded 11 entries from data/processed/RISCV/UnixCommands_risc.jsonl
Loading file: data/processed/ARM64/UnixCommands_arm.jsonl
Loaded 11 entries from data/processed/ARM64/UnixCommands_arm.jsonl
Source entries: 11
Target entries: 11
Creating instance 0 with source key risc and target key arm
Source entry keys: dict_keys(['source', 'risc', 'risc_output', 'risc_verbose'])
Target entry keys: dict_keys(['source', 'arm', 'arm_output', 'arm_verbose'])
Creating instance 1 with source key risc and target key arm
Source entry keys: dict_keys(['source', 'risc', 'risc_output', 'risc_verbose'])
Target entry keys: dict_keys(['source', 'arm', 'arm_output', 'arm_verbose'])
Creating instance 2 with source key risc and target key arm
Source entry keys: dict_keys(['source', 'risc', 'risc_output', 'risc_verbose'])
Target entry keys: 

Token indices sequence length is longer than the specified maximum sequence length for this model (2189 > 2048). Running this sequence through the model will result in indexing errors


In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
from src.sketch.sketch import Sketch

sketch = Sketch(config, guess.model)

In [4]:
import pandas as pd
from IPython.display import HTML, display

def html_escape(text):
    # Convert special characters to HTML-safe versions
    return (text.replace("&", "&amp;")
                .replace("<", "&lt;")
                .replace(">", "&gt;")
                .replace("\n", "<br>")
                .replace("\t", "&nbsp;&nbsp;&nbsp;&nbsp;"))  # 4 spaces for a tab

def display_blocks(blocks, pred):
    data = []

    for block in blocks:
        source_text = sketch.model.tokenizer.decode(
            pred.source[0][block.source_start:block.source_end]
        )
        pred_text = sketch.model.tokenizer.decode(
            pred.pred[0][block.pred_start:block.pred_end]
        )
        data.append({
            'SOURCE': html_escape(source_text),
            'PRED': html_escape(pred_text)
        })

    df = pd.DataFrame(data)

    styles = """
    <style>
    table {
        table-layout: auto;
        word-wrap: break-word;
    }
    td {
        white-space: normal !important;
        font-family: monospace;
        vertical-align: top;
    }
    </style>
    """
    display(HTML(styles + df.to_html(escape=False)))


In [21]:
results = sketch.sketch(predictions)

for pred_result, sketch_result in results:
    print(pred_result.instance_id)
    print(f"Ratio of invalid_blocks: {len(sketch_result.invalid_blocks)/len(sketch_result.total_blocks)}")
    print(f"Ratio of non_equivalent_blocks: {len(sketch_result.non_equivalent_blocks)/len(sketch_result.total_blocks)}")

UnixCommands/cat
Ratio of invalid_blocks: 0.0
Ratio of non_equivalent_blocks: 0.2857142857142857
UnixCommands/cd
Ratio of invalid_blocks: 0.1111111111111111
Ratio of non_equivalent_blocks: 0.3333333333333333
UnixCommands/cp
Ratio of invalid_blocks: 0.3125
Ratio of non_equivalent_blocks: 0.5625
UnixCommands/ls
Ratio of invalid_blocks: 0.0
Ratio of non_equivalent_blocks: 0.20833333333333334
UnixCommands/mkdir
Ratio of invalid_blocks: 0.0
Ratio of non_equivalent_blocks: 0.0
UnixCommands/ps
Ratio of invalid_blocks: 0.2
Ratio of non_equivalent_blocks: 0.6
UnixCommands/rm
Ratio of invalid_blocks: 0.0
Ratio of non_equivalent_blocks: 0.0
UnixCommands/rmdir
Ratio of invalid_blocks: 0.0
Ratio of non_equivalent_blocks: 0.0
UnixCommands/tee
Ratio of invalid_blocks: 0.0
Ratio of non_equivalent_blocks: 0.42857142857142855
UnixCommands/touch
Ratio of invalid_blocks: 0.0
Ratio of non_equivalent_blocks: 0.0
UnixCommands/xargs
Ratio of invalid_blocks: 0.25
Ratio of non_equivalent_blocks: 0.5
