In [14]:
from src.guess.guess import ConfigType, Guess, Config
from src.domain.datasets.UnixCommandDataset import UnixCommandDataset  # Register dataset
from src.domain.datasets.ProjectEulerDataset import ProjectEulerDataset
from src.domain.datasets.BringUpDataset import BringUpDataset

prediction = None

config = Config(ConfigType.QWEN_X862ARM64.get_path())
guess = Guess(config=config)

2025-04-23 22:26:28,457 - src.domain.models.QwenModel - INFO - Initializing QwenModel with ahmedheakl/asm2asm_1.5b_armv8_o0 on device mps


Loading Qwen model: ahmedheakl/asm2asm_1.5b_armv8_o0


2025-04-23 22:26:31,264 - QwenModel - INFO - Initialized QwenModel on device: mps:0
2025-04-23 22:26:31,265 - src.domain.models.QwenModel - INFO - Model initialization completed in 2.81 seconds


Source JSONL: data/processed/X86/BringUp_x86.jsonl
Target JSONL: data/processed/ARM64/BringUp_arm.jsonl
Loading file: data/processed/X86/BringUp_x86.jsonl
Loaded 48 entries from data/processed/X86/BringUp_x86.jsonl
Loading file: data/processed/ARM64/BringUp_arm.jsonl
Loaded 48 entries from data/processed/ARM64/BringUp_arm.jsonl


In [15]:
test_instance = guess.data_loader.iter().__next__()
print(test_instance)


Source entries: 48
Target entries: 48


KeyError: 'c_files'

In [3]:
import pickle
import os

if os.path.exists("test_pred.pkl"):
    with open("test_pred.pkl", "rb") as f:
        test_pred = pickle.load(f)
else:
    test_pred = guess.model.predict(test_instance, guess.inference_cfg)
    with open("test_pred.pkl", "wb") as f:
        pickle.dump(test_pred, f)

In [4]:
from src.sketch.sketch import Sketch

sketch = Sketch(config, guess.model)

In [5]:
import pandas as pd
from IPython.display import HTML, display

def html_escape(text):
    # Convert special characters to HTML-safe versions
    return (text.replace("&", "&amp;")
                .replace("<", "&lt;")
                .replace(">", "&gt;")
                .replace("\n", "<br>")
                .replace("\t", "&nbsp;&nbsp;&nbsp;&nbsp;"))  # 4 spaces for a tab

def display_blocks(blocks, pred):
    data = []

    for block in blocks:
        source_text = sketch.model.tokenizer.decode(
            pred.source[0][block.source_start:block.source_end]
        )
        pred_text = sketch.model.tokenizer.decode(
            pred.pred[0][block.pred_start:block.pred_end]
        )
        data.append({
            'SOURCE': html_escape(source_text),
            'PRED': html_escape(pred_text)
        })

    df = pd.DataFrame(data)

    styles = """
    <style>
    table {
        table-layout: auto;
        word-wrap: break-word;
    }
    td {
        white-space: normal !important;
        font-family: monospace;
        vertical-align: top;
    }
    </style>
    """
    display(HTML(styles + df.to_html(escape=False)))


In [9]:
# test_pred = predictions_test['UnixCommands/cat']
line_mappings = sketch.map_predicted_lines_to_source_lines(
    test_pred.source,
    test_pred.pred,
    test_pred.alignments
)

blocks = sketch.extract_pure_instruction_blocks(
    test_pred.source, test_pred.pred, line_mappings
)

print(blocks)

display_blocks(blocks, test_pred)

[PureInstructionBlock(source_start=45, source_end=55, pred_start=71, pred_end=80), PureInstructionBlock(source_start=118, source_end=126, pred_start=173, pred_end=182), PureInstructionBlock(source_start=138, source_end=150, pred_start=195, pred_end=206), PureInstructionBlock(source_start=162, source_end=170, pred_start=218, pred_end=227), PureInstructionBlock(source_start=235, source_end=247, pred_start=256, pred_end=266), PureInstructionBlock(source_start=257, source_end=275, pred_start=279, pred_end=298), PureInstructionBlock(source_start=288, source_end=300, pred_start=308, pred_end=318), PureInstructionBlock(source_start=313, source_end=331, pred_start=318, pred_end=337), PureInstructionBlock(source_start=408, source_end=416, pred_start=396, pred_end=405), PureInstructionBlock(source_start=456, source_end=474, pred_start=429, pred_end=448), PureInstructionBlock(source_start=526, source_end=534, pred_start=531, pred_end=540), PureInstructionBlock(source_start=565, source_end=575, pr

Unnamed: 0,SOURCE,PRED
0,"mv a5,a0","mov x29, sp"
1,"li a4, 0","mov x1, 0"
2,"addi a5,a5,8","add x0, x0, 8"
3,"li a1,0","mov w1, 0"
4,"addi a5,s0,-128","add x0, sp, 48"
5,"mv a1,a5  li a0,1","mov x1, x0  mov w0, 1"
6,"addi a4,s0,-128","add x0, sp, 48"
7,"li a2,99  mv a1,a4","mov x2, 99  mov x1, x0"
8,"li a0,10","mov w0, 10"
9,"li a5,0  mv a4,a5","mov w0, 0  mov w1, w0"


In [7]:
results = sketch.sketch({"test_pred": test_pred})

for pred_result, sketch_result in results:
    print(pred_result.instance_id)
    print(f"Ratio of invalid_blocks: {len(sketch_result.invalid_blocks)/len(sketch_result.total_blocks)}")
    print(f"Ratio of non_equivalent_blocks: {len(sketch_result.non_equivalent_blocks)/len(sketch_result.total_blocks)}")

UnixCommands/cat
Ratio of invalid_blocks: 0.0
Ratio of non_equivalent_blocks: 0.16666666666666666


In [10]:
print(sketch.model.decode(test_pred.pred))

main:
.LFB6:
	.cfi_startproc
	stp	x29, x30, [sp, -160]!
	.cfi_def_cfa_offset 160
	.cfi_offset 29, -160
	.cfi_offset 30, -152
	mov	x29, sp
	str	w0, [sp, 28]
	str	x1, [sp, 16]
	adrp	x0, :got:__stack_chk_guard
	ldr	x0, [x0, #:got_lo12:__stack_chk_guard]
	ldr	x1, [x0]
	str	x1, [sp, 152]
	mov	x1, 0
	ldr	x0, [sp, 16]
	add	x0, x0, 8
	ldr	x0, [x0]
	mov	w1, 0
	bl	open
	str	w0, [sp, 40]
	b	.L2
.L3:
	add	x0, sp, 48
	ldr	w2, [sp, 44]
	mov	x1, x0
	mov	w0, 1
	bl	write
.L2:
	add	x0, sp, 48
	mov	x2, 99
	mov	x1, x0
	ldr	w0, [sp, 40]
	bl	read
	str	w0, [sp, 44]
	ldr	w0, [sp, 44]
	cmp	w0, 0
	bne	.L3
	mov	w0, 10
	bl	putchar
	ldr	w0, [sp, 40]
	bl	close
	mov	w0, 0
	mov	w1, w0
	adrp	x0, :got:__stack_chk_guard
	ldr	x0, [x0, #:got_lo12:__stack_chk_guard]
	ldr	x3, [sp, 152]
	ldr	x2, [x0]
	subs	x3, x3, x2
	mov	x2, 0
	beq	.L5
	bl	__stack_chk_fail
.L5:
	mov	w0, w1
	ldp	x29, x30, [sp], 160
	.cfi_restore 30
	.cfi_restore 29
	.cfi_def_cfa_offset 0
	ret
	.cfi_endproc
	.arch armv8-a
	.file	"program.c"
	.c"
	.text
	.alig