In [None]:
from google.colab import drive
drive.mount('/content/drive')  # follow prompt


Mounted at /content/drive


In [None]:
import os
WORKDIR = '/content/drive/MyDrive/Translation_fpga'
os.makedirs(WORKDIR, exist_ok=True)
print("workdir:", WORKDIR)


workdir: /content/drive/MyDrive/Translation_fpga


In [None]:
from transformers import AutoConfig, AutoTokenizer
import json, os

MODEL_NAME = "facebook/nllb-200-distilled-600M"  # as used in the paper. :contentReference[oaicite:1]{index=1}
cfg = AutoConfig.from_pretrained(MODEL_NAME)
tok = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=False)  # safe option

meta = {
    "model_name": MODEL_NAME,
    "architecture": cfg.model_type if hasattr(cfg, 'model_type') else str(cfg.__class__.__name__),
    "num_hidden_layers": getattr(cfg, "num_hidden_layers", None) or getattr(cfg, "encoder_layers", None),
    "encoder_layers": getattr(cfg, "encoder_layers", None),
    "decoder_layers": getattr(cfg, "decoder_layers", None),
    "hidden_size": getattr(cfg, "hidden_size", None) or getattr(cfg, "d_model", None),
    "ffn_dim": getattr(cfg, "ffn_dim", None) or getattr(cfg, "d_ff", None),
    "num_attention_heads": getattr(cfg, "num_attention_heads", None) or getattr(cfg, "num_heads", None),
    "vocab_size": getattr(cfg, "vocab_size", len(tok)),
    "tokenizer_class": tok.__class__.__name__,
    "quantization_target_proposal": "INT4 per-channel symmetric (per-out-channel scales)",
    "default_adapter_ranks_to_test": [8, 16, 32],
    "paper_reference": "/mnt/data/Bridging_the_Quality_Cliff__Efficient_Multilingual_NMT_at_the_Edge_via_Joint_Quantization_and_Dynamic_Adaptation.pdf"
}
with open(os.path.join(WORKDIR, "model_metadata.json"), "w") as f:
    json.dump(meta, f, indent=2)
print("Wrote model_metadata.json")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/846 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/564 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/4.85M [00:00<?, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/17.3M [00:00<?, ?B/s]

Wrote model_metadata.json


In [None]:
import json, statistics, os
samples = [
 "This is a test sentence in English.",
 "यह एक परीक्षण वाक्य है।",               # Hindi (Devanagari)
 "Esto es una oración de prueba.",        # Spanish
 "Galician example: Este é un exemplo.",  # Galician-ish
 "中文测试句子。"                           # Chinese
]
tok_examples = []
lengths = []
for s in samples:
    toks = tok.encode(s)
    tok_examples.append({"text": s, "tokens": toks, "len": len(toks)})
    lengths.append(len(toks))

tokenizer_sanity = {
    "samples_count": len(samples),
    "min_len": min(lengths),
    "max_len": max(lengths),
    "mean_len": statistics.mean(lengths),
    "examples": tok_examples
}
with open(os.path.join(WORKDIR, "tokenizer_sanity.json"), "w") as f:
    json.dump(tokenizer_sanity, f, indent=2, ensure_ascii=False)
print("Wrote tokenizer_sanity.json")


Wrote tokenizer_sanity.json


In [None]:
from transformers import AutoModelForSeq2SeqLM
model_empty = AutoModelForSeq2SeqLM.from_config(cfg)
params = list(model_empty.named_parameters())

print("✔ Empty model instantiated.")
print("Total parameters (architecture only):", len(params))
print("Example param:", params[0][0], params[0][1].shape)


✔ Empty model instantiated.
Total parameters (architecture only): 509
Example param: model.shared.weight torch.Size([256206, 1024])


In [None]:
csv_path = os.path.join(WORKDIR, "quantizable_layers_list.csv")

with open(csv_path, "w", newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(["module_name", "param_name", "shape", "note"])

    for name, p in params:
        shape = "x".join(map(str, p.shape))
        note = "quantize_candidate_linear" if (p.dim() == 2 and "weight" in name) else "skip_or_check"
        writer.writerow([name, name.split('.')[-1], shape, note])

print("✔ Wrote quantizable_layers_list.csv")


NameError: name 'csv' is not defined

In [None]:
import math
layer_summary_path = os.path.join(WORKDIR, "layer_sizes_summary.csv")
int4_total = 0
fp16_total = 0

with open(layer_summary_path, "w", newline='') as csvout:
    writer = csv.writer(csvout)
    writer.writerow(["module_name", "shape", "param_count", "bytes_fp16", "bytes_int4_est", "note"])

    for name, p in params:
        shape = list(p.shape)
        param_count = 1
        for d in shape:
            param_count *= d

        bytes_fp16 = param_count * 2
        bytes_int4 = math.ceil(param_count / 2)

        fp16_total += bytes_fp16
        int4_total += bytes_int4

        writer.writerow([name, "x".join(map(str, shape)), param_count, bytes_fp16, bytes_int4,
                         "quantize_candidate_linear" if (p.dim() == 2 and "weight" in name) else "skip"])

size_summary = {
    "total_bytes_fp16": fp16_total,
    "total_bytes_int4_estimate": int4_total,
    "int4_MB": int4_total / (1024**2)
}

with open(os.path.join(WORKDIR, "size_estimates.json"), "w") as f:
    json.dump(size_summary, f, indent=2)

print("✔ Wrote layer_sizes_summary.csv")
print("✔ Wrote size_estimates.json")
print(size_summary)


In [None]:
import os, json
WORKDIR = "/content/drive/MyDrive/Translation_fpga/step2_flores"
os.makedirs(WORKDIR, exist_ok=True)
print("Workspace:", WORKDIR)


Workspace: /content/drive/MyDrive/Translation_fpga/step2_flores


In [None]:
from pathlib import Path
import json


WORKDIR = Path("/content/drive/MyDrive/Translation_fpga/step2_flores")
WORKDIR.mkdir(parents=True, exist_ok=True)

# Five languages you selected
TARGET_LANGS = [
    "eng_Latn",   # English
    "rus_Cyrl",   # Russian
    "hin_Deva",   # Hindi
    "deu_Latn",   # German
    "spa_Latn"    # Spanish
]

MODEL_NAME = "facebook/nllb-200-distilled-600M"

print("Workspace:", WORKDIR)
print("Languages:", TARGET_LANGS)


Workspace: /content/drive/MyDrive/Translation_fpga/step2_flores
Languages: ['eng_Latn', 'rus_Cyrl', 'hin_Deva', 'deu_Latn', 'spa_Latn']


In [None]:
# run in Colab python cell
from pathlib import Path
import json, statistics

WORKDIR = Path("/content/drive/MyDrive/Translation_fpga/step2_flores/flores200_dataset")  # adjust if your extracted folder has different name
TARGET_LANGS = ["eng_Latn", "rus_Cyrl", "hin_Deva", "deu_Latn", "spa_Latn"]
out_dir = Path("/content/drive/MyDrive/Translation_fpga/step2_flores")
tokenizer_name = "facebook/nllb-200-distilled-600M"

# helper to read a language file
def read_lang_file(path):
    with open(path, "r", encoding="utf-8") as f:
        return [line.rstrip("\n") for line in f]

# check files exist
for lang in TARGET_LANGS:
    dev_path = WORKDIR / "dev" / f"{lang}.dev"
    devtest_path = WORKDIR / "devtest" / f"{lang}.devtest"
    if not dev_path.exists() or not devtest_path.exists():
        raise FileNotFoundError(f"Missing files for {lang}: {dev_path} or {devtest_path} not found")

# load sentences
extracted = {}
for lang in TARGET_LANGS:
    dev_path = WORKDIR / "dev" / f"{lang}.dev"
    devtest_path = WORKDIR / "devtest" / f"{lang}.devtest"
    extracted[lang] = {
        "dev": read_lang_file(dev_path),
        "devtest": read_lang_file(devtest_path)
    }
    print(lang, "dev lines:", len(extracted[lang]["dev"]), "devtest lines:", len(extracted[lang]["devtest"]))

# tokenize stats (batching) - requires transformers; install if needed
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, use_fast=False)

def batched_token_lengths(sentences, batch=256):
    lengths = []
    for i in range(0, len(sentences), batch):
        batch_s = sentences[i:i+batch]
        enc = tokenizer(batch_s, add_special_tokens=True)
        lengths.extend([len(ids) for ids in enc["input_ids"]])
    return lengths

dataset_overview = {}
for lang in TARGET_LANGS:
    lang_stats = {}
    for split in ["dev", "devtest"]:
        sents = extracted[lang][split]
        lengths = batched_token_lengths(sents)
        lang_stats[split] = {
            "num_sentences": len(sents),
            "min_tokens": int(min(lengths)),
            "max_tokens": int(max(lengths)),
            "mean_tokens": float(statistics.mean(lengths)),
            "median_tokens": float(statistics.median(lengths)),
            "examples": sents[:5]
        }
    dataset_overview[lang] = lang_stats

out_path = out_dir / "dataset_overview.json"
with open(out_path, "w", encoding="utf-8") as f:
    json.dump(dataset_overview, f, indent=2, ensure_ascii=False)

print("Wrote dataset_overview.json ->", out_path)


eng_Latn dev lines: 997 devtest lines: 1012
rus_Cyrl dev lines: 997 devtest lines: 1012
hin_Deva dev lines: 997 devtest lines: 1012
deu_Latn dev lines: 997 devtest lines: 1012
spa_Latn dev lines: 997 devtest lines: 1012
Wrote dataset_overview.json -> /content/drive/MyDrive/Translation_fpga/step2_flores/dataset_overview.json


In [None]:
def write_jsonl(path, sentences):
    with open(path, "w", encoding="utf-8") as f:
        for s in sentences:
            f.write(json.dumps({"text": s}, ensure_ascii=False) + "\n")

for lang in TARGET_LANGS:
    for split in ["dev", "devtest"]:
        out_path = WORKDIR / f"{lang}_{split}.jsonl"
        write_jsonl(out_path, extracted[lang][split])
        print("Wrote:", out_path)


Wrote: /content/drive/MyDrive/Translation_fpga/step2_flores/flores200_dataset/eng_Latn_dev.jsonl
Wrote: /content/drive/MyDrive/Translation_fpga/step2_flores/flores200_dataset/eng_Latn_devtest.jsonl
Wrote: /content/drive/MyDrive/Translation_fpga/step2_flores/flores200_dataset/rus_Cyrl_dev.jsonl
Wrote: /content/drive/MyDrive/Translation_fpga/step2_flores/flores200_dataset/rus_Cyrl_devtest.jsonl
Wrote: /content/drive/MyDrive/Translation_fpga/step2_flores/flores200_dataset/hin_Deva_dev.jsonl
Wrote: /content/drive/MyDrive/Translation_fpga/step2_flores/flores200_dataset/hin_Deva_devtest.jsonl
Wrote: /content/drive/MyDrive/Translation_fpga/step2_flores/flores200_dataset/deu_Latn_dev.jsonl
Wrote: /content/drive/MyDrive/Translation_fpga/step2_flores/flores200_dataset/deu_Latn_devtest.jsonl
Wrote: /content/drive/MyDrive/Translation_fpga/step2_flores/flores200_dataset/spa_Latn_dev.jsonl
Wrote: /content/drive/MyDrive/Translation_fpga/step2_flores/flores200_dataset/spa_Latn_devtest.jsonl


In [None]:
# CELL 1
from pathlib import Path
import json, struct, math, hashlib, os, sys, statistics, time
from tqdm.auto import tqdm

WORKDIR = Path("/content/drive/MyDrive/Translation_fpga")
WORKDIR.mkdir(parents=True, exist_ok=True)

MODEL_NAME = "facebook/nllb-200-distilled-600M"
PAPER_PATH = "/content/drive/MyDrive/Translation_fpga/Bridging_the_Quality_Cliff__Efficient_Multilingual_NMT_at_the_Edge_via_Joint_Quantization_and_Dynamic_Adaptation.pdf"

# Quantization params
NBITS = 4
QMAX = 2**(NBITS-1) - 1  # 7 for 4-bit symmetric
QMIN = -2**(NBITS-1)     # -8

# LoftQ default rank and ridge
LOFTQ_RANK = 16
LOFTQ_RIDGE = 1e-4

print("WORKDIR:", WORKDIR)
print("Model:", MODEL_NAME)
print("Paper (local):", PAPER_PATH)
print("INT4 range:", QMIN, "to", QMAX, "rank:", LOFTQ_RANK)


WORKDIR: /content/drive/MyDrive/Translation_fpga
Model: facebook/nllb-200-distilled-600M
Paper (local): /content/drive/MyDrive/Translation_fpga/Bridging_the_Quality_Cliff__Efficient_Multilingual_NMT_at_the_Edge_via_Joint_Quantization_and_Dynamic_Adaptation.pdf
INT4 range: -8 to 7 rank: 16


In [None]:
# CELL 2: helper utilities
import torch
from contextlib import contextmanager

def atomic_write_bytes(path: Path, data: bytes):
    tmp = path.with_suffix(path.suffix + ".tmp")
    with tmp.open("wb") as f:
        f.write(data)
    tmp.replace(path)

def pack_int4_rowwise(Wq: torch.Tensor) -> bytes:
    """
    Pack a 2D int8 tensor with values in [-8..7] into bytes,
    two 4-bit signed values per byte, row-major.
    Wq: (out, in) torch.int8 on CPU
    Returns: bytes
    """
    assert Wq.dtype == torch.int8 or Wq.dtype == torch.int16 or Wq.dtype == torch.int32
    Wq_np = Wq.cpu().numpy().astype('int32')
    out, inp = Wq_np.shape
    # pad if odd columns
    pad = 0
    if inp % 2 != 0:
        Wq_np = np.pad(Wq_np, ((0,0),(0,1)), 'constant', constant_values=0)
        inp += 1
        pad = 1
    out_bytes = bytearray()
    for r in range(out):
        row = Wq_np[r]
        for i in range(0, inp, 2):
            low = int(row[i]) & 0xF
            high = int(row[i+1]) & 0xF
            b = (high << 4) | (low)
            out_bytes.append(b)
    return bytes(out_bytes)

def save_wbin(path: Path, Wq: torch.Tensor, scales: torch.Tensor):
    """
    Write .wbin: header(16 bytes) + scales(fp16) + packed int4 bytes, padded to 64B.
    Header: 4s I I I -> magic, version, out_dim, in_dim
    """
    out_dim, in_dim = Wq.shape
    magic = b"EAQ1"
    version = 1
    header = struct.pack("<4sIII", magic, version, out_dim, in_dim)
    # scales as fp16 little-endian
    scales_fp16 = scales.half().cpu().numpy().tobytes()
    packed = pack_int4_rowwise(Wq)
    data = header + scales_fp16 + packed
    # pad to 64B
    pad_len = (-len(data)) % 64
    if pad_len:
        data += b"\x00" * pad_len
    atomic_write_bytes(path, data)
    return path

def int4_to_signed(x):
    # convert nibble (0..15) to signed in [-8..7]
    if x >= 8:
        return x - 16
    return x

def compute_metrics(Wf: torch.Tensor, QW: torch.Tensor):
    diff = (Wf - QW).double()
    rmse = float(torch.sqrt(torch.mean(diff*diff)).item())
    maxabs = float(torch.max(torch.abs(diff)).item())
    rel_norm = float((diff.norm().item()) / (Wf.norm().item() + 1e-12))
    return {"rmse": rmse, "maxabs": maxabs, "rel_norm": rel_norm}


In [None]:
# CELL 3
from transformers import AutoConfig, AutoTokenizer, AutoModelForSeq2SeqLM
import numpy as np

print("Loading config + tokenizer (no weights)...")
cfg = AutoConfig.from_pretrained(MODEL_NAME)
tok = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=False)
print("vocab size:", len(tok))

print("Instantiating empty model from config (no weights) to enumerate parameters...")
model_empty = AutoModelForSeq2SeqLM.from_config(cfg)
params = list(model_empty.named_parameters())

# collect quantize candidates (2D weight matrices)
quant_candidates = [name for name, p in params if len(p.shape) == 2 and "weight" in name]
print("Found", len(quant_candidates), "2D weight tensors; sample:")
for n in quant_candidates[:8]:
    print("  ", n)

# choose 4 sample layers (or fewer if not available)
SAMPLE_LAYERS = quant_candidates[:4] if len(quant_candidates) >=4 else quant_candidates
print("SAMPLE_LAYERS:", SAMPLE_LAYERS)

# save chosen list
with open(WORKDIR / "sample_layers.json", "w") as f:
    json.dump(SAMPLE_LAYERS, f, indent=2)
print("Saved sample_layers.json")


Loading config + tokenizer (no weights)...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocab size: 256204
Instantiating empty model from config (no weights) to enumerate parameters...
Found 193 2D weight tensors; sample:
   model.shared.weight
   model.encoder.layers.0.self_attn.k_proj.weight
   model.encoder.layers.0.self_attn.v_proj.weight
   model.encoder.layers.0.self_attn.q_proj.weight
   model.encoder.layers.0.self_attn.out_proj.weight
   model.encoder.layers.0.fc1.weight
   model.encoder.layers.0.fc2.weight
   model.encoder.layers.1.self_attn.k_proj.weight
SAMPLE_LAYERS: ['model.shared.weight', 'model.encoder.layers.0.self_attn.k_proj.weight', 'model.encoder.layers.0.self_attn.v_proj.weight', 'model.encoder.layers.0.self_attn.q_proj.weight']
Saved sample_layers.json


In [None]:
# CELL 4
import torch

print("Attempting to load full model weights (low_cpu_mem_usage=True).")
try:
    model_full = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME, low_cpu_mem_usage=True)
    print("Model loaded. You can proceed with quantization of selected sample layers.")
except Exception as e:
    print("WARNING: full model load failed with exception:", e)
    print("""
Fallback options:
  1) If memory import fails on Colab, re-run on your college GPU host where you can load the model.
  2) For a local test-run: replace model loading with a synthetic random tensor
     (we included code later to run the same quantize/test logic on a random matrix).
  3) Try smaller model (like 'sshleifer/tiny-mbart' or other small transformer) to validate pipeline.
Continuing by attempting to extract available parameters from cache if any...""")
    # try to use huggingface cached files via from_pretrained with local_files_only? Not safe; raise for now
    raise


Attempting to load full model weights (low_cpu_mem_usage=True).
Model loaded. You can proceed with quantization of selected sample layers.


In [None]:
# CELL 8: fallback synthetic test (run this if full model load failed)
import torch, numpy as np
print("Running synthetic debug quantize+LoftQ test...")

def synthetic_run(out=128, inp=256, seed=42):
    torch.manual_seed(seed)
    Wf = torch.randn(out, inp) * 0.02  # small-scale weights
    max_abs = Wf.abs().amax(dim=1)
    scale = max_abs / QMAX
    scale[scale==0]=1e-8
    Wq = torch.round(Wf / scale.unsqueeze(1)).clamp(QMIN, QMAX).to(torch.int8)
    QW = Wq.float() * scale.unsqueeze(1)
    metrics_before = compute_metrics(Wf, QW)
    # loftq
    r = min(16, out, inp)
    U,S,Vt = torch.linalg.svd(Wf - QW, full_matrices=False)
    A = Vt[:r,:]
    AAT = A @ A.T
    inv = torch.linalg.inv(AAT + LOFTQ_RIDGE * torch.eye(AAT.shape[0]))
    B = ((Wf - QW) @ A.T) @ inv
    corrected = QW + (B @ A)
    metrics_after = compute_metrics(Wf, corrected)
    print("Synthetic test shapes:", Wf.shape)
    print("metrics before:", metrics_before)
    print("metrics after:", metrics_after)
    return True

_ = synthetic_run(128, 256)


Running synthetic debug quantize+LoftQ test...
Synthetic test shapes: torch.Size([128, 256])
metrics before: {'rmse': 0.0025228282559220908, 'maxabs': 0.00653904490172863, 'rel_norm': 0.1256090035904914}
metrics after: {'rmse': 0.0020956994492068834, 'maxabs': 0.006987506523728371, 'rel_norm': 0.10434270308416442}


In [None]:
# CELL 7
from pprint import pprint
from pathlib import Path
import json
WORKDIR = Path("/content/drive/MyDrive/Translation_fpga")
WORKDIR.mkdir(parents=True, exist_ok=True)
out_report = WORKDIR / "quant_test_report.json"
with open(out_report, "w", encoding="utf-8") as f:
    json.dump(report, f, indent=2)
print("Saved report to", out_report)
pprint(report)
print("\nSample artifacts saved under", WORKDIR)


Saved report to /content/drive/MyDrive/Translation_fpga/quant_test_report.json
{}

Sample artifacts saved under /content/drive/MyDrive/Translation_fpga


In [None]:
# CELL 0: setup workspace and create demo .wbin if weights dir empty
from pathlib import Path
import numpy as np, struct, json, hashlib, shutil

WORKDIR = Path("/content/drive/MyDrive/Translation_fpga/step5_artifacts")
WEIGHTS_DIR = WORKDIR / "weights"
ADAPTERS_DIR = WORKDIR / "adapters"
EMB_DIR = WORKDIR / "embeddings"
for p in (WORKDIR, WEIGHTS_DIR, ADAPTERS_DIR, EMB_DIR):
    p.mkdir(parents=True, exist_ok=True)

print("Workspace created at:", WORKDIR)

# Create a demo small .wbin if weights dir is empty (useful to test unpacker)
if not any(WEIGHTS_DIR.iterdir()):
    print("No weight files found — creating a demo .wbin for testing.")
    out_dim, in_dim = 8, 7
    magic = b"EAQ1"                     # 4 bytes
    version = 1
    header = struct.pack("<4sIII", magic, version, out_dim, in_dim)  # little-endian

    # scales: float16 per out row
    scales = (np.linspace(0.05, 0.5, out_dim).astype(np.float16)).tobytes()

    # make random small int4 values in [-3..3]
    vals = np.random.randint(-3, 4, size=(out_dim, in_dim)).astype(np.int8)
    # prepare packing: pad in_dim to even
    padded_in = in_dim if (in_dim % 2 == 0) else (in_dim + 1)
    if padded_in != in_dim:
        padcol = np.zeros((out_dim, 1), dtype=np.int8)
        vals = np.hstack([vals, padcol])
    packed = bytearray()
    for r in range(vals.shape[0]):
        row = vals[r]
        for i in range(0, row.shape[0], 2):
            low = int(row[i]) & 0xF
            high = int(row[i+1]) & 0xF
            packed.append((high << 4) | low)
    data = header + scales + bytes(packed)
    # pad to 64 bytes
    pad_len = (-len(data)) % 64
    if pad_len:
        data += b"\x00" * pad_len
    p_demo = WEIGHTS_DIR / "demo_layer.wbin"
    p_demo.write_bytes(data)
    print("Created demo .wbin at:", p_demo)
else:
    print("Weights directory not empty; skipping demo creation.")

# Quick listing
print("Weights files:", [p.name for p in sorted(WEIGHTS_DIR.iterdir())])


Workspace created at: /content/drive/MyDrive/Translation_fpga/step5_artifacts
No weight files found — creating a demo .wbin for testing.
Created demo .wbin at: /content/drive/MyDrive/Translation_fpga/step5_artifacts/weights/demo_layer.wbin
Weights files: ['demo_layer.wbin']


In [None]:
# CELL 1: write unpack_and_dequantize.py into workspace (executable helper)
from pathlib import Path
WORKDIR = Path("/content/drive/MyDrive/Translation_fpga/step5_artifacts")
unpack_py = WORKDIR / "unpack_and_dequantize.py"

code = r'''#!/usr/bin/env python3
"""
unpack_and_dequantize.py
Reads .wbin files produced by the quant pipeline (EAQ1 header) and reconstructs float weights.

Usage:
  python unpack_and_dequantize.py /path/to/layer.wbin --print 8 --save-npy

Output:
  prints header, sample values, sha256 of reconstructed float array; optional .npy save.
"""
import struct, argparse, pathlib, numpy as np, hashlib, sys

def nibble_to_signed(x):
    return x - 16 if x >= 8 else x

def read_wbin(path: pathlib.Path):
    b = path.read_bytes()
    if len(b) < 16:
        raise ValueError("File too short")
    magic, version, out_dim, in_dim = struct.unpack_from("<4sIII", b, 0)
    if magic != b"EAQ1":
        raise ValueError(f"Unexpected magic {magic!r}")
    offset = 16
    scales_bytes = b[offset: offset + 2 * out_dim]
    if len(scales_bytes) != 2 * out_dim:
        raise ValueError("Not enough bytes for scales")
    scales = np.frombuffer(scales_bytes, dtype=np.float16).astype(np.float32)
    offset += 2 * out_dim
    packed = b[offset:]
    padded_in = ((in_dim + 1)//2) * 2
    Wq = np.zeros((out_dim, padded_in), dtype=np.int8)
    idx = 0
    for r in range(out_dim):
        for c in range(0, padded_in, 2):
            if idx >= len(packed):
                raise ValueError("Packed data shorter than expected")
            byte = packed[idx]
            low = byte & 0x0F
            high = (byte >> 4) & 0x0F
            Wq[r, c] = nibble_to_signed(low)
            Wq[r, c+1] = nibble_to_signed(high)
            idx += 1
    Wq = Wq[:, :in_dim]
    Wf = (Wq.astype(np.float32)) * scales.reshape(-1,1)
    return {"version": int(version), "out_dim": int(out_dim), "in_dim": int(in_dim), "scales": scales, "Wq": Wq, "Wf": Wf}

def sha256_of_array(arr: np.ndarray):
    return hashlib.sha256(arr.tobytes()).hexdigest()

def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("wbin", type=pathlib.Path)
    ap.add_argument("--print", type=int, default=0)
    ap.add_argument("--save-npy", action="store_true")
    args = ap.parse_args()
    info = read_wbin(args.wbin)
    print("version:", info["version"])
    print("shape:", info["out_dim"], "x", info["in_dim"])
    print("scales (first 8):", info["scales"][:8])
    if args.print:
        nr = min(args.print, info["in_dim"])
        print("Wq first row (first cols):", info["Wq"][0,:nr].tolist())
        print("Wf first row (first cols):", info["Wf"][0,:nr].tolist())
    print("Wf sha256:", sha256_of_array(info["Wf"]))
    if args.save_npy:
        outp = args.wbin.with_suffix(".npy")
        np.save(str(outp), info["Wf"])
        print("Saved Wf to", outp)

if __name__ == '__main__':
    main()
'''
unpack_py.write_text(code, encoding="utf-8")
# make executable (Colab will respect)
import os
os.chmod(unpack_py, 0o755)
print("Wrote unpacker to:", unpack_py)


Wrote unpacker to: /content/drive/MyDrive/Translation_fpga/step5_artifacts/unpack_and_dequantize.py


In [None]:
# CELL 2 (FIXED): scan weights/adapters/embeddings and produce manifests with sha256 + header info
from pathlib import Path
import json, struct, hashlib

WORKDIR = Path("/content/drive/MyDrive/Translation_fpga/step5_artifacts")
WEIGHTS_DIR = WORKDIR / "weights"
ADAPTERS_DIR = WORKDIR / "adapters"
EMB_DIR = WORKDIR / "embeddings"

def sha256_file(p: Path) -> str:
    h = hashlib.sha256()
    with p.open("rb") as f:
        for chunk in iter(lambda: f.read(8192), b""):
            h.update(chunk)
    return h.hexdigest()

def parse_wbin_header(p: Path):
    raw = p.read_bytes()[:16]  # read header only
    if len(raw) < 16:
        return None
    magic, version, out_dim, in_dim = struct.unpack_from("<4sIII", raw, 0)
    if magic != b"EAQ1":
        return None
    return {
        "version": int(version),
        "out_dim": int(out_dim),
        "in_dim": int(in_dim)
    }

# ---- scan weights ----
weights = []
for p in sorted(WEIGHTS_DIR.glob("*.wbin")):
    header = parse_wbin_header(p)
    stats = p.stat()
    weights.append({
        "file": str(p.relative_to(WORKDIR)),
        "path": str(p),
        "size_bytes": stats.st_size,
        "sha256": sha256_file(p),
        "header": header
    })

# ---- scan adapters ----
adapters = []
for p in sorted(ADAPTERS_DIR.iterdir()):
    if p.is_file():
        stats = p.stat()
        adapters.append({
            "file": str(p.relative_to(WORKDIR)),
            "path": str(p),
            "size_bytes": stats.st_size,
            "sha256": sha256_file(p)
        })

# ---- scan embeddings ----
embs = []
for p in sorted(EMB_DIR.iterdir()):
    if p.is_file():
        stats = p.stat()
        embs.append({
            "file": str(p.relative_to(WORKDIR)),
            "path": str(p),
            "size_bytes": stats.st_size,
            "sha256": sha256_file(p)
        })

# ---- write manifests ----
( WORKDIR / "weights_manifest.json" ).write_text(json.dumps({"weights": weights}, indent=2))
( WORKDIR / "adapter_manifest.json" ).write_text(json.dumps({"adapters": adapters}, indent=2))
( WORKDIR / "emb_manifest.json" ).write_text(json.dumps({"embeddings": embs}, indent=2))

print("Wrote manifests OK!")
print(" - weights:", len(weights))
print(" - adapters:", len(adapters))
print(" - embeddings:", len(embs))


Wrote manifests OK!
 - weights: 1
 - adapters: 0
 - embeddings: 0


In [None]:
# CELL 3: unpack first up to 4 wbin files using the helper code we wrote above (imported as module)
from pathlib import Path
import subprocess, json, sys
WORKDIR = Path("/content/drive/MyDrive/Translation_fpga/step5_artifacts")
man = json.load((WORKDIR / "weights_manifest.json").open())
to_test = man["weights"][:4]

print("Validating first", len(to_test), "weights")
for entry in to_test:
    p = WORKDIR / entry["file"]
    print("\n--- unpacking", p)
    # call the script (same Python env)
    cmd = [sys.executable, str(WORKDIR / "unpack_and_dequantize.py"), str(p), "--print", "6"]
    subprocess.run(cmd, check=True)
print("\nUnpack validation done.")


Validating first 1 weights

--- unpacking /content/drive/MyDrive/Translation_fpga/step5_artifacts/weights/demo_layer.wbin

Unpack validation done.


In [None]:
# CELL 4: pad files to 64 bytes and refresh manifests
from pathlib import Path
import json, hashlib
WORKDIR = Path("/content/drive/MyDrive/Translation_fpga/step5_artifacts")
def sha256_file(p):
    import hashlib
    h = hashlib.sha256()
    with p.open("rb") as f:
        for chunk in iter(lambda: f.read(8192), b""):
            h.update(chunk)
    return h.hexdigest()

def pad_to_64(p: Path):
    size = p.stat().st_size
    pad = (-size) % 64
    if pad:
        with p.open("ab") as f:
            f.write(b"\x00" * pad)
        return pad
    return 0

# pad and update manifests
man_w = WORKDIR / "weights_manifest.json"
man_a = WORKDIR / "adapter_manifest.json"
man_e = WORKDIR / "emb_manifest.json"
for man_path in (man_w, man_a, man_e):
    if not man_path.exists():
        continue
    data = json.load(man_path.open())
    key = list(data.keys())[0]  # weights/adapters/embeddings
    for entry in data[key]:
        p = Path(entry["path"])
        pad = pad_to_64(p)
        if pad:
            print("Padded", p.name, "by", pad, "bytes")
        entry["size_bytes"] = p.stat().st_size
        entry["sha256"] = sha256_file(p)
    man_path.write_text(json.dumps(data, indent=2))
    print("Updated manifest:", man_path.name)
print("Padding & manifest refresh complete.")


Updated manifest: weights_manifest.json
Updated manifest: adapter_manifest.json
Updated manifest: emb_manifest.json
Padding & manifest refresh complete.


In [None]:
# CELL 5: write fpga_export_plan.md
from pathlib import Path
WORKDIR = Path("/content/drive/MyDrive/Translation_fpga/step5_artifacts")
md = WORKDIR / "fpga_export_plan.md"
content = f"""# FPGA Export Plan (generated)

## File formats

### .wbin (weights)
- Header (16 bytes, little endian):
  - magic: 4 bytes ASCII 'EAQ1'
  - version: uint32
  - out_dim: uint32
  - in_dim: uint32
- scales: out_dim values as FP16 (2 bytes each)
- packed weights: ceil(in_dim/2) * out_dim bytes (two 4-bit signed values per byte; low nibble = col0, high nibble = col1), row-major
- file padded to 64-byte boundary

### adapter_<lang>.bin
- (example binary layout)
- header: magic 'ADAP', version u32, num_layers u32
- per-layer index: layer_name_len u16, layer_name bytes, rank u16, offset u64, size u64
- payload: concatenated A (r x in fp16), followed by B (out x r fp16)

### Embeddings
- core_embeddings.bin: FP16 contiguous (core_size x emb_dim)
- shard files: emb_shard_<lang>.bin: FP16 contiguous (num_tokens x emb_dim)

## Memory map (suggested)
- HBM: store .wbin weights
- DDR: adapter bins, embedding shards
- URAM/BRAM: universal core embeddings + active adapter slots (double-buffer)

## Alignment
- All files must be 64-byte aligned for DMA efficiency.

## CSR register map (example)
- 0x0010 REG_ADAPTER_ACTIVE_PTR_LO (u32)
- 0x0014 REG_ADAPTER_ACTIVE_PTR_HI (u32)
- 0x0018 REG_ADAPTER_PREFETCH_PTR_LO (u32)
- 0x001C REG_ADAPTER_PREFETCH_PTR_HI (u32)
- 0x0020 REG_ADAPTER_CMD (u32)  1=prefetch_start 2=swap_active
- 0x0024 REG_PREFETCH_STATUS (u32): bit0=prefetch_busy bit1=prefetch_done

## Notes
- Endianness: little-endian throughout.
- If scales require FP32 instead of FP16, update header/spec and unpacker accordingly.

"""
md.write_text(content)
print("Wrote:", md)


Wrote: /content/drive/MyDrive/Translation_fpga/step5_artifacts/fpga_export_plan.md


In [None]:
# CELL 6: zip the workspace for handoff
from pathlib import Path
import zipfile, os, shutil

WORKDIR = Path("/content/step5_artifacts")
ZIPPATH = Path("/content/step5_artifacts.zip")
if ZIPPATH.exists():
    ZIPPATH.unlink()

# create zip (skips if files open)
shutil.make_archive(str(ZIPPATH.with_suffix('')), 'zip', root_dir=str(WORKDIR))
print("Packaged artifacts to:", ZIPPATH, " size:", ZIPPATH.stat().st_size, "bytes")

# list contents (few)
print("Sample files:")
for i, p in enumerate(sorted(WORKDIR.rglob("*"))):
    if i>40:
        break
    print(" -", p.relative_to(WORKDIR))


Packaged artifacts to: /content/step5_artifacts.zip  size: 2359 bytes
Sample files:
 - adapter_manifest.json
 - adapters
 - emb_manifest.json
 - embeddings
 - unpack_and_dequantize.py
 - weights
 - weights/demo_layer.wbin
 - weights_manifest.json
