<a href="https://colab.research.google.com/github/tam1444AH/Finetuning-Qwen3/blob/main/notebooks/supervised-data-preprocess.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import userdata

In [None]:
import os, re, json
from pathlib import Path

In [None]:
SRC = Path("/content/all_hybrid.jsonl")
OUT = Path("/content/sft_supervised_llm.jsonl")

HF_MODEL = "Qwen/Qwen3-Coder-30B-A3B-Instruct"
HF_PROVIDER = "nebius"

SYSTEM = (
    "You are a data writer creating high-quality programming tasks for the Cryptol language.\n"
    "Turn a Cryptol declaration (name + type signature) into a rich task for a code model.\n"
    "Rules:\n"
    "- Include the exact type signature in a fenced block.\n"
    "- Use this format:\n"
    "  ### Instruction:\n"
    "  ### Request:\n"
    "  ### Constraints & Hints:\n"
    "- Do NOT include any Cryptol code other than the signature fence.\n"
    "- Do NOT reveal solution details.\n"
    "- Mention bytes as [8], strings as [n][8] when relevant; respect polymorphism and constraints.\n"
    "- 120–300 words.\n"
    '- End with: \"Output exactly one code block in the format ```cryptol …``` and nothing else.\"'
)

USER_TMPL = (
    "Write a rich task for the following Cryptol declaration.\n\n"
    "- Name: {name}\n"
    "- Kind: {kind}\n"
    "- Signature:\n"
    "```cryptol\n"
    "{name} : {signature}\n"
    "```\n"
    "- Module (if any): {module}\n"
    "- Notes: Keep the task general, do not leak implementation. Encourage properties/edge cases, but no code."
)

In [None]:
SIG_RE = re.compile(r"^\s*([A-Za-z_][\w']*)\s*:\s*(.+)$", re.M)

def extract_decls(cryptol_src: str):
    decls = []
    for m in SIG_RE.finditer(cryptol_src):
        name, sig = m.group(1), m.group(2).strip()
        kind = "property" if " Bit" in f" {sig} " else "function"
        decls.append({"name": name, "signature": sig, "kind": kind})
    return decls

def get_client():
    from huggingface_hub import InferenceClient
    token = userdata.get('HF_TOKEN')  # must be set
    return InferenceClient(provider=HF_PROVIDER, api_key=token)

def call_llm(client, system: str, user: str) -> str:
    out = client.chat.completions.create(
        model=HF_MODEL,
        messages=[{"role":"system","content":system},
                  {"role":"user","content":user}],
        temperature=0.4,
        max_tokens=700,
    )
    return out.choices[0].message.content

In [None]:
def main():
    client = get_client()
    task_id = 1
    with SRC.open() as fin, OUT.open("w") as fout:
        for line in fin:
            row = json.loads(line)
            if row.get("set") != "supervised":
                continue
            if row.get("filetype") != "cry":
              continue
            src = (row.get("content") or "").strip()
            if not src:
                continue
            module = Path(row.get("filename","")).stem or "n/a"
            decls = extract_decls(src)
            if not decls:
                continue
            for d in decls:
                user = USER_TMPL.format(
                    name=d["name"], kind=d["kind"], signature=d["signature"], module=module
                )
                task = call_llm(client, SYSTEM, user).strip()
                record = {
                    "task_id": task_id,
                    "type": d["kind"],
                    "task": task,
                    "solution": f"```cryptol\n{src}\n```",
                    "supervised": True,
                }
                fout.write(json.dumps(record, ensure_ascii=False) + "\n")
                task_id += 1

if __name__ == "__main__":
    main()

In [None]:
# from google.colab import runtime
# runtime.unassign()