In [1]:
import os

REPO_URL="https://github.com/UH-Insure/Evaluation.git"
REPO="Colab-Training"

os.chdir("/content")

# If repo exists, update it; otherwise, clone fresh
if os.path.exists(REPO):
    print(f"Repo '{REPO}' exists, pulling latest changes...")
    os.chdir(REPO)
    !git reset --hard HEAD   # optional: discard local changes
    !git pull
else:
    print(f"Cloning repo '{REPO}'...")
    !git clone "$REPO_URL" "$REPO"
    os.chdir(REPO)

Cloning repo 'Colab-Training'...
Cloning into 'Colab-Training'...
remote: Enumerating objects: 46, done.[K
remote: Counting objects: 100% (46/46), done.[K
remote: Compressing objects: 100% (32/32), done.[K
remote: Total 46 (delta 10), reused 38 (delta 8), pack-reused 0 (from 0)[K
Receiving objects: 100% (46/46), 22.24 KiB | 22.24 MiB/s, done.
Resolving deltas: 100% (10/10), done.


In [2]:

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import AutoPeftModelForCausalLM, PeftModel

# ----------------- USER CONFIG -----------------
BASE_MODEL_ID = "Qwen/Qwen3-Coder-30B-A3B-Instruct"     # Base model; change if you fine-tuned a different Qwen3 variant
ADAPTER_REPO  = "tam2003/SFT-Qwen3-30B-dsetV3"       # Your adapter repo on Hugging Face
TRUST_REMOTE_CODE = True                     # Qwen chat template typically needs this
USE_AUTOPEFT = True                          # True: load adapters directly from the adapter repo
MERGE_AND_UNLOAD = False                     # True to merge LoRA weights into the base for faster inference
DTYPE = torch.bfloat16 if torch.cuda.is_available() else torch.float32

GEN_KW = dict(
    max_new_tokens=1024,     # enough to hold the whole `cryptol` block
    do_sample=False,
    temperature=0.0,        # lower than 0.7 → more deterministic
    # top_p=0.9,
)

# -----------------------------------------------

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device, "| dtype:", DTYPE)


Device: cuda | dtype: torch.bfloat16


In [3]:

tokenizer = AutoTokenizer.from_pretrained(
    BASE_MODEL_ID if not USE_AUTOPEFT else ADAPTER_REPO,  # adapter repos often include correct tokenizer config
    use_fast=True,
    trust_remote_code=TRUST_REMOTE_CODE,
)
# Ensure pad token exists
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
print("Loaded tokenizer. EOS:", tokenizer.eos_token_id, "PAD:", tokenizer.pad_token_id)


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/707 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/613 [00:00<?, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

Loaded tokenizer. EOS: 151645 PAD: 151643


In [4]:

# Fallback: load base model, then apply adapters
base = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL_ID,
    torch_dtype=DTYPE,
    device_map="auto",
    trust_remote_code=TRUST_REMOTE_CODE,
)
model = PeftModel.from_pretrained(base, ADAPTER_REPO)

# Optionally merge LoRA weights for faster inference (uses more RAM/VRAM)
if MERGE_AND_UNLOAD and hasattr(model, "merge_and_unload"):
    model = model.merge_and_unload()
    print("Merged LoRA weights into the base model.")

# Make sure pad token id is set for generation
if getattr(model.config, "pad_token_id", None) is None:
    model.config.pad_token_id = tokenizer.pad_token_id

_ = model.eval()
print("Model ready.")


config.json:   0%|          | 0.00/992 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 16 files:   0%|          | 0/16 [00:00<?, ?it/s]

model-00008-of-00016.safetensors:   0%|          | 0.00/4.00G [00:00<?, ?B/s]

model-00007-of-00016.safetensors:   0%|          | 0.00/4.00G [00:00<?, ?B/s]

model-00006-of-00016.safetensors:   0%|          | 0.00/4.00G [00:00<?, ?B/s]

model-00002-of-00016.safetensors:   0%|          | 0.00/4.00G [00:00<?, ?B/s]

model-00001-of-00016.safetensors:   0%|          | 0.00/4.00G [00:00<?, ?B/s]

model-00003-of-00016.safetensors:   0%|          | 0.00/4.00G [00:00<?, ?B/s]

model-00004-of-00016.safetensors:   0%|          | 0.00/4.00G [00:00<?, ?B/s]

model-00005-of-00016.safetensors:   0%|          | 0.00/4.00G [00:00<?, ?B/s]

model-00009-of-00016.safetensors:   0%|          | 0.00/4.00G [00:00<?, ?B/s]

model-00010-of-00016.safetensors:   0%|          | 0.00/4.00G [00:00<?, ?B/s]

model-00011-of-00016.safetensors:   0%|          | 0.00/4.00G [00:00<?, ?B/s]

model-00012-of-00016.safetensors:   0%|          | 0.00/3.99G [00:00<?, ?B/s]

model-00013-of-00016.safetensors:   0%|          | 0.00/4.00G [00:00<?, ?B/s]

model-00014-of-00016.safetensors:   0%|          | 0.00/4.00G [00:00<?, ?B/s]

model-00015-of-00016.safetensors:   0%|          | 0.00/4.00G [00:00<?, ?B/s]

model-00016-of-00016.safetensors:   0%|          | 0.00/1.09G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/16 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/180 [00:00<?, ?B/s]

adapter_config.json: 0.00B [00:00, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/3.38G [00:00<?, ?B/s]

Model ready.


In [8]:
from src.eval_suite import Config, run_eval_suite
import pandas as pd

os.chdir("/content")

config = Config(
    MODEL_ID=ADAPTER_REPO,   # purely informational for local runs
    EVALS_PATH="/content/tests1.jsonl",
    TEMP_FILE="/content/Colab-Training/data/generated.txt",
    SYSTEM_PROMPT="Return exactly ONE fenced code block labeled `cryptol` and nothing else (no prose before/after).",
)

eval_df = pd.read_json(config.EVALS_PATH, lines=True)

def generate_cryptol(messages) -> str:
    inputs = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        tokenize=True,
        return_tensors="pt",
    ).to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            inputs,
            **GEN_KW,
            pad_token_id=tokenizer.pad_token_id,
        )

    generated = outputs[0, inputs.shape[-1]:]  # only the new tokens
    text = tokenizer.decode(generated, skip_special_tokens=True)
    return text

# ---- 3. Run the eval suite using the local model ----
run_eval_suite(eval_df, config, execute=False, generate_fn=generate_cryptol)


Starting eval suite at 2025-11-29_03:01:07, 3 tasks to process.

=== Task 5 ===

[PROMPT BEGIN]
[SYSTEM]
Return exactly ONE fenced code block labeled `cryptol` and nothing else (no prose before/after).

[USER]
### Instruction:
Write a Cryptol function that implements the tasks described below.

### Request:
Task: Using the existing type `Cipher` and value `toyCipher` provided in the test setup, define a single polymorphic Cryptol function named `evktest` with the following type:

evktest : {ks, bs, n} (fin bs)
       => (Cipher ks bs, [n]([ks],[bs]), [bs]) -> [n]([bs], Bit)

The function should take a cipher implementation, a sequence of (Key, Ciphertext) test vectors, and a single plaintext block. For each (Key, CT) in the list, it must iterate over that list using a single generator in a list comprehension (no parallel generators or multiple `|` arms), compute CT' = cipher.encrypt Key PT for that entry, and produce an output list of pairs (CT', CT' == CT).

The code for `Cipher` and 

In [None]:
# Define conversation in the model's expected format
messages = [
   {"role": "system", "content": "Return exactly ONE fenced code block labeled `cryptol` and nothing else (no prose before/after)."},
    {"role": "user", "content": "Implement a Caesar cipher. Define the functions `encrypt` and `decrypt` with the signature: `{n} [8] -> [n][8] -> [n][8]`."},
]

# Build inputs using the chat template and ask the model to generate the assistant's reply
inputs = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt=True,  # append assistant header so the model continues as assistant
    tokenize=True,
    return_tensors="pt"
).to(model.device)

with torch.no_grad():
    outputs = model.generate(
        inputs,
        **GEN_KW,
        pad_token_id=tokenizer.pad_token_id,
    )

generated = outputs[0, inputs.shape[-1]:]
text = tokenizer.decode(generated, skip_special_tokens=True)
print("Assistant:\n", text)

Assistant:
 ```cryptol
encrypt : {n} [8] -> [n][8] -> [n][8]
encrypt shift plaintext = [((c + shift) % 256) | c <- plaintext]

decrypt : {n} [8] -> [n][8] -> [n][8]
decrypt shift ciphertext = [((c - shift) % 256) | c <- ciphertext]
```


In [None]:
def chat(system_prompt: str, user_prompt: str, **gen_kwargs):
    msgs = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt},
    ]
    model_inputs = tokenizer.apply_chat_template(
        msgs, add_generation_prompt=True, tokenize=True, return_tensors="pt"
    ).to(model.device)
    with torch.no_grad():
        out = model.generate(model_inputs, **({**GEN_KW, **gen_kwargs}), pad_token_id=tokenizer.pad_token_id)
    completion = tokenizer.decode(out[0, model_inputs.shape[-1]:], skip_special_tokens=True)
    return completion

print(chat(
    "You are a precise software assistant.",
    "Explain what an Echo State Network is in two sentences."
))


An Echo State Network (ESN) is a type of recurrent neural network where only the output weights are trained, while the hidden layer weights are randomly assigned and fixed. The network leverages the "echo state property" to ensure that the hidden states depend only on a finite number of recent inputs, making it suitable for temporal sequence processing tasks.


In [None]:
from google.colab import runtime
runtime.unassign()


## Notes
- If `AutoPeftModelForCausalLM.from_pretrained(ADAPTER_REPO)` fails because the adapter repo doesn't declare the base,
  set `USE_AUTOPEFT=False` to load `BASE_MODEL_ID` first, then apply `PeftModel.from_pretrained(BASE_MODEL_ID, ADAPTER_REPO)`.
- Keep `trust_remote_code=True` for Qwen so the tokenizer/model expose the correct **chat template**.
- If you see CUDA OOM, consider running on CPU (`device_map=None`) or reducing `max_new_tokens` and using a smaller batch.
- `MERGE_AND_UNLOAD=True` can speed up inference at the cost of more memory.
