In [1]:
!pip install -q transformers torch

In [4]:
import torch
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, AutoTokenizer
import re
from datasets import Dataset
import random
import time
import sys

In [5]:
random_seed = 42

In [6]:
test_cpp_path = 'test-C++-Python-tok.cpp'
test_py_path = 'test-C++-Python-tok.py'

In [7]:
def load_test_dataset(cpp_path, py_path):
  with open(cpp_path, 'r') as f_cpp, open(py_path, 'r') as f_py:
    cpp_lines = f_cpp.readlines()
    py_lines = f_py.readlines()

  test_data = []
  for cpp_code, py_code in zip(cpp_lines, py_lines):
    cpp_code = cpp_code.replace('NEW_LINE', '\n').replace('STRNEWLINE', '\n')
    py_code = py_code.replace('NEW_LINE', '\n').replace('STRNEWLINE', '\n')
    test_data.append({
      "cpp": cpp_code,
      "py": py_code
    })

  return Dataset.from_list(test_data)

In [8]:
test_dataset = load_test_dataset("test-C++-Python-tok.cpp", "test-C++-Python-tok.py")
test_dataset = test_dataset.shuffle(seed=random_seed)
test_subset = test_dataset.select(range(100))

In [7]:
fine_tuned_model = "wu7115/code-translator-2025-04-28_01.25.56"
model = AutoModelForCausalLM.from_pretrained(
  fine_tuned_model,
  device_map="auto",
  trust_remote_code=True
)
tokenizer = AutoTokenizer.from_pretrained(
  fine_tuned_model,
  trust_remote_code=True
)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [8]:
start_time = time.time()
predictions = []

for i, example in enumerate(test_subset):
  cpp_code = example["cpp"]
  prompt = f"Translate the following C++ code into Python:\n\n{cpp_code}\n\nAnswer:"
  inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

  with torch.no_grad():
    output = model.generate(
      **inputs,
      max_new_tokens=256,
      temperature=0.2,
      top_p=0.95,
      do_sample=True,
      pad_token_id=tokenizer.eos_token_id
    )

  generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
  python_code = generated_text.split("Answer:")[-1].strip()
  predictions.append(python_code)

  elapsed = time.time() - start_time
  print(f"[{i+1}/{len(test_subset)}] Generated successfully — Elapsed: {elapsed:.2f}s")

total_elapsed = time.time() - start_time
print(f"\n✅ All {len(test_subset)} generations completed in {elapsed:.2f} seconds.")

[1/100] Generated successfully — Elapsed: 14.83s
[2/100] Generated successfully — Elapsed: 29.06s
[3/100] Generated successfully — Elapsed: 43.24s
[4/100] Generated successfully — Elapsed: 57.33s
[5/100] Generated successfully — Elapsed: 71.36s
[6/100] Generated successfully — Elapsed: 85.26s
[7/100] Generated successfully — Elapsed: 99.31s
[8/100] Generated successfully — Elapsed: 113.26s
[9/100] Generated successfully — Elapsed: 127.31s
[10/100] Generated successfully — Elapsed: 141.31s
[11/100] Generated successfully — Elapsed: 155.25s
[12/100] Generated successfully — Elapsed: 169.26s
[13/100] Generated successfully — Elapsed: 183.27s
[14/100] Generated successfully — Elapsed: 197.28s
[15/100] Generated successfully — Elapsed: 211.26s
[16/100] Generated successfully — Elapsed: 225.25s
[17/100] Generated successfully — Elapsed: 239.27s
[18/100] Generated successfully — Elapsed: 253.22s
[19/100] Generated successfully — Elapsed: 267.20s
[20/100] Generated successfully — Elapsed: 281.

In [10]:
def clean_code(code: str) -> str:
  code = code.replace("INDENT", "").replace("DEDENT", "").replace("DED", "")
  code = re.sub(r"\n\s*\n", "\n", code)
  lines = [line.strip() for line in code.strip().splitlines()]
  cleaned = "\n".join([line for line in lines if line.strip() != ""])

  return cleaned

In [13]:
# cleaned_preds = [clean_code(p) for p in predictions]
cleaned_refs = [clean_code(e["py"]) for e in test_subset]

In [11]:
def exact_match_score(pred, ref):
  return int(pred == ref)

exact_matches = [exact_match_score(p, r) for p, r in zip(cleaned_preds, cleaned_refs)]
accuracy = sum(exact_matches) / len(exact_matches)

print(f"\n✅ Exact Match Accuracy: {accuracy*100:.2f}%")


✅ Exact Match Accuracy: 0.00%


In [1]:
# import json, pathlib
# with pathlib.Path("pairs.jsonl").open("w") as f:
#   for r, g in zip(cleaned_refs, cleaned_preds):
#     json.dump({"ref": r, "gen": g}, f)
#     f.write("\n")

# 1. Install compatible stack
!pip install -q "tree-sitter==0.22.0" "tree-sitter-python==0.21.0" "codebleu==0.7.0"

# 2. Reload refs & preds
import json, pathlib
pairs = [json.loads(l) for l in pathlib.Path("pairs.jsonl").open()]
refs  = [p["ref"] for p in pairs]
preds = [p["gen"] for p in pairs]
print(f"Loaded {len(refs)} pairs")

# 3. (Optional) exact-match
exact = sum(int(p.strip() == r.strip()) for p, r in zip(preds, refs)) / len(refs)
print(f"Exact-Match Accuracy: {exact*100:.2f}%")

# 4. CodeBLEU
from codebleu import calc_codebleu
score = calc_codebleu(refs, preds, lang="python")
print(f"\nOverall CodeBLEU: {score['codebleu']*100:.2f}%")
print("Breakdown:", score)

Loaded 100 pairs
Exact-Match Accuracy: 0.00%

Overall CodeBLEU: 61.78%
Breakdown: {'codebleu': 0.6177824701461674, 'ngram_match_score': 0.0993831158167901, 'weighted_ngram_match_score': 0.7229005056259499, 'syntax_match_score': 0.795142555438226, 'dataflow_match_score': 0.8537037037037037}


In [21]:
from openai import OpenAI
import os
from google.colab import userdata
import re

client = OpenAI(api_key=userdata.get("GPT_API_KEY"))

def extract_python_code(text):
    match = re.search(r"```(?:python)?\n(.*?)```", text, re.DOTALL)
    if match:
        return match.group(1).strip()
    return text.strip()

### Python Translation:"""

gpt4o_preds = []

for i, example in enumerate(test_subset):
    cpp_code = example["cpp"]
    prompt = f"""### C++ Code:
{cpp_code}

### Python Translation:"""

    try:
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[{"role": "user", "content": prompt}],
            temperature=0,
            top_p=1.0,
            max_tokens=256,
        )
        reply = response.choices[0].message.content
        extracted = extract_python_code(reply)
        gpt4o_preds.append(extracted)
        print(f"[{i+1}/{len(test_subset)}] ✅ Success")
    except Exception as e:
        gpt4o_preds.append("ERROR")
        print(f"[{i+1}/{len(test_subset)}] ❌ Error: {e}")

gpt4o_cleaned_preds = [clean_code(p) for p in gpt4o_preds]

# Save results if needed
import json, pathlib
with pathlib.Path("gpt4o_pairs.jsonl").open("w") as f:
    for r, g in zip(cleaned_refs, gpt4o_cleaned_preds):
        json.dump({"ref": r, "gen": g}, f)
        f.write("\n")

# Evaluate
from codebleu import calc_codebleu

score = calc_codebleu(cleaned_refs, gpt4o_cleaned_preds, lang="python")
print(f"\n📊 GPT-4o CodeBLEU: {score['codebleu']*100:.2f}%")
for metric, s in score.items():
    print(f"{metric:25s}: {s:.4f}")

[1/100] ✅ Success
[2/100] ✅ Success
[3/100] ✅ Success
[4/100] ✅ Success
[5/100] ✅ Success
[6/100] ✅ Success
[7/100] ✅ Success
[8/100] ✅ Success
[9/100] ✅ Success
[10/100] ✅ Success
[11/100] ✅ Success
[12/100] ✅ Success
[13/100] ✅ Success
[14/100] ✅ Success
[15/100] ✅ Success
[16/100] ✅ Success
[17/100] ✅ Success
[18/100] ✅ Success
[19/100] ✅ Success
[20/100] ✅ Success
[21/100] ✅ Success
[22/100] ✅ Success
[23/100] ✅ Success
[24/100] ✅ Success
[25/100] ✅ Success
[26/100] ✅ Success
[27/100] ✅ Success
[28/100] ✅ Success
[29/100] ✅ Success
[30/100] ✅ Success
[31/100] ✅ Success
[32/100] ✅ Success
[33/100] ✅ Success
[34/100] ✅ Success
[35/100] ✅ Success
[36/100] ✅ Success
[37/100] ✅ Success
[38/100] ✅ Success
[39/100] ✅ Success
[40/100] ✅ Success
[41/100] ✅ Success
[42/100] ✅ Success
[43/100] ✅ Success
[44/100] ✅ Success
[45/100] ✅ Success
[46/100] ✅ Success
[47/100] ✅ Success
[48/100] ✅ Success
[49/100] ✅ Success
[50/100] ✅ Success
[51/100] ✅ Success
[52/100] ✅ Success
[53/100] ✅ Success
[5