In [2]:
import zipfile
import os

zip_path = "/content/spoc.zip"
extract_dir = "/content/spoc_data"

os.makedirs(extract_dir, exist_ok=True)

with zipfile.ZipFile(zip_path, "r") as zip_ref:
    zip_ref.extractall(extract_dir)

print("Extraction done. Files extracted to:", extract_dir)


Extraction done. Files extracted to: /content/spoc_data


In [40]:
import pandas as pd

df = pd.read_csv("/content/spoc_data/train/split/spoc-train-train.tsv", sep="\t").dropna(subset=["text", "code"])
test_df = pd.read_csv("/content/spoc_data/train/split/spoc-train-test.tsv", sep="\t").dropna(subset=["text", "code"])



(15183, 7)

In [41]:
print(df.shape)
print(test_df.shape)

(181862, 7)
(15183, 7)


In [42]:
df.sample(5)

Unnamed: 0,text,code,workerid,probid,subid,line,indent
227007,PI = const double with PI = acos on -1,const double PI = acos(-1);,5,411A,35272272,0,0
27689,continue loop,continue;,13,776A,42231613,12,3
12844,print new line,cout << endl;,5,454A,38152708,11,2
191772,create long long variable temp with value = ab...,long long temp = abs(n + i);,54,488A,40845685,4,2
181652,decrement k,k--;,10,73A,33497243,18,3


In [43]:
test_df.sample(5)

Unnamed: 0,text,code,workerid,probid,subid,line,indent
4251,set p to i,p = i;,54,176A,42484586,17,5
2877,output s,cout << s << '\n';,9,147A,45273360,17,1
546,"print ""YES"" and '\n'","cout << ""YES"" << '\n';",54,1013A,48421132,13,2
6265,"while k is decremented by 1, print s1",while (k--) { cout << s1; },2,219A,40729486,14,2
18010,if t is equal to 1 increment p by 1 else incre...,(t == '1') ? p++ : r++;,2,774A,26250910,13,2


## Data Cleaning

In [44]:
df["combined"] = df.apply(
    lambda row: f"<|pseudo|>{row.text}<|code|>{row.code}<|end|>", axis=1
)
test_df["combined"] = test_df.apply(
    lambda r: f"<|pseudo|>{r.text}<|code|>{r.code}<|end|>", axis=1
)

In [45]:
from transformers import GPT2TokenizerFast, GPT2LMHeadModel

tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
tokenizer.add_special_tokens({
    "additional_special_tokens": ["<|pseudo|>", "<|code|>", "<|end|>"]
})


3

In [46]:
model = GPT2LMHeadModel.from_pretrained("gpt2")
model.resize_token_embeddings(len(tokenizer))
model.config.pad_token_id = tokenizer.eos_token_id

In [74]:
from datasets import Dataset

dataset = Dataset.from_pandas(df[["combined"]])
train_dataset = dataset.shuffle(seed=42).select(range(int(len(dataset) * 0.05)))

test_dataset = Dataset.from_pandas(test_df[["combined"]])
test_dataset = dataset.shuffle(seed=42).select(range(int(len(test_dataset) * 0.05)))

In [75]:
print(train_dataset.shape)
print(test_dataset.shape)

(9093, 2)
(759, 2)


## Training Arguments Setup

In [76]:
def tokenize_function(examples):
    tokenized_output = tokenizer(examples["combined"], truncation=True, padding="max_length", max_length=512)
    tokenized_output["labels"] = tokenized_output["input_ids"].copy() # Add labels
    return tokenized_output

tokenizer.pad_token = tokenizer.eos_token

tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_test = test_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/9093 [00:00<?, ? examples/s]

Map:   0%|          | 0/759 [00:00<?, ? examples/s]

In [80]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./gpt2-pseudocode-finetuned",  # folder to save checkpoints
    overwrite_output_dir=True,
    eval_strategy="epoch",               # evaluate once per epoch
    learning_rate=5e-5,
    per_device_train_batch_size=8,             # keep small for Colab
    per_device_eval_batch_size=8,
    num_train_epochs=3,                        # you can increase later
    weight_decay=0.01,
    logging_dir="./logs",                      # for tensorboard
    logging_steps=100,
    save_strategy="epoch",
    save_total_limit=2,                        # keep last 2 checkpoints
    load_best_model_at_end=True,
    fp16=True,                                 # if using GPU with mixed precision
)


In [81]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test if "tokenized_test" in locals() else None,
)

In [82]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.0502,0.042919
2,0.0434,0.039034
3,0.043,0.037725


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


TrainOutput(global_step=3411, training_loss=0.04748497707198632, metrics={'train_runtime': 1499.1022, 'train_samples_per_second': 18.197, 'train_steps_per_second': 2.275, 'total_flos': 7127785340928000.0, 'train_loss': 0.04748497707198632, 'epoch': 3.0})

In [83]:
trainer.save_model("./gpt2_pseudocode_model")
tokenizer.save_pretrained("./gpt2_pseudocode_model")


('./gpt2_pseudocode_model/tokenizer_config.json',
 './gpt2_pseudocode_model/special_tokens_map.json',
 './gpt2_pseudocode_model/vocab.json',
 './gpt2_pseudocode_model/merges.txt',
 './gpt2_pseudocode_model/added_tokens.json',
 './gpt2_pseudocode_model/tokenizer.json')

In [84]:
from transformers import GPT2LMHeadModel, GPT2TokenizerFast

model_path = "./gpt2_pseudocode_model"

tokenizer = GPT2TokenizerFast.from_pretrained(model_path)
model = GPT2LMHeadModel.from_pretrained(model_path)


In [93]:
def generate_code(pseudo_code):
    prompt = f"<|pseudo|>\n{pseudo_code}\n<|code|>\n"
    encodings = tokenizer(prompt, return_tensors="pt", padding=True)

    input_ids = encodings["input_ids"]
    attention_mask = encodings["attention_mask"]

    output = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        max_new_tokens=200,
        temperature=0.7,
        top_p=0.95,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id,
    )

    return tokenizer.decode(output[0], skip_special_tokens=True)



print(generate_code("for i from 0 to 5 print i"))



for i from 0 to 5 print i

for (int i = 0; i < 5; i++) { cout << i << endl; }


In [94]:
sample_rows = test_df.sample(3, random_state=42)


In [97]:
for i, row in test_df.sample(5).iterrows():
    pseudo = row["text"]
    gt_code = row["code"]

    print(f"\t--- Psudo & Generated Code ---\n{generate_code(pseudo)}")
    print(f"\t--- Ground Truth Code ---\n{gt_code}")
    print("-" * 80)

	--- Psudo & Generated Code ---

read n read p

for (int i = 0; i < n; i++) cin >> n >> p[i];
	--- Ground Truth Code ---
cin >> n >> p;
--------------------------------------------------------------------------------
	--- Psudo & Generated Code ---

s = string

string s;
	--- Ground Truth Code ---
string s;
--------------------------------------------------------------------------------
	--- Psudo & Generated Code ---

i = t - 1

i = t - 1;
	--- Ground Truth Code ---
i = t - 1;
--------------------------------------------------------------------------------
	--- Psudo & Generated Code ---

read k, b, n, t

for (int k = 0; k < k + 1; k++) { cin >> k >> b >> n >> t; }
	--- Ground Truth Code ---
cin >> k >> b >> n >> t;
--------------------------------------------------------------------------------
	--- Psudo & Generated Code ---

increase temp by (i - 1) * a

temp += (i - 1) * a;
	--- Ground Truth Code ---
temp += (i - 1) * a;
------------------------------------------------------------

## Post Evaluation

In [101]:
%pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.6


In [104]:
import evaluate

bleu = evaluate.load("bleu")

# Collect predictions
preds, refs = [], []

for i, row in test_df.sample(100).iterrows():
    pseudo = row["text"]
    gt_code = row["code"]

    gen_code = generate_code(pseudo)

    preds.append(gen_code)
    refs.append([gt_code])

# Compute BLEU
results = bleu.compute(predictions=preds, references=refs)
print(f"BLEU Score: {results['bleu']:.4f}")

BLEU Score: 0.4195
