<a href="https://colab.research.google.com/github/theantigone/Fine-Tuning-CodeT5/blob/master/notebooks/1.0-jqt-fine-tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install datasets evaluate sentencepiece sacrebleu pandas

!pip install transformers
!pip install tree_sitter==0.2.0
!git clone -q https://github.com/microsoft/CodeXGLUE.git

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting portalocker (from sacrebleu)
  Downloading portalocker-3.1.1-py3-none-any.whl.metadata (8.6 kB)
Co

In [None]:
import pandas as pd
import re
import os
import subprocess
import tempfile
import importlib
import torch

from transformers import T5ForConditionalGeneration, RobertaTokenizer, Trainer, TrainingArguments, EarlyStoppingCallback
from datasets import Dataset, DatasetDict
import evaluate

# ------------------------------
# Step 3: Load CSV Files
# ------------------------------
train_df = pd.read_csv("/content/ft_train.csv")
val_df = pd.read_csv("/content/ft_valid.csv")
test_df = pd.read_csv("/content/ft_test.csv")

# ------------------------------
# Step 4: Mask if Conditions & Flatten Code
# ------------------------------
def flatten_code(code):
    return " ".join(code.split())

def generate_pattern(target):
    target = target.strip()
    tokens = re.findall(r'\S+', target)
    if tokens and tokens[-1] == ":":
        tokens = tokens[:-1]
        pattern = r'\s*'.join(map(re.escape, tokens)) + r'\s*:'
    else:
        pattern = r'\s*'.join(map(re.escape, tokens))
    return pattern

def mask_if_conditions(df, df_name="dataset"):
    masked_data = []
    # Limit to first two rows (per your original design)
    #df = df.head(6)
    for idx, row in df.iterrows():
        function_code = row['cleaned_method']
        target_if_condition = row['target_block']

        if not isinstance(target_if_condition, str) or not target_if_condition.strip():
            print(f"[{df_name} row {idx}] Skipped: Empty or invalid target_block")
            continue

        raw_condition = target_if_condition.strip()
        flattened_func = flatten_code(function_code)
        pattern = generate_pattern(raw_condition)

         # Debugging: Display the generated regex pattern and the flattened function code
        #print(f"[{df_name} row {idx}] Generated regex pattern: {pattern}")
        #print(f"[{df_name} row {idx}] Flattened function code: {flattened_func}")

         # Check if the pattern exists in the flattened function code
        #if re.search(pattern, flattened_func):
        #    print(f"[{df_name} row {idx}] Pattern found in function code.")
        #else:
        #    print(f"[{df_name} row {idx}] Pattern NOT found in function code.")

        masked_func, count = re.subn(pattern, "<mask>:", flattened_func, count=1)
        if count == 0:
            print(f"[{df_name} row {idx}] Warning: Condition not found or not replaced")
            continue

        masked_data.append({
            'masked_input': masked_func,
            'target': raw_condition,
            'original_function': function_code
        })
    return pd.DataFrame(masked_data)

masked_train_df = mask_if_conditions(train_df, df_name="train")
masked_val_df = mask_if_conditions(val_df, df_name="val")
masked_test_df = mask_if_conditions(test_df, df_name="test")

# ------------------------------
# Step 5: Convert to Hugging Face Datasets
# ------------------------------
hf_train = Dataset.from_pandas(masked_train_df[['masked_input', 'target']])
hf_val = Dataset.from_pandas(masked_val_df[['masked_input', 'target']])
hf_test = Dataset.from_pandas(masked_test_df[['masked_input', 'target']])

dataset = DatasetDict({
    "train": hf_train,
    "validation": hf_val,
    "test": hf_test
})

# ------------------------------
# Step 6: Load Pre-trained CodeT5 Model & Tokenizer
# ------------------------------
model_checkpoint = "Salesforce/codet5-small"
model = T5ForConditionalGeneration.from_pretrained(model_checkpoint)
tokenizer = RobertaTokenizer.from_pretrained(model_checkpoint)

# ------------------------------
# Step 7: Tokenize the Dataset
# ------------------------------
def preprocess_function(examples):
    inputs = examples["masked_input"]
    targets = examples["target"]
    model_inputs = tokenizer(inputs, max_length=256, truncation=True, padding="max_length")
    labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = dataset.map(preprocess_function, batched=True)

# ------------------------------
# Step 8: Training Arguments & Trainer Setup
# ------------------------------
training_args = TrainingArguments(
    output_dir="./codet5-finetuned",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    learning_rate=5e-5,
    gradient_accumulation_steps=2,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    save_total_limit=2,
    logging_steps=100,
    push_to_hub=False,
    fp16=True
)

print("CUDA available:", torch.cuda.is_available())

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

# ------------------------------
# Step 9: Train the Model
# ------------------------------
trainer.train()

# ------------------------------
# Step 10: Evaluate on Test Set
# ------------------------------
metrics = trainer.evaluate(tokenized_datasets["test"])
print("Test Evaluation Metrics:", metrics)

# ------------------------------
# Step 11: Generate Predictions and Save Results with CodeBLEU Scores
# ------------------------------
sacrebleu_metric = evaluate.load("sacrebleu")

results = []
refs_list = []
hyps_list = []

for i, row in masked_test_df.iterrows():
    input_text = row["masked_input"]
    expected_if = row["target"]

    # Generate predicted condition
    inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True).to(model.device)
    outputs = model.generate(**inputs, max_length=128)
    predicted_if = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()

    # Reconstruct the full function with the predicted if condition inserted.
    # IMPORTANT: Flatten the original function so that both reference and hypothesis have the same format.
    reference_full = flatten_code(row['original_function'])
    hypothesis_full = input_text.replace('<mask>:', predicted_if + ':')

    # Save to lists for CodeBLEU evaluation
    refs_list.append(reference_full.strip())
    hyps_list.append(hypothesis_full.strip())

    # Compute BLEU-4 for this individual sample
    exact_match = (predicted_if.strip() == expected_if.strip())
    bleu_result = sacrebleu_metric.compute(predictions=[predicted_if], references=[[expected_if]])
    bleu_score = bleu_result["score"]

    # For now, set codebleu score to None; we update it below.
    results.append({
        "Input function with masked if condition": input_text,
        "Whether the prediction is correct": exact_match,
        "Expected if condition": expected_if,
        "Predicted if condition": predicted_if,
        "CodeBLEU prediction score": None,
        "BLEU-4 prediction score": bleu_score
    })

results_df = pd.DataFrame(results)

# Write all references and hypotheses to files for CodeBLEU evaluation
# We write them to /content/ to mimic your working example.
refs_path = "/content/all_targets.txt"
hyps_path = "/content/all_predictions.txt"

with open(refs_path, "w") as f_ref, open(hyps_path, "w") as f_pred:
    for ref, hyp in zip(refs_list, hyps_list):
        f_ref.write(ref + "\n")
        f_pred.write(hyp + "\n")

# Build the command to run the CodeBLEU evaluator.
# Adjust the --lang flag as needed (here, we assume Python code evaluation).
cmd = (
    "cd /content/CodeXGLUE/Code-Code/code-to-code-trans/evaluator/CodeBLEU/ && "
    f"python calc_code_bleu.py --refs /content/all_targets.txt --hyp /content/all_predictions.txt --lang python --params 0.25,0.25,0.25,0.25"
)

result = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True)
output = result.stdout.strip() if result.stdout.strip() else result.stderr.strip()
print("CodeBLEU evaluator output:\n", output)

# Parse the CodeBLEU score from the evaluator output.
m = re.search(r"CodeBLEU score:\s+([0-9.]+)", output)
global_codebleu_score = float(m.group(1)) if m else 0.0

# Update the results DataFrame with the global CodeBLEU score.
results_df["CodeBLEU prediction score"] = global_codebleu_score

results_df.to_csv("testset-results.csv", index=False)
print("Test set results saved as 'testset-results.csv'.")

results_df.head()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.57k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/242M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.48k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/703k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/294k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/12.5k [00:00<?, ?B/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

  trainer = Trainer(


CUDA available: True


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mtmtran03[0m ([33mtmtran03-college-of-william-and-mary[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,0.0864,0.072825
2,0.0808,0.070276
3,0.0765,0.068876
4,0.073,0.068293
5,0.0723,0.068212


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


Test Evaluation Metrics: {'eval_loss': 0.07172377407550812, 'eval_runtime': 18.84, 'eval_samples_per_second': 265.393, 'eval_steps_per_second': 4.193, 'epoch': 5.0}


Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

CodeBLEU evaluator output:
 ngram match: 0.8403698784297634, weighted ngram match: 0.9028705687141984, syntax_match: 0.9057959361283487, dataflow_match: 0.878970376226573
CodeBLEU score:  0.8820016898747209
Test set results saved as 'testset-results.csv'.


Unnamed: 0,Input function with masked if condition,Whether the prediction is correct,Expected if condition,Predicted if condition,CodeBLEU prediction score,BLEU-4 prediction score
0,"def read(self, count=True, timeout=None, ignor...",False,if ignore_timeouts and is_timeout ( e ) :,if ignore_timeouts and is_timeouts ( e ) :,0.882002,73.488892
1,"def _cache_mem(curr_out, prev_mem, mem_len, re...",True,if prev_mem is None :,if prev_mem is None :,0.882002,100.0
2,def filtered(gen): for example in gen: example...,True,if example_len > max_length :,if example_len > max_length :,0.882002,100.0
3,"def search(self, query): # ""Search.ashx?query=...",False,"if item . get ( ""type"" , """" ) == ""audio"" :","if item [ ""guide_id"" ] not in self . _stations :",0.882002,5.680778
4,"def _check_script(self, script, directive): fo...",False,"if var . must_contain ( ""/"" ) :",if not var . can_contain ( directive ) :,0.882002,19.358307


In [None]:
masked_train_df.loc[49712]

Unnamed: 0,49712
masked_input,"def _check_compact(buf): ndim = len(buf.shape) size = tvm.tir.const(1, buf.shape[0].dtype) for i in reversed(range(ndim)): <mask>: raise RuntimeError( ""Cannot prove compact: shape=%s, strides=%s"" % (buf.shape, buf.strides) ) size = size * buf.shape[i]"
target,"if not util . equal_const_int ( size - buf . strides [ i ] , 0 ) :"
original_function,"def _check_compact(buf):\n ndim = len(buf.shape)\n size = tvm.tir.const(1, buf.shape[0].dtype)\n for i in reversed(range(ndim)):\n if not util.equal_const_int(size - buf.strides[i], 0):\n raise RuntimeError(\n ""Cannot prove compact: shape=%s, strides=%s"" % (buf.shape, buf.strides)\n )\n size = size * buf.shape[i]\n"


In [None]:
df_no_mask = masked_train_df[~masked_train_df['masked_input'].str.contains("<mask>")]

df_no_mask

Unnamed: 0,masked_input,target,original_function
