In [1]:
!pip install transformers datasets sacrebleu evaluate sentence-transformers nltk rouge-score

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting sacrebleu
  Downloading sacrebleu-2.4.3-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.

In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForSeq2SeqLM
from datasets import load_dataset
import random
import re
import json
import os
import pandas as pd
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import evaluate
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [3]:

device = "cuda" if torch.cuda.is_available() else "cpu"

In [4]:
language = "python"
dataset = load_dataset("code_search_net", language, trust_remote_code=True)

train_df = dataset['train'].to_pandas()[['func_code_string','func_documentation_string']]
test_df = dataset['test'].to_pandas()[['func_code_string','func_documentation_string']]
validation_df = dataset['validation'].to_pandas()[['func_code_string','func_documentation_string']]

def remove_comments(code):
    code = re.sub(r'#.*', '', code)  # Remove single-line comments
    code = re.sub(r'""".*?"""|\'\'\'.*?\'\'\'', '', code, flags=re.DOTALL)  # Remove multi-line comments
    return code

train_df['func_code_string'] = train_df['func_code_string'].apply(remove_comments)
test_df['func_code_string'] = test_df['func_code_string'].apply(remove_comments)
validation_df['func_code_string'] = validation_df['func_code_string'].apply(remove_comments)

def isASCII(s):
    try:
        s.encode(encoding='utf-8').decode('ascii')
    except UnicodeDecodeError:
        return False
    else:
        return True

train_df = train_df[train_df['func_documentation_string'].apply(isASCII)]
test_df = test_df[test_df['func_documentation_string'].apply(isASCII)]
validation_df = validation_df[validation_df['func_documentation_string'].apply(isASCII)]

train_df = train_df[~(train_df['func_documentation_string'] == '')]
test_df = test_df[~(test_df['func_documentation_string'] == '')]
validation_df = validation_df[~(validation_df['func_documentation_string'] == '')]

train_df = train_df[~train_df['func_documentation_string'].duplicated()]
test_df = test_df[~test_df['func_documentation_string'].duplicated()]
validation_df = validation_df[~validation_df['func_documentation_string'].duplicated()]

# Reduce the test set size by factor of 40
reduced_size = max(1, len(test_df)//40)
test_df = test_df.head(reduced_size)

codesearchnet_samples = list(zip(test_df['func_code_string'], test_df['func_documentation_string']))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/12.9k [00:00<?, ?B/s]

code_search_net.py:   0%|          | 0.00/8.44k [00:00<?, ?B/s]

python.zip:   0%|          | 0.00/941M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/412178 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/22176 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/23107 [00:00<?, ? examples/s]

In [5]:

!wget http://www.phontron.com/download/conala-corpus-v1.1.zip -O conala-corpus-v1.1.zip
!unzip -o conala-corpus-v1.1.zip -d conala_dataset

conala_test_path = '/content/conala_dataset/conala-corpus/conala-test.json'
with open(conala_test_path, 'r', encoding='utf-8') as f:
    conala_data = json.load(f)

conala_df = pd.DataFrame(conala_data)
conala_df = conala_df[['snippet','rewritten_intent']]
conala_df = conala_df.dropna()
conala_df = conala_df[conala_df['rewritten_intent'].apply(isASCII)]
conala_df = conala_df[~(conala_df['rewritten_intent'] == '')]
conala_df = conala_df[~conala_df['rewritten_intent'].duplicated()]

conala_samples = list(zip(conala_df['snippet'], conala_df['rewritten_intent']))

--2024-12-19 17:31:08--  http://www.phontron.com/download/conala-corpus-v1.1.zip
Resolving www.phontron.com (www.phontron.com)... 173.236.247.185
Connecting to www.phontron.com (www.phontron.com)|173.236.247.185|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 52105440 (50M) [application/zip]
Saving to: ‘conala-corpus-v1.1.zip’


2024-12-19 17:31:08 (127 MB/s) - ‘conala-corpus-v1.1.zip’ saved [52105440/52105440]

Archive:  conala-corpus-v1.1.zip
   creating: conala_dataset/conala-corpus/
  inflating: conala_dataset/conala-corpus/conala-mined.jsonl  
  inflating: conala_dataset/conala-corpus/conala-train.json  
  inflating: conala_dataset/conala-corpus/conala-test.json  


In [6]:
from huggingface_hub import snapshot_download, login
login(token="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")

In [7]:

model_names = {
    "Llama": "meta-llama/Llama-3.2-1B"  # Ensure you have access and accepted license
}
#source: https://huggingface.co/meta-llama/Llama-3.2-1B
tokenizers = {}
models = {}


# Llama (Causal LM)
tokenizers["Llama"] = AutoTokenizer.from_pretrained(model_names["Llama"], use_fast=False)
models["Llama"] = AutoModelForCausalLM.from_pretrained(model_names["Llama"]).to(device)

tokenizer_config.json:   0%|          | 0.00/50.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/301 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/843 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/185 [00:00<?, ?B/s]

In [8]:
def generate_comment(model_name, code_snippet):
    prompt = "'''Explain the following code:\n" + code_snippet + "\n'''"
    tokenizer = tokenizers[model_name]

    # Ensure the tokenizer has a pad_token
    if tokenizer.pad_token is None:
        if tokenizer.eos_token is not None:
            tokenizer.pad_token = tokenizer.eos_token
        else:
            tokenizer.add_special_tokens({'pad_token': '[PAD]'})

    # Tokenize the input with attention mask
    encoded_input = tokenizer.encode_plus(
        prompt,
        return_tensors="pt",
        padding=True,  # Add padding if necessary
        truncation=True  # Truncate long inputs
    )
    input_ids = encoded_input["input_ids"].to(device)
    attention_mask = encoded_input["attention_mask"].to(device)

    if model_name == "CodeT5":
        outputs = models[model_name].generate(
            input_ids,
            attention_mask=attention_mask,
            max_length=100,
            num_beams=4,
            early_stopping=True,
            pad_token_id=tokenizer.pad_token_id
        )
        comment = tokenizer.decode(outputs[0], skip_special_tokens=True)
        return comment.strip()
    else:
        outputs = models[model_name].generate(
            input_ids,
            attention_mask=attention_mask,
            max_new_tokens=150,
            temperature=0.7,
            top_p=0.9,
            do_sample=True,
            pad_token_id=tokenizer.pad_token_id
        )
        full_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        parts = full_text.split("'''")
        if len(parts) > 2:
            comment = parts[-1].strip()
        else:
            comment = full_text
        return comment


In [9]:

rouge = evaluate.load("rouge")
meteor = evaluate.load("meteor")

# For BLEU-2, we use nltk:
# weights = (0.5, 0.5) for up to 2-grams
def compute_bleu_2(hypotheses, references):
    smoothie = SmoothingFunction().method4
    scores = []
    for hyp, ref in zip(hypotheses, references):
        ref_tokens = nltk.word_tokenize(ref)
        hyp_tokens = nltk.word_tokenize(hyp)
        # BLEU-2 weights
        score = sentence_bleu([ref_tokens], hyp_tokens, weights=(0.5,0.5), smoothing_function=smoothie)
        scores.append(score)
    return sum(scores)/len(scores) if scores else 0.0

# For embeddings and cosine similarity:
# We'll choose microsoft/codebert-base as embedding model.
embedding_model = SentenceTransformer('microsoft/codebert-base')

def compute_cosine_similarity(hypotheses, references):
    hyp_embeddings = embedding_model.encode(hypotheses, convert_to_tensor=True)
    ref_embeddings = embedding_model.encode(references, convert_to_tensor=True)
    sims = cosine_similarity(hyp_embeddings.cpu().numpy(), ref_embeddings.cpu().numpy())
    # Compute average diagonal similarity (pairwise)
    diagonal_sims = [sims[i, i] for i in range(len(hypotheses))]
    return sum(diagonal_sims)/len(diagonal_sims) if diagonal_sims else 0.0


def evaluate_model(model_label, samples):
    predictions = []
    references = []
    for (code, ref_comment) in samples:
        pred_comment = generate_comment(model_label, code)
        predictions.append(pred_comment)
        references.append(ref_comment)

    # Compute metrics
    # BLEU-2:
    bleu_2 = compute_bleu_2(predictions, references)

    # ROUGE (we can get rouge1, rouge2, rougeL from evaluate)
    rouge_results = rouge.compute(predictions=predictions, references=references)
    rouge_1 = rouge_results['rouge1'] * 100
    rouge_2 = rouge_results['rouge2'] * 100
    rouge_l = rouge_results['rougeL'] * 100

    # METEOR
    meteor_result = meteor.compute(predictions=predictions, references=references)
    meteor_score = meteor_result['meteor'] * 100

    # Cosine Similarity
    cosine_sim = compute_cosine_similarity(predictions, references) * 100

    print(f"\nEvaluating model: {model_label}")
    print(f"  BLEU-2: {bleu_2*100:.2f}")  # BLEU is normally between 0 and 1, multiply by 100 for readability
    print(f"  ROUGE-1: {rouge_1:.2f}")
    print(f"  ROUGE-2: {rouge_2:.2f}")
    print(f"  ROUGE-L: {rouge_l:.2f}")
    print(f"  METEOR: {meteor_score:.2f}")
    print(f"  Cosine Similarity: {cosine_sim:.2f}")

    # Print some sample predictions
    for i in range(min(2, len(samples))):
        print("Code:\n", samples[i][0][:100], "...")
        print("Reference:", samples[i][1])
        print("Prediction:", predictions[i], "\n")

# Evaluate on CodeSearchNet reduced test set
for model_label in model_names.keys():
    evaluate_model(model_label, codesearchnet_samples)

# Evaluate on CoNaLa test set
for model_label in model_names.keys():
    evaluate_model(model_label, conala_samples)

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.02k [00:00<?, ?B/s]

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


config.json:   0%|          | 0.00/498 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]


Evaluating model: Llama
  BLEU-2: 0.97
  ROUGE-1: 5.01
  ROUGE-2: 1.09
  ROUGE-L: 3.93
  METEOR: 2.89
  Cosine Similarity: 72.12
Code:
 def get_vid_from_url(url):
        
        return match1(url, r'youtu\.be/([^?/]+)') or \
          ...
Reference: Extracts video ID from URL.
Prediction: def get_vid_from_url(url):
    if url is None:
        return None
    try:
        from urllib.parse import parse_qs
        parsed_url = parse_qs(url)
        vid = parsed_url['v'][0]
    except Exception as e:
        print(e)
    else:
        return vid 

Code:
 def sina_xml_to_url_list(xml_data):
    
    rawurl = []
    dom = parseString(xml_data)
    for nod ...
Reference: str->list
    Convert XML to URL List.
    From Biligrab.
Prediction: # get the root node
    root = parseString(xml_data)
    # get the list of all the children nodes
    children = root.getElementsByTagName('durl')
    # get the list of all the children nodes' children nodes
    for child in children:
        url = chil