In [1]:
!pip install transformers datasets sacrebleu evaluate sentence-transformers nltk rouge-score

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting sacrebleu
  Downloading sacrebleu-2.4.3-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.

In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForSeq2SeqLM
from datasets import load_dataset
import random
import re
import json
import os
import pandas as pd
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import evaluate
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [6]:
import re
from datasets import load_dataset

language = "python"
dataset = load_dataset("code_search_net", language, trust_remote_code=True)

# Convert to Pandas DataFrame and select relevant columns
train_df = dataset['train'].to_pandas()[['func_code_string', 'func_documentation_string']]
test_df = dataset['test'].to_pandas()[['func_code_string', 'func_documentation_string']]
validation_df = dataset['validation'].to_pandas()[['func_code_string', 'func_documentation_string']]

print(f"Initial dataset sizes - Train: {len(train_df)}, Test: {len(test_df)}, Validation: {len(validation_df)}")

# Function to remove comments from code
def remove_comments(code):
    code = re.sub(r'#.*', '', code)  # Remove single-line comments
    code = re.sub(r'""".*?"""|\'\'\'.*?\'\'\'', '', code, flags=re.DOTALL)  # Remove multi-line comments
    return code

# Apply comment removal
train_df['func_code_string'] = train_df['func_code_string'].apply(remove_comments)
test_df['func_code_string'] = test_df['func_code_string'].apply(remove_comments)
validation_df['func_code_string'] = validation_df['func_code_string'].apply(remove_comments)

# Function to check if a string is ASCII
def isASCII(s):
    try:
        s.encode(encoding='utf-8').decode('ascii')
    except UnicodeDecodeError:
        return False
    else:
        return True

# Filter for ASCII documentation strings
train_df = train_df[train_df['func_documentation_string'].apply(isASCII)]
test_df = test_df[test_df['func_documentation_string'].apply(isASCII)]
validation_df = validation_df[validation_df['func_documentation_string'].apply(isASCII)]

print(f"After filtering non-ASCII - Train: {len(train_df)}, Test: {len(test_df)}, Validation: {len(validation_df)}")

# Remove empty documentation strings
train_df = train_df[~(train_df['func_documentation_string'] == '')]
test_df = test_df[~(test_df['func_documentation_string'] == '')]
validation_df = validation_df[~(validation_df['func_documentation_string'] == '')]

print(f"After removing empty documentation - Train: {len(train_df)}, Test: {len(test_df)}, Validation: {len(validation_df)}")

# Remove duplicates in documentation strings
train_df = train_df[~train_df['func_documentation_string'].duplicated()]
test_df = test_df[~test_df['func_documentation_string'].duplicated()]
validation_df = validation_df[~validation_df['func_documentation_string'].duplicated()]

print(f"After removing duplicates - Train: {len(train_df)}, Test: {len(test_df)}, Validation: {len(validation_df)}")

# Reduce the test set size by a factor of 40
reduced_size = max(1, len(test_df) // 40)
test_df = test_df.head(reduced_size)

print(f"Final test set size after reduction: {len(test_df)}")

# Create a list of tuples from the reduced test set
codesearchnet_samples = list(zip(test_df['func_code_string'], test_df['func_documentation_string']))


Initial dataset sizes - Train: 412178, Test: 22176, Validation: 23107
After filtering non-ASCII - Train: 406508, Test: 22014, Validation: 22656
After removing empty documentation - Train: 406508, Test: 22014, Validation: 22656
After removing duplicates - Train: 385782, Test: 21342, Validation: 21905
Final test set size after reduction: 533


In [7]:
import json
import pandas as pd
import os

# Download and unzip the dataset
!wget http://www.phontron.com/download/conala-corpus-v1.1.zip -O conala-corpus-v1.1.zip
!unzip -o conala-corpus-v1.1.zip -d conala_dataset

# Load the dataset
conala_test_path = '/content/conala_dataset/conala-corpus/conala-test.json'
with open(conala_test_path, 'r', encoding='utf-8') as f:
    conala_data = json.load(f)

# Convert to DataFrame
conala_df = pd.DataFrame(conala_data)
print(f"Initial dataset size: {len(conala_df)}")

# Select relevant columns and drop missing values
conala_df = conala_df[['snippet', 'rewritten_intent']]
conala_df = conala_df.dropna()
print(f"After dropping NaN values: {len(conala_df)}")

# Function to check if a string is ASCII
def isASCII(s):
    try:
        s.encode(encoding='utf-8').decode('ascii')
    except UnicodeDecodeError:
        return False
    else:
        return True

# Filter for ASCII documentation strings
conala_df = conala_df[conala_df['rewritten_intent'].apply(isASCII)]
print(f"After filtering non-ASCII intents: {len(conala_df)}")

# Remove empty intents
conala_df = conala_df[~(conala_df['rewritten_intent'] == '')]
print(f"After removing empty intents: {len(conala_df)}")

# Remove duplicate intents
conala_df = conala_df[~conala_df['rewritten_intent'].duplicated()]
print(f"After removing duplicate intents: {len(conala_df)}")

# Create a list of tuples for the final dataset
conala_samples = list(zip(conala_df['snippet'], conala_df['rewritten_intent']))
print(f"Final dataset size (number of samples): {len(conala_samples)}")


--2024-12-22 15:56:57--  http://www.phontron.com/download/conala-corpus-v1.1.zip
Resolving www.phontron.com (www.phontron.com)... 173.236.247.185
Connecting to www.phontron.com (www.phontron.com)|173.236.247.185|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 52105440 (50M) [application/zip]
Saving to: ‘conala-corpus-v1.1.zip’


2024-12-22 15:56:58 (36.1 MB/s) - ‘conala-corpus-v1.1.zip’ saved [52105440/52105440]

Archive:  conala-corpus-v1.1.zip
   creating: conala_dataset/conala-corpus/
  inflating: conala_dataset/conala-corpus/conala-mined.jsonl  
  inflating: conala_dataset/conala-corpus/conala-train.json  
  inflating: conala_dataset/conala-corpus/conala-test.json  
Initial dataset size: 500
After dropping NaN values: 477
After filtering non-ASCII intents: 476
After removing empty intents: 476
After removing duplicate intents: 450
Final dataset size (number of samples): 450


In [None]:
from huggingface_hub import snapshot_download, login
login(token="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")

In [None]:

model_names = {
    "CodeT5": "Salesforce/codet5-base"
}
#source: https://huggingface.co/Salesforce/codet5-base

tokenizers = {}
models = {}

# CodeT5 (Seq2Seq)
tokenizers["CodeT5"] = AutoTokenizer.from_pretrained(model_names["CodeT5"])
models["CodeT5"] = AutoModelForSeq2SeqLM.from_pretrained(model_names["CodeT5"]).to(device)



tokenizer_config.json:   0%|          | 0.00/1.48k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/703k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/294k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/12.5k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.57k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/892M [00:00<?, ?B/s]

In [None]:
def generate_comment(model_name, code_snippet):
    prompt = "'''Explain the following code:\n" + code_snippet + "\n'''"
    if model_name == "CodeT5":
        input_ids = tokenizers[model_name].encode(prompt, return_tensors="pt").to(device)
        outputs = models[model_name].generate(
            input_ids,
            max_length=100,
            num_beams=4,
            early_stopping=True
        )
        comment = tokenizers[model_name].decode(outputs[0], skip_special_tokens=True)
        return comment.strip()
    else:
        input_ids = tokenizers[model_name].encode(prompt, return_tensors="pt").to(device)
        outputs = models[model_name].generate(
            input_ids,
            max_length=150,
            temperature=0.7,
            top_p=0.9,
            do_sample=True
        )
        full_text = tokenizers[model_name].decode(outputs[0], skip_special_tokens=True)
        parts = full_text.split("'''")
        if len(parts) > 2:
            comment = parts[-1].strip()
        else:
            comment = full_text
        return comment



In [None]:

rouge = evaluate.load("rouge")
meteor = evaluate.load("meteor")

# For BLEU-2, we use nltk:
# weights = (0.5, 0.5) for up to 2-grams
def compute_bleu_2(hypotheses, references):
    smoothie = SmoothingFunction().method4
    scores = []
    for hyp, ref in zip(hypotheses, references):
        ref_tokens = nltk.word_tokenize(ref)
        hyp_tokens = nltk.word_tokenize(hyp)
        # BLEU-2 weights
        score = sentence_bleu([ref_tokens], hyp_tokens, weights=(0.5,0.5), smoothing_function=smoothie)
        scores.append(score)
    return sum(scores)/len(scores) if scores else 0.0

# For embeddings and cosine similarity:
# We'll choose microsoft/codebert-base as embedding model.
embedding_model = SentenceTransformer('microsoft/codebert-base')

def compute_cosine_similarity(hypotheses, references):
    hyp_embeddings = embedding_model.encode(hypotheses, convert_to_tensor=True)
    ref_embeddings = embedding_model.encode(references, convert_to_tensor=True)
    sims = cosine_similarity(hyp_embeddings.cpu().numpy(), ref_embeddings.cpu().numpy())
    # Compute average diagonal similarity (pairwise)
    diagonal_sims = [sims[i, i] for i in range(len(hypotheses))]
    return sum(diagonal_sims)/len(diagonal_sims) if diagonal_sims else 0.0

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.02k [00:00<?, ?B/s]

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


config.json:   0%|          | 0.00/498 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

In [None]:
def evaluate_model(model_label, samples):
    predictions = []
    references = []
    for (code, ref_comment) in samples:
        pred_comment = generate_comment(model_label, code)
        predictions.append(pred_comment)
        references.append(ref_comment)

    # Compute metrics
    # BLEU-2:
    bleu_2 = compute_bleu_2(predictions, references)

    # ROUGE (we can get rouge1, rouge2, rougeL from evaluate)
    rouge_results = rouge.compute(predictions=predictions, references=references)
    rouge_1 = rouge_results['rouge1']
    rouge_2 = rouge_results['rouge2']
    rouge_l = rouge_results['rougeL']

    # METEOR
    meteor_result = meteor.compute(predictions=predictions, references=references)
    meteor_score = meteor_result['meteor']

    # Cosine Similarity
    cosine_sim = compute_cosine_similarity(predictions, references)

    print(f"\nEvaluating model: {model_label}")
    print(f"  BLEU-2: {bleu_2*100:.2f}")  # BLEU is normally between 0 and 1, multiply by 100 for readability
    print(f"  ROUGE-1: {rouge_1:.2f}")
    print(f"  ROUGE-2: {rouge_2:.2f}")
    print(f"  ROUGE-L: {rouge_l:.2f}")
    print(f"  METEOR: {meteor_score:.2f}")
    print(f"  Cosine Similarity: {cosine_sim:.2f}")

    # Print some sample predictions
    for i in range(min(2, len(samples))):
        print("Code:\n", samples[i][0][:100], "...")
        print("Reference:", samples[i][1])
        print("Prediction:", predictions[i], "\n")

# Evaluate on CodeSearchNet reduced test set
for model_label in model_names.keys():
    evaluate_model(model_label, codesearchnet_samples)

# Evaluate on CoNaLa test set
for model_label in model_names.keys():
    evaluate_model(model_label, conala_samples)

Token indices sequence length is longer than the specified maximum sequence length for this model (714 > 512). Running this sequence through the model will result in indexing errors



Evaluating model: CodeT5
  BLEU-2: 1.08
  ROUGE-1: 0.12
  ROUGE-2: 0.03
  ROUGE-L: 0.11
  METEOR: 0.03
  Cosine Similarity: 0.91
Code:
 def get_vid_from_url(url):
        
        return match1(url, r'youtu\.be/([^?/]+)') or \
          ...
Reference: Extracts video ID from URL.
Prediction: match1(url, r'youtu\.be/([^/?]+)') or \
          parse_query_param(parse_query_param(url,or \
          parse_query_param(parse_query_param(url,or \
          parse_query_param(parse_query_param(url,or \
          parse_query_param(parse_query_param(url, 'u'), 

Code:
 def sina_xml_to_url_list(xml_data):
    
    rawurl = []
    dom = parseString(xml_data)
    for nod ...
Reference: str->list
    Convert XML to URL List.
    From Biligrab.
Prediction: url_list = sina_xml_to_url_list(xml_data)url_list = sina_xml_to_url_list(xml_data)url_list.append(url_list)url_list.append(url_list.pop(0)).dataurl_list.append(url_list.pop(0))
    
   url_list.append(url_list. 


Evaluating model: CodeT5
  BLEU-2: 2