In [1]:
# JAVA

In [2]:
!pip install datasets pandas scikit-learn matplotlib torch

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (1

In [4]:
from datasets import load_dataset

# Step 2: Data Collection
# Specify the programming language for the dataset
language = "java"  # Choose from 'python', 'java', 'javascript', 'php', 'ruby', 'go'


# Load the dataset
dataset = load_dataset("code_search_net", language)

Generating train split:   0%|          | 0/454451 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/26909 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/15328 [00:00<?, ? examples/s]

In [5]:
# Explore the dataset
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url'],
        num_rows: 454451
    })
    test: Dataset({
        features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url'],
        num_rows: 26909
    })
    validation: Dataset({
        features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url'],
        num_rows: 15328
    })
})


In [6]:
train_df = dataset['train'].to_pandas()[['func_code_string','func_documentation_string']]
test_df = dataset['test'].to_pandas()[['func_code_string','func_documentation_string']]
validation_df = dataset['validation'].to_pandas()[['func_code_string','func_documentation_string']]

In [7]:
# Access the first example
first_example = train_df.iloc[0]
print("Docstring:\n", first_example['func_documentation_string'])
print()
print("Code:\n", first_example['func_code_string'])


Docstring:
 Parses the {@link NumberFormat} to use from the context arguments.

@param context The context.
@param args The arguments of the macro.

@return the {@link NumberFormat}.

Code:
 private NumberFormat parseFormatter(Context context, Arguments args)
    {
        final String format = args.get(FORMAT_PARAM_NAME);
        final Locale locale = context.get(LOCALE);
        if (format != null)
        {
            return new DecimalFormat(format, DecimalFormatSymbols.getInstance(locale));
        }

        final Mode mode = Mode.loadFromContext(args, this.defaultMode);
        if (Mode.INTEGER.equals(mode))
        {
            return NumberFormat.getIntegerInstance(locale);
        }
        if (Mode.CURRENCY.equals(mode))
        {
            return NumberFormat.getCurrencyInstance(locale);
        }
        if (Mode.PERCENT.equals(mode))
        {
            return NumberFormat.getPercentInstance(locale);
        }
        return NumberFormat.getInstance(locale);
    }


In [8]:
#remove any non-english comments.
# From https://stackoverflow.com/a/27084708/5768407

def isASCII(s):
    try:
        s.encode(encoding='utf-8').decode('ascii')
    except UnicodeDecodeError:
        return False
    else:
        return True

train_df = train_df[train_df['func_documentation_string'].apply(lambda x: isASCII(x))]
test_df = test_df[test_df['func_documentation_string'].apply(lambda x: isASCII(x))]
validation_df = validation_df[validation_df['func_documentation_string'].apply(lambda x: isASCII(x))]

In [9]:
"""
any line with an @ symbol or curly braces, as that will significantlly lessen the amount of learning your model will have to do.
This also works out well since the JavaDoc syntax can usually be autogenerated from the method's signature.
#https://colab.research.google.com/github/ncoop57/i-am-a-nerd/blob/master/_notebooks/2020-03-07-How_to_Create_an_Automatic_Code_Comment_Generator_using_Deep_Learning.ipynb#scrollTo=r3sxnOlI7Hg2

"""
import re
from tqdm import tqdm
import pandas as pd

def filter_jdocs(df):
    methods = []
    comments = []
    for i, row in tqdm(list(df.iterrows())):
        comment = row["func_documentation_string"]
        # Remove {} text in comments from https://stackoverflow.com/questions/14596884/remove-text-between-and-in-python/14598135
        comment = re.sub("([\{\[]).*?([\)\}])", '', comment)


        cleaned = []
        for line in comment.split('\n'):
            if "@" in line: break
            cleaned.append(line)
        comments.append('\n'.join(cleaned))
        methods.append(row["func_code_string"])
    new_df = pd.DataFrame(zip(methods, comments), columns = ["func_code_string", "func_documentation_string"])

    return new_df

train_df = filter_jdocs(train_df);
test_df = filter_jdocs(test_df);
validation_df = filter_jdocs(validation_df);

100%|██████████| 440332/440332 [00:04<00:00, 96129.29it/s] 
100%|██████████| 26248/26248 [00:00<00:00, 33413.26it/s]
100%|██████████| 14197/14197 [00:00<00:00, 45705.21it/s]


In [10]:
# remove any empty comments or duplicate comments
train_df = train_df[~(train_df['func_documentation_string'] == '')]
test_df = test_df[~(test_df['func_documentation_string'] == '')]
validation_df = validation_df[~(validation_df['func_documentation_string'] == '')]

In [11]:
train_df = train_df[~train_df['func_documentation_string'].duplicated()]
test_df = test_df[~test_df['func_documentation_string'].duplicated()]
validation_df = validation_df[~validation_df['func_documentation_string'].duplicated()]

In [12]:
len(train_df), len(test_df), len(validation_df)

(331917, 21291, 11101)

In [13]:
import pandas as pd
import matplotlib.pyplot as plt
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [14]:
print(train_df.iloc[0]['func_code_string'])

private NumberFormat parseFormatter(Context context, Arguments args)
    {
        final String format = args.get(FORMAT_PARAM_NAME);
        final Locale locale = context.get(LOCALE);
        if (format != null)
        {
            return new DecimalFormat(format, DecimalFormatSymbols.getInstance(locale));
        }

        final Mode mode = Mode.loadFromContext(args, this.defaultMode);
        if (Mode.INTEGER.equals(mode))
        {
            return NumberFormat.getIntegerInstance(locale);
        }
        if (Mode.CURRENCY.equals(mode))
        {
            return NumberFormat.getCurrencyInstance(locale);
        }
        if (Mode.PERCENT.equals(mode))
        {
            return NumberFormat.getPercentInstance(locale);
        }
        return NumberFormat.getInstance(locale);
    }


In [15]:
from transformers import AutoTokenizer, AutoModelWithLMHead, SummarizationPipeline


# source: https://huggingface.co/SEBIS/code_trans_t5_base_code_comment_generation_java
pipeline = SummarizationPipeline(
    model=AutoModelWithLMHead.from_pretrained("SEBIS/code_trans_t5_base_code_comment_generation_java"),
    tokenizer=AutoTokenizer.from_pretrained("SEBIS/code_trans_t5_base_code_comment_generation_java", skip_special_tokens=True),
    device=0
)

tokenized_code = train_df.iloc[0]['func_code_string']
pipeline([tokenized_code])[0]['summary_text']


The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]



config.json:   0%|          | 0.00/621 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/892M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/797k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


'Parses the output of the context'

In [16]:
fraction = 1/40
sampled_test_df = test_df.sample(frac=fraction, random_state=42)

In [17]:
len(sampled_test_df)

532

In [18]:
generated_comments = []
reference_comments = []

# Iterate over the test set
for _, row in tqdm(sampled_test_df.iterrows(), total=sampled_test_df.shape[0]):
    code = row['func_code_string']
    reference = row['func_documentation_string']



    # Generate the comment
    generated_comment = pipeline([code])[0]['summary_text'].strip()

    generated_comments.append(generated_comment)
    reference_comments.append(reference)

  2%|▏         | 9/532 [00:02<03:04,  2.84it/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
100%|██████████| 532/532 [02:42<00:00,  3.28it/s]


In [19]:
!pip install rouge-score

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=eff2a9f147edae1a474b5d0c3e4ce1753fc263b3e2d72e1f461c7bfd0627982e
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


In [20]:
from nltk.translate.bleu_score import corpus_bleu
from rouge_score import rouge_scorer
from nltk.translate.meteor_score import meteor_score
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [21]:
def evaluate_generated_comments(reference_comments, generated_comments):

    # Tokenize for BLEU
    tokenized_generated = [nltk.word_tokenize(comment.lower()) for comment in generated_comments]
    tokenized_reference = [[nltk.word_tokenize(ref.lower())] for ref in reference_comments]

    # BLEU-2
    bleu_score = corpus_bleu(tokenized_reference, tokenized_generated, weights=(0.5, 0.5))  # BLEU-2

    # ROUGE
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    rouge1, rouge2, rougeL = 0, 0, 0
    for ref, gen in zip(reference_comments, generated_comments):
        scores = scorer.score(ref, gen)
        rouge1 += scores['rouge1'].fmeasure
        rouge2 += scores['rouge2'].fmeasure
        rougeL += scores['rougeL'].fmeasure
    rouge1 /= len(reference_comments)
    rouge2 /= len(reference_comments)
    rougeL /= len(reference_comments)

    # METEOR
    meteor = 0
    for ref, gen in zip(reference_comments, generated_comments):
        # Tokenize the hypothesis and references
        ref_tokens = nltk.word_tokenize(ref.lower())
        gen_tokens = nltk.word_tokenize(gen.lower())

        # Compute METEOR for each pair
        meteor += meteor_score([ref_tokens], gen_tokens)
    meteor /= len(reference_comments)

    return {
        'BLEU-2': bleu_score,
        'ROUGE-1': rouge1,
        'ROUGE-2': rouge2,
        'ROUGE-L': rougeL,
        'METEOR': meteor
    }


In [22]:
metrics = evaluate_generated_comments(reference_comments, generated_comments)
for metric, score in metrics.items():
    print(f"{metric}: {score:.4f}")

BLEU-2: 0.0311
ROUGE-1: 0.2503
ROUGE-2: 0.0781
ROUGE-L: 0.2228
METEOR: 0.1816


In [23]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity


In [24]:
# Load a pre-trained Sentence-BERT model
model = SentenceTransformer('all-MiniLM-L6-v2')


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [25]:
# Encode the comments to get their embeddings
embeddings_ref = model.encode(reference_comments)
embeddings_gen = model.encode(generated_comments)


In [26]:
# Compute cosine similarity for each pair
similarity_scores = cosine_similarity(embeddings_ref, embeddings_gen).diagonal()

# Calculate average similarity
average_similarity = similarity_scores.mean()



print(f"\nAverage Cosine Similarity: {average_similarity:.4f}")



Average Cosine Similarity: 0.4391
