In [1]:
!pip install transformers

import numpy as np
from tqdm import tqdm
import torch
from transformers import AutoTokenizer, AutoModel
from data_utils import create_dataset, create_loader

from torch import Tensor

Defaulting to user installation because normal site-packages is not writeable


In [2]:
@torch.no_grad()
def get_feats(model, tokenizer, data_loader, max_length, device, desc='Get feats'):
    embeds = []

    for text in tqdm(data_loader, total=len(data_loader), desc=desc):
        text_input = tokenizer(text, padding='max_length', truncation=True, max_length=max_length,
                               return_tensors="pt").to(device)
        embed = model(text_input.input_ids, attention_mask=text_input.attention_mask)

        embeds.append(embed)

    embeds = torch.cat(embeds, dim=0)

    return embeds


@torch.no_grad()
def contrast_evaluation(text_embeds, code_embeds, img2txt):
    score_matrix_i2t = text_embeds @ code_embeds.t()
    scores_i2t = score_matrix_i2t.cpu().numpy()


    ranks = np.ones(scores_i2t.shape[0]) * -1
    for index, score in enumerate(scores_i2t):
        inds = np.argsort(score)[::-1]
        ranks[index] = np.where(inds == img2txt[index])[0][0]

    # Compute metrics
    tr1 = 100.0 * len(np.where(ranks < 1)[0]) / len(ranks)
    tr5 = 100.0 * len(np.where(ranks < 5)[0]) / len(ranks)
    tr10 = 100.0 * len(np.where(ranks < 10)[0]) / len(ranks)
    mrr = 100.0 * np.mean(1 / (ranks + 1))

    eval_result = {'r1': tr1,
                   'r5': tr5,
                   'r10': tr10,
                   'mrr': mrr}
    return eval_result

In [3]:
print("\nCreating retrieval dataset")
#change language and path to dataset here
_, _, test_dataset, code_dataset = create_dataset('dataset/CSN', 'ruby')

test_loader, code_loader = create_loader([test_dataset, code_dataset], [None, None],
                                             batch_size=[256, 256],
                                             num_workers=[4, 4], is_trains=[False, False], collate_fns=[None, None])

tokenizer = AutoTokenizer.from_pretrained('Salesforce/codet5p-110m-embedding', trust_remote_code=True)
model = AutoModel.from_pretrained('Salesforce/codet5p-110m-embedding', trust_remote_code=True)

print('\nStart zero-shot evaluation...')
device = torch.device('cuda')
model = model.to(device)
model.eval()

text_embeds = get_feats(model, tokenizer, test_loader, 64, device, desc='Get text feats')
code_embeds = get_feats(model, tokenizer, code_loader, 360, device, desc='Get code feats')
test_result = contrast_evaluation(text_embeds, code_embeds, test_loader.dataset.text2code)
print(f'\n====> zero-shot test result: ', test_result)


Creating retrieval dataset
Read 24927 data from ./dataset/CSN/ruby/train.jsonl
Read 1400 data from ./dataset/CSN/ruby/valid.jsonl
Read 4360 data from ./dataset/CSN/ruby/codebase.jsonl
Read 1261 data from ./dataset/CSN/ruby/test.jsonl
Read 4360 data from ./dataset/CSN/ruby/codebase.jsonl
Read 4360 data from ./dataset/CSN/ruby/codebase.jsonl


(…)dding/resolve/main/tokenizer_config.json:   0%|          | 0.00/1.34k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


(…)p-110m-embedding/resolve/main/vocab.json:   0%|          | 0.00/511k [00:00<?, ?B/s]

(…)p-110m-embedding/resolve/main/merges.txt:   0%|          | 0.00/294k [00:00<?, ?B/s]

(…)0m-embedding/resolve/main/tokenizer.json:   0%|          | 0.00/1.37M [00:00<?, ?B/s]

(…)embedding/resolve/main/added_tokens.json:   0%|          | 0.00/59.0 [00:00<?, ?B/s]

(…)ing/resolve/main/special_tokens_map.json:   0%|          | 0.00/1.03k [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


(…)-110m-embedding/resolve/main/config.json:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

(…)/main/configuration_codet5p_embedding.py:   0%|          | 0.00/2.62k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/Salesforce/codet5p-110m-embedding:
- configuration_codet5p_embedding.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


(…)solve/main/modeling_codet5p_embedding.py:   0%|          | 0.00/1.93k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/Salesforce/codet5p-110m-embedding:
- modeling_codet5p_embedding.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


pytorch_model.bin:   0%|          | 0.00/439M [00:00<?, ?B/s]


Start zero-shot evaluation...


AssertionError: Torch not compiled with CUDA enabled