In [1]:
from datasets import load_dataset

# load only validation dataset of python from hf
dataset = load_dataset("code_search_net", "python", split="validation")
print(dataset)

Dataset({
    features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url'],
    num_rows: 23107
})


In [2]:
# print(dataset[0]['whole_func_string'])
# print(dataset[0]['func_code_string'])
# print(dataset[0]['func_documentation_string'])

# Convert the dataset into pairs of docstring-code
str_pairs = [(d['func_documentation_string'], d['func_code_string']) for d in dataset]

# pairs[0]

In [3]:
# testion out roberta tokenizer
from transformers.models.roberta import RobertaTokenizer

tnizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base")
x = tnizer(str_pairs[0][1])

# This will give us both input_ids and attention mask
# print(f"input: {str_pairs[0][1]}\ntokenizer output: {x}") # testing for the code string of first datapoint

Token indices sequence length is longer than the specified maximum sequence length for this model (4535 > 512). Running this sequence through the model will result in indexing errors


In [4]:
# Init model
from transformers.models.roberta import RobertaModel

# Using roberta because codebert is built on top of it (roberta has the same architecture as bert, the only difference is that it is trained on much more data, and NSP task is skipped while training).
model = RobertaModel.from_pretrained("microsoft/codebert-base", output_attentions=True, attn_implementation="eager")
model.eval()

RobertaModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(50265, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0-11): 12 x RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dropou

In [5]:
# trying out inference for one docstring-code pair
doc, code = str_pairs[0]

# setting max_length because model expects fixed length inputs
inputs = tnizer(doc, code, padding="max_length", truncation=True, max_length=256, return_tensors="pt")
inputs

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


{'input_ids': tensor([[    0, 40249,    10,  1844,  1343,  1421,     4, 50140,  1437,  1437,
          1437, 47930, 50118,  1437,  1437,  1437, 48364, 50118,  1437,  1437,
          1437, 16028,    35,  6545,     4, 16040,   705, 50118,  1437,  1437,
          1437,  1437,  1437,  1437,  1437,  1737,     7,  2341,    15, 50118,
          1437,  1437,  1437,  1546,    35,  6755,    50,    10,  5043, 50118,
          1437,  1437,  1437,  1437,  1437,  1437,  1437, 26739,  1546,     7,
           304,    25,    10,  2231,  5043, 36612,   757,  2630,     4,   318,
          6755,     6,    34,     7,    28,    65,     9,     5,  2523,     9,
          3382,  3092,    11, 11909, 38630,     4, 27278,     4, 43457, 50118,
          1437,  1437,  1437,  1437,  1437,  1437,  1437,    36, 18517,   642,
             6,   740, 15688,     6, 15380,  1215,  8338,   322,   318,    10,
          5043,     6,   197,   185,    41, 15306,  7281,   368,     8,   671,
            10, 42715, 15594,  7281,  

In [7]:
import torch
with torch.no_grad():
    outputs = model(**inputs)

# This is a tuple of (layer1, layer2, ..., layer12), each shape: [1, num_heads, seq_len, seq_len]
# attentions[layer][batch][head] -> 2d array representing attention scores
print(outputs.attentions[0][0][0]) # attention scores of first attention head in first layer

tensor([[5.3649e-01, 4.0284e-03, 2.3822e-03,  ..., 4.3576e-03, 1.2357e-03,
         4.9639e-03],
        [2.7465e-01, 2.4957e-02, 2.0957e-02,  ..., 2.8823e-03, 1.1690e-03,
         2.4128e-03],
        [2.8285e-02, 1.0770e-01, 9.4607e-02,  ..., 3.7162e-03, 1.5548e-03,
         1.4725e-03],
        ...,
        [1.7232e-03, 2.3840e-04, 7.9932e-05,  ..., 8.3183e-02, 2.1781e-02,
         7.5156e-03],
        [5.7953e-04, 1.0877e-04, 4.7337e-05,  ..., 2.9940e-01, 5.0779e-02,
         1.9509e-02],
        [1.3488e-02, 4.8767e-04, 1.3569e-04,  ..., 3.5414e-01, 9.7978e-02,
         8.2096e-02]])
