# Test the Code Bert model

## Setup & Utils

In [1]:
import torch
import torch.nn as nn
from transformers import RobertaTokenizer, RobertaConfig, RobertaModel

from tqdm import tqdm
import numpy as np
from matplotlib import pyplot as plt
import os
from numpy.linalg import norm
import sys
from scipy.spatial.distance import cosine

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = RobertaModel.from_pretrained("microsoft/codebert-base")
model.to(device)
model.eval()

tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base")

In [3]:
def similarity_score_1d(vector1, vector2):
    assert(len(vector1.shape) == 1)
    assert(len(vector2.shape) == 1)
    assert(vector1.shape[0] == vector2.shape[0])
    return np.dot(vector1,vector2)/(norm(vector1)*norm(vector2))
    
    # return 1 - cosine(vector1, vector2)

In [4]:
def reshape_3d_to_2d(vector):
    assert(len(vector.shape) == 3)
    first_dim = vector.shape[0]
    second_dim = vector.shape[1]
    new_dim = first_dim * second_dim
    return vector.reshape(new_dim, vector.shape[2])

In [5]:
def similarity_score_matrix_3d(vector1, vector2):
    if (vector1 is None or vector2 is None):
        return
    assert(len(vector1.shape) == 3)
    assert(len(vector2.shape) == 3)
    # assert(vector1.shape[1] == vector2.shape[1])
    # assert(vector1.shape[2] == vector2.shape[2])
    vec1_2d = reshape_3d_to_2d(vector1)
    vec2_2d = reshape_3d_to_2d(vector2)
    M = vec1_2d.shape[0]
    N = vec2_2d.shape[0]
    similarity_scores = np.zeros((M, N))
    # for m in range(M):
    #     v = vec1_2d[m]
    #     for n in range(m+1, N):
    #         if (similarity_scores[m, n] != 0): continue
    #         u = vec2_2d[n]
    #         sim_score = np.dot(v,u)/(norm(v)*norm(u))
    #         similarity_scores[m, n] = sim_score
            
    # for n in range(N):
    #     u = vec2_2d[n]
    #     for m in range(n+1, M):
    #         if (similarity_scores[m, n] != 0): continue
    #         v = vec1_2d[m]
    #         sim_score = np.dot(v,u)/(norm(v)*norm(u))
    #         similarity_scores[m, n] = sim_score
    
    for m in range(M):
        v = vec1_2d[m]
        for n in range(N):
            if (similarity_scores[m, n] != 0): continue
            u = vec2_2d[n]
            sim_score = np.dot(v,u)/(norm(v)*norm(u))
            similarity_scores[m, n] = sim_score

    # np.savez(file_path, similarity_scores)
    return similarity_scores

In [168]:
def get_token_ids_single(text, tokenizer):
    tokens        = tokenizer.tokenize(text)
    print(tokens)
    token_ids = tokenizer.convert_tokens_to_ids(tokens) + [3]
    return token_ids

## Example text

In [7]:
text1 = \
"""def get_max(my_list):
if not my_list:
    raise ValueError("Input list is empty")

max_element = max(my_list)
return max_element
"""

text2 = \
"""def get_largest_value(L):
largest = L[0]
for element in L:
    if element > largest:
        largest = element
return largest"""

text3 = \
"""def m(L):
    t = 0
    c = 0
    for e in L:
        t += e
        c += 1
    return t/c"""
    
text4 = \
"""int getMaxValue(const int arr[], int size) {
    if (size <= 0) {
        // Handle empty array or invalid size
        std::cerr << "Invalid array size or empty array." << std::endl;
        return -1; // You can choose a suitable default value or throw an exception
    }

    int max_value = arr[0]; // Assume the first element is the maximum

    for (int i = 1; i < size; ++i) {
        if (arr[i] > max_value) {
            // Update max_value if a larger element is found
            max_value = arr[i];
        }
    }

    return max_value;
}"""

text5 = \
"""int getLastValue(const int arr[], int size) {
    if (size <= 0) {
        // Handle empty array or invalid size
        std::cerr << "Invalid array size or empty array." << std::endl;
        return -1; // You can choose a suitable default value or throw an exception
    }

    return arr[size - 1];
}"""

text_desc1 = "return max value of a list"
text_desc2 = "return average value of a list"

sentences = [text1, text2, text3, text4, text5, text_desc1, text_desc2]
tokens = {'input_ids': [], 'attention_mask': []}

for sentence in sentences:
    # encode each sentence and append to dictionary
    new_tokens = tokenizer.encode_plus(sentence, max_length=128,
                                       truncation=True, padding='max_length',
                                       return_tensors='pt')
    tokens['input_ids'].append(new_tokens['input_ids'][0])
    tokens['attention_mask'].append(new_tokens['attention_mask'][0])
    
# reformat list of tensors into single tensor
tokens['input_ids'] = torch.stack(tokens['input_ids'])
tokens['attention_mask'] = torch.stack(tokens['attention_mask'])

## Tokenization

In [8]:
text1_tokens        = [tokenizer.cls_token] + tokenizer.tokenize(text1)     + [tokenizer.sep_token]
text2_tokens        = [tokenizer.cls_token] + tokenizer.tokenize(text2)     + [tokenizer.sep_token]
text3_tokens        = [tokenizer.cls_token] + tokenizer.tokenize(text3)     + [tokenizer.sep_token]
text_desc_tokens    = [tokenizer.cls_token] + tokenizer.tokenize(text_desc) + [tokenizer.sep_token]

NameError: name 'text_desc' is not defined

In [None]:
text1_token_ids = tokenizer.convert_tokens_to_ids(text1_tokens)
text2_token_ids = tokenizer.convert_tokens_to_ids(text2_tokens)
text3_token_ids = tokenizer.convert_tokens_to_ids(text3_tokens)
text_desc_token_ids = tokenizer.convert_tokens_to_ids(text_desc_tokens)

NameError: name 'text_desc_tokens' is not defined

## Get Embedding

In [None]:
outputs = model(**tokens)
outputs

(tensor([[[-0.0985,  0.2084, -0.0147,  ..., -0.2358, -0.3665,  0.3596],
          [-1.1312, -0.1249,  0.5453,  ..., -1.1168, -0.2118,  0.5683],
          [-0.5733,  0.0711,  0.1152,  ..., -0.1341, -0.1191,  0.8507],
          ...,
          [-0.3762, -0.1106,  0.4930,  ..., -0.5234,  0.0261,  0.0174],
          [-0.3762, -0.1106,  0.4930,  ..., -0.5234,  0.0261,  0.0174],
          [-0.3762, -0.1106,  0.4930,  ..., -0.5234,  0.0261,  0.0174]],
 
         [[-0.1703,  0.1046, -0.0596,  ..., -0.2995, -0.4340,  0.4011],
          [-0.9838,  0.1748,  0.4807,  ..., -1.1601, -0.1933,  0.6104],
          [-0.6993,  0.2367,  0.3774,  ..., -0.4023, -0.1533,  0.7596],
          ...,
          [-0.3447,  0.2819,  0.3316,  ..., -0.6630, -0.0595,  0.3459],
          [-0.3447,  0.2819,  0.3316,  ..., -0.6630, -0.0595,  0.3459],
          [-0.3447,  0.2819,  0.3316,  ..., -0.6630, -0.0595,  0.3459]],
 
         [[-0.2545,  0.0844, -0.0706,  ..., -0.3226, -0.5346,  0.5019],
          [-1.1956, -0.0283,

In [None]:
embeddings = outputs[0]
embeddings

tensor([[[-0.0985,  0.2084, -0.0147,  ..., -0.2358, -0.3665,  0.3596],
         [-1.1312, -0.1249,  0.5453,  ..., -1.1168, -0.2118,  0.5683],
         [-0.5733,  0.0711,  0.1152,  ..., -0.1341, -0.1191,  0.8507],
         ...,
         [-0.3762, -0.1106,  0.4930,  ..., -0.5234,  0.0261,  0.0174],
         [-0.3762, -0.1106,  0.4930,  ..., -0.5234,  0.0261,  0.0174],
         [-0.3762, -0.1106,  0.4930,  ..., -0.5234,  0.0261,  0.0174]],

        [[-0.1703,  0.1046, -0.0596,  ..., -0.2995, -0.4340,  0.4011],
         [-0.9838,  0.1748,  0.4807,  ..., -1.1601, -0.1933,  0.6104],
         [-0.6993,  0.2367,  0.3774,  ..., -0.4023, -0.1533,  0.7596],
         ...,
         [-0.3447,  0.2819,  0.3316,  ..., -0.6630, -0.0595,  0.3459],
         [-0.3447,  0.2819,  0.3316,  ..., -0.6630, -0.0595,  0.3459],
         [-0.3447,  0.2819,  0.3316,  ..., -0.6630, -0.0595,  0.3459]],

        [[-0.2545,  0.0844, -0.0706,  ..., -0.3226, -0.5346,  0.5019],
         [-1.1956, -0.0283,  0.6454,  ..., -1

In [None]:
embeddings.shape

torch.Size([7, 128, 768])

In [None]:
text1_embedding.shape

NameError: name 'text1_embedding' is not defined

In [None]:
attention_mask = tokens['attention_mask']
attention_mask.shape

torch.Size([7, 128])

In [None]:
mask = attention_mask.unsqueeze(-1).expand(embeddings.size()).float()
mask.shape

torch.Size([7, 128, 768])

In [None]:
mask

tensor([[[1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]],

        [[1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]],

        [[1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]],

        ...,

        [[1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0., 

In [None]:
masked_embeddings = embeddings * mask
masked_embeddings.shape

torch.Size([7, 128, 768])

In [None]:
masked_embeddings

tensor([[[-0.0985,  0.2084, -0.0147,  ..., -0.2358, -0.3665,  0.3596],
         [-1.1312, -0.1249,  0.5453,  ..., -1.1168, -0.2118,  0.5683],
         [-0.5733,  0.0711,  0.1152,  ..., -0.1341, -0.1191,  0.8507],
         ...,
         [-0.0000, -0.0000,  0.0000,  ..., -0.0000,  0.0000,  0.0000],
         [-0.0000, -0.0000,  0.0000,  ..., -0.0000,  0.0000,  0.0000],
         [-0.0000, -0.0000,  0.0000,  ..., -0.0000,  0.0000,  0.0000]],

        [[-0.1703,  0.1046, -0.0596,  ..., -0.2995, -0.4340,  0.4011],
         [-0.9838,  0.1748,  0.4807,  ..., -1.1601, -0.1933,  0.6104],
         [-0.6993,  0.2367,  0.3774,  ..., -0.4023, -0.1533,  0.7596],
         ...,
         [-0.0000,  0.0000,  0.0000,  ..., -0.0000, -0.0000,  0.0000],
         [-0.0000,  0.0000,  0.0000,  ..., -0.0000, -0.0000,  0.0000],
         [-0.0000,  0.0000,  0.0000,  ..., -0.0000, -0.0000,  0.0000]],

        [[-0.2545,  0.0844, -0.0706,  ..., -0.3226, -0.5346,  0.5019],
         [-1.1956, -0.0283,  0.6454,  ..., -1

In [None]:
summed = torch.sum(masked_embeddings, 1)
summed.shape

torch.Size([7, 768])

In [None]:
summed_mask = torch.clamp(mask.sum(1), min=1e-9)
summed_mask.shape

torch.Size([7, 768])

In [None]:
mean_pooled = summed / summed_mask
mean_pooled

tensor([[-0.4235,  0.2881,  0.2646,  ..., -0.6353, -0.4293,  0.6803],
        [-0.3458,  0.1308,  0.2953,  ..., -0.5086, -0.4853,  0.6119],
        [-0.2817,  0.0571,  0.2189,  ..., -0.4147, -0.5097,  0.5551],
        ...,
        [-0.2065, -0.0235,  0.2284,  ..., -0.3111, -0.4478,  0.6506],
        [-0.0564,  0.1062, -0.0027,  ...,  0.0219, -0.4743,  0.5632],
        [ 0.0213,  0.1191, -0.0118,  ...,  0.0435, -0.4385,  0.4762]],
       grad_fn=<DivBackward0>)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
mean_pooled = mean_pooled.detach().numpy()


In [None]:
mean_pooled.shape

(7, 768)

In [None]:
similarity_score_1d(mean_pooled[0], mean_pooled[1])

0.98526657

In [None]:
similarity_score_1d(mean_pooled[0], mean_pooled[2])

0.9517823

In [None]:
similarity_score_1d(mean_pooled[1], mean_pooled[2])

0.97757465

In [None]:
similarity_score_1d(mean_pooled[0], mean_pooled[3])

0.9414042

In [None]:
similarity_score_1d(mean_pooled[0], mean_pooled[4])

0.9504929

In [None]:
similarity_score_1d(mean_pooled[0], mean_pooled[5])

0.88987374

In [None]:
similarity_score_1d(mean_pooled[0], mean_pooled[6])

0.8784212

In [None]:
similarity_score_1d(mean_pooled[0], mean_pooled[4])

0.9504929

In [None]:
text1_mean_embedding = np.mean(text1_embedding.detach().numpy()[0], axis=0)
text2_mean_embedding = np.mean(text2_embedding.detach().numpy()[0], axis=0)
text3_mean_embedding = np.mean(text3_embedding.detach().numpy()[0], axis=0)
text_desc_mean_embedding = np.mean(text_desc_embedding.detach().numpy()[0], axis=0)

In [None]:
similarity_score_1d(text1_mean_embedding, text2_mean_embedding)

0.98526657

In [None]:
similarity_score_1d(text1_mean_embedding, text_desc_mean_embedding)

0.878421

In [None]:
similarity_score_1d(text1_mean_embedding, text3_mean_embedding)

0.95178217

In [None]:
similarity_score_1d(text_desc_mean_embedding, text3_mean_embedding)

0.89263886

## Try out one tutorial

https://peaceful0907.medium.com/sentence-embedding-by-bert-and-sentence-similarity-759f7beccbf1

In [None]:
class BERT_classifier(nn.Module):
    def __init__(self, bertmodel, num_label):
        super(BERT_classifier, self).__init__()
        self.bertmodel = bertmodel
        self.classifier = nn.Linear(bertmodel.config.hidden_size, num_label)

    def forward(self, wrapped_input):
        hidden = self.bertmodel(**wrapped_input)
        last_hidden_state, pooler_output = hidden[0], hidden[1]
        logits = self.classifier(pooler_output)

        return logits

bert = RobertaModel.from_pretrained("microsoft/codebert-base")
model = BERT_classifier(bert, 2)

In [None]:
text1_wrapped_input = tokenizer(text1, max_length=15, add_special_tokens=True, truncation=True, padding='max_length', return_tensors="pt")
text2_wrapped_input = tokenizer(text2, max_length=15, add_special_tokens=True, truncation=True, padding='max_length', return_tensors="pt")
text3_wrapped_input = tokenizer(text3, max_length=15, add_special_tokens=True, truncation=True, padding='max_length', return_tensors="pt")
text_desc_wrapped_input = tokenizer(text_desc, max_length=15, add_special_tokens=True, truncation=True, padding='max_length', return_tensors="pt")

In [None]:
text1_pair_tokens        = [tokenizer.cls_token] + tokenizer.tokenize(text_desc) + [tokenizer.sep_token] + tokenizer.tokenize(text1) + [tokenizer.eos_token]
text2_pair_tokens        = [tokenizer.cls_token] + tokenizer.tokenize(text_desc) + [tokenizer.sep_token] + tokenizer.tokenize(text2) + [tokenizer.eos_token]
text3_pair_tokens        = [tokenizer.cls_token] + tokenizer.tokenize(text_desc) + [tokenizer.sep_token] + tokenizer.tokenize(text3) + [tokenizer.eos_token]

In [None]:
text1_pair_token_ids = tokenizer.convert_tokens_to_ids(text1_pair_tokens)
text2_pair_token_ids = tokenizer.convert_tokens_to_ids(text2_pair_tokens)
text3_pair_token_ids = tokenizer.convert_tokens_to_ids(text3_pair_tokens)

In [None]:
text1_pair_embedding = model(torch.tensor(text1_pair_token_ids)[None, :])[1]
text2_pair_embedding = model(torch.tensor(text2_pair_token_ids)[None, :])[1]
text3_pair_embedding = model(torch.tensor(text3_pair_token_ids)[None, :])[1]

In [None]:
text1_pair_embedding.shape

torch.Size([1, 768])

In [None]:
np.mean(text1_pair_embedding.detach().numpy())

0.008173173

In [169]:
tids1 = get_token_ids_single("VSCodeServer", tokenizer)
tids2 = get_token_ids_single("vscodeClient", tokenizer)
tids3 = get_token_ids_single("disneyland", tokenizer)
tids4 = get_token_ids_single("vscodeServer", tokenizer)

['V', 'SC', 'ode', 'Server']
['v', 'sc', 'ode', 'Client']
['dis', 'ney', 'land']
['v', 'sc', 'ode', 'Server']


In [150]:
context_embedding1 = model(torch.tensor(tids1)[None, :])[0]
context_embedding2 = model(torch.tensor(tids2)[None, :])[0]
context_embedding3 = model(torch.tensor(tids3)[None, :])[0]
context_embedding4 = model(torch.tensor(tids4)[None, :])[0]

In [151]:
context_embedding.shape

torch.Size([1, 6, 768])

In [152]:
last_vector1 = context_embedding1.detach().numpy()[:, 0, :]
last_vector2 = context_embedding2.detach().numpy()[:, 0, :]
last_vector3 = context_embedding3.detach().numpy()[:, 0, :]
last_vector4 = context_embedding4.detach().numpy()[:, 0, :]

In [153]:
similarity_score_1d(last_vector1[0], last_vector2[0])

0.9992478

In [154]:
similarity_score_1d(last_vector1[0], last_vector3[0])

0.99790347

In [155]:
similarity_score_1d(last_vector2[0], last_vector3[0])

0.9979418

In [156]:
matrix12 = similarity_score_matrix_3d(context_embedding1.detach().numpy(), context_embedding2.detach().numpy())

In [157]:
matrix13 = similarity_score_matrix_3d(context_embedding1.detach().numpy(), context_embedding3.detach().numpy())

In [158]:
matrix14 = similarity_score_matrix_3d(context_embedding1.detach().numpy(), context_embedding4.detach().numpy())

In [159]:
for i in matrix12:
    print(i)

[0.99924779 0.56499153 0.62505573 0.57279372]
[0.60644126 0.94427735 0.85539436 0.84050715]
[0.64857978 0.8738116  0.95530665 0.87383288]
[0.63928145 0.84324539 0.87390709 0.94365901]


In [160]:
for i in matrix13:
    print(i)

[0.99790347 0.67630619 0.64278752]
[0.60614669 0.83747607 0.81981283]
[0.6461795  0.86878538 0.8575666 ]
[0.63783824 0.89322299 0.89370477]


In [165]:
for i in matrix14:
    print(i)

[0.99945617 0.58030516 0.62634778 0.6110518 ]
[0.60835147 0.94942743 0.8691678  0.86020535]
[0.6508556  0.86774838 0.96582645 0.89685947]
[0.6420384  0.85281646 0.8901177  0.98410237]


In [162]:
matrix12.mean()

0.7912707962095737

In [163]:
matrix13.mean()

0.7814775208632151

In [164]:
matrix14.mean()

0.8034173622727394

In [120]:
last_vector1[0, :10]

array([-0.1410906 ,  0.3659888 ,  0.04457453, -0.04732981,  0.04115221,
       -0.19793634, -0.08595254,  0.01691525,  0.07735952, -0.07030746],
      dtype=float32)

In [50]:
last_vector3[0, :10]

array([-0.11602467,  0.37414128,  0.04851544, -0.05977523,  0.04996986,
       -0.18897107, -0.10482092,  0.0216094 ,  0.0481642 , -0.12973918],
      dtype=float32)

In [54]:
norm(last_vector1[0])

18.088388

In [55]:
norm(last_vector3[0])

17.934418

In [None]:
tids1_embedding = model(torch.tensor(tids1)[None, :])[0]
tids2_embedding = model(torch.tensor(tids2)[None, :])[0]

In [None]:
tids1_embedding

tensor([[[-0.1411,  0.3660,  0.0446,  ..., -0.2033, -0.3209,  0.3346],
         [-0.3140,  0.5784,  0.4030,  ...,  0.3538, -0.5442,  0.3591],
         [-0.3408,  0.6607,  0.1974,  ..., -0.3252, -0.3707,  0.4559],
         [-0.3417,  0.1081, -0.1175,  ..., -0.0371, -0.4847,  0.1521],
         [-0.0482,  0.2290,  0.0767,  ..., -0.1025, -0.6494,  0.4877],
         [-0.1407,  0.3658,  0.0450,  ..., -0.2029, -0.3208,  0.3339]]],
       grad_fn=<NativeLayerNormBackward0>)

In [None]:
tids2_embedding

tensor([[[-0.1075,  0.3750,  0.0401,  ..., -0.1878, -0.3041,  0.3191],
         [-0.2476,  0.4381,  0.1267,  ..., -0.5965, -0.4729,  0.1755],
         [-0.1068,  0.3740,  0.0405,  ..., -0.1877, -0.3029,  0.3171]]],
       grad_fn=<NativeLayerNormBackward0>)

In [None]:
similarity_score_matrix_3d(tids1_embedding, tids2_embedding)

NameError: name 'reshape_3d_to_2d' is not defined