In [1]:
from tqdm import tqdm

In [2]:
from libs.feature_extraction.vectorizers import BertVectorizer
bert = BertVectorizer("microsoft/unixcoder-base")
vectors = bert.load_vectors("data/comment_finder/vectors/bert_vectors.npy")   

Loading model(microsoft/unixcoder-base)...


In [3]:
import pandas as pd
from nltk.translate import bleu_score
from numpy.linalg import norm
import numpy as np

df = pd.read_csv("./data/comment_finder/all.csv")

chencherry = bleu_score.SmoothingFunction()

def getBleuScore(test_idx, match_idx):
    ref = df.iloc[[test_idx]].to_records()[0][2]
    hyp = df.iloc[[match_idx]].to_records()[0][2]
    bleu_score_val = bleu_score.sentence_bleu([ref], hyp, smoothing_function=chencherry.method1)
    return bleu_score_val


def find_similiar(data_vector, test_id):

    A = data_vector[test_id]

    max_cosine = 0
    max_sim_id=-1
    for i, B in enumerate(data_vector):
        cosine = np.dot(A,B)/(norm(A)*norm(B))
        if i == test_id:
            continue
        if cosine>max_cosine:
            max_sim_id = i
            max_cosine = cosine
    return max_sim_id, max_cosine

In [6]:
from annoy import AnnoyIndex

dims = 768
trees = 10000
file_index_to_file_vector = {}

# build ann index
t = AnnoyIndex(dims, metric='angular')
for i in tqdm(range(vectors.shape[0])):
    file_vector = vectors[i].reshape(768,1)
    file_index_to_file_vector[i] = file_vector
    t.add_item(i, file_vector)
t.build(trees)
t.save("v1.annoy")

100%|██████████| 150677/150677 [00:22<00:00, 6691.36it/s]


True

In [4]:
from annoy import AnnoyIndex

dims = 768
trees = 10000
file_index_to_file_vector = {}

# build ann index
t = AnnoyIndex(dims, metric='angular')
t.load("v1.annoy")

True

In [8]:
test_df = pd.read_csv("./results/bert_cosine_bleu_C1_F5/fold_1.csv")
test_series = test_df.test_idx

sum = 0
toplam = 1000 #len(test_series)
for index, test_idx in tqdm(enumerate(test_series), total=toplam):
    if index > toplam-1:
        break
    nearest_neighbours = t.get_nns_by_vector(vectors[test_idx], 2)
    max_sim_id = nearest_neighbours[1]
    bleu_score_val = getBleuScore(test_idx,max_sim_id)
    sum = sum + bleu_score_val
    
avg_bleu_score = sum / toplam
display(avg_bleu_score)

100%|██████████| 1000/1000 [00:46<00:00, 21.57it/s]


0.13481056056337995

In [4]:
test_df = pd.read_csv("./results/bert_cosine_bleu_C1_F5/fold_1.csv")
test_series = test_df.test_idx

sum = 0
toplam = 1000 #len(test_series)
for index, test_idx in tqdm(enumerate(test_series), total=toplam):
    if index > toplam-1:
        break
    max_sim_id, max_cosine = find_similiar(vectors, test_idx)
    bleu_score_val = getBleuScore(test_idx,max_sim_id)
    sum = sum + bleu_score_val
    
avg_bleu_score = sum / toplam
display(avg_bleu_score)

100%|██████████| 1000/1000 [21:22<00:00,  1.28s/it]


0.1349337927608504