In [2]:
from tqdm import tqdm

In [3]:
import pandas as pd
from nltk.translate import bleu_score
from numpy.linalg import norm
import numpy as np
from PIL import Image

df = pd.read_csv("./data/comment_finder/all.csv")

chencherry = bleu_score.SmoothingFunction()

def getBleuScore(test_idx, match_idx):
    ref = df.iloc[[test_idx]].to_records()[0][2]
    hyp = df.iloc[[match_idx]].to_records()[0][2]
    bleu_score_val = bleu_score.sentence_bleu([ref], hyp, smoothing_function=chencherry.method1)
    return bleu_score_val


def find_similiar(data_vector, test_id):

    A = data_vector[test_id]

    max_cosine = 0
    max_sim_id=-1
    for i, B in enumerate(data_vector):
        cosine = np.dot(A,B)/(norm(A)*norm(B))
        if i == test_id:
            continue
        if cosine>max_cosine:
            max_sim_id = i
            max_cosine = cosine
    return max_sim_id, max_cosine

def array_to_image(array):
    array = np.clip(array, 0, 255).astype(np.uint8)
    return Image.fromarray(array.reshape(-1, array.shape[-1]))

In [6]:
from libs.feature_extraction.vectorizers import BertVectorizer
import numpy as np
from scipy.signal import stft
import tensorflow as tf

bert = BertVectorizer("microsoft/unixcoder-base")
vectors = bert.load_vectors("data/comment_finder/vectors/bert_vectors.npy")

stft_list=[]

for index, vector in tqdm(enumerate(vectors), total=len(vectors)):
    f, t, Zxx_db = stft(vector, fs = 128, nperseg = 128, noverlap = 116, nfft = 128)
    Zxx_db = tf.abs(Zxx_db)
    Zxx_db = Zxx_db[:64,:64]
    stft_list.append(Zxx_db)
    
stft_list = np.concatenate(stft_list)
display(stft_list.shape)
vectors = stft_list.reshape(stft_list.shape[0]//64, 4096)
display(vectors.shape)
np.save("v4_vectors", vectors)

Loading model(microsoft/unixcoder-base)...


100%|██████████| 150677/150677 [01:55<00:00, 1300.86it/s]


(9643328, 64)

(150677, 4096)

In [None]:
from annoy import AnnoyIndex

dims = 4096
trees = 10000
file_index_to_file_vector = {}

# build ann index
t = AnnoyIndex(dims, metric='angular')
for i in tqdm(range(vectors.shape[0])):
    file_vector = vectors[i].reshape(dims,1)
    file_index_to_file_vector[i] = file_vector
    t.add_item(i, file_vector)
t.build(trees)
t.save("v4.annoy")

100%|██████████| 150677/150677 [01:27<00:00, 1724.36it/s]


True

In [5]:
from annoy import AnnoyIndex

dims = 4096
trees = 10000
file_index_to_file_vector = {}

# build ann index
t = AnnoyIndex(dims, metric='angular')
t.load("v4.annoy")

True

In [30]:
import pandas as pd
from tqdm import tqdm

test_df = pd.read_csv("./results/bert_cosine_bleu_C1_F5/fold_1.csv")
vectors = np.load("v4_vectors.npy")
test_series = test_df.test_idx

sum = 0
toplam = len(test_series)
for index, test_idx in tqdm(enumerate(test_series), total=toplam):
    if index > toplam-1:
        break
    nearest_neighbours = t.get_nns_by_vector(vectors[test_idx], 2)
    max_sim_id = nearest_neighbours[1]
    bleu_score_val = getBleuScore(test_idx,max_sim_id)
    sum = sum + bleu_score_val
    
avg_bleu_score = sum / toplam
display(avg_bleu_score)

  1%|          | 285/30136 [00:16<29:28, 16.88it/s]


KeyboardInterrupt: 

In [6]:
import pandas as pd
from tqdm import tqdm

test_df = pd.read_csv("./results/bert_cosine_bleu_C1_F5/fold_1.csv")
vectors = np.load("v4_vectors.npy")

sum = 0
tq = tqdm(range(len(vectors)))
for index in tq:
    nearest_neighbours = t.get_nns_by_vector(vectors[index], 2)
    max_sim_id = nearest_neighbours[1]
    bleu_score_val = getBleuScore(index,max_sim_id)
    sum = sum + bleu_score_val
    tq.set_description("Bleu Score: %s" % str(sum / (index+1)))
        
    
avg_bleu_score = sum / len(vectors)
display(avg_bleu_score)

Bleu Score: 0.13091595869126196: 100%|██████████| 150677/150677 [2:58:26<00:00, 14.07it/s]  


0.13091595869126196