In [1]:
from collections import OrderedDict
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import pairwise_distances

from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
from utils.preprocessor import Stopwords_preprocessor
from utils.markdown import beir_metrics_to_markdown_table
from IPython.display import Markdown

# from rank_bm25 import BM25Okapi as BM25
from transformers import logging, AutoTokenizer
from beir.datasets.data_loader import GenericDataLoader
from beir.retrieval.evaluation import EvaluateRetrieval

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import pathlib
import torch
from torch import nn

In [4]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

In [5]:
# from beir import util
# dataset =  'trec-covid' # "nfcorpus" 
# url = "https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{}.zip".format(dataset)
# data_path = util.download_and_unzip(url, 'data')

In [116]:
corpus_name = 'scifact'
corpus_name = 'trec-covid'
# corpus_name = 'nfcorpus'

corpus, queries, qrels = GenericDataLoader(f'data/{corpus_name}').load(split="test")
corpus_text = [v['text'] for k,v in corpus.items()]

100%|███████████████████████████████████████████████████████████████████████████████████| 171332/171332 [00:01<00:00, 136601.12it/s]


In [7]:
def tokenize(x):
    return tokenizer.convert_ids_to_tokens(tokenizer.encode(x, add_special_tokens=False))

vectorizer = TfidfVectorizer(tokenizer=tokenize, vocabulary=tokenizer.vocab)
%time vectorizer.fit(corpus_text)

Token indices sequence length is longer than the specified maximum sequence length for this model (546 > 512). Running this sequence through the model will result in indexing errors


CPU times: user 6.09 s, sys: 53.3 ms, total: 6.14 s
Wall time: 6.13 s


In [8]:
# # test
# text_sample = corpus[list(corpus.keys())[0]]['text']
# res = mean_rotary_discrepency(text_sample)
# res.shape

In [62]:
def mean_vector(text):
    ids = tokenizer.encode(text, add_special_tokens=False)
    if len(ids) == 0:
        return np.zeros(word_reprs.shape[1])
    return word_reprs[ids].mean(axis=0)

def sum_vector(text):
    ids = tokenizer.encode(text, add_special_tokens=False)
    return word_reprs[ids].sum(axis=0)


def idf_sum_vector(text):
    ids = tokenizer.encode(text, add_special_tokens=False)
    return np.einsum('ld,l', word_reprs[ids], vectorizer.idf_[ids])

In [111]:
folder_path = pathlib.Path('data/limanet/')

subpath = '20240422.11:52:35-batch_size_1024' #trec-covid best, batch 10000
# subpath = '20240422.02:12:51-batch_size_1024' #for report
# subpath = '20240422.02:12:51-batch_size_1024' #multihead_lima_rotating_history, dim 96, head 12, depth 3
# subpath = '20240422.02:23:48-batch_size_1024' #rotator_multihead_local_coordinate, dim 96, head 12, depth 3
# subpath = '20240422.11:49:56-batch_size_1024' #rotator_lima3, dim 96, head 12, depth 3
# subpath = '20240422.18:17:15-batch_size_1024' #rotator_lima3, dim 128, head 16, depth 3
subpath = '20240423.15:38:14-batch_size_1024' #rotator_lima3, dim 96, head 12, depth 3, rotary_denom 2
subpath = '20240424.13:12:53-batch_size_1024' #rotator_lima3, triplet loss
# subpath = '20240424.14:29:05-batch_size_1024' #rotator_lima3, triplet+mse
subpath = '20240424.18:17:26-batch_size_1024' # rotator_lima3_5, double LN
# subpath = '20240424.18:22:33-batch_size_1024' # rotator_lima3, mse+squared_triplet

batch_num = 25000

model = torch.load(folder_path/subpath/f'batch_{batch_num}-model.pt').cpu()
word_reprs_complex = model.predictor.all_word_embeddings()
word_reprs = torch.concat([word_reprs_complex.real, word_reprs_complex.imag], dim=-1).detach().numpy()
word_reprs_complex = word_reprs_complex.detach().numpy()

In [112]:
# for i in range(len(model.limas)):
#     lima_shape = model.limas[i].lima_shape
#     print(lima_shape)
#     print(f'{i}: {lima_shape.min()}, {lima_shape.max()}')

In [117]:
method = idf_sum_vector
method = mean_vector
# method = sum_vector

part = 'text'
# part = 'title'

%time text_vec_dict = OrderedDict({k: method(v[part]) for k, v in corpus.items()})
%time query_vec_dict = OrderedDict({k: method(v) for k, v in queries.items()})
text_vecs = np.stack(list(text_vec_dict.values()))

CPU times: user 2min 16s, sys: 125 ms, total: 2min 16s
Wall time: 2min 16s
CPU times: user 7.77 ms, sys: 0 ns, total: 7.77 ms
Wall time: 7.65 ms


In [119]:
metric = 'euclidean'
# metric = 'cosine'


def score(query_vector, metric=metric):
    return (1/pairwise_distances(query_vector[None, :], text_vecs, metric=metric))[0]


%time results = {qid: dict(zip(text_vec_dict.keys(), score(query_vector).tolist())) \
            for qid, query_vector in query_vec_dict.items()}

metrics = EvaluateRetrieval.evaluate(qrels, results, [1, 3, 5, 10, 100, 1000])

flatten_metrics = {k: v for metric_type in metrics for k, v in metric_type.items()}
metric_names, metric_values = zip(*flatten_metrics.items())
print(*metric_names, sep='\t')
print(*metric_values, sep='\t')
print()

md = beir_metrics_to_markdown_table(*metrics)
Markdown(md)

CPU times: user 53.4 s, sys: 2min 43s, total: 3min 37s
Wall time: 8.41 s
NDCG@1	NDCG@3	NDCG@5	NDCG@10	NDCG@100	NDCG@1000	MAP@1	MAP@3	MAP@5	MAP@10	MAP@100	MAP@1000	Recall@1	Recall@3	Recall@5	Recall@10	Recall@100	Recall@1000	P@1	P@3	P@5	P@10	P@100	P@1000
0.33	0.31531	0.31267	0.2985	0.18779	0.15031	0.00083	0.00198	0.00307	0.00516	0.02045	0.04176	0.00083	0.00235	0.00386	0.00692	0.03627	0.14159	0.38	0.34667	0.34	0.322	0.1886	0.07304



||NDCG|MAP|Recall|P|
|-|-|-|-|-|
|@1|0.3300|0.0008|0.0008|0.3800|
|@3|0.3153|0.0020|0.0024|0.3467|
|@5|0.3127|0.0031|0.0039|0.3400|
|@10|0.2985|0.0052|0.0069|0.3220|
|@100|0.1878|0.0204|0.0363|0.1886|
|@1000|0.1503|0.0418|0.1416|0.0730|

In [22]:
model.predictor.rotary_denom

Parameter containing:
tensor(0.4918, requires_grad=True)

In [15]:
# # write first 10 questions and top 10 answer to file

# samples = list(results.items())[:10]
# for q_num, score_dict in samples:
#     with open(f'question_{q_num}.txt', 'w') as f:
#         f.write(f'{queries[q_num]}\n\n')
#         tokens = tokenizer.convert_ids_to_tokens(tokenizer(queries[q_num], add_special_tokens=False)['input_ids'])
#         f.write(f'{tokens}\n\n')
        
#         text_ids, text_scores = zip(*score_dict.items())
#         text_scores = np.array(text_scores)
#         top_10_idx = np.argsort(text_scores)[:-10:-1]

#         for idx in top_10_idx:
#             f.write(f'{corpus[text_ids[idx]]}\n\n')

In [16]:
# #test: 看每個字往時間方向逆向轉一個 t 後，附近的字為何。理論來說會是跟這個字無關的字 (text independent)，因為這個旋轉抵銷了時間旋轉

# inverse_metric_theta = - 1/model.predictor.rotary_denom**(model.predictor.dimension_nums/model.predictor.dim)
# inverse_pos_rotation = torch.complex(inverse_metric_theta.cos(), inverse_metric_theta.sin())
# least_effective_position_of_the_word = model.predictor.all_word_embeddings() * inverse_pos_rotation
# least_effective_position_of_the_word = torch.concat([least_effective_position_of_the_word.real, least_effective_position_of_the_word.imag], dim=-1).detach().numpy()

# least_effective_position_of_the_word.shape

# %time d = pairwise_distances(word_reprs, least_effective_position_of_the_word, metric='euclidean') # metric='cosine'

# %time pair = d.argsort(axis=1)[:, :10]

# for input_id in tokenizer.encode(text_sample):
#     print(f'{tokenizer.convert_ids_to_tokens(input_id)}: {tokenizer.convert_ids_to_tokens(pair[input_id])}')