In [None]:
import gc
import os
import pickle
import jsonlines
import torch
from tqdm import tqdm
import pandas as pd
import csv
from collections import defaultdict
import argparse
from core.models.entailment import EntailmentDeberta
from rank_eval import load_data, load_rank_results

def merge_score(rank_score, entropy_score):
    if entropy_score is None:
        return rank_score
    if entropy_score < 0.01:
        return rank_score + 1.0
    return rank_score

dataset_names = ["trec-covid", "climate-fever", "dbpedia-entity", "fever", "hotpotqa", "nfcorpus", "nq", "scidocs"]
for dataset_name in tqdm(dataset_names, desc='dataset'):
    dataset_path = f'/home/song/dataset/beir/{dataset_name}'
    queries1, docs1, scores = load_data(dataset_path, dataset_name)
    queries = {str(qid): query['text'] for qid, query in queries1.items()}
    docs = {str(docid): doc for docid, doc in docs1.items()}
    rank_result_path = f'dataset/rank/{dataset_name}/{dataset_name}-rank10-small.tsv'
    rank_results = load_rank_results(rank_result_path)
    entropy_result_path = f'output/rerank/{dataset_name}/entropy-small.tsv'
    entropy_results = load_rank_results(entropy_result_path)
    print(f"dataset: {dataset_name}")
    merge_results = [] # ['qid', 'query', 'docid', 'doc', 'gold_score', 'rank_index', 'rank_score', 'entropy_score', 'merge_score']
    for qid in rank_results:
        for i, docid in enumerate(rank_results[qid]):
            merge_results.append([str(qid), 
                                  queries.get(str(qid), ''), 
                                  str(docid), 
                                  docs.get(str(docid), ''), 
                                  scores.get(str(qid), {}).get(str(docid), 0.0), 
                                  i,
                                  rank_results.get(qid, {}).get(docid, 0.0),
                                  entropy_results.get(qid, {}).get(docid, None),
                                  merge_score(rank_results.get(qid, {}).get(docid, 0.0), entropy_results.get(qid, {}).get(docid, None))
                                  ])
    with open(f'output/tmp/merge-small-{dataset_name}.tsv', 'w', newline='') as f:
        writer = csv.writer(f, delimiter='\t')
        writer.writerow(['qid', 'query', 'docid', 'doc', 'gold_score', 'rank_index', 'rank_score', 'entropy_score', 'merge_score'])
        writer.writerows(merge_results)
    print(f"output: output/tmp/merge-small-{dataset_name}.tsv")

dataset:   0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/171332 [00:00<?, ?it/s]

dataset:  12%|█▎        | 1/8 [00:01<00:09,  1.35s/it]

dataset: trec-covid
output: output/tmp/merge-small-trec-covid.tsv


  0%|          | 0/5416593 [00:00<?, ?it/s]

dataset:  25%|██▌       | 2/8 [00:26<01:31, 15.29s/it]

dataset: climate-fever
output: output/tmp/merge-small-climate-fever.tsv


  0%|          | 0/4635922 [00:00<?, ?it/s]

dataset:  38%|███▊      | 3/8 [00:49<01:33, 18.64s/it]

dataset: dbpedia-entity
output: output/tmp/merge-small-dbpedia-entity.tsv


  0%|          | 0/5416568 [00:00<?, ?it/s]

dataset: fever


dataset:  50%|█████     | 4/8 [01:16<01:28, 22.19s/it]

output: output/tmp/merge-small-fever.tsv


  0%|          | 0/5233329 [00:00<?, ?it/s]

dataset:  62%|██████▎   | 5/8 [01:41<01:09, 23.10s/it]

dataset: hotpotqa
output: output/tmp/merge-small-hotpotqa.tsv


  0%|          | 0/3633 [00:00<?, ?it/s]

dataset: nfcorpus
output: output/tmp/merge-small-nfcorpus.tsv


dataset:  75%|███████▌  | 6/8 [01:43<00:31, 15.88s/it]

  0%|          | 0/2681468 [00:00<?, ?it/s]

dataset:  88%|████████▊ | 7/8 [01:54<00:14, 14.49s/it]

dataset: nq
output: output/tmp/merge-small-nq.tsv


  0%|          | 0/25657 [00:00<?, ?it/s]

dataset: 100%|██████████| 8/8 [01:56<00:00, 14.56s/it]

dataset: scidocs
output: output/tmp/merge-small-scidocs.tsv





In [10]:
queries

{'632589828c8b9fca2c3a59e97451fde8fa7d188d': 'An evolutionary recurrent network which automates the design of recurrent neural/fuzzy networks using a new evolutionary learning algorithm is proposed in this paper. This new evolutionary learning algorithm is based on a hybrid of genetic algorithm (GA) and particle swarm optimization (PSO), and is thus called HGAPSO. In HGAPSO, individuals in a new generation are created, not only by crossover and mutation operation as in GA, but also by PSO. The concept of elite strategy is adopted in HGAPSO, where the upper-half of the best-performing individuals in a population are regarded as elites. However, instead of being reproduced directly to the next generation, these elites are first enhanced. The group constituted by the elites is regarded as a swarm, and each elite corresponds to a particle within it. In this regard, the elites are enhanced by PSO, an operation which mimics the maturing phenomenon in nature. These enhanced elites constitute 

In [11]:
docs

{'78495383450e02c5fe817e408726134b3084905d': 'A Direct Search Method to solve Economic Dispatch Problem with Valve-Point Effect',
 '7dcb308b9292a8bc87d6f7793d2ca5e0e19dfa40': 'Bearish-Bullish Sentiment Analysis on Financial Microblogs',
 '8c872ecd87945e71fcd9fa1b6cb1133cfe805bf2': 'Predicting defects in SAP Java code: An experience report',
 '3a63667284dc8b9687ed1620406030bfe39af3c9': 'Active-Metric Learning for Classification of Remotely Sensed Hyperspectral Images',
 '071f47b7bc5830643e31dbed82e0375bf9b26559': 'Ad Hoc Retrieval Experiments Using WordNet and Automatically Constructed Thesauri',
 'ee9596725d1db17f2b1e2207dd3ea260343bfe4f': 'Underwater Acoustic Target Tracking: A Review',
 'a65196dfff31425281c690a7f2ca65247147da6b': 'Unsupervised Diverse Colorization via Generative Adversarial Networks',
 'a04b5b99f5d9d8748843e870536a4a9f65562012': 'Lane Detection ( Part I ) : Mono-Vision Based Method',
 'de8e80d409aaaa3244da4f2cb5b5bb053d453cee': 'Detection of distributed denial of ser

In [16]:
QID = '541853e747dd63d6aff41c773e21fd1e224f0680'
for qid in queries:
    # print(qid)
    if qid.endswith('80'):
        print(qid)

746cafc676374114198c414d6426ec2f50e0ff80
e170ca6dad1221f4bb2e4fc3d42a182e23026b80
02a808de5aa34685955fd1473433161edd20fd80
aaa9d12640ec6f9d1d37333141c761c902d2d280
1007fbc622acd3cc8f658558a3e841ea200f6880
3d425a44b54f505a5d280653a3b4f992d4836c80
c91b4b3a20a7637ecbb7e0179ac3108f3cf11880
698b8181cd613a72adeac0d75252afe7f57a5180
4ce68170f85560942ee51465e593b16560f9c580
f306c0d24a5eb338b7a577a17d8b35d78716d880
e2b7a371b7cfb5f2bdc2abeb41397aee03fd5480
11ad7734bbb81e901f2e59b73456324b299d8980
08e4982410ebaa6dbd203a953113214bc9740b80
4189eac6d7104e00323f78a8897167d50c815c80
47fdd1579f732dd6389f9342027560e385853180
c7b3f5bccb19f1a224eb87c6924f244b1511e680
409ff05931b5f252935930ecd8de4e62bc0c7d80
213cb7593934bc675c336f53dd6c61a3c799be80
fd47145321e4b34e043104c9eb21c9bc28dfd680
f407c09ae8d886fc373d3f471c97c22d3ca50580
384ac22ddf645108d085f6f9ec6d359813776a80
5c1e5e87a7cb833b046222bf631f9063c9926680
76d4f741a0321bad1f080a6c4d41996a381d3c80
50629d7d6afd7577ccfd92b35c7e15f79ad4b180
1b3b22b95ab55853