In [None]:
# !pip install -qU huggingface_hub datasets

In [1]:
import datasets
from huggingface_hub import login
from datasets import load_dataset
from tqdm.auto import tqdm
import random
import pandas as pd
import pickle
import os
import json
from pyserini.search.lucene import LuceneSearcher

random.seed(12)

In [2]:
# login()

In [3]:
base_path = "."

In [4]:
def get_passages():
    pickle_file = f"{base_path}/data/passages.pickle"
    passages_dataset = load_dataset("BeIR/trec-covid", "corpus")

    if not os.path.isfile(pickle_file):
        passages = {}
        for item in tqdm(passages_dataset["corpus"]):
            passages[item["_id"]] = {
                "fulltext": item["title"] + " " + item["text"]
            }

        with open(pickle_file, "wb") as f:
            pickle.dump(passages, f)
    else:
        with open(pickle_file, "rb") as f:
            passages = pickle.load(f)

    return passages

In [5]:
def generate_negative_samples(query, doc_ids, origin):
    negative_samples = []
    
    count = 1
    for doc_id in doc_ids:
        fulltext = passages[doc_id]["fulltext"]
        
        sample = {
            "id": doc_id,
            "query": query,
            "passage": fulltext,
            "origin": origin
        }
        negative_samples.append(sample)
        
        if count == 5:
            break
            
        count += 1
        
    return negative_samples

In [6]:
def get_samples():
    trec_ds = datasets.load_dataset('unicamp-dl/trec-covid-experiment')
    positive_samples = []
    negative_samples = []

    for ds in trec_ds:
        if "example" not in ds:
            for item in tqdm(trec_ds[ds]):
                positive_doc_id = item["positive_doc_id"]
                fulltext = passages[positive_doc_id]["fulltext"]
                
                sample = {
                    "id": positive_doc_id,
                    "query": item["query"],
                    "passage": fulltext,
                    "origin": ds
                }
                positive_samples.append(sample)
                
                if len(item["negative_doc_ids"]) > 0:
                    random_doc_ids = item["negative_doc_ids"]
                else:
                    random_doc_ids = search_with_bm25(item["query"])
                    
                negative_samples.extend(generate_negative_samples(item["query"], random_doc_ids, ds))

    df_data_pos = pd.DataFrame(positive_samples)
    df_data_pos["score"] = 1

    df_data_neg = pd.DataFrame(negative_samples)
    df_data_neg["score"] = 0

    df_data_merge = pd.concat([df_data_pos, df_data_neg], axis=0, ignore_index=True)
    
    df_data_merge.to_csv(f"{base_path}/data/data.csv")

    return df_data_merge

In [7]:
def generate_ramdom_numbers(max=5, k=1000):
    random_list = []
    while len(random_list) < max:
        n = random.randint(0, k - 1)

        # Prevent duplicated index
        if n not in random_list:
            random_list.append(n)

    return random_list

In [8]:
def search_with_bm25(query, max=5, k=1000):
    searcher = LuceneSearcher.from_prebuilt_index('beir-v1.0.0-trec-covid.flat')
    hits = searcher.search(query, k)
    random_list = generate_ramdom_numbers(max=max, k=k)
    random_ids = []

    for index in random_list:
        jsondoc = json.loads(hits[index].raw)
        random_ids.append(jsondoc["_id"])

    return random_ids

In [9]:
passages = get_passages()

Found cached dataset trec-covid (/home/manoel/.cache/huggingface/datasets/BeIR___trec-covid/corpus/0.0.0/093f1fe2ffa7a9c72fa48239c8f279b51d6b171abd77737c7fd1406125307599)


  0%|          | 0/1 [00:00<?, ?it/s]

In [10]:
df_data = get_samples()

Found cached dataset trec-covid-experiment (/home/manoel/.cache/huggingface/datasets/unicamp-dl___trec-covid-experiment/default/0.0.0/408acea7f1921299714cb2c40d35a0c61e678a84ba7ab64fe1b521654d417ed0)


  0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/463 [00:00<?, ?it/s]

  0%|          | 0/1001 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/999 [00:00<?, ?it/s]

  0%|          | 0/979 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1088 [00:00<?, ?it/s]

In [11]:
df_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 69180 entries, 0 to 69179
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   id       69180 non-null  object
 1   query    69180 non-null  object
 2   passage  69180 non-null  object
 3   origin   69180 non-null  object
 4   score    69180 non-null  int64 
dtypes: int64(1), object(4)
memory usage: 2.6+ MB


In [12]:
df_data.head()

Unnamed: 0,id,query,passage,origin,score
0,70hskj1o,How can chatbots be designed to effectively sh...,Chatbots in the fight against the COVID-19 pan...,eduseiti_100_queries_expansion_20230501_01,1
1,70hskj1o,What strategies can be used to encourage desir...,Chatbots in the fight against the COVID-19 pan...,eduseiti_100_queries_expansion_20230501_01,1
2,70hskj1o,What are the risks associated with amplifying ...,Chatbots in the fight against the COVID-19 pan...,eduseiti_100_queries_expansion_20230501_01,1
3,70hskj1o,What research has been conducted on the effect...,Chatbots in the fight against the COVID-19 pan...,eduseiti_100_queries_expansion_20230501_01,1
4,70hskj1o,How can collaborations between healthcare work...,Chatbots in the fight against the COVID-19 pan...,eduseiti_100_queries_expansion_20230501_01,1


In [None]:
df_data["query"].duplicated().sum()

In [None]:
df_duplicated = df_data.duplicated(subset=['query'])

In [None]:
df_duplicated.info()

In [None]:
df_duplicated.head()

In [None]:
df_data.loc[df_data.duplicated(), :]

In [None]:
trec_ds = datasets.load_dataset('unicamp-dl/trec-covid-experiment')
positive_samples = []
negative_samples = []

for ds in trec_ds:
    if "example" not in ds:
        for item in tqdm(trec_ds[ds]):
            positive_doc_id = item["positive_doc_id"]
            fulltext = passages[positive_doc_id]["fulltext"]

            sample = {
                "id": positive_doc_id,
                "query": item["query"],
                "passage": fulltext,
                "origin": ds
            }
            positive_samples.append(sample)

            if len(item["negative_doc_ids"]) > 0:
                random_doc_ids = item["negative_doc_ids"]
            else:
                random_doc_ids = search_with_bm25(item["query"])

            negative_samples.append(generate_negative_samples(item["query"], random_doc_ids, ds))

df_data_pos = pd.DataFrame(positive_samples)
df_data_pos["score"] = 1

df_data_neg = pd.DataFrame(negative_samples)
df_data_neg["score"] = 0

In [None]:
df_data_pos.info()

In [None]:
df_data_neg.info()

In [None]:
df_data_merge = pd.concat([df_data_pos, df_data_neg], axis=0, ignore_index=True)