In [1]:
import os
import json
import numpy as np
import pandas as pd
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# https://huggingface.co/datasets/ms_marco
dataset = load_dataset('ms_marco', 'v1.1', split='train')
# split=None -> dataset.items() ['train'] ['validation'] ['test']

In [3]:
raw_df = dataset.to_pandas()
raw_df.head()

Unnamed: 0,answers,passages,query,query_id,query_type,wellFormedAnswers
0,[Results-Based Accountability is a disciplined...,"{'is_selected': [0, 0, 0, 0, 0, 1, 0, 0, 0, 0]...",what is rba,19699,description,[]
1,[Yes],"{'is_selected': [0, 1, 0, 0, 0, 0, 0], 'passag...",was ronald reagan a democrat,19700,description,[]
2,[20-25 minutes],"{'is_selected': [0, 0, 0, 0, 1, 0, 0, 0, 0, 0]...",how long do you need for sydney and surroundin...,19701,numeric,[]
3,[$11 to $22 per square foot],"{'is_selected': [0, 0, 0, 0, 0, 0, 0, 0, 1], '...",price to install tile in shower,19702,numeric,[]
4,[Due to symptoms in the body],"{'is_selected': [0, 0, 1, 0, 0, 0, 0, 0], 'pas...",why conversion observed in body,19703,description,[]


In [6]:
raw_df['passages'][0]['passage_text'][0]

"Since 2007, the RBA's outstanding reputation has been affected by the 'Securency' or NPA scandal. These RBA subsidiaries were involved in bribing overseas officials so that Australia might win lucrative note-printing contracts. The assets of the bank include the gold and foreign exchange reserves of Australia, which is estimated to have a net worth of A$101 billion. Nearly 94% of the RBA's employees work at its headquarters in Sydney, New South Wales and at the Business Resumption Site."

In [33]:
raw_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 82326 entries, 0 to 82325
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   answers            82326 non-null  object
 1   passages           82326 non-null  object
 2   query              82326 non-null  object
 3   query_id           82326 non-null  int32 
 4   query_type         82326 non-null  object
 5   wellFormedAnswers  82326 non-null  object
dtypes: int32(1), object(5)
memory usage: 3.5+ MB


In [37]:
query_texts = []
query_ids = []
query_types = []
answers = []
is_selected = []
passage_texts = []
passage_urls = []

for data in dataset:
    passages = data["passages"]
    is_selected_data = list(map(bool, passages["is_selected"]))
    passage_texts_data = passages["passage_text"]
    passage_urls_data = passages["url"]
    assert (
        len(is_selected_data) == len(passage_texts_data) == len(passage_urls_data)
    )
    num_passages = len(is_selected_data)
    is_selected.extend(is_selected_data)
    passage_texts.extend(passage_texts_data)
    passage_urls.extend(passage_urls_data)
    query_ids.extend([data["query_id"]] * num_passages)
    query_texts.extend([data["query"]] * num_passages)
    query_types.extend([data["query_type"]] * num_passages)
    answers.extend([data["answers"]] * num_passages)
    
df = pd.DataFrame(
    {
        "query_id": query_ids,
        "query_text": query_texts,
        "query_type": query_types,
        "relevant": is_selected,
        "passage_text": passage_texts,
        "passage_url": passage_urls,
        "anwsers": answers,
    }
)

In [38]:
df.head(10)

Unnamed: 0,query_id,query_text,query_type,relevant,passage_text,passage_url,anwsers
0,19699,what is rba,description,False,"Since 2007, the RBA's outstanding reputation h...",https://en.wikipedia.org/wiki/Reserve_Bank_of_...,[Results-Based Accountability is a disciplined...
1,19699,what is rba,description,False,The Reserve Bank of Australia (RBA) came into ...,https://en.wikipedia.org/wiki/Reserve_Bank_of_...,[Results-Based Accountability is a disciplined...
2,19699,what is rba,description,False,RBA Recognized with the 2014 Microsoft US Regi...,http://acronyms.thefreedictionary.com/RBA,[Results-Based Accountability is a disciplined...
3,19699,what is rba,description,False,The inner workings of a rebuildable atomizer a...,https://www.slimvapepen.com/rebuildable-atomiz...,[Results-Based Accountability is a disciplined...
4,19699,what is rba,description,False,Results-Based Accountability® (also known as R...,http://rba-africa.com/about/what-is-rba/,[Results-Based Accountability is a disciplined...
5,19699,what is rba,description,True,Results-Based Accountability® (also known as R...,http://resultsleadership.org/what-is-results-b...,[Results-Based Accountability is a disciplined...
6,19699,what is rba,description,False,"RBA uses a data-driven, decision-making proces...",http://rba-africa.com/about/what-is-rba/,[Results-Based Accountability is a disciplined...
7,19699,what is rba,description,False,vs. NetIQ Identity Manager. Risk-based authent...,http://searchsecurity.techtarget.com/definitio...,[Results-Based Accountability is a disciplined...
8,19699,what is rba,description,False,"A rebuildable atomizer (RBA), often referred t...",https://www.slimvapepen.com/rebuildable-atomiz...,[Results-Based Accountability is a disciplined...
9,19699,what is rba,description,False,Get To Know Us. RBA is a digital and technolog...,http://www.rbaconsulting.com/,[Results-Based Accountability is a disciplined...


In [47]:
# save to CSV
df.to_csv('./data/msmarco-hf-v1.1/ms_marco_for_ir.csv', index=False)

In [55]:
# save to JSONL
with open('./data/msmarco-hf-v1.1/ms_marco_for_ir.jsonl', 'w') as f:
    for record in df.to_dict(orient='records'):
        f.write(json.dumps(record) + '\n')