In [2]:
import json
import pandas as pd

In [3]:
def load_samples(path):
    with open(path, encoding='utf-8') as f:
        for line in f:
            yield json.loads(line)


def load_passages(path):
    print('Loading passages from {}...'.format(path))

    samples = []

    for sample in load_samples(path):
        passages = []
        selected = []
        s = {}
        for passage in sample['passages']:
            passages.append(passage['passage_text'].lower())
            selected.append(passage['is_selected'])
        s['query'] = sample['query'].lower()
        s['selected'] = selected
        s['passages'] = passages
        samples.append(s)

    print('Loaded {} samples.'.format(len(samples)))

    return samples

In [20]:
train = load_passages('datasets/msmarco/train_v1.1.json')

Loading passages from datasets/msmarco/train_v1.1.json...
Loaded 82326 samples.


In [21]:
df = pd.DataFrame(train)

In [22]:
df.head()

Unnamed: 0,passages,query,selected
0,"[since 2007, the rba's outstanding reputation ...",what is rba,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0]"
1,"[in his younger years, ronald reagan was a mem...",was ronald reagan a democrat,"[0, 1, 0, 0, 0, 0, 0]"
2,"[sydney, new south wales, australia is located...",how long do you need for sydney and surroundin...,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0]"
3,"[in regards to tile installation costs, consum...",price to install tile in shower,"[0, 0, 0, 0, 0, 0, 0, 0, 1]"
4,"[conclusions: in adult body ct, dose to an org...",why conversion observed in body,"[0, 0, 1, 0, 0, 0, 0, 0]"


In [23]:
print('Average Number of Passages:', sum([len(t['passages']) for t in train]) / len(train))    

Average Number of Passages: 8.21360202123266


In [24]:
print('Average Passage Length:', sum([len(t['passages'][i]) for t in train for i in range(len(t['passages']))]) / sum([len(t['passages']) for t in train]))

Average Passage Length: 421.67736282392747


In [26]:
print('Average Number of Selected Passages per Sample:', sum([sum(t['selected']) for t in train]) / len(train))

Average Number of Selected Passages per Sample: 1.0752739110366105
