In [2]:
import pandas as pd
import random

### Queries (dev)

In [3]:
queries_dev = pd.read_csv("queries.dev.tsv", header=None, sep='\t')
queries_dev.columns = ['qid', 'query']
queries_dev.head()

Unnamed: 0,qid,query
0,1048578,cost of endless pools/swim spa
1,1048579,what is pcnt
2,1048580,what is pcb waste
3,1048581,what is pbis?
4,1048582,what is paysky


In [4]:
queries_dev.shape

(101093, 2)

In [3]:
qrels_dev = pd.read_csv("qrels.dev.tsv", header=None, sep='\t')
qrels_dev.columns = ['qid', 'iter','pid','label']
qrels_dev.head()

Unnamed: 0,qid,iter,pid,label
0,1102432,0,2026790,1
1,1102431,0,7066866,1
2,1102431,0,7066867,1
3,1090282,0,7066900,1
4,39449,0,7066905,1


In [4]:
qrels_dev.shape

(59273, 4)

In [5]:
print(len(set(queries_dev['qid'])))
print(len(set(qrels_dev['qid'])))
print(len(set(qrels_dev['qid']).intersection(queries_dev['qid'])))

101093
55578
55578


### Queries (train)

In [5]:
queries_train = pd.read_csv("queries.train.tsv", header=None, sep='\t')
queries_train.columns = ['qid', 'query']
queries_train.head()

Unnamed: 0,qid,query
0,121352,define extreme
1,634306,what does chattel mean on credit history
2,920825,what was the great leap forward brainly
3,510633,tattoo fixers how much does it cost
4,737889,what is decentralization process.


In [6]:
queries_train.shape

(808731, 2)

In [7]:
qrels_train = pd.read_csv("qrels.train.tsv", header=None, sep='\t')
qrels_train.columns = ['qid', 'iter','pid','label']
qrels_train.head()

Unnamed: 0,qid,iter,pid,label
0,1185869,0,0,1
1,1185868,0,16,1
2,597651,0,49,1
3,403613,0,60,1
4,1183785,0,389,1


In [8]:
qrels_train.shape

(532761, 4)

In [9]:
print(len(set(queries_train['qid'])))
print(len(set(qrels_train['qid'])))
print(len(set(qrels_train['qid']).intersection(queries_train['qid'])))

808731
502939
502939


### Passage data

In [10]:
passages = pd.read_csv("collection.tsv", header=None, sep='\t')
passages.columns = ['pid', 'passage']

In [11]:
passages.shape

(8841823, 2)

In [1]:
passages[0]

NameError: name 'passages' is not defined

### Triplet training data

In [7]:
triplets_train = pd.read_csv("triplets/qidpidtriples.train.full.2.tsv", header=None, sep='\t')
triplets_train.columns = ['qid', 'pos_pid','neg_pid']
triplets_train.head()

Unnamed: 0,qid,pos_pid,neg_pid
0,1000094,5399011,4239068
1,1000094,5399011,271630
2,1000094,5399011,5534953
3,1000094,5399011,2608609
4,1000094,5399011,7026367


In [None]:
triplets_train.shape

In [13]:
print(len(set(triplets_train['qid'])))
print(len(set(qrels_train['qid'])))
print(len(set(qrels_train['qid']).intersection(triplets_train['qid'])))

400782
502939
400782


In [14]:
print(len(set(triplets_train['pos_pid']).union(set(triplets_train['neg_pid']))))

8829047


In [15]:
random.seed(1)
train_query_set = list(set(triplets_train['qid']))
tiny_query_set = random.sample(train_query_set, k=400)
small_query_set = random.sample(train_query_set, k=4000)
medium_query_set = random.sample(train_query_set, k=12000)
large_query_set = random.sample(train_query_set, k=40000)


In [16]:
triplets_train_tiny = triplets_train[triplets_train['qid'].isin(tiny_query_set)].copy()
triplets_train_tiny.to_csv("triplets/qidpidtriples.train.tiny.2.tsv", index=False, header=None, sep='\t')

In [17]:
print(triplets_train_tiny.shape)
print(len(set(triplets_train_tiny['pos_pid']).union(set(triplets_train_tiny['neg_pid']))))

(402026, 3)
361629


In [18]:
triplets_train_small = triplets_train[triplets_train['qid'].isin(small_query_set)].copy()
triplets_train_small.to_csv("triplets/qidpidtriples.train.small.2.tsv", index=False, header=None, sep='\t')

In [19]:
print(triplets_train_small.shape)
print(len(set(triplets_train_small['pos_pid']).union(set(triplets_train_small['neg_pid']))))

(3978527, 3)
2645096


In [20]:
triplets_train_medium = triplets_train[triplets_train['qid'].isin(medium_query_set)].copy()
triplets_train_medium.to_csv("triplets/qidpidtriples.train.medium.2.tsv", index=False, header=None, sep='\t')

In [21]:
print(triplets_train_medium.shape)
print(len(set(triplets_train_medium['pos_pid']).union(set(triplets_train_medium['neg_pid']))))

(11988093, 3)
5247526


In [22]:
triplets_train_small_mixed = triplets_train.copy().sample(frac=0.01, random_state=1)
triplets_train_small_mixed.to_csv("triplets/qidpidtriples.train.small_mixed.2.tsv", index=False, header=None, sep='\t')

In [23]:
print(triplets_train_small_mixed.shape)
print(len(set(triplets_train_small_mixed['pos_pid']).union(set(triplets_train_small_mixed['neg_pid']))))
print(len(set(triplets_train_small_mixed['qid'])))

(3977687, 3)
3002423
391202


In [8]:
triplets_train_medium_mixed = triplets_train.copy().sample(frac=0.03, random_state=1)
triplets_train_medium_mixed.to_csv("triplets/qidpidtriples.train.medium_mixed.2.tsv", index=False, header=None, sep='\t')

In [25]:
print(triplets_train_medium_mixed.shape)
print(len(set(triplets_train_medium_mixed['pos_pid']).union(set(triplets_train_medium_mixed['neg_pid']))))

(11933060, 3)
5464050


In [26]:
print(len(set(triplets_train_medium_mixed['qid'])))

395465


In [27]:
triplets_train_large = triplets_train[triplets_train['qid'].isin(large_query_set)].copy()
triplets_train_large.to_csv("triplets/qidpidtriples.train.large.2.tsv", index=False, header=None, sep='\t')

In [28]:
print(triplets_train_large.shape)
print(len(set(triplets_train_large['pos_pid']).union(set(triplets_train_large['neg_pid']))))

(39724681, 3)
7761119


### First round retrievel results

In [29]:
top1000_dev = pd.read_csv("top1000.dev", header=None, sep='\t')
top1000_dev.columns = ['qid', 'pid','query', 'passage']
top1000_dev.head()

Unnamed: 0,qid,pid,query,passage
0,188714,1000052,foods and supplements to lower blood sugar,Watch portion sizes: ■ Even healthy foods will...
1,1082792,1000084,what does the golgi apparatus do to the protei...,"Start studying Bonding, Carbs, Proteins, Lipid..."
2,995526,1000094,where is the federal penitentiary in ind,It takes THOUSANDS of Macy's associates to bri...
3,199776,1000115,health benefits of eating vegetarian,The good news is that you will discover what g...
4,660957,1000115,what foods are good if you have gout?,The good news is that you will discover what g...


In [30]:
top1000_dev.shape

(6668967, 4)

In [31]:
print(len(set(top1000_dev['qid'])))
print(len(set(qrels_dev['qid'])))
print(len(set(qrels_dev['qid']).intersection(top1000_dev['qid'])))

6980
55578
6980


### Subsample collections

In [38]:
pid_set0 = set(triplets_train_medium_mixed['pos_pid']).union(set(triplets_train_medium_mixed['neg_pid']))
pid_set = pid_set0.union(set(qrels_dev['pid'])).union(set(top1000_dev['pid']))

In [39]:
passages_medium = passages[passages['pid'].isin(pid_set)].copy()
print(len(passages_medium))

6517508


In [40]:
passages_medium.to_csv("collection_medium.tsv", index=False, header=None, sep='\t')

In [41]:
pid_dev = set(top1000_dev['pid'])

In [43]:
passages_dev_only = passages[passages['pid'].isin(pid_dev)].copy()
print(len(passages_dev_only))

3895239


In [44]:
passages_dev_only.to_csv("collection_dev_only.tsv", index=False, header=None, sep='\t')