In [1]:
#Doc-to-topic has been repurposed for doc-to-doc relevance assessment but, is this a good assumption?
#We want to use a doc-to-topic relevance assessment corpus and run a manual doc-to-doc assessment 
# to find out how good the assumption is
#So, from a doc-to-topic corpus, we want to select some topics and documents to create a doc-to-doc corpus

import pandas as pd

In [2]:
# Let's read the TSV that contains information on the doc-to-topic relevance assessment. 
# It should be a TREC-like file with four columns topic, zeros, PMID, relevance assessment (0, 1 or 2) and no headers
# Data is located under data/input

# All TREC Genomics 2005 --> initial attemp
#trec = pd.DataFrame(pd.read_csv('../data/input/genomics.qrels.large.txt', header=None, sep='\t'))

# Only those from TREC Genomics 2055 for which we could retrieve title and abstract --> this is what we want
trec = pd.DataFrame(pd.read_csv('../data/input/TREC_cleaned.tsv', header=None, sep='\t'))

trec.columns = ['topic','zero','pmid','relevance']
trec.head()

Unnamed: 0,topic,zero,pmid,relevance
0,100,0,10051592,0
1,100,0,10066453,2
2,100,0,10071611,0
3,100,0,10081497,0
4,100,0,10099207,0


In [3]:
# We first get totals for those non-relevant
non = trec[trec['relevance']==0].copy()
non['non'] = non['relevance'].groupby(non['topic']).transform('count')
non = non.loc[:,['topic','non']].drop_duplicates()
non = non.reset_index(drop=True)
non.head()

Unnamed: 0,topic,non
0,100,563
1,108,863
2,142,250
3,126,1000
4,107,285


In [4]:
# Then for those partially relevant
partial = trec[trec['relevance']==1].copy()
partial['partial'] = partial['relevance'].groupby(partial['topic']).transform('count')
partial = partial.loc[:,['topic','partial']].drop_duplicates()
partial = partial.reset_index(drop=True)
partial.head()

Unnamed: 0,topic,partial
0,100,50
1,101,17
2,105,76
3,119,19
4,102,5


In [5]:
# And finally for those definitevely relevant
definitive = trec[trec['relevance']==2].copy()
definitive['definitive'] = definitive['relevance'].groupby(definitive['topic']).transform('count')
definitive = definitive.loc[:,['topic','definitive']].drop_duplicates()
definitive = definitive.reset_index(drop=True)
definitive.head()

Unnamed: 0,topic,definitive
0,100,22
1,101,2
2,102,5
3,103,6
4,108,73


In [6]:
# now we merge them together keeping those which have articles for the three relevance categories
result = pd.merge(non, partial, on='topic')
result = pd.merge(result, definitive, on='topic')
result = result.sort_values(by=['topic'])
result

Unnamed: 0,topic,non,partial,definitive
0,100,563,50,22
7,101,613,17,2
6,102,1114,5,5
5,103,647,19,6
9,105,975,76,4
11,106,982,117,41
4,107,285,113,76
1,108,863,118,73
29,109,138,13,163
31,110,717,10,3


In [7]:
# We do not need to assess the whole collection
# We only have time to assess about 100 reference documents each with 15 documents-to-be-assessed
# 15 looks like a good number, some most scientific literature repositories show 10 at once but some few show 25

# All the reference documents and the documents-to-be assessed will be selected from the same topic
# We want to give priority to articles relevant to the topic (definitively or partially)
# We want to use ~20% of the doc-to-topic relevant articles as reference document
# We want to avoid relevance judges to read too many times the same article (~no more than 25% reuse)

# Let's suppose that no article will be judge twice and exactly 5 topic relevant articles will be assessed 
# against the reference one 
# Example with a topic with 5 definitively relevant articles to the topic
# reference articles = 5 * 20% = 1, the max number of doc-to-assesed would be 4 with no reuse
# Example with a topic with 10 definitively relevant articles to the topic
# reference articles = 10 * 20% = 2, the max number of doc-to-assesed would be 8 but we need 10, 2 will be reused, i.e, 25%
# we need topics with at least 10 relevant articles

import math
import numpy as np

In [8]:
result['candidates'] = (result['definitive']*20/100).apply(np.ceil)
more10 = result[result['definitive']>=10]
more10.sum()

topic          2929.0
non           14529.0
partial        1509.0
definitive     2365.0
candidates      483.0
dtype: float64

In [9]:
# We have 483 reference articles but we only have resources for 100
# Let's remove those with too many definitively relevant articles so we can cover more topics
between10and20 = more10[more10['definitive']<=20]
between10and20.sum()

topic         1019.0
non           5930.0
partial        156.0
definitive     123.0
candidates      27.0
dtype: float64

In [10]:
# We have room for more
between10and80 = more10[more10['definitive']<=80]
between10and80.sum()

topic          1944.0
non           11540.0
partial         701.0
definitive      490.0
candidates      105.0
dtype: float64

In [11]:
between10and80

Unnamed: 0,topic,non,partial,definitive,candidates
0,100,563,50,22,5.0
11,106,982,117,41,9.0
4,107,285,113,76,16.0
1,108,863,118,73,15.0
14,113,1281,4,10,2.0
26,116,1127,28,58,12.0
24,118,873,12,20,4.0
8,119,519,19,42,9.0
20,121,361,21,17,4.0
12,122,776,26,19,4.0


In [12]:
between10and80.to_csv('../data/output/selected_trec_topics.tsv', sep = '\t', index=False)