In [1]:
import pandas as pd
import csv
import random

In [2]:
# Get data from TREC.tsv file
def get_trec_tsv_data():
    trec_tsv_data = pd.read_csv('../data/input/TREC.tsv', sep='\t', names=['col_1', 'col_2', 'col_3', 'col_4'], header=None)
    return trec_tsv_data

In [3]:
trec_tsv_data = get_trec_tsv_data()
trec_tsv_data[:10]

Unnamed: 0,col_1,col_2,col_3,col_4
0,100,0,10023709,0
1,100,0,10051592,0
2,100,0,10066453,2
3,100,0,10071611,0
4,100,0,10081001,1
5,100,0,10081002,1
6,100,0,10081497,0
7,100,0,10094804,0
8,100,0,10099207,0
9,100,0,10138840,0


In [4]:
# Get data from selected_trec_topics.tsv file
def get_selected_trec_topics_tsv_data():
    selected_trec_topics_tsv_data = pd.read_csv('../data/input/selected_trec_topics.tsv', sep='\t')
    return selected_trec_topics_tsv_data

In [5]:
selected_trec_topics_tsv_data = get_selected_trec_topics_tsv_data()
selected_trec_topics_tsv_data[:10]

Unnamed: 0,Topic,Non,Partial,Def,Seed
0,100,630,52,22,7
1,106,1061,125,44,14
2,113,1342,4,10,3
3,116,1179,28,58,19
4,118,905,12,20,6
5,119,528,19,42,14
6,121,380,25,17,5
7,122,815,37,19,6
8,128,880,53,21,7
9,129,949,22,16,5


In [6]:
# Get the topics from selected_trec_topics.tsv file
def get_selected_trec_topics():
    selected_trec_topics = list()
    for index, row in selected_trec_topics_tsv_data.iterrows():
        selected_trec_topics.append(row['Topic'])
    return selected_trec_topics

In [7]:
selected_trec_topics = get_selected_trec_topics()

In [8]:
# Get reference PMIDs which are definitely relevant (2) for a particular topic
def get_reference_PMIDs(topic):
    topic_data = trec_tsv_data.loc[(trec_tsv_data['col_1'] == topic) & (trec_tsv_data['col_4'] == 2)]
    PMIDs_list = list()
    for index, row in topic_data.iterrows():
        PMIDs_list.append(row['col_3'])
        
    # Select reference documents at random
    no_of_ref_candidates = int(selected_trec_topics_tsv_data.loc[(selected_trec_topics_tsv_data['Topic'] == topic)]['Seed'])
    PMIDs_list = random.sample(PMIDs_list, no_of_ref_candidates)
    
    return PMIDs_list

In [9]:
# Get reference PMIDs which are definitely relevant (2) for selected TREC topics
def get_reference_PMIDs_for_selected_topics():
    reference_PMIDS_for_selected_topics = {}
    for topic in selected_trec_topics:
        PMIDs_list = get_reference_PMIDs(topic)
        reference_PMIDS_for_selected_topics[topic] = PMIDs_list
    return reference_PMIDS_for_selected_topics    

In [10]:
reference_PMIDS_for_selected_topics = get_reference_PMIDs_for_selected_topics()

In [11]:
# Create and write to topic_reference_pmid.tsv file
with open('../data/output/topic_reference_pmid.tsv', 'wt') as out_file:
    tsv_writer = csv.writer(out_file, delimiter='\t')
    tsv_writer.writerow(['topic', 'reference PMID'])
    for key, values in reference_PMIDS_for_selected_topics.items():
        for value in values:
            tsv_writer.writerow([key, value])