In [1]:
import pandas as pd
import csv
import random

In [2]:
# Get data from topic_reference_pmid.tsv file
def get_topic_reference_pmid_tsv_data():
    topic_reference_pmid_tsv_data = pd.read_csv('../data/output/topic_reference_pmid.tsv', sep='\t')
    return topic_reference_pmid_tsv_data

In [3]:
topic_reference_pmid_tsv_data = get_topic_reference_pmid_tsv_data()
topic_reference_pmid_tsv_data[:10]

Unnamed: 0,topic,reference PMID
0,100,12565999
1,100,11321159
2,100,11327801
3,100,12625758
4,100,8158003
5,100,10066453
6,100,9482256
7,106,14521963
8,106,12153140
9,106,9358188


In [4]:
# Get data from TREC_cleaned.tsv file
def get_trec_tsv_data():
    trec_tsv_data = pd.read_csv('../data/input/TREC_cleaned.tsv', sep='\t', names=['col_1', 'col_2', 'col_3', 'col_4'], header=None)
    return trec_tsv_data

In [5]:
trec_tsv_data = get_trec_tsv_data()
trec_tsv_data[:10]

Unnamed: 0,col_1,col_2,col_3,col_4
0,100,0,10051592,0
1,100,0,10066453,2
2,100,0,10071611,0
3,100,0,10081497,0
4,100,0,10099207,0
5,100,0,10138840,0
6,100,0,10188261,0
7,100,0,10194562,1
8,100,0,10195608,0
9,100,0,10211543,1


In [6]:
# Get data from selected_trec_topics.tsv file
def get_selected_trec_topics_tsv_data():
    selected_trec_topics_tsv_data = pd.read_csv('../data/input/selected_trec_topics.tsv', sep='\t')
    return selected_trec_topics_tsv_data

In [7]:
selected_trec_topics_tsv_data = get_selected_trec_topics_tsv_data()

In [8]:
# Select 20 documents for each reference PMID for the same topic
def get_documents_to_be_assessed(topic):
    to_be_assessed_pmids = list()
    
    # Get data for the specified topic from TREC_cleaned.tsv file
    topic_data = trec_tsv_data.loc[(trec_tsv_data['col_1'] == topic)]
    
    # Select 6-9 for definitely relevant PMIDs for the specified topic
    
    # Check if definitely relevant articles for the topic is not less than 6
    n_samples = int(selected_trec_topics_tsv_data.loc[(selected_trec_topics_tsv_data['Topic'] == topic)]['Def'])
    if n_samples < 6:
        def_rel_count = n_samples
    else:
        def_rel_count = random.randint(6, 9)
    
    def_rel_pmids = topic_data.loc[(topic_data['col_4'] == 2)]
    def_rel_pmids = def_rel_pmids.sample(n=def_rel_count)
    
    for index, row in def_rel_pmids.iterrows():
        to_be_assessed_pmids.append(row['col_3'])
    
    # Select 6-9 for partially relevant PMIDs for the specified topic
    
    # Check if partially relevant articles for the topic is not less than 6
    n_samples = int(selected_trec_topics_tsv_data.loc[(selected_trec_topics_tsv_data['Topic'] == topic)]['Partial'])
    if n_samples < 6:
        part_rel_count = n_samples
    else:
        part_rel_count = random.randint(6, 9)
    
    part_rel_pmids = topic_data.loc[(topic_data['col_4'] == 1)]
    part_rel_pmids = part_rel_pmids.sample(n=part_rel_count)
    
    for index, row in part_rel_pmids.iterrows():
        to_be_assessed_pmids.append(row['col_3'])
    
    # Select the remaining irrelevant PMIDs for the specified topic
    irr_count = 20 - (def_rel_count + part_rel_count)
    
    irr_pmids = topic_data.loc[(topic_data['col_4'] == 0)]
    irr_pmids = irr_pmids.sample(n=irr_count)
    
    for index, row in irr_pmids.iterrows():
        to_be_assessed_pmids.append(row['col_3'])
        
    # Sort the documents in ascending order
    to_be_assessed_pmids.sort()
        
    return to_be_assessed_pmids

In [9]:
# Get list of tuples for the specified topic and its reference PMIDs
def get_reference_pmids_for_specified_topic(topic):
    topic_reference_pmids = list()
    
    # Get data for the specified topic from topic_reference_pmid.tsv file
    topic_data = topic_reference_pmid_tsv_data.loc[(topic_reference_pmid_tsv_data['topic'] == topic)]
    
    for index, row in topic_data.iterrows():
        topic_reference_pmids.append(tuple((topic, row['reference PMID'])))
        
    # Sort the list of tuples by ascending order of the reference pmids
    topic_reference_pmids.sort(key=lambda x:x[1])
        
    return topic_reference_pmids

In [10]:
# Get unique topics from topic_reference_pmid.tsv file
def get_topics():
    unique_topics = list()
    for index, row in topic_reference_pmid_tsv_data.drop_duplicates(subset=["topic"]).iterrows():
        unique_topics.append(row['topic'])
    return unique_topics

In [11]:
def get_assessed_docs_for_reference_pmids():
    unique_topics = get_topics()

    # List of tuples of topics against its reference PMIDs
    all_topics_reference_pmids = list()
    for topic in unique_topics:
        topic_reference_pmids = get_reference_pmids_for_specified_topic(topic)
        all_topics_reference_pmids.append(topic_reference_pmids)

    # Flat out list of lists
    all_topics_reference_pmids = [item for sublist in all_topics_reference_pmids for item in sublist]

    # Get pmids to be assessed against each reference pmid for the same topic
    reference_assessed_pmids = {}

    for topic_reference_pmids in all_topics_reference_pmids:
        to_be_assessed_pmids = get_documents_to_be_assessed(topic_reference_pmids[0])

        reference_assessed_pmids[topic_reference_pmids] = to_be_assessed_pmids

    return reference_assessed_pmids

In [12]:
reference_assessed_pmids = get_assessed_docs_for_reference_pmids()

In [13]:
# Create and write to topic_reference_and_documents.tsv file
with open('../data/output/topic_reference_and_documents.tsv', 'wt') as out_file:
    tsv_writer = csv.writer(out_file, delimiter='\t')
    tsv_writer.writerow(['TREC topic', 'PMID reference document', 'PMID to be assessed'])
    for key, values in reference_assessed_pmids.items():
        for value in values:
            tsv_writer.writerow([key[0], key[1], value])