In [23]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import re

# load data

In [24]:
all_abstracts = pd.read_csv('data/included_abstracts.csv', index_col = 0)

all_abstracts['article_date'] = pd.to_datetime(all_abstracts['article_date'])
decade_df = all_abstracts[(all_abstracts['article_date'] > '2012-01-01') & (all_abstracts['article_date'] <'2022-01-01')]
len(decade_df)

28703

In [25]:
methods_df = pd.read_csv('output/methods_scored.csv', index_col = 0)
print(len(methods_df))
methods_df.info()

9071
<class 'pandas.core.frame.DataFrame'>
Int64Index: 9071 entries, 0 to 9070
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   pmid                 9071 non-null   int64  
 1   title                9071 non-null   object 
 2   methods              9071 non-null   object 
 3   sample_answer        9071 non-null   object 
 4   sample_score         8648 non-null   float64
 5   database_answer      9071 non-null   object 
 6   database_score       8648 non-null   float64
 7   organisation_answer  9071 non-null   object 
 8   organisation_score   8648 non-null   float64
dtypes: float64(3), int64(1), object(5)
memory usage: 708.7+ KB


In [26]:
abstracts_df = pd.read_csv('output/abstracts_scored.csv', index_col = 0)
print(len(abstracts_df))
abstracts_df.info()

28703
<class 'pandas.core.frame.DataFrame'>
Int64Index: 28703 entries, 0 to 28702
Data columns (total 13 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   pmid                 28703 non-null  int64  
 1   title                28702 non-null  object 
 2   abstract             28703 non-null  object 
 3   disease_answer       28703 non-null  object 
 4   disease_score        28703 non-null  float64
 5   sample_answer        28703 non-null  object 
 6   sample_score         28703 non-null  float64
 7   modality_answer      28703 non-null  object 
 8   modality_score       28703 non-null  float64
 9   database_answer      28703 non-null  object 
 10  database_score       28703 non-null  float64
 11  organisation_answer  28703 non-null  object 
 12  organisation_score   28703 non-null  float64
dtypes: float64(5), int64(1), object(7)
memory usage: 3.1+ MB


# abstract sample sizes

In [27]:
# all letters to lower case
abstracts_df['sample_answer'] = abstracts_df['sample_answer'].map(lambda x:x.lower() if type(x) == str else x) #reduce all to lowercase

In [28]:
# convert text numbers to numeric

from text_to_num import alpha2digit

templist = []

for x in abstracts_df['sample_answer']:
    try:
        templist.append(alpha2digit(x, "en"))
    except Exception as e:
        print(e)
        templist.append('skip')

abstracts_df['sample_answer'] = templist

In [29]:
# remove commas (no spaces)
abstracts_df['sample_answer']=abstracts_df['sample_answer'].str.replace(',','')

In [30]:
# remove entries with no numeric characters
def has_numbers(inputString):
    return any(char.isdigit() for char in inputString)

templist2 = []

for x in abstracts_df['sample_answer']:
    if has_numbers(x) == True:
        templist2.append(x)
    else:
        templist2.append(np.nan)

abstracts_df['sample_answer'] = templist2        

In [57]:
#abstracts_df['sample_answer'].sample(15)

# methods sample sizes

In [32]:
# all letters to lower case
methods_df['sample_answer'] = methods_df['sample_answer'].map(lambda x:x.lower() if type(x) == str else x) #reduce all to lowercase

In [33]:
# convert text numbers to numeric

from text_to_num import alpha2digit

templist = []

for x in methods_df['sample_answer']:
    try:
        templist.append(alpha2digit(x, "en"))
    except Exception as e:
        print(e)
        templist.append('skip')

methods_df['sample_answer'] = templist

In [34]:
# remove commas (no spaces)
methods_df['sample_answer']=methods_df['sample_answer'].str.replace(',','')

In [35]:
templist2 = []

for x in methods_df['sample_answer']:
    if has_numbers(x) == True:
        templist2.append(x)
    else:
        templist2.append(np.nan)

methods_df['sample_answer'] = templist2 

In [58]:
#methods_df.sample(15)

# sample size sheet and combined truth column for annotation

In [48]:
abstract_sizes = abstracts_df[['abstract', 'pmid', 'sample_answer', 'sample_score']].copy()
abstract_sizes.rename(columns={"sample_answer": "abstract_size", "sample_score": "abstract_score"}, inplace=True)

method_sizes = methods_df[['methods', 'pmid', 'sample_answer', 'sample_score']].copy()
method_sizes.rename(columns={"sample_answer": "method_size", "sample_score": "method_score"}, inplace=True)

In [49]:
all_sizes = abstract_sizes.merge(method_sizes, on = 'pmid', how = 'left')
all_sizes = all_sizes[['pmid', 'abstract', 'methods', 'abstract_size', 'abstract_score', 'method_size', 'method_score']].astype('object')

In [50]:
## rules for generating truth column
# (1) keep only absolute numerics from abstract / method sizes
# (2) keep all method derived sizes
# (3) where method probability is below threshold, AND if the abstract size is larger, drop the method size
# (4) where method = NaN, replace with abstract derived size

In [51]:
templist = []

for x in all_sizes['abstract_size']:
    try:
        if x.isdigit() == True:
            templist.append(x)
        else:
            templist.append(np.nan)
    except:
        templist.append(np.nan)
        
all_sizes['abstract_truth'] = templist

#check if only contains numbers in method

templist = []

for x in all_sizes['method_size']:
    try:
        if x.isdigit() == True:
            templist.append(x)
        else:
            templist.append(np.nan)
    except:
        templist.append(np.nan)
        
all_sizes['method_truth'] = templist

all_sizes[['abstract_truth', 'method_truth']] = all_sizes[['abstract_truth', 'method_truth']].apply(pd.to_numeric)

In [53]:
templist = []

for i, row in all_sizes.iterrows():
    if row['method_score'] < 0.0005 and row['abstract_truth'] > row['method_truth']:
        templist.append(np.nan)
    else:
        templist.append(row['method_truth'])

all_sizes['method_truth'] = templist

In [54]:
all_sizes = all_sizes.assign(size_truth = all_sizes['method_truth'].fillna(all_sizes['abstract_truth']))

In [55]:
all_sizes.sample(50)

Unnamed: 0,pmid,abstract,methods,abstract_size,abstract_score,method_size,method_score,abstract_truth,method_truth,size_truth
13203,33937859,To automate skeletal muscle segmentation in a ...,,370,0.603823,,,370.0,,370.0
18660,23313839,Little is known about the ability of natural l...,,10798,0.640644,,,10798.0,,10798.0
28508,33392576,Accurate triage in the emergency department (E...,,265572,0.659106,,,265572.0,,265572.0
4232,31957147,This study investigated whether infrared spect...,,9,0.659201,,,9.0,,9.0
12492,33897397,"Over the last few decades, electroencephalogra...","In the literature, not all previous ...",3,0.478481,,8.5e-05,3.0,,3.0
25132,34389965,In conjunction with recent advancements in mac...,Study Design and Patient Population ...,3178,0.411547,2,0.421381,3178.0,2.0,2.0
9731,34999410,Low vision rehabilitation improves quality-of-...,,5547,0.909553,,,5547.0,,5547.0
23114,26599106,Genetic profiling represents the future of neu...,We recruited primary GBM patients undergoing...,82,0.223857,,0.003104,82.0,,82.0
8401,33981950,Benign breast disease (BBD) is a strong breast...,Study Population and Design We co...,15â 395,0.712636,15 395,0.474894,,,
15459,31628932,"Concussion, also referred to as mild traumatic...",,17,0.6465,,,17.0,,17.0


In [56]:
all_sizes.to_csv('output/_annotation_sizes.csv')