In [1]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import re

# load data

In [2]:
all_abstracts = pd.read_csv('data/included_abstracts.csv', index_col = 0)

all_abstracts['article_date'] = pd.to_datetime(all_abstracts['article_date'])
decade_df = all_abstracts[(all_abstracts['article_date'] > '2012-01-01') & (all_abstracts['article_date'] <'2022-01-01')]
len(decade_df)

28703

In [3]:
methods_df = pd.read_csv('output/methods_scored.csv', index_col = 0)
print(len(methods_df))
methods_df.info()

27252
<class 'pandas.core.frame.DataFrame'>
Int64Index: 27252 entries, 12 to 3943
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   pmid                 27252 non-null  int64  
 1   title                27251 non-null  object 
 2   methods              27252 non-null  object 
 3   sample_answer        27252 non-null  object 
 4   sample_score         26249 non-null  float64
 5   database_answer      27252 non-null  object 
 6   database_score       26249 non-null  float64
 7   organisation_answer  27252 non-null  object 
 8   organisation_score   26249 non-null  float64
dtypes: float64(3), int64(1), object(5)
memory usage: 2.1+ MB


In [4]:
abstracts_df = pd.read_csv('output/abstracts_scored.csv', index_col = 0)
print(len(abstracts_df))
abstracts_df.info()

28703
<class 'pandas.core.frame.DataFrame'>
Int64Index: 28703 entries, 0 to 28702
Data columns (total 13 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   pmid                 28703 non-null  int64  
 1   title                28702 non-null  object 
 2   abstract             28703 non-null  object 
 3   disease_answer       28703 non-null  object 
 4   disease_score        28703 non-null  float64
 5   sample_answer        28703 non-null  object 
 6   sample_score         28703 non-null  float64
 7   modality_answer      28703 non-null  object 
 8   modality_score       28703 non-null  float64
 9   database_answer      28703 non-null  object 
 10  database_score       28703 non-null  float64
 11  organisation_answer  28703 non-null  object 
 12  organisation_score   28703 non-null  float64
dtypes: float64(5), int64(1), object(7)
memory usage: 3.1+ MB


# abstract sample sizes

In [5]:
# all letters to lower case
abstracts_df['sample_answer'] = abstracts_df['sample_answer'].map(lambda x:x.lower() if type(x) == str else x) #reduce all to lowercase

In [6]:
# convert text numbers to numeric

from text_to_num import alpha2digit

templist = []

for x in abstracts_df['sample_answer']:
    try:
        templist.append(alpha2digit(x, "en"))
    except Exception as e:
        print(e)
        templist.append('skip')

abstracts_df['sample_answer'] = templist

In [7]:
# remove commas (no spaces)
abstracts_df['sample_answer']=abstracts_df['sample_answer'].str.replace(',','')

In [8]:
# remove entries with no numeric characters
def has_numbers(inputString):
    return any(char.isdigit() for char in inputString)

templist2 = []

for x in abstracts_df['sample_answer']:
    if has_numbers(x) == True:
        templist2.append(x)
    else:
        templist2.append(np.nan)

abstracts_df['sample_answer'] = templist2        

In [9]:
abstracts_df['sample_answer'].sample(15)

7701                                                   162
17513                                                    3
5681                                                   110
8683                                                   104
28410                                                   72
24850    10<sup>4</sup> particles/î¼l for exosome) high...
13916                                                 4090
24357    one-dimensional cardiac cycle signal was conve...
7908                                                  4196
1792                                                    22
26889                                                  200
16309                                                    3
3908                                                   163
21013                                                  354
10565                                                  NaN
Name: sample_answer, dtype: object

# methods sample sizes

In [10]:
# all letters to lower case
methods_df['sample_answer'] = methods_df['sample_answer'].map(lambda x:x.lower() if type(x) == str else x) #reduce all to lowercase

In [11]:
# convert text numbers to numeric

from text_to_num import alpha2digit

templist = []

for x in methods_df['sample_answer']:
    try:
        templist.append(alpha2digit(x, "en"))
    except Exception as e:
        print(e)
        templist.append('skip')

methods_df['sample_answer'] = templist

In [12]:
# remove commas (no spaces)
methods_df['sample_answer']=methods_df['sample_answer'].str.replace(',','')

In [13]:
templist2 = []

for x in methods_df['sample_answer']:
    if has_numbers(x) == True:
        templist2.append(x)
    else:
        templist2.append(np.nan)

methods_df['sample_answer'] = templist2 

In [14]:
methods_df.sample(15)

Unnamed: 0,pmid,title,methods,sample_answer,sample_score,database_answer,database_score,organisation_answer,organisation_score
14644,32297795,Predicting and Preventing Nocturnal Hypoglycem...,methods data sets training data set. a subset ...,13200,0.1042,tidepool big data donation data set,0.4429,"palo alto, ca",0.1849
13471,32694537,"A telescope GWAS analysis strategy, based on S...",dataset group 5a 0.68 group 1c group 9a group ...,dataset group 5a 0.68,0.0001,adni-1,0.0061,adni,0.0129
5627,34207169,A CNN-Based Autoencoder and Machine Learning M...,2. materials and methods because bq chewers al...,33 30 30,0.0072,center for cognition and brain disorders,0.001,chung shan medical university hospital,0.1097
16195,31991205,Local field potential dynamics in the primate ...,material and methods the experimental protocol...,1000,0.0004,national institutes of natural sciences,0.0101,national institutes of natural sciences,0.1683
9758,33487885,Efficient deep learning approach for augmented...,no. of covid-19 images no. of non-covid images...,covid-19 images no. of non-covid images,0.0001,ct dataset,0.0055,ct,0.0008
26505,24122488,Molecular differences between chronic and aggr...,materials & methods we used 310 affymetrix hg...,310 affymetrix hg-u133plus2.0 microarray sampl...,0.248,geo accession number gse16134,0.0835,120,0.0009
24628,27353503,Multimodal MRI features predict isocitrate deh...,methods patient enrollment this study was co...,120,0.12,medical record,0.5379,danafarber/brigham and women's cancer center,0.3319
18583,31289283,Quantifying individual differences in brain mo...,methods participants. data was obtained from t...,100,0.0387,autism brain imaging database exchange ii (abi...,0.5484,"royal childrens hospital, melbourne, victoria",0.0445
23176,28958729,A Deep Learning Solution for Automatic Fetal N...,"methods in clinical practice, a sonologist aim...",3,0.0001,imagenet,0.0031,falx in the midline,0.0064
3464,34476540,Detecting hip osteoarthritis on clinical CT: a...,materials andmethods training data computed to...,94,0.3499,picture archiving and communication system,0.3852,"oulu university hospital, oulu, finland",0.6363


# sample size sheet and combined truth column for annotation

In [15]:
abstract_sizes = abstracts_df[['abstract', 'pmid', 'sample_answer', 'sample_score']].copy()
abstract_sizes.rename(columns={"sample_answer": "abstract_size", "sample_score": "abstract_score"}, inplace=True)

method_sizes = methods_df[['methods', 'pmid', 'sample_answer', 'sample_score']].copy()
method_sizes.rename(columns={"sample_answer": "method_size", "sample_score": "method_score"}, inplace=True)

In [16]:
all_sizes = abstract_sizes.merge(method_sizes, on = 'pmid', how = 'left')
all_sizes = all_sizes[['pmid', 'abstract', 'methods', 'abstract_size', 'abstract_score',
                       'method_size', 'method_score']].astype('object')

In [17]:
## rules for generating truth column
# (1) keep only absolute numerics from abstract / method sizes
# (2) keep all method derived sizes
# (3) where method probability is below threshold, AND if the abstract size is larger, drop the method size
# (4) where method = NaN, replace with abstract derived size

In [18]:
templist = []

for x in all_sizes['abstract_size']:
    try:
        if x.isdigit() == True:
            templist.append(x)
        else:
            templist.append(np.nan)
    except:
        templist.append(np.nan)
        
all_sizes['abstract_truth'] = templist

#check if only contains numbers in method

templist = []

for x in all_sizes['method_size']:
    try:
        if x.isdigit() == True:
            templist.append(x)
        else:
            templist.append(np.nan)
    except:
        templist.append(np.nan)
        
all_sizes['method_truth'] = templist

all_sizes[['abstract_truth', 'method_truth']] = all_sizes[['abstract_truth', 'method_truth']].apply(pd.to_numeric)

In [19]:
#templist = []
#
#for i, row in all_sizes.iterrows():
#    if row['method_score'] < 0.0005 and row['abstract_truth'] > row['method_truth']:
#        templist.append(np.nan)
#    else:
#        templist.append(row['method_truth'])
#
#all_sizes['method_truth'] = templist

In [20]:
#all_sizes = all_sizes.assign(size_truth = all_sizes['method_truth'].fillna(all_sizes['abstract_truth'])).drop(['method_truth', 'abstract_truth'], axis=1)

In [19]:
all_sizes.sample(50)

Unnamed: 0,pmid,abstract,methods,abstract_size,abstract_score,method_size,method_score,abstract_truth,method_truth
26274,31185292,Spine surgery has been identified as a risk fa...,materials and methods the following guidelines...,5413,0.760828,5,0.6147,5413.0,5.0
14245,31972347,The interrupted time-series (ITS) concept is p...,method dataset the dataset employed as an exam...,dataset were from a study of hawley etâ al. (2...,0.043455,14,0.0008,,14.0
25618,32491928,The human brain is characterized by complex st...,materials and methods this research was conduc...,19,0.579145,46,0.7861,19.0,46.0
20133,26354313,Total knee arthroplasty (TKA) patients commonl...,materials and methods ten postsurgical (19 3 m...,19,0.687666,10,0.2606,19.0,10.0
12309,34589115,Survival of patients with metastatic melanoma ...,s2. perform the svm-rfe+cbr method s3. reduc...,62,0.768254,49,0.0089,62.0,49.0
4960,30860491,The monitoring of caloric intake is an importa...,['methods: the study will capture four indivi...,20,0.130162,20,0.0054,20.0,20.0
16732,33260624,Assessing the human affective state using elec...,2. materials and methods 2.1. data collection ...,20,0.595105,20,0.0462,20.0,20.0
21243,28708848,Avoidable hospital readmissions not only contr...,methods data preparation health's epic electro...,300000,0.616757,303097,0.0132,300000.0,303097.0
27188,30347342,Metformin is the preferred first-line medicati...,materials and methods base data and standardiz...,12147,0.927348,1.4 million,0.5481,12147.0,
20660,32273902,"In recent years, asynchronous brain computer i...",data availability *e bci data used to support ...,5,0.072123,90,0.0001,5.0,90.0


In [20]:
all_sizes.to_csv('output/_annotation_sizes.csv')