In [1]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import re

# load data

In [2]:
all_abstracts = pd.read_csv('data/included_abstracts.csv', index_col = 0)

all_abstracts['article_date'] = pd.to_datetime(all_abstracts['article_date'])
decade_df = all_abstracts[(all_abstracts['article_date'] > '2012-01-01') & (all_abstracts['article_date'] <'2022-01-01')]
len(decade_df)

28703

In [3]:
methods_df = pd.read_csv('output/methods_scored.csv', index_col = 0)
print(len(methods_df))
methods_df.info()

27252
<class 'pandas.core.frame.DataFrame'>
Int64Index: 27252 entries, 0 to 27251
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   pmid               27252 non-null  int64  
 1   title              27251 non-null  object 
 2   methods            27252 non-null  object 
 3   sample_answer      27252 non-null  object 
 4   sample_score       24580 non-null  float64
 5   sample_answer_2    27252 non-null  object 
 6   sample_score_2     24580 non-null  float64
 7   database_answer    27252 non-null  object 
 8   database_score     24580 non-null  float64
 9   database_answer_2  27252 non-null  object 
 10  database_score_2   24580 non-null  float64
 11  location_answer    27252 non-null  object 
 12  location_score     24580 non-null  float64
dtypes: float64(5), int64(1), object(7)
memory usage: 2.9+ MB


In [4]:
abstracts_df = pd.read_csv('output/abstracts_scored.csv', index_col = 0)
print(len(abstracts_df))
abstracts_df.info()

28703
<class 'pandas.core.frame.DataFrame'>
Int64Index: 28703 entries, 0 to 28702
Data columns (total 13 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   pmid                 28703 non-null  int64  
 1   title                28702 non-null  object 
 2   abstract             28703 non-null  object 
 3   disease_answer       28703 non-null  object 
 4   disease_score        28703 non-null  float64
 5   sample_answer        28703 non-null  object 
 6   sample_score         28703 non-null  float64
 7   modality_answer      28703 non-null  object 
 8   modality_score       28703 non-null  float64
 9   database_answer      28703 non-null  object 
 10  database_score       28703 non-null  float64
 11  organisation_answer  28703 non-null  object 
 12  organisation_score   28703 non-null  float64
dtypes: float64(5), int64(1), object(7)
memory usage: 3.1+ MB


# combined columns

In [7]:
abstract_databases = abstracts_df[['pmid', 'abstract', 'database_answer', 'database_score', 'organisation_answer', 'organisation_score']].copy()
abstract_databases.rename(columns={"database_answer": "abstract_db", "database_score": "abstract_db_score",
                                  "organisation_answer": "abstract_org", "organisation_score": "abstract_org_score"}, inplace=True)

method_databases = methods_df[['pmid', 'methods', 'database_answer', 'database_score', 'database_answer_2', 'database_score_2', 
                               'location_answer', 'location_score']].copy()
method_databases.rename(columns={"database_answer": "method_db", "database_score": "method_db_score",
                                 "database_answer_2": "method_db_2", "database_score_2": "method_db_score_2",
                                  "location_answer": "method_org", "location_score": "method_org_score"}, inplace=True)

In [11]:
all_database = abstract_databases.merge(method_databases, on = 'pmid', how = 'left')
all_database = all_database[['pmid', 'abstract', #'methods', 
                             'abstract_db', 'abstract_db_score',
                             'abstract_org', 'abstract_org_score', 
                             'method_db' ,'method_db_score', 'method_db_2' ,'method_db_score_2',
                            'method_org', 'method_org_score']].astype('object')

In [12]:
all_database.sample(50)

Unnamed: 0,pmid,abstract,abstract_db,abstract_db_score,abstract_org,abstract_org_score,method_db,method_db_score,method_db_2,method_db_score_2,method_org,method_org_score
8989,32880677,Acute kidney injury (AKI) is a deleterious com...,two hospitals,0.594497,two hospitals,0.294958,korean society of nephrology registry,0.082059,registry database,0.003598,three teaching hospitals,0.107208
2828,34334334,Parkinson's disease is heterogeneous in sympto...,Parkinson's Progression Markers Initiative (PP...,0.440794,Parkinson's Progression Markers Initiative,0.30993,parkinson's progression markers initiative (ppmi),0.444845,pdbp,0.071393,the parkinson's progression markers initiative...,0.172924
9550,30638592,Premature ventricular contraction is associate...,Holter devices,0.046926,Artificial Immune System,0.014237,mit-bih arrhythmia database (ardb) and mit-bih...,0.155041,mit-bih databases,0.000519,main lead,0.000985
19443,31571320,Machine learning has increasingly been applied...,resting-state functional magnetic resonance im...,0.099084,different sites and scanners,0.023352,rsfmri),0.00575,rsfmri,0.004149,"chaoyang hospital, beijing, china",0.310518
20254,34008178,To develop a radiomic model predicting nonresp...,LaTIM UMR 1101,0.179352,multicenter pretherapeutic contrast-enhanced c...,0.043352,french,0.005813,image biomarker standardisation initiative,0.000656,french,0.170773
8622,34927854,Colorectal cancer (CRC) is the third most comm...,discovery cohort,0.011372,plasma,0.002002,,,,,,
28343,32747203,Machine learning models used to predict postop...,Tertiary hospital,0.231173,Tertiary hospital,0.229364,"institutional electronic medical records, soci...",0.465651,social security death index,0.00212,beth israel deaconess medical center,0.591132
4828,34707347,To evaluate a deep learning-based method to au...,anterior segment optical coherence tomography,0.01934,cornea specialist ophthalmologist,0.01614,bascom palmer eye institute,0.015566,imagenet dataset,0.002699,university of miami,0.438837
28017,34307860,Medical imaging refers to visualization techni...,ImageNet,0.16953,ImageNet,0.143138,chest x-ray dataset,0.011214,chest x-ray dataset provided by the 2019 siim-...,0.00965,kaggle,0.006722
8358,33590485,To develop and validate a deep learning algori...,tissue microarrays of colorectal carcinomas,0.377676,tissue microarrays of colorectal carcinomas,0.083935,a,,a,,a,


In [21]:
#all_database = all_database.fillna('')
#all_database["region_db"] = ""
#all_database["local_db"] = ""
#all_database["research_db"] = ""
#all_database["collection"] = ""
#all_database["other"] = ""

#all_database['method_db'] = all_database['method_db'].map(lambda x:x.lower() if type(x) == str else x)
#all_database['abstract_db'] = all_database['abstract_db'].map(lambda x:x.lower() if type(x) == str else x)


In [22]:
#from collections import Counter
## extract best guess
#text = ['electronic health', 'health record', 'electronic medical', 'medical record', 'clinical record', 
#        'information system', 'medical center', 'patient record', 'notes']

#for x in text:
#    all_database["extract"] = np.where(all_database['method_db'].str.contains(x), "1", all_database["extract"])
#    all_database["extract"] = np.where(all_database['abstract_db'].str.contains(x), "1", all_database["extract"])

##output
#print('text counts:')
#print(Counter(all_database["extract"]))

text counts:
Counter({'': 27342, '1': 1361})


In [23]:
## research db best guess
#text = ['cohort', 'study', 'trial', 'research']
#
#for x in text:
#    all_database["research_db"] = np.where(all_database['method_db'].str.contains(x), "1", all_database["research_db"])
#    all_database["research_db"] = np.where(all_database['abstract_db'].str.contains(x), "1", all_database["research_db"])

##output
#print('text counts:')
#print(Counter(all_database["research_db"]))

text counts:
Counter({'': 26082, '1': 2621})


In [13]:
all_database.to_csv('output/_annotation_database.csv')

In [None]:
## split

In [16]:
split_1 = all_database[0:5000]
split_2 = all_database[5000:10000]
split_3 = all_database[10000:15000]
split_4 = all_database[15000:20000]
split_5 = all_database[20000:25000]
split_6 = all_database[25000:28703]

In [18]:
split_1.to_csv('output/split_database_annotation/split_1.csv')
split_2.to_csv('output/split_database_annotation/split_2.csv')
split_3.to_csv('output/split_database_annotation/split_3.csv')
split_4.to_csv('output/split_database_annotation/split_4.csv')
split_5.to_csv('output/split_database_annotation/split_5.csv')
split_6.to_csv('output/split_database_annotation/split_6.csv')