In [1]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import re

# load data

In [2]:
all_abstracts = pd.read_csv('data/included_abstracts.csv', index_col = 0)

all_abstracts['article_date'] = pd.to_datetime(all_abstracts['article_date'])
decade_df = all_abstracts[(all_abstracts['article_date'] > '2012-01-01') & (all_abstracts['article_date'] <'2022-01-01')]
len(decade_df)

28703

In [3]:
methods_df = pd.read_csv('output/methods_scored.csv', index_col = 0)
print(len(methods_df))
methods_df.info()

9071
<class 'pandas.core.frame.DataFrame'>
Int64Index: 9071 entries, 0 to 9070
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   pmid                 9071 non-null   int64  
 1   title                9071 non-null   object 
 2   methods              9071 non-null   object 
 3   sample_answer        9071 non-null   object 
 4   sample_score         8648 non-null   float64
 5   database_answer      9071 non-null   object 
 6   database_score       8648 non-null   float64
 7   organisation_answer  9071 non-null   object 
 8   organisation_score   8648 non-null   float64
dtypes: float64(3), int64(1), object(5)
memory usage: 708.7+ KB


In [4]:
abstracts_df = pd.read_csv('output/abstracts_scored.csv', index_col = 0)
print(len(abstracts_df))
abstracts_df.info()

28703
<class 'pandas.core.frame.DataFrame'>
Int64Index: 28703 entries, 0 to 28702
Data columns (total 13 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   pmid                 28703 non-null  int64  
 1   title                28702 non-null  object 
 2   abstract             28703 non-null  object 
 3   disease_answer       28703 non-null  object 
 4   disease_score        28703 non-null  float64
 5   sample_answer        28703 non-null  object 
 6   sample_score         28703 non-null  float64
 7   modality_answer      28703 non-null  object 
 8   modality_score       28703 non-null  float64
 9   database_answer      28703 non-null  object 
 10  database_score       28703 non-null  float64
 11  organisation_answer  28703 non-null  object 
 12  organisation_score   28703 non-null  float64
dtypes: float64(5), int64(1), object(7)
memory usage: 3.1+ MB


# combined columns

In [18]:
abstract_databases = abstracts_df[['pmid', 'abstract', 'database_answer', 'database_score', 'organisation_answer', 'organisation_score']].copy()
abstract_databases.rename(columns={"database_answer": "abstract_db", "database_score": "abstract_db_score",
                                  "organisation_answer": "abstract_org", "organisation_score": "abstract_org_score"}, inplace=True)

method_databases = methods_df[['pmid', 'methods', 'database_answer', 'database_score', 'organisation_answer', 'organisation_score']].copy()
method_databases.rename(columns={"database_answer": "method_db", "database_score": "method_db_score",
                                  "organisation_answer": "method_org", "organisation_score": "method_org_score"}, inplace=True)

In [19]:
all_database = abstract_databases.merge(method_databases, on = 'pmid', how = 'left')
all_database = all_database[['pmid', 'abstract', 'methods', 'abstract_db', 'abstract_db_score',
                             'abstract_org', 'abstract_org_score', 'method_db' ,'method_db_score',
                            'method_org', 'method_org_score']].astype('object')

In [20]:
all_database.sample(50)

Unnamed: 0,pmid,abstract,methods,abstract_db,abstract_db_score,abstract_org,abstract_org_score,method_db,method_db_score,method_org,method_org_score
18190,33914464,"We developed, tested, and validated machine le...",,BREAST-Q,0.171535,North America,0.383384,,,,
8659,34482428,To develop and validate deep learning (DL) met...,,training and validation datasets,0.044365,training and validation datasets,0.001829,,,,
10351,34364492,We report on the feasibility study exploring t...,,people aged 19-88 with different tumor grades ...,0.025304,people aged 19-88 with different tumor grades ...,0.00592,,,,
8339,33927352,Glioblastoma is the most aggressive type of br...,,Cancer Genome Atlas,0.442231,Cancer Genome Atlas,0.464841,a,,a,
7911,33850515,Accurate segmentation of the optic disc (OD) d...,,our collected dataset and six public datasets,0.055616,our collected dataset and six public datasets,0.007581,,,,
5648,34457220,Most detection methods of coronavirus disease ...,,coronavirus disease 2019,0.21782,coronavirus disease 2019,0.012178,,,,
12347,34926514,The photopic negative response of the full-fie...,,109 optic neuropathy and 108 controls) of 155 ...,0.030059,clinic,0.029246,,,,
5281,24043317,The objective was to develop non-invasive pred...,A representative set of classification algori...,Monroe Carell Jr. Children's Hospital at Vande...,0.20516,Monroe Carell Jr. Children's Hospital at Vande...,0.414958,dataset 2,0.000539,neonatal intensive care unit,0.001809
11370,31877845,Bruxism is a masticatory muscle activity chara...,2.1M.OatveerriaallsTarenadtmMenetthScohdesme...,Bruxism is a masticatory muscle activity chara...,0.000726,bruxism,0.001186,occlusion stress,0.001502,"Ivoclar Vivadent, Liechtenstein",0.006605
7810,32046685,MR images (MRIs) accurate segmentation of brai...,"In this section, we describe details...",Ischemic Stroke Lesion Segmentation (ISLES) 20...,0.124099,BRATS,0.018731,3D volumetric MRIs,0.004826,ISLES,0.00014


In [21]:
#all_database = all_database.fillna('')
#all_database["region_db"] = ""
#all_database["local_db"] = ""
#all_database["research_db"] = ""
#all_database["collection"] = ""
#all_database["other"] = ""

#all_database['method_db'] = all_database['method_db'].map(lambda x:x.lower() if type(x) == str else x)
#all_database['abstract_db'] = all_database['abstract_db'].map(lambda x:x.lower() if type(x) == str else x)


In [22]:
#from collections import Counter
## extract best guess
#text = ['electronic health', 'health record', 'electronic medical', 'medical record', 'clinical record', 
#        'information system', 'medical center', 'patient record', 'notes']

#for x in text:
#    all_database["extract"] = np.where(all_database['method_db'].str.contains(x), "1", all_database["extract"])
#    all_database["extract"] = np.where(all_database['abstract_db'].str.contains(x), "1", all_database["extract"])

##output
#print('text counts:')
#print(Counter(all_database["extract"]))

text counts:
Counter({'': 27342, '1': 1361})


In [23]:
## research db best guess
#text = ['cohort', 'study', 'trial', 'research']
#
#for x in text:
#    all_database["research_db"] = np.where(all_database['method_db'].str.contains(x), "1", all_database["research_db"])
#    all_database["research_db"] = np.where(all_database['abstract_db'].str.contains(x), "1", all_database["research_db"])

##output
#print('text counts:')
#print(Counter(all_database["research_db"]))

text counts:
Counter({'': 26082, '1': 2621})


In [24]:
all_database.to_csv('output/_annotation_database.csv')