In [3]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

# import datasets

In [4]:
all_abstracts = pd.read_csv('data/included_abstracts.csv', index_col = 0)

all_abstracts['article_date'] = pd.to_datetime(all_abstracts['article_date'])
decade_df = all_abstracts[(all_abstracts['article_date'] > '2012-01-01') & (all_abstracts['article_date'] <'2022-01-01')]
len(decade_df)

28703

In [5]:
methods_df = pd.read_csv('output/methods_scored.csv', index_col = 0)
len(methods_df)

9071

In [6]:
methods_df.head()

Unnamed: 0,pmid,title,methods,sample_answer,sample_score,database_answer,database_score,organisation_answer,organisation_score
0,35330977,ECG Restitution Analysis and Machine Learning ...,Subject Recruitment This study did not requi...,Two,0.395289,Rossdales Equine Hospital and Diagnostic Centre,0.076196,University of Surrey,0.352178
1,30814403,Combined machine learning and functional magne...,2.1. Subjects The protocol was approved by t...,69,0.003431,ShaanxiTibet immigrant cohort,0.141915,Xijing Hospital of Air Force Medical University,0.135661
2,34933537,The use and applicability of machine learning ...,"lected database for 153 men with BPE, treated...",153,0.058369,lected database,0.030526,"lected database for 153 men with BPE, treated ...",0.002073
3,34897147,Automation of a Rule-based Workflow to Estimat...,"Participants This retrospective, noninvasive...",908,0.643942,Juntendo University Hospital,0.019872,Juntendo University Hospital,0.608144
4,34176867,Deep Learning Algorithm to Detect Cardiac Sarc...,Study Sample Patients aged ≥18 years who und...,50,0.329474,EchoNet-Dynamic dataset,0.001419,University of Tokyo Hospital,0.691286


In [7]:
abstracts_df = pd.read_csv('output/abstracts_scored.csv', index_col = 0)
len(abstracts_df)

28703

In [8]:
abstracts_df.head()

Unnamed: 0,pmid,title,abstract,disease_answer,disease_score,sample_answer,sample_score,modality_answer,modality_score,database_answer,database_score,organisation_answer,organisation_score
0,35309968,mTeeth: Identifying Brushing Teeth Surfaces Us...,Ensuring that all the teeth surfaces are adequ...,oral diseases,0.304024,114,0.414761,inertial sensors,0.559315,wrist-worn inertial sensor dataset collected f...,0.169088,wrist-worn inertial sensor dataset collected f...,0.169088
1,35330785,Development of a Machine Learning Algorithm fo...,Reverse total shoulder arthroplasty (rTSA) off...,Reverse total shoulder arthroplasty,0.377661,2799,0.494304,Office of Statewide Health Planning and Develo...,0.363589,Office of Statewide Health Planning and Develo...,0.689447,Office of Statewide Health Planning and Develo...,0.689447
2,35330977,ECG Restitution Analysis and Machine Learning ...,Atrial fibrillation is the most frequent arrhy...,paroxysmal atrial fibrillation,0.391101,control and horses with PAF,0.110626,normal sinus-rhythm ECGs,0.316086,normal sinus-rhythm ECGs,0.071468,normal sinus-rhythm ECGs,0.071468
3,35330920,Segmenting Thoracic Cavities with Neoplastic L...,Automatic segmentation of thoracic cavity stru...,neoplastic disease,0.441814,402,0.776861,CT images,0.502925,402 cancer patients,0.757216,402 cancer patients,0.757216
4,30814403,Combined machine learning and functional magne...,Hypoxia exposure during high-altitude expediti...,psychomotor impairment,0.347314,69,0.720279,Rs-fMRI,0.341074,Shaanxi-Tibet immigrant cohort,0.850154,Shaanxi-Tibet immigrant cohort,0.850154


# proper noun extraction

In [131]:
import nltk
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize
from nltk.tokenize.treebank import TreebankWordTokenizer

### abstracts

In [132]:
abstracts_list = abstracts_df['abstract'].to_list()

In [133]:
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')

abs_nouns = []

for text in tqdm(abstracts_list):
    
    blocklist = []
    
    for sentence in sent_detector.tokenize(text):
        tokenizedSentence = word_tokenize(sentence)
        taggedSentence = pos_tag(tokenizedSentence)
        wordlist = []
        
        for word, pos in taggedSentence:
            if pos == 'NNP':
                #print(word)
                wordlist.append(word)
        
        blocklist.extend(wordlist)
    
    abs_nouns.append(blocklist)

  0%|          | 0/28703 [00:00<?, ?it/s]

In [139]:
abstracts_df['abs_nouns'] = abs_nouns

In [140]:
abstracts_df.to_csv('output/abstracts_scored_nouns.csv')

### methods

In [135]:
methods_list = methods_df['methods'].to_list()

In [136]:
met_nouns = []

for text in tqdm(methods_list):
    
    blocklist = []
    
    for sentence in sent_detector.tokenize(text):
        tokenizedSentence = word_tokenize(sentence)
        taggedSentence = pos_tag(tokenizedSentence)
        wordlist = []
        
        for word, pos in taggedSentence:
            if pos == 'NNP':
                #print(word)
                wordlist.append(word)
        
        blocklist.extend(wordlist)
    
    met_nouns.append(blocklist)

  0%|          | 0/9071 [00:00<?, ?it/s]

In [137]:
methods_df['met_nouns'] = met_nouns

In [138]:
methods_df.to_csv('output/methods_scored_nouns.csv')

# geo_extraction

In [141]:
import geograpy
from geograpy import extraction
from geograpy import places
import nltk

nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Joe Z\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to C:\Users\Joe
[nltk_data]     Z\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to C:\Users\Joe
[nltk_data]     Z\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [40]:
abstracts_list = abstracts_df['abstract'].to_list()

In [143]:
abs_geo_cities = []
abs_geo_countries = []
abs_geo_other = []

for x in tqdm(abstracts_list):
    try:
        places = geograpy.get_geoPlace_context(text=x)
    
        abs_geo_cities.append(places.cities)
        abs_geo_countries.append(places.countries)
        abs_geo_other.append(places.other)
    
    except:
        abs_geo_cities.append('nan')
        abs_geo_countries.append('nan')
        abs_geo_other.append('nan')
        
abstracts_df['abs_geo_cities'] = abs_geo_cities
abstracts_df['abs_geo_countries'] = abs_geo_countries
abstracts_df['abs_geo_other'] = abs_geo_other

  0%|          | 0/28703 [00:00<?, ?it/s]

In [None]:
abstracts_df.to_csv('output/abstracts_scored_nouns_geo.csv')

In [147]:
methods_list = methods_df['methods'].to_list()

In [None]:
met_geo_cities = []
met_geo_countries = []
met_geo_other = []

for x in tqdm(methods_list):
    try:
        places = geograpy.get_geoPlace_context(text=x)
    
        met_geo_cities.append(places.cities)
        met_geo_countries.append(places.countries)
        met_geo_other.append(places.other)
    
    except:
        met_geo_cities.append('nan')
        met_geo_countries.append('nan')
        met_geo_other.append('nan')
        
methods_df['met_geo_cities'] = met_geo_cities
methods_df['met_geo_countries'] = met_geo_countries
methods_df['met_geo_other'] = met_geo_other

  0%|          | 0/9071 [00:00<?, ?it/s]

In [None]:
methods_df.to_csv('output/methods_scored_nouns_geo.csv')

# detect proper nouns

In [60]:
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize
from nltk.tokenize.treebank import TreebankWordTokenizer

In [61]:
text = "I enjoyed the visit to Pisa where I visited Stephen at the AMIDA institute"

In [62]:
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
    
for sentence in sent_detector.tokenize(text):
    tokenizedSentence = word_tokenize(sentence)
    taggedSentence = pos_tag(tokenizedSentence)
    start = True
    currentCandidate = []

    for word, pos in taggedSentence:
        if start:
            start = False
            continue

        if pos == 'NNP':
            currentCandidate.append(word)
            continue

        if len(currentCandidate) > 0:
            print(' '.join(currentCandidate))
            currentCandidate = []

    if len(currentCandidate) > 0:
        print(' '.join(currentCandidate))

Pisa
Stephen
AMIDA
