# Preprocessing - Experiments

- loading the required information from the json files
- preprocessing
- extracting key phrases

## Requirements

- nltk
- python-rake

In [27]:
%%capture
!pip3 install python-rake
!pip3 install git+https://github.com/boudinfl/pke.git

!python -m nltk.downloader stopwords
!python -m nltk.downloader universal_tagset
!python -m spacy download en # download the english model

In [47]:
from __future__ import print_function

import string

import json
import os
from os import getcwd
from os import listdir
from os.path import isfile, join
from glob import glob

from literature_utils import DataLoader

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, RegexpTokenizer

import RAKE
from pke.unsupervised import TopicRank, MultipartiteRank
from pke.supervised import Kea

## Getting the test files

In [29]:
root_path = join(getcwd(), 'dataset', 'biorxiv_medrxiv', 'biorxiv_medrxiv', 'pdf_json')
files = glob(join(root_path, '*.json'))

In [30]:
print(files[0])

/home/tobias/Desktop/covid19-search/dataset/biorxiv_medrxiv/biorxiv_medrxiv/pdf_json/8f5c5631cca22541a8cea87fb688b17f1266b859.json


## RAKE Tests

In [31]:
rake = RAKE.Rake(join(getcwd(), 'stopwords_en.txt'))

In [32]:
phrase_blacklist = [
    'granted medrxiv', 
    'cc-by-nc-', 
    'author/funder', 
    'copyright holder', 
    'peer-reviewed', 
    'preprint', 
    'license', 
    'reuse allowed', 
    'rights reserved', 
    'international license', 
    'cc-by-nc', 
    '0 international license', 
    'medrxiv preprint',
    'cc-by-nc 4']

In [33]:
for f in files[10:20]:
    dl = DataLoader(f)
    text = dl.get_full_text()
    title = dl.get_title()
    keyphrases = rake.run(text, minCharacters = 1, maxWords = 3, minFrequency = 5)
    print(title + "\n")
    print(*[phrase for phrase in keyphrases if phrase[1] >= 1.3 and phrase[0] not in phrase_blacklist], sep='\n')
    print('\n\n')

Epidemiological Tools that Predict Partial Herd Immunity to SARS Coronavirus 2

('sars-cov-2', 5.166666666666666)
('herd immunity', 4.762237762237762)
('population', 1.6363636363636365)
('infected', 1.4444444444444444)
('type', 1.3953488372093024)
('spread', 1.3333333333333333)



Article type: Research Article Title: Under-the-radar dengue virus infections in natural populations of Aedes aegypti mosquitoes Running title: Dengue virus maintenance in mosquito vectors

('florida', 2.176470588235294)



Interferon-α2b treatment for COVID-19

('ifn-α2b', 4.987240829346092)
('covid-19 disease', 4.107843137254902)
('cases treated', 3.75)
('ifn', 2.303030303030303)
('treatment', 1.8888888888888888)
('time', 1.8181818181818181)
('il-6', 1.8125)
('treated', 1.5)
('crp', 1.4)



A Scalable Method of Applying Heat and Humidity for Decontamination of N95 Respirators During the COVID-19 Crisis

('test group masks', 6.337108013937282)
('decontamination cycles', 3.5294117647058822)
('test', 2.1785714

## Topic Rank

In [34]:
pos = {'NOUN', 'PROPN', 'ADJ'}


In [38]:
for f in files[10:20]:
    dl = DataLoader(f)
    text = dl.get_full_text()
    title = dl.get_title()

    #temporary fix
    with open('file.txt', 'w') as tf:
        tf.write(text)

    extractor = TopicRank()
   
    extractor.load_document(
        input='file.txt', 
        language='en')

    extractor.candidate_selection()
    extractor.candidate_weighting()
    keyphrases = extractor.get_n_best(n=10)

    print(f'{title}\n')
    print(*keyphrases, sep='\n')
    print('\n\n')

#cleanup
os.remove('file.txt')

Epidemiological Tools that Predict Partial Herd Immunity to SARS Coronavirus 2

('cov-2', 0.0421434058456915)
('sars coronavirus', 0.04034294873298681)
('partial herd immunity', 0.030402656341928895)
('country', 0.023622924405299825)
('china', 0.017589947704040813)
('type', 0.01747275887832254)
('infection', 0.01735734454117719)
('undocumented spread', 0.01676018951302783)
('japan', 0.016263954946685262)
('international license', 0.015512324000143112)



Article type: Research Article Title: Under-the-radar dengue virus infections in natural populations of Aedes aegypti mosquitoes Running title: Dengue virus maintenance in mosquito vectors

('full genome', 0.023718981042713885)
('fig', 0.020826097519890674)
('manatee county', 0.020495862460642358)
('florida', 0.019941567124249597)
('denv4', 0.0148215591702564)
('denv4 strain', 0.010924012689922514)
('analysis resource', 0.010364903005669597)
('selection pressure', 0.010100197699509733)
('rights', 0.009923869105411353)
('radar dengue vi

In [42]:
stoplist = stopwords.words('english')
pos = {'NOUN', 'PROPN', 'ADJ'}
grammar = "NP: {<ADJ>*<NOUN|PROPN>+}"

In [49]:
for f in files[10:20]:
    dl = DataLoader(f)
    text = dl.get_full_text()
    title = dl.get_title()

    #temporary fix
    with open('file.txt', 'w') as tf:
        tf.write(text)

    extractor = MultipartiteRank()
   
    extractor.load_document(
        input='file.txt', 
        language='en',
        normalization=None)

    pos = {'NOUN', 'PROPN', 'ADJ'}
    stoplist = list(string.punctuation)
    stoplist += ['-lrb-', '-rrb-', '-lcb-', '-rcb-', '-lsb-', '-rsb-']
    stoplist += stopwords.words('english')
    extractor.candidate_selection(pos=pos, stoplist=stoplist)

    extractor.candidate_weighting(alpha=1.1,
                              threshold=0.74,
                              method='average')

    keyphrases = extractor.get_n_best(n=10)


    print(f'{title}\n')
    print(*[keyphrase for keyphrase in  keyphrases if keyphrase not in phrase_blacklist], sep='\n')
    print('\n\n')

#cleanup
os.remove('file.txt')

Epidemiological Tools that Predict Partial Herd Immunity to SARS Coronavirus 2

('sars coronavirus', 0.04318112718355876)
('cov-2', 0.03684793870593526)
('partial herd immunity', 0.031244088589241707)
('sars', 0.02958813510839033)
('epidemiological tools', 0.02667781861473892)
('china', 0.015084781853939407)
('undocumented spread', 0.01443684288773317)
('type', 0.013506088017896961)
('japan', 0.012597637743229147)
('herd immunity', 0.01102284189826849)



Article type: Research Article Title: Under-the-radar dengue virus infections in natural populations of Aedes aegypti mosquitoes Running title: Dengue virus maintenance in mosquito vectors

('full genome', 0.017040716034161036)
('florida', 0.014789719904302601)
('fig', 0.013261347000988787)
('manatee county', 0.01322320645479539)
('radar dengue virus infections', 0.013094079203847583)
('denv4', 0.01274465985802436)
('mosquito populations', 0.011025254666028167)
('aedes aegypti mosquitoes', 0.010138702837837892)
('analysis resource', 0