# Preprocessing - Experiments

- loading the required information from the json files
- preprocessing
- extracting key phrases

## Requirements

- nltk
- python-rake

In [1]:
%%capture
!pip3 install python-rake
!pip3 install git+https://github.com/boudinfl/pke.git

!python -m nltk.downloader stopwords
!python -m nltk.downloader universal_tagset
!python -m spacy download en # download the english model

In [2]:
from __future__ import print_function

import string

import json
import os
from os import getcwd
from os import listdir
from os.path import isfile, join
from glob import glob

from utils.literature import DataLoader

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, RegexpTokenizer

import RAKE
from pke.unsupervised import TopicRank, MultipartiteRank
from pke.supervised import Kea

## Getting the test files

In [3]:
root_path = join(getcwd(), 'dataset', 'biorxiv_medrxiv', 'biorxiv_medrxiv', 'pdf_json')
files = glob(join(root_path, '*.json'))

In [4]:
print(files[0])

/home/tobias/Desktop/covid19-search/dataset/biorxiv_medrxiv/biorxiv_medrxiv/pdf_json/812769f24dce35d317bc43f43b06812674632a27.json


## RAKE Tests

In [5]:
rake = RAKE.Rake(join(getcwd(), 'stopwords_en.txt'))

In [6]:
phrase_blacklist = [
    'granted medrxiv', 
    'cc-by-nc-', 
    'author/funder', 
    'copyright holder', 
    'peer-reviewed', 
    'preprint', 
    'license', 
    'reuse allowed', 
    'rights reserved', 
    'international license', 
    'cc-by-nc', 
    '0 international license', 
    'medrxiv preprint',
    'cc-by-nc 4']

In [7]:
for f in files[10:20]:
    dl = DataLoader(f)
    text = dl.get_full_text()
    title = dl.get_title()
    keyphrases = rake.run(text, minCharacters = 1, maxWords = 3, minFrequency = 5)
    print(title + "\n")
    print(*[phrase for phrase in keyphrases if phrase[1] >= 1.3 and phrase[0] not in phrase_blacklist], sep='\n')
    print('\n\n')

Title: Estimating the risk of COVID-19 death during the course of the outbreak in Korea

('covid-19', 1.5555555555555556)
('risk', 1.4285714285714286)
('cfr', 1.3)



Exploring Conformational Transition of 2019 Novel Coronavirus Spike Glycoprotein Between Its Closed and Open States Using Molecular Dynamics Simulations

('interdomain salt bridges', 8.207142857142857)
('open state', 4.788676236044657)
('md simulations', 4.703216374269006)
('sars-cov-2', 4.657289002557545)
('smd simulations', 4.447368421052632)
('host cell', 4.392857142857142)
('md trajectory', 4.361111111111111)
('open states', 4.124708624708624)
('closed state', 4.069155446756426)
('smd atoms', 3.8195488721804507)
('conformation', 1.9090909090909092)
('s2', 1.9090909090909092)
('ace2', 1.7857142857142858)
('protein', 1.7454545454545454)
('states', 1.7307692307692308)
('closed', 1.6744186046511629)
('rbd', 1.56)
('covid-19', 1.375)
('performed', 1.3529411764705883)
('selected', 1.3333333333333333)
('protomer', 1.325)





## Topic Rank

In [8]:
pos = {'NOUN', 'PROPN', 'ADJ'}


In [9]:
for f in files[10:20]:
    dl = DataLoader(f)
    text = dl.get_full_text()
    title = dl.get_title()

    #temporary fix
    with open('file.txt', 'w') as tf:
        tf.write(text)

    extractor = TopicRank()
   
    extractor.load_document(
        input='file.txt', 
        language='en')

    extractor.candidate_selection()
    extractor.candidate_weighting()
    keyphrases = extractor.get_n_best(n=10)

    print(f'{title}\n')
    print(*keyphrases, sep='\n')
    print('\n\n')

#cleanup
os.remove('file.txt')

Title: Estimating the risk of COVID-19 death during the course of the outbreak in Korea

('korea', 0.055948897942910164)
('crude cfr', 0.046813240373552445)
('covid-19 death', 0.04574381913937533)
('risk', 0.03517373761436597)
('estimates', 0.030731131825426906)
('covid-19', 0.02876700016236938)
('delay', 0.02857153007099116)
('cases', 0.027504267422700814)
('gyeongsangbuk', 0.022186287952035408)
('geographic differences', 0.02068274759888483)



Exploring Conformational Transition of 2019 Novel Coronavirus Spike Glycoprotein Between Its Closed and Open States Using Molecular Dynamics Simulations

('rbd', 0.03195249300741924)
('simulations', 0.02854169097018119)
('state', 0.021267030746406335)
('crystal structures', 0.02094197616030536)
('open states', 0.01900352355653158)
('closed', 0.01591775296120378)
('interdomain salt bridges', 0.014152766549263049)
('sars', 0.013020187689531074)
('host cells', 0.012605913887291969)
('atoms', 0.01242718636484212)



Delivery of CPAP respiratory su

In [10]:
stoplist = stopwords.words('english')
pos = {'NOUN', 'PROPN', 'ADJ'}
grammar = "NP: {<ADJ>*<NOUN|PROPN>+}"

In [11]:
for f in files[10:20]:
    dl = DataLoader(f)
    text = dl.get_full_text()
    title = dl.get_title()

    #temporary fix
    with open('file.txt', 'w') as tf:
        tf.write(text)

    extractor = MultipartiteRank()
   
    extractor.load_document(
        input='file.txt', 
        language='en',
        normalization=None)

    pos = {'NOUN', 'PROPN', 'ADJ'}
    stoplist = list(string.punctuation)
    stoplist += ['-lrb-', '-rrb-', '-lcb-', '-rcb-', '-lsb-', '-rsb-']
    stoplist += stopwords.words('english')
    extractor.candidate_selection(pos=pos, stoplist=stoplist)

    extractor.candidate_weighting(alpha=1.1,
                              threshold=0.74,
                              method='average')

    keyphrases = extractor.get_n_best(n=10)


    print(f'{title}\n')
    print(*[keyphrase for keyphrase in  keyphrases if keyphrase not in phrase_blacklist], sep='\n')
    print('\n\n')

#cleanup
os.remove('file.txt')

Title: Estimating the risk of COVID-19 death during the course of the outbreak in Korea

('korea', 0.04951595284224889)
('crude cfr', 0.039470497188559064)
('cfr', 0.03151441267528793)
('covid-19 death', 0.02700140932670198)
('risk', 0.0269128822240534)
('delay', 0.025232214915838255)
('death', 0.023334651933478744)
('gyeongsangbuk', 0.020285029287693424)
('covid-19', 0.018825327734230096)
('cases', 0.01840464901533295)



Exploring Conformational Transition of 2019 Novel Coronavirus Spike Glycoprotein Between Its Closed and Open States Using Molecular Dynamics Simulations

('closed', 0.02312279276564334)
('rbd', 0.022493161057562883)
('simulations', 0.02192130832183498)
('open states', 0.019146931898586284)
('exploring conformational transition', 0.013551381772667796)
('sars', 0.0133770601348098)
('host cells', 0.013308851698809316)
('crystal structures', 0.01233237730546202)
('cov-2', 0.010578081016983772)
('md simulations', 0.009582382156385152)



Delivery of CPAP respiratory suppo