In [1]:
from gensim.summarization import summarize
import fse # fast sentence embeddings

from gensim.models import KeyedVectors

path_w2v = '/home/santosh/Work/models/word2vec/CORD-19/CORD-19-FT_200d_5w_5i_5mc.model'

covid_trained_model = KeyedVectors.load(path_w2v)

In [3]:
# Here I am using word embeddings from the CORD-19
import glob
import nltk
import logging
import os
from tqdm import tqdm


from nltk.corpus import stopwords
from nltk.tokenize.treebank import TreebankWordDetokenizer, TreebankWordTokenizer

# start the log
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',level=logging.INFO)


def clean_my_text(full_text_):
    stopset = set(stopwords.words('english')) #| set(string.punctuation)
    tokens = TreebankWordTokenizer().tokenize(full_text_.split('---10')[0])
    cleanup = [token for token in tokens if token not in stopset and len(token) > 1]
    return cleanup


# get the current working directory and file
dir_path = '/home/santosh/Work/Datasets/CORD-19-sentences/'




In [3]:
paragraphs = []

all_text_files = sorted(glob.glob(dir_path + '*.txt*'))

for each_text_file in tqdm(all_text_files):
    with open(each_text_file, 'r') as f:
        temp_ = f.readlines()
        for each_line in temp_:
            paragraphs.append(TreebankWordTokenizer().tokenize(each_line))
  

100%|██████████| 12356/12356 [05:00<00:00, 41.07it/s]


In [7]:
import pickle

def dump(data,filename):
    file = open(result_dir_path+filename+'.bin','wb')
    pickle.dump(data, file)
    file.close()
    
def load(filename):
    file = open(result_dir_path+filename+'.bin','rb')
    data = pickle.load(file)
    file.close()
    return data    

In [6]:
result_dir_path = '/home/santosh/Work/models/word2vec/CORD-19/'
from fse import IndexedList

paragraphs_index = IndexedList(paragraphs)
dump(paragraphs_index, 'paragraphs_index_doi')

In [8]:
result_dir_path = '/home/santosh/Work/models/word2vec/CORD-19/'
paragraphs_index = load('paragraphs_index_doi')


In [11]:
# SIF embeddings
from fse.models import uSIF
sif_model = uSIF(covid_trained_model, workers=6, lang_freq="en")

sif_model.train(paragraphs_index)

2020-03-31 00:11:45,806 : INFO : no frequency mode: using wordfreq for estimation of frequency for language: en
2020-03-31 00:11:46,153 : INFO : scanning all indexed sentences and their word counts
2020-03-31 00:11:46,739 : INFO : finished scanning 486503 sentences with an average length of 146 and 71216436 total words
2020-03-31 00:11:46,782 : INFO : estimated memory for 486503 sentences with 200 dimensions and 196469 vocabulary: 2197 MB (2 GB)
2020-03-31 00:11:46,783 : INFO : initializing sentence vectors for 486503 sentences
2020-03-31 00:11:48,456 : INFO : pre-computing uSIF weights for 196469 words
2020-03-31 00:11:48,880 : INFO : begin training
2020-03-31 00:11:53,900 : INFO : PROGRESS : finished 5.84% with 28435 sentences and 4026179 words, 5687 sentences/s
2020-03-31 00:11:58,926 : INFO : PROGRESS : finished 11.58% with 56330 sentences and 8112200 words, 5579 sentences/s
2020-03-31 00:12:03,956 : INFO : PROGRESS : finished 17.20% with 83680 sentences and 12127774 words, 5470 se

(486503, 71216436)

In [12]:
sif_model.save(result_dir_path+'CORD-19-FT_p2v_sif.model')

2020-03-31 00:13:47,684 : INFO : saving uSIF object under /home/santosh/Work/models/word2vec/CORD-19/CORD-19-FT_p2v_sif.model, separately None
2020-03-31 00:13:47,685 : INFO : storing np array 'vectors' to /home/santosh/Work/models/word2vec/CORD-19/CORD-19-FT_p2v_sif.model.wv.vectors.npy
2020-03-31 00:13:47,760 : INFO : storing np array 'vectors_vocab' to /home/santosh/Work/models/word2vec/CORD-19/CORD-19-FT_p2v_sif.model.wv.vectors_vocab.npy
2020-03-31 00:13:47,830 : INFO : storing np array 'vectors_ngrams' to /home/santosh/Work/models/word2vec/CORD-19/CORD-19-FT_p2v_sif.model.wv.vectors_ngrams.npy
2020-03-31 00:14:19,715 : INFO : storing np array 'vectors' to /home/santosh/Work/models/word2vec/CORD-19/CORD-19-FT_p2v_sif.model.sv.vectors.npy
2020-03-31 00:14:31,826 : INFO : saved /home/santosh/Work/models/word2vec/CORD-19/CORD-19-FT_p2v_sif.model


# Test Embeddings

In [13]:
from gensim.summarization import summarize

In [6]:
from gensim.models import KeyedVectors
sif_model = KeyedVectors.load(path_w2v+'CORD-19-p2v_sif.bin')

2020-03-30 09:27:00,341 : INFO : loading Word2VecKeyedVectors object from /home/santosh/Work/models/word2vec/CORD-19/CORD-19-p2v_sif.bin
2020-03-30 09:27:03,011 : INFO : loading wv recursively from /home/santosh/Work/models/word2vec/CORD-19/CORD-19-p2v_sif.bin.wv.* with mmap=None
2020-03-30 09:27:03,012 : INFO : loading vectors from /home/santosh/Work/models/word2vec/CORD-19/CORD-19-p2v_sif.bin.wv.vectors.npy with mmap=None
2020-03-30 09:27:05,548 : INFO : loading sv recursively from /home/santosh/Work/models/word2vec/CORD-19/CORD-19-p2v_sif.bin.sv.* with mmap=None
2020-03-30 09:27:05,550 : INFO : loading vectors from /home/santosh/Work/models/word2vec/CORD-19/CORD-19-p2v_sif.bin.sv.vectors.npy with mmap=None
2020-03-30 09:27:09,928 : INFO : loading prep recursively from /home/santosh/Work/models/word2vec/CORD-19/CORD-19-p2v_sif.bin.prep.* with mmap=None
2020-03-30 09:27:09,930 : INFO : loaded /home/santosh/Work/models/word2vec/CORD-19/CORD-19-p2v_sif.bin


In [14]:
import re
def extract_query_result(sv_query_result):
    result_sentences =[]
    for each_result in sv_query_result:
        result_sentences.append(TreebankWordDetokenizer().detokenize((each_result[0])))
     
    return result_sentences

In [17]:
!pip install transformers==2.2.0
!pip install bert-extractive-summarizer

Collecting transformers==2.2.0
[?25l  Downloading https://files.pythonhosted.org/packages/ec/e7/0a1babead1b79afabb654fbec0a052e0d833ba4205a6dfd98b1aeda9c82e/transformers-2.2.0-py3-none-any.whl (360kB)
[K     |████████████████████████████████| 368kB 1.4MB/s eta 0:00:01
[31mERROR: spacy-transformers 0.5.1 has requirement transformers<2.1.0,>=2.0.0, but you'll have transformers 2.2.0 which is incompatible.[0m
Installing collected packages: transformers
  Found existing installation: transformers 2.0.0
    Uninstalling transformers-2.0.0:
      Successfully uninstalled transformers-2.0.0
Successfully installed transformers-2.2.0
Collecting bert-extractive-summarizer
  Downloading https://files.pythonhosted.org/packages/14/0a/c5f1a9b798c51226e17d079e05b127c26df08545211e9f45802ddeca46fe/bert-extractive-summarizer-0.4.0.tar.gz


Building wheels for collected packages: bert-extractive-summarizer
  Building wheel for bert-extractive-summarizer (setup.py) ... [?25ldone
[?25h  Stored in directory: /home/santosh/.cache/pip/wheels/01/68/6b/b574b0e49d621123a2e67a7ec8ae2645ab61fe9f84159947dd
Successfully built bert-extractive-summarizer
Installing collected packages: bert-extractive-summarizer
Successfully installed bert-extractive-summarizer-0.4.0


In [18]:
from summarizer import Summarizer

2020-03-31 00:17:44,712 : INFO : PyTorch version 1.3.1 available.


In [19]:
extract_query_result(query_result)

["mean incubation period was 5.2 days (95% confidence interval [CI]: 4.1-7.0) [2]. As reported 143 by Xu et al [10], the median time from exposure to onset of illness (infected) was 4 days (interquartile 144 range 3-5 days). Another study showed that the mean of incubation period was around 5 days and the 145 period falls within the range of 2-14 days [11]. We set the 5-day as the incubation period and the latent 146 period in this study. Thus, \uf077 = \uf077' = 0.2. 147---10.1101/2020.03.05.20031849---",
 'Current estimates suggest a mean incubation period of 6.4 days (95% credible interval : 5.6-7.7), with a range from 2.1 to 11.1 days (2.5th to 97.5th percentile) [19]. To date, the maximum observed incubation period was 14 days [8].---10.3390/healthcare8010051---',
 'Currently, our understanding of the incubation period for 2019-nCoV is limited. An early analysis based on 34 confirmed cases in Chinese provinces outside Wuhan, using data on known travel to and from Wuhan to estimate

In [24]:
model = Summarizer()
body = ' '.join(extract_query_result(query_result))
result = model(body, min_length=200)
full = ''.join(result)
full

2020-03-31 00:26:44,299 : INFO : loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-config.json from cache at /home/santosh/.cache/torch/transformers/6dfaed860471b03ab5b9acb6153bea82b6632fb9bbe514d3fff050fe1319ee6d.fc076a4d5f1edf25ea3a2bd66e9f6f295dcd64c81dfef5b3f5a3eb2a82751ad1
2020-03-31 00:26:44,341 : INFO : Model config {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "is_decoder": false,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "num_labels": 2,
  "output_attentions": false,
  "output_hidden_states": true,
  "output_past": true,
  "pruned_heads": {},
  "torchscript": false,
  "type_vocab_size": 2,
  "use_bfloat16": false,
  "vocab_size": 30522
}

2020-03-31 0

'COPD patients usually have increased number of comorbitities (cardiovascular diseases, respiratory tract diseases, metabolic diseases, haematological diseases / coagulopathies, musculoskeletal diseases, gastro-intestinal diseases, renal diseases, psychiatric diseases, neoplasias) known as``COPD comorbidome"which are considered as COPD-related (e.g. respiratory failure, pulmonary heart disease cachexia) or COPD-non related (eg obesity, diabetes mellitus, arterial hypertension) [23] [24] [25]. These articles were excluded because these zoonotic pathogens, are not transmitted through direct contact between livestock and humans.---10.1016/j.onehlt.2016.03.001--- As a highly potent herb widely used to treat various diseases, AR has played an indispensable role in healthcare throughout Chinese history [1 , 2]. The capacity of Chlamydia infections to lead to infertility and blindness, their association with chronic diseases such as atherosclerosis, and the extraordinary prevalence and array 

In [43]:
# query = "pre-existing pulmonary disease SARS-Cov2 Hypertension" 

# query = "What is the incubation days of SARS-CoV-2" 
# query = "incubation days coronavirus 2019-nCoV"#  COVID-19
# query = 'socio economic poverty behaviour'

# query = 'what is the comorbidities associated with death'
# query = 'public health mitigation measures that could be effective for control'
# query = 'socio-economic and behavioral factors to understand the economic impact of the SARS-CoV-2 virus and whether there were differences. '
# query = 'what are the risk factors for death in COVID-19'
# query = 'what is the basic reproductive number of SARS-CoV-2 in days'
# query = 'what is the serial interval days SARS-CoV-2'
# query = 'what do we know about the environmental factors influencing SARS-CoV-2'
# query = 'what do we know about drugs using to treat SARS-CoV-2'
# query = 'Transmission dynamics of the virus SARS-CoV-2'
query ='risk of fatality among symptomatic hospitalized patients'
# query = 'Efforts targeted at a universal coronavirus vaccine'


query_result = sif_model.sv.similar_by_sentence(clean_my_text(query), model=sif_model, indexable=paragraphs_index.items, topn=20)

# extract_query_result(query_result)
summarize(' '.join(extract_query_result(query_result)), ratio =0.2, split=True)

2020-03-31 00:40:21,214 : INFO : scanning all indexed sentences and their word counts
2020-03-31 00:40:21,214 : INFO : finished scanning 1 sentences with an average length of 6 and 6 total words
2020-03-31 00:40:21,215 : INFO : removing 5 principal components took 0s
2020-03-31 00:40:21,273 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2020-03-31 00:40:21,277 : INFO : built Dictionary(218 unique tokens: ['accord', 'admiss', 'characterist', 'fatal', 'icu']...) from 47 documents (total 613 corpus positions)
2020-03-31 00:40:21,279 : INFO : Building graph
2020-03-31 00:40:21,280 : INFO : Filling graph
2020-03-31 00:40:21,300 : INFO : Removing unreachable nodes of graph
2020-03-31 00:40:21,302 : INFO : Pagerank graph
2020-03-31 00:40:21,314 : INFO : Sorting pagerank scores


['(D) Fatality by age groups among suspected and confirmed patients.',
 'Compared with outpatient/ED patients, the hospitalized cases were more frequently male (58.3% vs.',
 'Regarding ILI Influenza A (H1N1) pdm09 negative patients, CT/CC genotype carriers had a higher risk of being hospitalized than patients with TT genotype (Adjusted OR : 2.54 (95% CI : 1.54-4.19) ).---10.1371/journal.pone.0158181--- General characteristics of hematological parameters among healthy control (HC), CAP patients and NCIP patients.---10.1101/2020.02.25.20024711--- We detected HRVs in 16% of hospitalized enrolled patients (overall and with radiographically-confirmed pneumonia), 19% of outpatients with influenza-like illness, and 9.6% of control patients (Table 1).',
 'The frequency of HRV detection was highest among enrolled patients #19 years of age, especially young children aged ,5 years.',
 'However, a substantial proportion of hospitalized adult patients in all age groups had HRVs detected.',
 'Among 