In [1]:
from gensim.summarization import summarize
import fse # fast sentence embeddings

from gensim.models import KeyedVectors

path_w2v = '/home/santosh/Work/models/word2vec/CORD-19/CORD-10-w2v_200d_5w_10i_3mc.bin'

covid_trained_model = KeyedVectors.load_word2vec_format(path_w2v, binary=True)

In [1]:
# Here I am using word embeddings from the CORD-19
import glob
import nltk
import logging
import os
from tqdm import tqdm

# start the log
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',level=logging.INFO)

# get the current working directory and file
result_dir_path = '/home/santosh/Work/Datasets/CORD-19-paragraphs/'

root_path = '/home/santosh/Work/Datasets/CORD-19-research-challenge/'
paths = ['biorxiv_medrxiv/biorxiv_medrxiv/',
        'comm_use_subset/comm_use_subset/',
        'noncomm_use_subset/noncomm_use_subset/']




In [2]:
# get all the jsosn
import json

all_jsons =[]
for json_path in paths:
     all_jsons.extend(sorted(glob.glob(root_path+json_path + '*.json*')))


In [12]:
# extract sentences

for each_json_file in tqdm(all_jsons):
    with open(each_json_file) as json_file:
        data = json.load(json_file)

        full_text = []

        try:
            for each_text in data['abstract']:
                full_text.append(each_text['text'])
        except:
                pass

        try:
            for each_text in data['body_text']:
                full_text.append(each_text['text'])
        except:
                pass
        
        with open(result_dir_path+each_json_file.split('/')[-1][:-5]+'.txt', 'a') as writer:
            for each_para in full_text:
                    if 'word count' not in each_para and 'All rights reserved' not in each_para and 'No reuse allowed without permission' not in each_para:
                        writer.write(each_para+'\n')     

100%|██████████| 12356/12356 [00:59<00:00, 209.41it/s]


In [3]:
paragraphs = []

all_text_files = sorted(glob.glob(result_dir_path + '*.txt*'))

for each_text_file in tqdm(all_text_files):
    with open(each_text_file, 'r') as f:
        temp_ = f.readlines()
        for each_line in temp_:
            paragraphs.append(nltk.word_tokenize(each_line))
  

100%|██████████| 12356/12356 [07:11<00:00, 28.60it/s]


In [4]:
from fse import IndexedList
paragraphs_index = IndexedList(paragraphs)

In [5]:
import pickle

def dump(data,filename):
    file = open(result_dir_path+filename+'.bin','wb')
    pickle.dump(data, file)
    file.close()

In [6]:
result_dir_path = '/home/santosh/Work/models/word2vec/CORD-19/'
dump(paragraphs_index, 'paragraphs_index')

In [8]:
# SIF embeddings
from fse.models import uSIF
sif_model = uSIF(covid_trained_model, workers=4, lang_freq="en")

sif_model.train(paragraphs_index)

2020-03-29 21:48:05,006 : INFO : no frequency mode: using wordfreq for estimation of frequency for language: en
2020-03-29 21:48:05,165 : INFO : scanning all indexed sentences and their word counts
2020-03-29 21:48:05,628 : INFO : finished scanning 413898 sentences with an average length of 158 and 65397441 total words
2020-03-29 21:48:05,676 : INFO : estimated memory for 413898 sentences with 200 dimensions and 241336 vocabulary: 500 MB (0 GB)
2020-03-29 21:48:05,677 : INFO : initializing sentence vectors for 413898 sentences
2020-03-29 21:48:07,008 : INFO : pre-computing uSIF weights for 241336 words
2020-03-29 21:48:07,524 : INFO : begin training
2020-03-29 21:48:12,530 : INFO : PROGRESS : finished 42.06% with 174083 sentences and 13559343 words, 34816 sentences/s
2020-03-29 21:48:17,533 : INFO : PROGRESS : finished 83.86% with 347097 sentences and 27192688 words, 34602 sentences/s
2020-03-29 21:48:19,584 : INFO : worker thread finished; awaiting finish of 3 more threads
2020-03-29 

(413898, 32531620)

In [9]:
sif_model.save(result_dir_path+'CORD-19-p2v_sif.bin')

2020-03-29 21:48:53,520 : INFO : saving uSIF object under /home/santosh/Work/models/word2vec/CORD-19/CORD-19-p2v_sif.bin, separately None
2020-03-29 21:48:53,523 : INFO : storing np array 'vectors' to /home/santosh/Work/models/word2vec/CORD-19/CORD-19-p2v_sif.bin.wv.vectors.npy
2020-03-29 21:48:53,626 : INFO : storing np array 'vectors' to /home/santosh/Work/models/word2vec/CORD-19/CORD-19-p2v_sif.bin.sv.vectors.npy
2020-03-29 21:48:54,216 : INFO : saved /home/santosh/Work/models/word2vec/CORD-19/CORD-19-p2v_sif.bin


# Test Embeddings

In [10]:
import pickle
from gensim.summarization import summarize

def load(filename):
    file = open(path_w2v+filename+'.bin','rb')
    data = pickle.load(file)
    file.close()
    return data

In [5]:
path_w2v = '/home/santosh/Work/models/word2vec/CORD-19/'

paragraphs_index = load('paragraphs_index')



In [6]:
from gensim.models import KeyedVectors
sif_model = KeyedVectors.load(path_w2v+'CORD-19-p2v_sif.bin')

2020-03-30 09:27:00,341 : INFO : loading Word2VecKeyedVectors object from /home/santosh/Work/models/word2vec/CORD-19/CORD-19-p2v_sif.bin
2020-03-30 09:27:03,011 : INFO : loading wv recursively from /home/santosh/Work/models/word2vec/CORD-19/CORD-19-p2v_sif.bin.wv.* with mmap=None
2020-03-30 09:27:03,012 : INFO : loading vectors from /home/santosh/Work/models/word2vec/CORD-19/CORD-19-p2v_sif.bin.wv.vectors.npy with mmap=None
2020-03-30 09:27:05,548 : INFO : loading sv recursively from /home/santosh/Work/models/word2vec/CORD-19/CORD-19-p2v_sif.bin.sv.* with mmap=None
2020-03-30 09:27:05,550 : INFO : loading vectors from /home/santosh/Work/models/word2vec/CORD-19/CORD-19-p2v_sif.bin.sv.vectors.npy with mmap=None
2020-03-30 09:27:09,928 : INFO : loading prep recursively from /home/santosh/Work/models/word2vec/CORD-19/CORD-19-p2v_sif.bin.prep.* with mmap=None
2020-03-30 09:27:09,930 : INFO : loaded /home/santosh/Work/models/word2vec/CORD-19/CORD-19-p2v_sif.bin


In [7]:
import re
def untokenize(words):
    """
    Untokenizing a text undoes the tokenizing operation, restoring
    punctuation and spaces to the places that people expect them to be.
    Ideally, `untokenize(tokenize(text))` should be identical to `text`,
    except for line breaks.
    """
    text = ' '.join(words)
    step1 = text.replace("`` ", '"').replace(" ''", '"').replace('. . .',  '...')
    step2 = step1.replace(" ( ", " (").replace(" ) ", ") ")
    step3 = re.sub(r' ([.,:;?!%]+)([ \'"`])', r"\1\2", step2)
    step4 = re.sub(r' ([.,:;?!%]+)$', r"\1", step3)
    step5 = step4.replace(" '", "'").replace(" n't", "n't").replace(
         "can not", "cannot")
    step6 = step5.replace(" ` ", " '")
    step7 = step6.replace("[ ", "[").replace(" ]", "]")
    return step7.strip()

def extract_query_result(sv_query_result):
    result_sentences =[]
    for each_result in sv_query_result:
        result_sentences.append(untokenize(each_result[0]))
     
    return result_sentences

In [62]:
# query = "pre-existing pulmonary disease SARS-Cov2 Hypertension" 

# query = "What is the incubation days of SARS-CoV-2" 
# query = "incubation days coronavirus 2019-nCoV"#  COVID-19
# query = 'socio economic poverty behaviour'

# query = 'what is the influence of pre-existing diseases and comorbidities'
# query = 'public health mitigation measures that could be effective for control'
# query = 'socio-economic and behavioral factors to understand the economic impact of the SARS-CoV-2 virus and whether there were differences. '
# query = 'what are the risk factors for death in COVID-19'
# query = 'what is the basic reproductive number of SARS-CoV-2 in days'
# query = 'what is the serial interval days SARS-CoV-2'
# query = 'what do we know about the environmental factors influencing SARS-CoV-2'
# query = 'what do we know about drugs using to treat SARS-CoV-2'
# query = 'Transmission dynamics of the virus SARS-CoV-2'
# query ='risk of fatality among symptomatic hospitalized patients'
# query = 'Efforts targeted at a universal coronavirus vaccine'
query = ''

query_result = sif_model.sv.similar_by_sentence(nltk.word_tokenize(query), model=sif_model, indexable=paragraphs_index.items, topn=10)

extract_query_result(query_result)
summarize(' '.join(extract_query_result(query_result)), ratio =0.2, split=True)

2020-03-30 17:38:11,432 : INFO : scanning all indexed sentences and their word counts
2020-03-30 17:38:11,432 : INFO : finished scanning 1 sentences with an average length of 7 and 7 total words
2020-03-30 17:38:11,434 : INFO : removing 5 principal components took 0s
2020-03-30 17:38:11,487 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2020-03-30 17:38:11,491 : INFO : built Dictionary(297 unique tokens: ['complic', 'evolut', 'futur', 'influenza', 'issu']...) from 48 documents (total 655 corpus positions)
2020-03-30 17:38:11,494 : INFO : Building graph
2020-03-30 17:38:11,495 : INFO : Filling graph
2020-03-30 17:38:11,515 : INFO : Removing unreachable nodes of graph
2020-03-30 17:38:11,516 : INFO : Pagerank graph
2020-03-30 17:38:11,529 : INFO : Sorting pagerank scores


['In regard to choosing a vaccine target and platform, the vaccine candidate must be immunogenic and immune targeting must lead to virus neutralization or potent cytotoxic responses.',
 'Much of the focus for the development of a SARS-CoV or MERS-CoV vaccine has been on the S protein since it is immunogenic and antibodies targeting it can neutralize the virus [59, 60].',
 'Although the highly conservative M2e of influenza A virus is one of the most promising target for development of universal influenza vaccines, some strategies would be required to improve immunogenicity of vaccines based on M2e containing only 24 amino acid.',
 'In this review, we discuss promising novel influenza virus vaccine targets and the use of MVA for vaccine development against various respiratory viruses.',
 'Early vaccine studies focused on leveraging strategies that had been successful for other vaccines including virus inactivation [12] [13] [14] and subunit immunogens [15] along with novel strategies suc