In [1]:
import fse # fast sentence embeddings

from gensim.models import KeyedVectors

path_w2v = '/home/santosh/Work/models/word2vec/CORD-19/CORD-10-w2v_200d_5w_10i_3mc.bin'

covid_trained_model = KeyedVectors.load_word2vec_format(path_w2v, binary=True)

In [2]:
# Here I am using word embeddings from the CORD-19
import glob
import nltk
import logging
import os
from tqdm import tqdm

# start the log
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',level=logging.INFO)

# get the current working directory and file
result_dir_path = '/home/santosh/Work/Datasets/CORD-19-paragraphs/'

root_path = '/home/santosh/Work/Datasets/CORD-19-research-challenge/'
paths = ['biorxiv_medrxiv/biorxiv_medrxiv/',
        'comm_use_subset/comm_use_subset/',
        'noncomm_use_subset/noncomm_use_subset/']




In [10]:
# get all the jsosn
import json

all_jsons =[]
for json_path in paths:
     all_jsons.extend(sorted(glob.glob(root_path+json_path + '*.json*')))


In [12]:
# extract sentences

for each_json_file in tqdm(all_jsons):
    with open(each_json_file) as json_file:
        data = json.load(json_file)

        full_text = []

        try:
            for each_text in data['abstract']:
                full_text.append(each_text['text'])
        except:
                pass

        try:
            for each_text in data['body_text']:
                full_text.append(each_text['text'])
        except:
                pass
        
        with open(result_dir_path+each_json_file.split('/')[-1][:-5]+'.txt', 'a') as writer:
            for each_para in full_text:
                    if 'word count' not in each_para and 'All rights reserved' not in each_para and 'No reuse allowed without permission' not in each_para:
                        writer.write(each_para+'\n')     

100%|██████████| 12356/12356 [00:59<00:00, 209.41it/s]


In [3]:
paragraphs = []

all_text_files = sorted(glob.glob(result_dir_path + '*.txt*'))

for each_text_file in tqdm(all_text_files):
    with open(each_text_file, 'r') as f:
        temp_ = f.readlines()
        for each_line in temp_:
            paragraphs.append(nltk.word_tokenize(each_line))
  

100%|██████████| 12356/12356 [07:11<00:00, 28.60it/s]


In [4]:
from fse import IndexedList
paragraphs_index = IndexedList(paragraphs)

In [5]:
import pickle

def dump(data,filename):
    file = open(result_dir_path+filename+'.bin','wb')
    pickle.dump(data, file)
    file.close()

In [6]:
result_dir_path = '/home/santosh/Work/models/word2vec/CORD-19/'
dump(paragraphs_index, 'paragraphs_index')

In [8]:
# SIF embeddings
from fse.models import uSIF
sif_model = uSIF(covid_trained_model, workers=4, lang_freq="en")

sif_model.train(paragraphs_index)

2020-03-29 21:48:05,006 : INFO : no frequency mode: using wordfreq for estimation of frequency for language: en
2020-03-29 21:48:05,165 : INFO : scanning all indexed sentences and their word counts
2020-03-29 21:48:05,628 : INFO : finished scanning 413898 sentences with an average length of 158 and 65397441 total words
2020-03-29 21:48:05,676 : INFO : estimated memory for 413898 sentences with 200 dimensions and 241336 vocabulary: 500 MB (0 GB)
2020-03-29 21:48:05,677 : INFO : initializing sentence vectors for 413898 sentences
2020-03-29 21:48:07,008 : INFO : pre-computing uSIF weights for 241336 words
2020-03-29 21:48:07,524 : INFO : begin training
2020-03-29 21:48:12,530 : INFO : PROGRESS : finished 42.06% with 174083 sentences and 13559343 words, 34816 sentences/s
2020-03-29 21:48:17,533 : INFO : PROGRESS : finished 83.86% with 347097 sentences and 27192688 words, 34602 sentences/s
2020-03-29 21:48:19,584 : INFO : worker thread finished; awaiting finish of 3 more threads
2020-03-29 

(413898, 32531620)

In [9]:
sif_model.save(result_dir_path+'CORD-19-p2v_sif.bin')

2020-03-29 21:48:53,520 : INFO : saving uSIF object under /home/santosh/Work/models/word2vec/CORD-19/CORD-19-p2v_sif.bin, separately None
2020-03-29 21:48:53,523 : INFO : storing np array 'vectors' to /home/santosh/Work/models/word2vec/CORD-19/CORD-19-p2v_sif.bin.wv.vectors.npy
2020-03-29 21:48:53,626 : INFO : storing np array 'vectors' to /home/santosh/Work/models/word2vec/CORD-19/CORD-19-p2v_sif.bin.sv.vectors.npy
2020-03-29 21:48:54,216 : INFO : saved /home/santosh/Work/models/word2vec/CORD-19/CORD-19-p2v_sif.bin


In [10]:
import re
def untokenize(words):
    """
    Untokenizing a text undoes the tokenizing operation, restoring
    punctuation and spaces to the places that people expect them to be.
    Ideally, `untokenize(tokenize(text))` should be identical to `text`,
    except for line breaks.
    """
    text = ' '.join(words)
    step1 = text.replace("`` ", '"').replace(" ''", '"').replace('. . .',  '...')
    step2 = step1.replace(" ( ", " (").replace(" ) ", ") ")
    step3 = re.sub(r' ([.,:;?!%]+)([ \'"`])', r"\1\2", step2)
    step4 = re.sub(r' ([.,:;?!%]+)$', r"\1", step3)
    step5 = step4.replace(" '", "'").replace(" n't", "n't").replace(
         "can not", "cannot")
    step6 = step5.replace(" ` ", " '")
    step7 = step6.replace("[ ", "[").replace(" ]", "]")
    return step7.strip()

def extract_query_result(sv_query_result):
    result_sentences =[]
    for each_result in sv_query_result:
        result_sentences.append(untokenize(each_result[0]))
     
    return result_sentences

In [19]:
# query = "pre-existing pulmonary disease SARS-Cov2 Hypertension" 

query = "incubation days SARS-CoV" 
# query = "incubation days coronavirus 2019-nCoV"#  COVID-19
# query = 'socio economic poverty behaviour'

# query = 'pre-existing diseases'
# query = ' basic reproductive number SARS-CoV-2 days'
# query = 'serial interval days'
# query = 'environmental factors SARS-CoV-2'

query_result = sif_model.sv.similar_by_sentence(nltk.word_tokenize(query), model=sif_model, indexable=paragraphs_index.items, topn=100)

extract_query_result(query_result)

2020-03-29 22:01:17,062 : INFO : scanning all indexed sentences and their word counts
2020-03-29 22:01:17,065 : INFO : finished scanning 1 sentences with an average length of 3 and 3 total words
2020-03-29 22:01:17,067 : INFO : removing 5 principal components took 0s


['• Incubation: 2 days.',
 '• Incubation: 14 days.',
 '• Incubation: 14 days.',
 '• Incubation: 2 days.',
 '• Incubation: 5-15 days.',
 '• Incubation: 5-15 days.',
 '• Incubation: 2-3 days.',
 '• Incubation: 2-3 days.',
 '. Distribution of Ebola virus incubation period (from exposure to symptoms), by days of incubation Notes: Adapted from Legrand et al. [3] and Eichner et al. [4]. *Frequency related to the number of patients out of a total of 5,000. Source: Meltzer et al. [1]',
 'We fit a log-normal distribution to pooled data and found the median incubation period to be 4.5 days (95% CI 3.9-5.2 days) for astrovirus, 1.2 days (95% CI 1.1-1.2 days) for norovirus genogroups I and II, 1.7 days (95% CI 1.5-1.8 days) for sapovirus, and 2.0 days (95% CI 1.4-2.4 days) for rotavirus.',
 'The incubation period of MERS-Co V ranges from 2 to 14 days. (Zaki, 2012).',
 'The incubation period of MERS-Co V ranges from 2 to 14 days. (Zaki, 2012).',
 'Tg S (t) (days )',
 '1-2 days [1], 2.62 days [2], a