In [2]:
import fse # fast sentence embeddings

from gensim.models import KeyedVectors

path_w2v = '/home/santosh/Work/models/word2vec/CORD-19/CORD-10-w2v_200d_5w_10i_3mc.bin'

covid_trained_model = KeyedVectors.load_word2vec_format(path_w2v, binary=True)

2020-03-28 15:01:23,945 : INFO : loading projection weights from /home/santosh/Work/models/word2vec/CORD-19/CORD-10-w2v_200d_5w_10i_3mc.bin
2020-03-28 15:01:29,495 : INFO : loaded (241336, 200) matrix from /home/santosh/Work/models/word2vec/CORD-19/CORD-10-w2v_200d_5w_10i_3mc.bin


In [1]:
# Here I am using word embeddings from the CORD-19
from fse import IndexedList
from glob import glob
import nltk
import logging
import os
from tqdm import tqdm

# start the log
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',level=logging.INFO)

# get the current working directory and file
data_dir_path = '/home/santosh/Work/Datasets/CORD-19-text/'
result_dir_path = '/home/santosh/Work/models/word2vec/CORD-19/'



In [2]:
all_text_files = glob(data_dir_path+'*.txt')
len(all_text_files)

12356

In [3]:
sentences = []

for each_text_file in tqdm(all_text_files):
    with open(each_text_file, 'r') as f:
        temp_ = f.readlines()
        for each_line in temp_:
            sentences.append(nltk.word_tokenize(each_line))
  

100%|██████████| 12356/12356 [09:23<00:00, 21.93it/s]


In [4]:
sentences_index = IndexedList(sentences)

In [7]:
import pickle

def dump(data,filename):
    file = open(result_dir_path+filename+'.bin','wb')
    pickle.dump(data, file)
    file.close()
    
    

In [8]:
# save sentences list

dump(sentences_index, 'sentences_index')


In [9]:
# SIF embeddings
from fse.models import uSIF
sif_model = uSIF(covid_trained_model, workers=4, lang_freq="en")

sif_model.train(sentences_index)

2020-03-28 15:14:56,713 : INFO : no frequency mode: using wordfreq for estimation of frequency for language: en
2020-03-28 15:14:56,875 : INFO : scanning all indexed sentences and their word counts
2020-03-28 15:14:59,341 : INFO : finished scanning 2306469 sentences with an average length of 28 and 65669827 total words
2020-03-28 15:14:59,387 : INFO : estimated memory for 2306469 sentences with 200 dimensions and 241336 vocabulary: 1944 MB (1 GB)
2020-03-28 15:14:59,388 : INFO : initializing sentence vectors for 2306469 sentences
2020-03-28 15:15:06,717 : INFO : pre-computing uSIF weights for 241336 words
2020-03-28 15:15:07,209 : INFO : begin training
2020-03-28 15:15:12,226 : INFO : PROGRESS : finished 37.34% with 861324 sentences and 12160051 words, 172264 sentences/s
2020-03-28 15:15:17,237 : INFO : PROGRESS : finished 74.22% with 1711908 sentences and 24211668 words, 170116 sentences/s
2020-03-28 15:15:20,680 : INFO : worker thread finished; awaiting finish of 3 more threads
2020-

(2306469, 32665927)

In [10]:
# Average embeddings

from fse.models import Average

av_model = Average(covid_trained_model)
av_model.train(IndexedList(sentences))

# model.sv.similarity(0,1)

2020-03-28 15:16:53,656 : INFO : scanning all indexed sentences and their word counts
2020-03-28 15:16:56,174 : INFO : finished scanning 2306469 sentences with an average length of 28 and 65669827 total words
2020-03-28 15:16:56,216 : INFO : estimated memory for 2306469 sentences with 200 dimensions and 241336 vocabulary: 1944 MB (1 GB)
2020-03-28 15:16:56,217 : INFO : initializing sentence vectors for 2306469 sentences
2020-03-28 15:17:03,793 : INFO : begin training
2020-03-28 15:17:08,796 : INFO : PROGRESS : finished 45.35% with 1045989 sentences and 14781162 words, 209197 sentences/s
2020-03-28 15:17:13,797 : INFO : PROGRESS : finished 90.00% with 2075768 sentences and 29368720 words, 205955 sentences/s
2020-03-28 15:17:14,944 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-03-28 15:17:14,945 : INFO : training on 2306469 effective sentences with 32665927 effective words took 11s with 206817 sentences/s


(2306469, 32665927)

In [12]:
av_model.sv.similar_by_word("incubation", wv=covid_trained_model, indexable=sentences_index.items, topn=10)

2020-03-28 15:18:13,223 : INFO : precomputing L2-norms of sentence vectors


[(['*', 'Temperature', 'of', 'incubation', '.'], 1281523, 0.9171814322471619),
 (['Primary', 'incubation', 'was', 'for', '1', 'hr', 'at', '40°C', '.'],
  363101,
  0.8121793866157532),
 (['Saracatinib',
   'was',
   'treated',
   'during',
   'the',
   '4',
   '•',
   'C',
   'incubation',
   'only',
   'or',
   'added',
   'at',
   'specific',
   'time',
   'points',
   'during',
   'the',
   '37',
   '•',
   'C',
   'incubation',
   '.'],
  247700,
  0.7885129451751709),
 (['where', ',', 'T', '1', 'is', 'the', 'mean', 'incubation', '.'],
  1348650,
  0.7837847471237183),
 (['Oocytes',
   'were',
   'lysed',
   'by',
   'incubation',
   'in',
   '10',
   'mM',
   'CHAPS',
   'for',
   '1',
   'h',
   'at',
   '4°C',
   '.'],
  221323,
  0.7563362121582031),
 (['These',
   'are',
   'named',
   "'extended",
   'incubation',
   "'",
   ',',
   "'short",
   'incubation',
   "'",
   'and',
   "'extended",
   'absorption',
   "'",
   'assays',
   'respectively',
   '.'],
  1839815,
  0.751

In [13]:
sif_model.sv.similar_by_word("incubation", wv=covid_trained_model, indexable=sentences_index.items, topn=10)

2020-03-28 15:26:19,339 : INFO : precomputing L2-norms of sentence vectors


[(['where', ',', 'T', '1', 'is', 'the', 'mean', 'incubation', '.'],
  1348650,
  0.9460828304290771),
 (['*', 'Temperature', 'of', 'incubation', '.'], 1281523, 0.9154615998268127),
 (['per', 'incubation', ')', '.'], 1998328, 0.8970457315444946),
 (['R0',
   'set',
   'to',
   '3',
   'and',
   'mean',
   'incubation',
   'time',
   'to',
   'be',
   '7.5days',
   '.'],
  717091,
  0.888635516166687),
 (['Saracatinib',
   'was',
   'treated',
   'during',
   'the',
   '4',
   '•',
   'C',
   'incubation',
   'only',
   'or',
   'added',
   'at',
   'specific',
   'time',
   'points',
   'during',
   'the',
   '37',
   '•',
   'C',
   'incubation',
   '.'],
  247700,
  0.8793104887008667),
 (['Plaques',
   'were',
   'enumerated',
   'after',
   'incubation',
   'at',
   '35uC',
   'and',
   '5',
   '%',
   'CO',
   '2',
   'for',
   '2',
   'days',
   '.'],
  2105744,
  0.8350512981414795),
 (['Inoculum',
   'was',
   'removed',
   'after',
   'one',
   'hour',
   'of',
   'incubation',

In [16]:
av_model.sv.similar_by_sentence("incubation days coronavirus".split(), model=av_model, indexable=sentences_index.items, topn=10)

2020-03-28 16:27:28,226 : INFO : scanning all indexed sentences and their word counts
2020-03-28 16:27:28,226 : INFO : finished scanning 1 sentences with an average length of 3 and 3 total words


[(['Plaques',
   'were',
   'enumerated',
   'after',
   'incubation',
   'at',
   '35uC',
   'and',
   '5',
   '%',
   'CO',
   '2',
   'for',
   '2',
   'days',
   '.'],
  2105744,
  0.8233815431594849),
 (['In', 'C', ',', 'the', 'incubation', 'period', 'was', '5', 'days', '.'],
  2281248,
  0.8149763345718384),
 (['In',
   '(',
   'C',
   ')',
   ',',
   'the',
   'incubation',
   'period',
   'was',
   '5',
   'days',
   '.'],
  1028724,
  0.8149763345718384),
 (['In',
   '(',
   'C',
   ')',
   ',',
   'the',
   'incubation',
   'period',
   'was',
   '5',
   'days',
   '.'],
  606694,
  0.8149763345718384),
 (['The',
   'incubation',
   'period',
   'of',
   '2019-nCoV',
   'is',
   'generally',
   '3-7',
   'days',
   'but',
   'no',
   'longer',
   'than',
   '14',
   'days',
   ',',
   'and',
   'the',
   'virus',
   'is',
   'infective',
   'during',
   'the',
   'incubation',
   'period',
   '.'],
  2249114,
  0.8144031763076782),
 (['Distribution',
   'of',
   'Ebola',
   '

In [17]:
sif_model.sv.similar_by_sentence("incubation days coronavirus".split(), model=av_model, indexable=sentences_index.items, topn=10)

2020-03-28 16:27:39,822 : INFO : scanning all indexed sentences and their word counts
2020-03-28 16:27:39,822 : INFO : finished scanning 1 sentences with an average length of 3 and 3 total words


[(['In', 'C', ',', 'the', 'incubation', 'period', 'was', '5', 'days', '.'],
  2281248,
  0.8242506980895996),
 (['In',
   '(',
   'C',
   ')',
   ',',
   'the',
   'incubation',
   'period',
   'was',
   '5',
   'days',
   '.'],
  606694,
  0.8242506980895996),
 (['In',
   '(',
   'C',
   ')',
   ',',
   'the',
   'incubation',
   'period',
   'was',
   '5',
   'days',
   '.'],
  1028724,
  0.8242506980895996),
 (['The',
   'incubation',
   'period',
   'of',
   '2019-nCoV',
   'is',
   'generally',
   '3-7',
   'days',
   'but',
   'no',
   'longer',
   'than',
   '14',
   'days',
   ',',
   'and',
   'the',
   'virus',
   'is',
   'infective',
   'during',
   'the',
   'incubation',
   'period',
   '.'],
  2249114,
  0.8183196783065796),
 (['R',
   '0',
   '=',
   '2.92',
   'with',
   'incubation',
   'time',
   'τ',
   '=',
   '5.2',
   'days',
   'and',
   'τ',
   '=',
   '14',
   'days',
   'were',
   'used',
   '.'],
  854103,
  0.8084714412689209),
 (['The',
   'viral',
   'inc

In [21]:
av_model.save(result_dir_path+'CORD-19-s2v_av.bin')

2020-03-28 16:36:12,116 : INFO : saving SentenceVectors object under /home/santosh/Work/models/word2vec/CORD-19/CORD-19-s2v_sif.bin, separately None
2020-03-28 16:36:12,118 : INFO : storing np array 'vectors' to /home/santosh/Work/models/word2vec/CORD-19/CORD-19-s2v_sif.bin.vectors.npy
2020-03-28 16:37:07,313 : INFO : not storing attribute vectors_norm
2020-03-28 16:37:07,317 : INFO : saved /home/santosh/Work/models/word2vec/CORD-19/CORD-19-s2v_sif.bin


In [20]:
sif_model.save(result_dir_path+'CORD-19-s2v_sif.bin')

2020-03-28 16:33:56,546 : INFO : saving SentenceVectors object under /home/santosh/Work/models/word2vec/CORD-19/CORD-19-s2v_sif.bin, separately None
2020-03-28 16:33:56,547 : INFO : storing np array 'vectors' to /home/santosh/Work/models/word2vec/CORD-19/CORD-19-s2v_sif.bin.vectors.npy
2020-03-28 16:35:41,336 : INFO : not storing attribute vectors_norm
2020-03-28 16:35:41,420 : INFO : saved /home/santosh/Work/models/word2vec/CORD-19/CORD-19-s2v_sif.bin
