# Extract Data

In [2]:
root_path = '/home/santosh/Work/Datasets/CORD-19-research-challenge/'
paths = [
    'biorxiv_medrxiv/biorxiv_medrxiv/',
        'comm_use_subset/comm_use_subset/',
        'noncomm_use_subset/noncomm_use_subset/'
        ]

result_path = '/home/santosh/Work/Datasets/CORD-19-sentences/'



In [3]:
import glob
import json
from tqdm import tqdm
import pandas as pd

In [4]:
# get all the jsosn

all_jsons =[]
for json_path in paths:
     all_jsons.extend(sorted(glob.glob(root_path+json_path + '*.json*')))

# all_json = glob.glob(f'{root_path}/**/*.json', recursive=True)

In [5]:
all_jsons[0]

'/home/santosh/Work/Datasets/CORD-19-research-challenge/biorxiv_medrxiv/biorxiv_medrxiv/0015023cc06b5362d332b3baf348d11567ca2fbb.json'

In [6]:
meta_df = pd.read_csv('/home/santosh/Work/Datasets/CORD-19-research-challenge/metadata.csv')

In [7]:
meta_df.columns

Index(['sha', 'source_x', 'title', 'doi', 'pmcid', 'pubmed_id', 'license',
       'abstract', 'publish_time', 'authors', 'journal',
       'Microsoft Academic Paper ID', 'WHO #Covidence', 'has_full_text',
       'full_text_file'],
      dtype='object')

In [8]:
selected_columns = ['sha', 'doi', 'pmcid', 'pubmed_id', 'publish_time'] # sha is paper id here

In [9]:
meta_ref_df = meta_df[selected_columns]
meta_ref_df = meta_ref_df[meta_ref_df['sha'].notna()]


In [10]:
# extract sentences with doi


for each_json_file in tqdm(all_jsons):
    with open(each_json_file) as json_file:
        data = json.load(json_file)
        
        try:
            source = meta_ref_df[meta_ref_df['sha'].str.contains(data['paper_id'])]['doi'].values.tolist()[0]
        except:
            source =''
            
        full_text = []

        try:
            for each_text in data['metadata']:
                full_text.append(each_text['title']+' ---'+source+'---')
        except:
                pass 

        
        try:
            for each_text in data['abstract']:
                full_text.append(each_text['text']+' ---'+source+'---')
        except:
                pass

        try:
            for each_text in data['body_text']:
                full_text.append(each_text['text']+' ---'+source+'---')
        except:
                pass
            
        try:
            for each_text in data['ref_entries']:
                full_text.append(data['ref_entries'][each_text]['text']+' ---'+source+'---')
        except:
                pass    
        
        with open(result_path+each_json_file.split('/')[-1][:-5]+'.txt', 'a') as writer:
            for each_para in full_text:
                    if 'doi:' not in each_para and 'word count' not in each_para and 'All rights reserved' not in each_para and 'No reuse allowed without permission' not in each_para:
                        writer.write(each_para+'\n')     

100%|██████████| 12356/12356 [03:33<00:00, 57.96it/s]


# Generate Fasttext

In [2]:
# Here I am using gensim for learning the Fast Text word embeddings from the CORD-19
import nltk
import logging
import os
# download NLTK basic stopwords
import nltk
from nltk.corpus import stopwords

from nltk.tokenize.treebank import TreebankWordDetokenizer, TreebankWordTokenizer


# start the log
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',level=logging.INFO)


def clean_my_text(full_text_):
    stopset = set(stopwords.words('english')) #| set(string.punctuation)
    tokens = TreebankWordTokenizer().tokenize(full_text_.split('---10')[0])
    cleanup = [token for token in tokens if token not in stopset and len(token) > 1]
    return cleanup


# get the current working directory and file
data_dir_path = '/home/santosh/Work/Datasets/CORD-19-sentences/'
result_dir_path = '/home/santosh/Work/models/word2vec/CORD-19/'

In [3]:
class SentenceClass(object):
    def __init__(self, dirname):
        self.dirname = dirname

    def __iter__(self):
        for fname in os.listdir(self.dirname):
#             print(fname)
            for line in open(os.path.join(self.dirname, fname), 'r'):
                yield clean_my_text(line)


# iterate one line at a time and learn embeddings
paragraphs = SentenceClass(data_dir_path)  # a memory-friendly iterator

In [4]:
from gensim.models.fasttext import FastText

model = FastText(size=200,
    window=5,
    min_count=5,
    workers=4, iter=5)

model.build_vocab(sentences=paragraphs)
training_examples_count = model.corpus_count







2020-03-30 23:24:12,634 : INFO : resetting layer weights
2020-03-30 23:24:22,285 : INFO : collecting all words and their counts
2020-03-30 23:24:22,323 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2020-03-30 23:24:31,662 : INFO : PROGRESS: at sentence #10000, processed 808238 words, keeping 67684 word types
2020-03-30 23:24:41,977 : INFO : PROGRESS: at sentence #20000, processed 1624424 words, keeping 105701 word types
2020-03-30 23:24:50,885 : INFO : PROGRESS: at sentence #30000, processed 2429497 words, keeping 138800 word types
2020-03-30 23:25:00,007 : INFO : PROGRESS: at sentence #40000, processed 3263053 words, keeping 172853 word types
2020-03-30 23:25:11,423 : INFO : PROGRESS: at sentence #50000, processed 4053294 words, keeping 199081 word types
2020-03-30 23:25:20,401 : INFO : PROGRESS: at sentence #60000, processed 4908065 words, keeping 230538 word types
2020-03-30 23:25:29,350 : INFO : PROGRESS: at sentence #70000, processed 5745451 words, kee

In [5]:
model.train(paragraphs, total_examples=training_examples_count, epochs=model.iter)
logging.info("Done training data files")

  """Entry point for launching an IPython kernel.
2020-03-30 23:32:19,582 : INFO : training model with 4 workers on 196469 vocabulary and 200 features, using sg=0 hs=0 sample=0.001 negative=5 window=5
2020-03-30 23:32:20,628 : INFO : EPOCH 1 - PROGRESS: at 0.16% examples, 64252 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:32:21,665 : INFO : EPOCH 1 - PROGRESS: at 0.40% examples, 78394 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:32:22,702 : INFO : EPOCH 1 - PROGRESS: at 0.66% examples, 85655 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:32:23,712 : INFO : EPOCH 1 - PROGRESS: at 0.92% examples, 87371 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:32:24,798 : INFO : EPOCH 1 - PROGRESS: at 1.20% examples, 89206 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:32:25,826 : INFO : EPOCH 1 - PROGRESS: at 1.46% examples, 89655 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:32:26,909 : INFO : EPOCH 1 - PROGRESS: at 1.70% examples, 89462 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:32:27,99

2020-03-30 23:33:34,134 : INFO : EPOCH 1 - PROGRESS: at 18.69% examples, 95705 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:33:35,225 : INFO : EPOCH 1 - PROGRESS: at 18.90% examples, 95885 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:33:36,294 : INFO : EPOCH 1 - PROGRESS: at 19.17% examples, 95920 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:33:37,308 : INFO : EPOCH 1 - PROGRESS: at 19.47% examples, 96016 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:33:38,326 : INFO : EPOCH 1 - PROGRESS: at 19.74% examples, 96104 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:33:39,396 : INFO : EPOCH 1 - PROGRESS: at 20.02% examples, 96129 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:33:40,399 : INFO : EPOCH 1 - PROGRESS: at 20.29% examples, 96232 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:33:41,448 : INFO : EPOCH 1 - PROGRESS: at 20.54% examples, 96275 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:33:42,527 : INFO : EPOCH 1 - PROGRESS: at 20.84% examples, 96276 words/s, in_qsize 0, out_

2020-03-30 23:34:52,315 : INFO : EPOCH 1 - PROGRESS: at 38.71% examples, 95772 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:34:53,333 : INFO : EPOCH 1 - PROGRESS: at 39.02% examples, 95755 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:34:54,384 : INFO : EPOCH 1 - PROGRESS: at 39.29% examples, 95782 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:34:55,475 : INFO : EPOCH 1 - PROGRESS: at 39.58% examples, 95785 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:34:56,490 : INFO : EPOCH 1 - PROGRESS: at 39.84% examples, 95835 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:34:57,554 : INFO : EPOCH 1 - PROGRESS: at 40.10% examples, 95852 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:34:58,609 : INFO : EPOCH 1 - PROGRESS: at 40.39% examples, 95875 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:34:59,690 : INFO : EPOCH 1 - PROGRESS: at 40.69% examples, 95883 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:35:00,773 : INFO : EPOCH 1 - PROGRESS: at 40.98% examples, 95883 words/s, in_qsize 0, out_

2020-03-30 23:36:09,898 : INFO : EPOCH 1 - PROGRESS: at 59.09% examples, 96631 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:36:10,965 : INFO : EPOCH 1 - PROGRESS: at 59.33% examples, 96596 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:36:12,000 : INFO : EPOCH 1 - PROGRESS: at 59.57% examples, 96536 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:36:13,078 : INFO : EPOCH 1 - PROGRESS: at 59.79% examples, 96499 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:36:14,182 : INFO : EPOCH 1 - PROGRESS: at 60.11% examples, 96489 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:36:15,208 : INFO : EPOCH 1 - PROGRESS: at 60.39% examples, 96514 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:36:16,224 : INFO : EPOCH 1 - PROGRESS: at 60.67% examples, 96542 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:36:17,263 : INFO : EPOCH 1 - PROGRESS: at 60.98% examples, 96563 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:36:18,287 : INFO : EPOCH 1 - PROGRESS: at 61.24% examples, 96587 words/s, in_qsize 0, out_

2020-03-30 23:37:27,244 : INFO : EPOCH 1 - PROGRESS: at 79.50% examples, 96867 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:37:28,348 : INFO : EPOCH 1 - PROGRESS: at 79.75% examples, 96861 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:37:29,432 : INFO : EPOCH 1 - PROGRESS: at 80.03% examples, 96860 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:37:30,477 : INFO : EPOCH 1 - PROGRESS: at 80.26% examples, 96811 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:37:31,537 : INFO : EPOCH 1 - PROGRESS: at 80.49% examples, 96788 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:37:32,587 : INFO : EPOCH 1 - PROGRESS: at 80.77% examples, 96766 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:37:33,636 : INFO : EPOCH 1 - PROGRESS: at 81.03% examples, 96778 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:37:34,641 : INFO : EPOCH 1 - PROGRESS: at 81.30% examples, 96802 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:37:35,648 : INFO : EPOCH 1 - PROGRESS: at 81.54% examples, 96825 words/s, in_qsize 0, out_

2020-03-30 23:38:44,680 : INFO : EPOCH 1 - PROGRESS: at 99.77% examples, 97365 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:38:45,447 : INFO : worker thread finished; awaiting finish of 3 more threads
2020-03-30 23:38:45,449 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-03-30 23:38:45,495 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-03-30 23:38:45,576 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-03-30 23:38:45,577 : INFO : EPOCH - 1 : training on 39241820 raw words (37588020 effective words) took 386.0s, 97380 effective words/s
2020-03-30 23:38:46,618 : INFO : EPOCH 2 - PROGRESS: at 0.23% examples, 92145 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:38:47,713 : INFO : EPOCH 2 - PROGRESS: at 0.53% examples, 98763 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:38:48,754 : INFO : EPOCH 2 - PROGRESS: at 0.81% examples, 101533 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:38:49,787 : INFO : EPOCH 2 - PROGRES

2020-03-30 23:39:57,210 : INFO : EPOCH 2 - PROGRESS: at 19.05% examples, 102042 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:39:58,281 : INFO : EPOCH 2 - PROGRESS: at 19.34% examples, 101988 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:39:59,339 : INFO : EPOCH 2 - PROGRESS: at 19.63% examples, 101943 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:40:00,385 : INFO : EPOCH 2 - PROGRESS: at 19.89% examples, 101914 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:40:01,485 : INFO : EPOCH 2 - PROGRESS: at 20.13% examples, 101693 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:40:02,508 : INFO : EPOCH 2 - PROGRESS: at 20.38% examples, 101580 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:40:03,551 : INFO : EPOCH 2 - PROGRESS: at 20.64% examples, 101565 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:40:04,615 : INFO : EPOCH 2 - PROGRESS: at 20.97% examples, 101514 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:40:05,637 : INFO : EPOCH 2 - PROGRESS: at 21.23% examples, 101527 words/s, in_qsiz

2020-03-30 23:41:13,492 : INFO : EPOCH 2 - PROGRESS: at 39.84% examples, 101658 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:41:14,503 : INFO : EPOCH 2 - PROGRESS: at 40.10% examples, 101671 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:41:15,521 : INFO : EPOCH 2 - PROGRESS: at 40.39% examples, 101679 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:41:16,604 : INFO : EPOCH 2 - PROGRESS: at 40.69% examples, 101644 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:41:17,652 : INFO : EPOCH 2 - PROGRESS: at 40.98% examples, 101626 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:41:18,691 : INFO : EPOCH 2 - PROGRESS: at 41.29% examples, 101614 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:41:19,784 : INFO : EPOCH 2 - PROGRESS: at 41.61% examples, 101563 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:41:20,833 : INFO : EPOCH 2 - PROGRESS: at 41.91% examples, 101557 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:41:21,860 : INFO : EPOCH 2 - PROGRESS: at 42.20% examples, 101557 words/s, in_qsiz

2020-03-30 23:42:29,164 : INFO : EPOCH 2 - PROGRESS: at 60.37% examples, 101665 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:42:30,186 : INFO : EPOCH 2 - PROGRESS: at 60.65% examples, 101669 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:42:31,220 : INFO : EPOCH 2 - PROGRESS: at 60.95% examples, 101669 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:42:32,237 : INFO : EPOCH 2 - PROGRESS: at 61.22% examples, 101674 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:42:33,249 : INFO : EPOCH 2 - PROGRESS: at 61.48% examples, 101683 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:42:34,288 : INFO : EPOCH 2 - PROGRESS: at 61.77% examples, 101682 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:42:35,295 : INFO : EPOCH 2 - PROGRESS: at 62.05% examples, 101687 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:42:36,336 : INFO : EPOCH 2 - PROGRESS: at 62.42% examples, 101642 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:42:37,375 : INFO : EPOCH 2 - PROGRESS: at 62.69% examples, 101639 words/s, in_qsiz

2020-03-30 23:43:44,515 : INFO : EPOCH 2 - PROGRESS: at 81.14% examples, 101798 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:43:45,599 : INFO : EPOCH 2 - PROGRESS: at 81.41% examples, 101811 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:43:46,702 : INFO : EPOCH 2 - PROGRESS: at 81.73% examples, 101818 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:43:47,751 : INFO : EPOCH 2 - PROGRESS: at 82.02% examples, 101810 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:43:48,770 : INFO : EPOCH 2 - PROGRESS: at 82.31% examples, 101814 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:43:49,807 : INFO : EPOCH 2 - PROGRESS: at 82.58% examples, 101812 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:43:50,807 : INFO : EPOCH 2 - PROGRESS: at 82.83% examples, 101819 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:43:51,841 : INFO : EPOCH 2 - PROGRESS: at 83.13% examples, 101819 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:43:52,859 : INFO : EPOCH 2 - PROGRESS: at 83.40% examples, 101823 words/s, in_qsiz

2020-03-30 23:44:57,183 : INFO : EPOCH 3 - PROGRESS: at 0.23% examples, 89287 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:44:58,253 : INFO : EPOCH 3 - PROGRESS: at 0.53% examples, 98424 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:44:59,300 : INFO : EPOCH 3 - PROGRESS: at 0.81% examples, 101135 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:45:00,336 : INFO : EPOCH 3 - PROGRESS: at 1.10% examples, 100944 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:45:01,357 : INFO : EPOCH 3 - PROGRESS: at 1.38% examples, 101287 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:45:02,374 : INFO : EPOCH 3 - PROGRESS: at 1.66% examples, 101558 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:45:03,400 : INFO : EPOCH 3 - PROGRESS: at 1.95% examples, 101686 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:45:04,408 : INFO : EPOCH 3 - PROGRESS: at 2.24% examples, 102007 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:45:05,461 : INFO : EPOCH 3 - PROGRESS: at 2.55% examples, 101800 words/s, in_qsize 0, out_qs

2020-03-30 23:46:13,018 : INFO : EPOCH 3 - PROGRESS: at 20.56% examples, 102599 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:46:14,040 : INFO : EPOCH 3 - PROGRESS: at 20.87% examples, 102587 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:46:15,060 : INFO : EPOCH 3 - PROGRESS: at 21.14% examples, 102592 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:46:16,075 : INFO : EPOCH 3 - PROGRESS: at 21.43% examples, 102601 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:46:17,085 : INFO : EPOCH 3 - PROGRESS: at 21.71% examples, 102618 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:46:18,095 : INFO : EPOCH 3 - PROGRESS: at 21.99% examples, 102614 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:46:19,119 : INFO : EPOCH 3 - PROGRESS: at 22.28% examples, 102610 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:46:20,152 : INFO : EPOCH 3 - PROGRESS: at 22.56% examples, 102599 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:46:21,156 : INFO : EPOCH 3 - PROGRESS: at 22.86% examples, 102595 words/s, in_qsiz

2020-03-30 23:47:28,527 : INFO : EPOCH 3 - PROGRESS: at 41.26% examples, 102019 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:47:29,587 : INFO : EPOCH 3 - PROGRESS: at 41.58% examples, 101986 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:47:30,612 : INFO : EPOCH 3 - PROGRESS: at 41.88% examples, 101993 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:47:31,640 : INFO : EPOCH 3 - PROGRESS: at 42.17% examples, 101991 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:47:32,653 : INFO : EPOCH 3 - PROGRESS: at 42.44% examples, 102002 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:47:33,662 : INFO : EPOCH 3 - PROGRESS: at 42.70% examples, 102016 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:47:34,684 : INFO : EPOCH 3 - PROGRESS: at 42.98% examples, 102016 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:47:35,739 : INFO : EPOCH 3 - PROGRESS: at 43.29% examples, 101997 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:47:36,782 : INFO : EPOCH 3 - PROGRESS: at 43.58% examples, 101989 words/s, in_qsiz

2020-03-30 23:48:45,295 : INFO : EPOCH 3 - PROGRESS: at 61.61% examples, 101224 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:48:46,353 : INFO : EPOCH 3 - PROGRESS: at 61.91% examples, 101215 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:48:47,402 : INFO : EPOCH 3 - PROGRESS: at 62.17% examples, 101205 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:48:48,461 : INFO : EPOCH 3 - PROGRESS: at 62.55% examples, 101154 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:48:49,488 : INFO : EPOCH 3 - PROGRESS: at 62.81% examples, 101160 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:48:50,545 : INFO : EPOCH 3 - PROGRESS: at 63.13% examples, 101146 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:48:51,632 : INFO : EPOCH 3 - PROGRESS: at 63.40% examples, 101123 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:48:52,701 : INFO : EPOCH 3 - PROGRESS: at 63.67% examples, 101111 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:48:53,741 : INFO : EPOCH 3 - PROGRESS: at 63.95% examples, 101109 words/s, in_qsiz

2020-03-30 23:50:01,971 : INFO : EPOCH 3 - PROGRESS: at 82.07% examples, 100646 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:50:03,034 : INFO : EPOCH 3 - PROGRESS: at 82.34% examples, 100640 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:50:04,109 : INFO : EPOCH 3 - PROGRESS: at 82.63% examples, 100630 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:50:05,167 : INFO : EPOCH 3 - PROGRESS: at 82.88% examples, 100623 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:50:06,268 : INFO : EPOCH 3 - PROGRESS: at 83.19% examples, 100603 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:50:07,303 : INFO : EPOCH 3 - PROGRESS: at 83.44% examples, 100606 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:50:08,336 : INFO : EPOCH 3 - PROGRESS: at 83.71% examples, 100608 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:50:09,386 : INFO : EPOCH 3 - PROGRESS: at 83.99% examples, 100605 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:50:10,412 : INFO : EPOCH 3 - PROGRESS: at 84.26% examples, 100610 words/s, in_qsiz

2020-03-30 23:51:14,810 : INFO : EPOCH 4 - PROGRESS: at 1.02% examples, 96035 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:51:15,856 : INFO : EPOCH 4 - PROGRESS: at 1.28% examples, 95063 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:51:16,946 : INFO : EPOCH 4 - PROGRESS: at 1.56% examples, 95151 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:51:17,969 : INFO : EPOCH 4 - PROGRESS: at 1.81% examples, 94914 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:51:19,056 : INFO : EPOCH 4 - PROGRESS: at 2.12% examples, 95126 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:51:20,146 : INFO : EPOCH 4 - PROGRESS: at 2.41% examples, 95340 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:51:21,176 : INFO : EPOCH 4 - PROGRESS: at 2.68% examples, 95990 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:51:22,244 : INFO : EPOCH 4 - PROGRESS: at 2.98% examples, 96209 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:51:23,246 : INFO : EPOCH 4 - PROGRESS: at 3.22% examples, 96795 words/s, in_qsize 0, out_qsize 0
2

2020-03-30 23:52:32,149 : INFO : EPOCH 4 - PROGRESS: at 20.87% examples, 98112 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:52:33,205 : INFO : EPOCH 4 - PROGRESS: at 21.14% examples, 98129 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:52:34,219 : INFO : EPOCH 4 - PROGRESS: at 21.43% examples, 98193 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:52:35,264 : INFO : EPOCH 4 - PROGRESS: at 21.71% examples, 98221 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:52:36,309 : INFO : EPOCH 4 - PROGRESS: at 21.99% examples, 98229 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:52:37,362 : INFO : EPOCH 4 - PROGRESS: at 22.28% examples, 98244 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:52:38,409 : INFO : EPOCH 4 - PROGRESS: at 22.52% examples, 98159 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:52:39,439 : INFO : EPOCH 4 - PROGRESS: at 22.78% examples, 97985 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:52:40,455 : INFO : EPOCH 4 - PROGRESS: at 23.04% examples, 98024 words/s, in_qsize 0, out_

2020-03-30 23:53:49,436 : INFO : EPOCH 4 - PROGRESS: at 41.61% examples, 98643 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:53:50,502 : INFO : EPOCH 4 - PROGRESS: at 41.91% examples, 98646 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:53:51,511 : INFO : EPOCH 4 - PROGRESS: at 42.20% examples, 98676 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:53:52,525 : INFO : EPOCH 4 - PROGRESS: at 42.47% examples, 98708 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:53:53,610 : INFO : EPOCH 4 - PROGRESS: at 42.75% examples, 98752 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:53:54,661 : INFO : EPOCH 4 - PROGRESS: at 43.05% examples, 98757 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:53:55,708 : INFO : EPOCH 4 - PROGRESS: at 43.36% examples, 98765 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:53:56,800 : INFO : EPOCH 4 - PROGRESS: at 43.64% examples, 98801 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:53:57,854 : INFO : EPOCH 4 - PROGRESS: at 43.94% examples, 98741 words/s, in_qsize 0, out_

2020-03-30 23:55:07,369 : INFO : EPOCH 4 - PROGRESS: at 61.80% examples, 98291 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:55:08,373 : INFO : EPOCH 4 - PROGRESS: at 62.05% examples, 98272 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:55:09,444 : INFO : EPOCH 4 - PROGRESS: at 62.42% examples, 98231 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:55:10,512 : INFO : EPOCH 4 - PROGRESS: at 62.69% examples, 98231 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:55:11,559 : INFO : EPOCH 4 - PROGRESS: at 62.98% examples, 98240 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:55:12,650 : INFO : EPOCH 4 - PROGRESS: at 63.28% examples, 98226 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:55:13,665 : INFO : EPOCH 4 - PROGRESS: at 63.53% examples, 98207 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:55:14,726 : INFO : EPOCH 4 - PROGRESS: at 63.80% examples, 98210 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:55:15,775 : INFO : EPOCH 4 - PROGRESS: at 64.10% examples, 98219 words/s, in_qsize 0, out_

2020-03-30 23:56:24,548 : INFO : EPOCH 4 - PROGRESS: at 82.75% examples, 98896 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:56:25,641 : INFO : EPOCH 4 - PROGRESS: at 83.04% examples, 98913 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:56:26,707 : INFO : EPOCH 4 - PROGRESS: at 83.33% examples, 98912 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:56:27,787 : INFO : EPOCH 4 - PROGRESS: at 83.58% examples, 98904 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:56:28,789 : INFO : EPOCH 4 - PROGRESS: at 83.87% examples, 98922 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:56:29,805 : INFO : EPOCH 4 - PROGRESS: at 84.13% examples, 98936 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:56:30,815 : INFO : EPOCH 4 - PROGRESS: at 84.40% examples, 98950 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:56:31,848 : INFO : EPOCH 4 - PROGRESS: at 84.68% examples, 98959 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:56:32,896 : INFO : EPOCH 4 - PROGRESS: at 84.98% examples, 98963 words/s, in_qsize 0, out_

2020-03-30 23:57:36,831 : INFO : EPOCH 5 - PROGRESS: at 2.21% examples, 101373 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:57:37,887 : INFO : EPOCH 5 - PROGRESS: at 2.52% examples, 101190 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:57:38,908 : INFO : EPOCH 5 - PROGRESS: at 2.79% examples, 101359 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:57:39,923 : INFO : EPOCH 5 - PROGRESS: at 3.07% examples, 101540 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:57:41,013 : INFO : EPOCH 5 - PROGRESS: at 3.34% examples, 101735 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:57:42,050 : INFO : EPOCH 5 - PROGRESS: at 3.65% examples, 101705 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:57:43,073 : INFO : EPOCH 5 - PROGRESS: at 3.91% examples, 101763 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:57:44,145 : INFO : EPOCH 5 - PROGRESS: at 4.19% examples, 102084 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:57:45,199 : INFO : EPOCH 5 - PROGRESS: at 4.48% examples, 101921 words/s, in_qsize 0, out_

2020-03-30 23:58:52,926 : INFO : EPOCH 5 - PROGRESS: at 22.43% examples, 101657 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:58:53,990 : INFO : EPOCH 5 - PROGRESS: at 22.72% examples, 101615 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:58:55,044 : INFO : EPOCH 5 - PROGRESS: at 23.00% examples, 101573 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:58:56,050 : INFO : EPOCH 5 - PROGRESS: at 23.23% examples, 101487 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:58:57,085 : INFO : EPOCH 5 - PROGRESS: at 23.53% examples, 101385 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:58:58,156 : INFO : EPOCH 5 - PROGRESS: at 23.79% examples, 101239 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:58:59,191 : INFO : EPOCH 5 - PROGRESS: at 24.09% examples, 101124 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:59:00,285 : INFO : EPOCH 5 - PROGRESS: at 24.42% examples, 101067 words/s, in_qsize 0, out_qsize 0
2020-03-30 23:59:01,324 : INFO : EPOCH 5 - PROGRESS: at 24.70% examples, 101072 words/s, in_qsiz

2020-03-31 00:00:08,819 : INFO : EPOCH 5 - PROGRESS: at 42.77% examples, 100478 words/s, in_qsize 0, out_qsize 0
2020-03-31 00:00:09,856 : INFO : EPOCH 5 - PROGRESS: at 43.08% examples, 100481 words/s, in_qsize 0, out_qsize 0
2020-03-31 00:00:10,912 : INFO : EPOCH 5 - PROGRESS: at 43.38% examples, 100473 words/s, in_qsize 0, out_qsize 0
2020-03-31 00:00:12,010 : INFO : EPOCH 5 - PROGRESS: at 43.67% examples, 100494 words/s, in_qsize 0, out_qsize 0
2020-03-31 00:00:13,056 : INFO : EPOCH 5 - PROGRESS: at 43.98% examples, 100487 words/s, in_qsize 0, out_qsize 0
2020-03-31 00:00:14,077 : INFO : EPOCH 5 - PROGRESS: at 44.25% examples, 100501 words/s, in_qsize 0, out_qsize 0
2020-03-31 00:00:15,081 : INFO : EPOCH 5 - PROGRESS: at 44.53% examples, 100525 words/s, in_qsize 0, out_qsize 0
2020-03-31 00:00:16,152 : INFO : EPOCH 5 - PROGRESS: at 44.80% examples, 100532 words/s, in_qsize 0, out_qsize 0
2020-03-31 00:00:17,233 : INFO : EPOCH 5 - PROGRESS: at 45.07% examples, 100560 words/s, in_qsiz

2020-03-31 00:01:24,663 : INFO : EPOCH 5 - PROGRESS: at 63.43% examples, 100922 words/s, in_qsize 0, out_qsize 0
2020-03-31 00:01:25,666 : INFO : EPOCH 5 - PROGRESS: at 63.69% examples, 100939 words/s, in_qsize 0, out_qsize 0
2020-03-31 00:01:26,672 : INFO : EPOCH 5 - PROGRESS: at 63.95% examples, 100912 words/s, in_qsize 0, out_qsize 0
2020-03-31 00:01:27,752 : INFO : EPOCH 5 - PROGRESS: at 64.25% examples, 100895 words/s, in_qsize 0, out_qsize 0
2020-03-31 00:01:28,838 : INFO : EPOCH 5 - PROGRESS: at 64.55% examples, 100876 words/s, in_qsize 0, out_qsize 0
2020-03-31 00:01:29,878 : INFO : EPOCH 5 - PROGRESS: at 64.79% examples, 100875 words/s, in_qsize 0, out_qsize 0
2020-03-31 00:01:30,963 : INFO : EPOCH 5 - PROGRESS: at 65.08% examples, 100896 words/s, in_qsize 0, out_qsize 0
2020-03-31 00:01:31,984 : INFO : EPOCH 5 - PROGRESS: at 65.35% examples, 100906 words/s, in_qsize 0, out_qsize 0
2020-03-31 00:01:33,060 : INFO : EPOCH 5 - PROGRESS: at 65.70% examples, 100887 words/s, in_qsiz

2020-03-31 00:02:41,016 : INFO : EPOCH 5 - PROGRESS: at 83.80% examples, 100631 words/s, in_qsize 0, out_qsize 0
2020-03-31 00:02:42,026 : INFO : EPOCH 5 - PROGRESS: at 84.05% examples, 100641 words/s, in_qsize 0, out_qsize 0
2020-03-31 00:02:43,091 : INFO : EPOCH 5 - PROGRESS: at 84.33% examples, 100634 words/s, in_qsize 0, out_qsize 0
2020-03-31 00:02:44,103 : INFO : EPOCH 5 - PROGRESS: at 84.61% examples, 100642 words/s, in_qsize 0, out_qsize 0
2020-03-31 00:02:45,157 : INFO : EPOCH 5 - PROGRESS: at 84.90% examples, 100639 words/s, in_qsize 0, out_qsize 0
2020-03-31 00:02:46,173 : INFO : EPOCH 5 - PROGRESS: at 85.17% examples, 100647 words/s, in_qsize 0, out_qsize 0
2020-03-31 00:02:47,198 : INFO : EPOCH 5 - PROGRESS: at 85.44% examples, 100651 words/s, in_qsize 0, out_qsize 0
2020-03-31 00:02:48,235 : INFO : EPOCH 5 - PROGRESS: at 85.71% examples, 100652 words/s, in_qsize 0, out_qsize 0
2020-03-31 00:02:49,275 : INFO : EPOCH 5 - PROGRESS: at 85.99% examples, 100653 words/s, in_qsiz

In [6]:
model.save(result_dir_path+'CORD-19-FT_200d_5w_5i_5mc.model')


2020-03-31 00:03:50,152 : INFO : saving FastText object under /home/santosh/Work/models/word2vec/CORD-19/CORD-19-FT_200d_5w_5i_5mc.model, separately None
2020-03-31 00:03:50,153 : INFO : storing np array 'vectors' to /home/santosh/Work/models/word2vec/CORD-19/CORD-19-FT_200d_5w_5i_5mc.model.wv.vectors.npy
2020-03-31 00:03:50,228 : INFO : storing np array 'vectors_vocab' to /home/santosh/Work/models/word2vec/CORD-19/CORD-19-FT_200d_5w_5i_5mc.model.wv.vectors_vocab.npy
2020-03-31 00:03:50,299 : INFO : storing np array 'vectors_ngrams' to /home/santosh/Work/models/word2vec/CORD-19/CORD-19-FT_200d_5w_5i_5mc.model.wv.vectors_ngrams.npy
2020-03-31 00:04:04,371 : INFO : not storing attribute vectors_norm
2020-03-31 00:04:04,373 : INFO : not storing attribute vectors_vocab_norm
2020-03-31 00:04:04,373 : INFO : not storing attribute vectors_ngrams_norm
2020-03-31 00:04:04,374 : INFO : not storing attribute buckets_word
2020-03-31 00:04:04,375 : INFO : storing np array 'syn1neg' to /home/santosh

In [7]:
model.wv.most_similar('SARS-CoV-2', topn=10)

2020-03-31 00:04:50,164 : INFO : precomputing L2-norms of word weight vectors
2020-03-31 00:04:50,396 : INFO : precomputing L2-norms of ngram weight vectors


[('SARS-CoV-2-F2', 0.9803857207298279),
 ('SARS-CoV-2-F1', 0.9775040149688721),
 ('SARS-CoV-', 0.9563692212104797),
 ('2019-nCoV/SARS-CoV-2', 0.9542135000228882),
 ('SARS-CoV-2.', 0.9538172483444214),
 ('SARS-CoV-NC', 0.9490789175033569),
 ('SARS-CoV-ΔE', 0.9482947587966919),
 ('SARS-CoV-cpsR-19', 0.9448564648628235),
 ('SARS-CoV-ΔE-8a-dup', 0.9360930919647217),
 ('SARS-CoV-1', 0.9319473505020142)]

# Generate Paragraph Embeddings