In [1]:
from vectorai import ViClient
import json
import xmltodict as xtd
import numpy as np
import os
from nltk.corpus import stopwords
from nltk.tokenize import wordpunct_tokenize
from nltk.stem import PorterStemmer
from joblib import Parallel, delayed
from gensim import corpora
from gensim import models
import pandas as pd
from gensim import similarities
from operator import itemgetter
import glob
import dask.bag as db
from dask.distributed import Client, progress
from vectorai.models.deployed import ViText2Vec

In [2]:
client = Client(n_workers=6, threads_per_worker=2)

client

0,1
Client  Scheduler: tcp://127.0.0.1:51227  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 6  Cores: 12  Memory: 17.13 GB


In [3]:
username = 'cesar-cabeza'
api_key = 'amdXU0VYVUJXN1RRYmVKWGUwcG46Ni1zRW15R01SNFNmLUZ4Ry00MHFmZw'

vi_client = ViClient(username=username, api_key = api_key)
text_encoder = ViText2Vec(username, api_key)

Logged in. Welcome cesar-cabeza. To view list of available collections, call list_collections() method.


In [6]:
vi_client.delete_collection("covid-19")

{'status': 'error', 'message': 'covid-19 does not exist'}

In [7]:
path_test = "B:/document_parser/document_parses/test"

In [13]:
def load_corpus(files_path, client, text_encoder):
    files = glob.glob(files_path + "/*.json")
    stopset = set(stopwords.words("english"))
    stemmer = PorterStemmer()
    b = db.from_sequence(files).map(preprocess_json, stopset, stemmer)
    client.insert_documents("covid-19", b.compute(), models={
                            "title": text_encoder.encode, "abstract": text_encoder.encode}, workers = 12, chunksize = 20)

In [14]:
def preprocess_json(file_path, stopset, stemmer):
    with open(file_path) as file:
        file_json = json.load(file)
    returned = preprocess_document(file_json, stopset, stemmer)
    return returned

In [15]:
def preprocess_document(file, stopset, stemmer):
    returned = {}
    returned["_id"] = file["paper_id"]
    title = file["metadata"]["title"]
    abstract = " ".join([paragraph["text"] for paragraph in file["abstract"]])
    body = " ".join([paragraph["text"] for paragraph in file["body_text"]])
    returned["title"] = title
    returned["abstract"] = preprocess_document_part(abstract, stopset, stemmer)
    returned["body"] = preprocess_document_part(body, stopset, stemmer)
    return returned

In [16]:
def preprocess_document_part(part, stopset, stemmer):
    if part != "":  # actually has content
        tokens = [word.lower() for word in wordpunct_tokenize(
            part) if word.lower() not in stopset and len(word) > 2]
        stems = [stemmer.stem(token) for token in tokens]
        return " ".join(stems)
    return "no content"

In [17]:
load_corpus(path_test, vi_client, text_encoder)

HBox(children=(FloatProgress(value=0.0, max=27.0), HTML(value='')))




In [18]:
vi_client.collection_schema("covid-19")

{'abstract': 'text',
 'abstract_vector_': 'vector',
 'body': 'text',
 'insert_date_': 'date',
 'title': 'text',
 'title_vector_': 'vector'}

In [24]:
advanced_search_query = {
    'text' : {'vector': text_encoder.encode('what is the origin coronavirus'), 'fields' : ['abstract_vector_', "title_vector_"]}
}
results = vi_client.advanced_search('covid-19', advanced_search_query, page_size=10)

In [25]:
results

{'results': [{'_id': 'a3116a1ea2e157c8855023d968cc0f0bd6c9846f',
   'insert_date_': '2020-11-05T08:31:41.743675',
   'abstract': 'concern new coronaviru 2019 ncov global public health threat articl provid preliminari evolutionari molecular epidemiolog analysi new viru phylogenet tree built use avail whole genom sequenc 2019 ncov whole genom sequenc highli similar sequenc avail gene bank sar mer bat sar like coronaviru fubar analysi show nucleocapsid spike glycoprotein site posit pressur homolog model help explain molecular structur differ virus phylogenet tree show 2019 ncov significantli cluster bat sar like coronaviru sequenc isol 2015 wherea structur analysi reveal mutat nucleocapsid protein result 2019ncov could consid coronaviru distinct sar viru probabl transmit bat anoth host mutat confer upon abil infect human author funder right reserv reus allow without permiss',
   'title': 'The 2019-new coronavirus epidemic: evidence for virus evolution',
   'body': 'famili coronavirida com

In [26]:
[(result["title"],result["_search_score"]) for result in results["results"]]

[('The 2019-new coronavirus epidemic: evidence for virus evolution',
  0.8915696),
 ('Host and infectivity prediction of Wuhan 2019 novel coronavirus using deep learning algorithm',
  0.8064966),
 ('A Chinese Case of Coronavirus Disease 2019 (COVID-19) Did Not Show Infectivity During the Incubation Period: Based on an Epidemiological Survey',
  0.7831983999999999),
 ('Dinucleotide repeats in coronavirus SARS-CoV-2 genome: evolutionary implications',
  0.7324622000000001),
 ('Coronavirus and paramyxovirus in bats from Northwest Italy',
  0.7292778000000002),
 ('Preliminary studies on feline coronavirus distribution in naturally and experimentally infected cats',
  0.7255031999999999),
 ('Immune evasion of porcine enteric coronaviruses and viral modulation of antiviral innate signaling',
  0.7171661999999999),
 ('Is a 14-day quarantine period optimal for effectively controlling coronavirus disease 2019 (COVID-19)?',
  0.6793499999999999),
 ('Relative Coronavirus Disease 2019 Mortality: A

In [50]:
import dask.bag as db
from dask.distributed import Client, progress
client = Client(n_workers=6, threads_per_worker=2)

In [51]:
client

0,1
Client  Scheduler: tcp://127.0.0.1:60739  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 6  Cores: 12  Memory: 17.13 GB


In [55]:
files = glob.glob("B:/document_parser/document_parses/test"+"/*.json")
print(files[0])
b = db.from_sequence(files).map(preprocess_json)

B:/document_parser/document_parses/test\000a0fc8bbef80410199e690191dc3076a290117.json


In [147]:
b.count().compute()

84420

In [56]:
vi_client.insert_documents("corpus-dask", b.compute(), models ={"title": text_encoder.encode, "abstract":text_encoder.encode})

HBox(children=(FloatProgress(value=0.0, max=37.0), HTML(value='')))




{'inserted_successfully': 556, 'failed': 0, 'failed_document_ids': []}

In [27]:
advanced_search_query = {
    'text' : {'vector': text_encoder.encode('what is the origin coronavirus'), 'fields' : ['abstract_vector_', "title_vector_"]}
}
resultsb = vi_client.advanced_search('corpus-dask', advanced_search_query, page_size=20)

In [32]:
resultsb

{'results': [{'_id': 'a3116a1ea2e157c8855023d968cc0f0bd6c9846f',
   'insert_date_': '2020-11-04T18:51:15.374255',
   'abstract': 'There is concern about a new coronavirus, the 2019-nCoV, as a global public health threat. In this article, we provide a preliminary evolutionary and molecular epidemiological analysis of this new virus. A phylogenetic tree has been built using the 15 available whole genome sequence of 2019-nCoV and 12 whole genome sequences highly similar sequences available in gene bank (5 from SARS, 2 from MERS and 5 from Bat SARS-like Coronavirus). FUBAR analysis shows that the Nucleocapsid and the Spike Glycoprotein has some sites under positive pressure while homology modelling helped to explain some molecular and structural differences between the viruses. The phylogenetic tree showed that 2019.nCoV significantly clustered with Bat SARS-like Coronavirus sequence isolated in 2015, whereas structural analysis revealed mutation in S and nucleocapsid proteins. From these 

In [28]:
vi_client.collection_schema("corpus-dask")

{'abstract': 'text',
 'abstract_vector_': 'vector',
 'body': 'text',
 'insert_date_': 'date',
 'title': 'text',
 'title_vector_': 'vector'}

In [29]:
[(result["_id"],result["_search_score"]) for result in resultsb["results"]]

[('a3116a1ea2e157c8855023d968cc0f0bd6c9846f', 0.9573084999999999),
 ('a05179c57d5e1cdddc839ff3c0b70a614efbb6cb', 0.8702154000000002),
 ('a18236e9c75536c022502b9a9d0efdc6b0a3f0c2', 0.8499500000000002),
 ('a45430e796de9329f33ca9a4ea936b3c569c5296', 0.8114971999999998),
 ('a09090caa2498f6ff886f0fd7b10a7e45818751c', 0.7965499999999999),
 ('00623268d0ce12f11d6b3acb0cc012821f592b93', 0.7556745999999999),
 ('008cf2148ee4383534191b3736096878cad82c67', 0.7367054999999998),
 ('a912f654028e787673b235c9ba2eb2d161e372aa', 0.7334266),
 ('a17525ba0365d376469cb2e8faeb0bad6d6ed682', 0.7328115),
 ('007075cfdca03250c97761eb0e9c8e1bc8a94d6c', 0.7250315999999999),
 ('0022361a636d679fc3493cc0b20bb6127bf63a6a', 0.7245579000000002),
 ('a1949f80e726d045fe1cccb24a0e911cc0b912b9', 0.7034755000000001),
 ('a03517f26664be79239bcdf3dbb0966913206a86', 0.7024539000000001),
 ('a911adbaf5828c2f84558d48e2e72e00810069aa', 0.6969185000000002),
 ('a007977dad90a07b3beb9f689e3be8b3f7d2a7f6', 0.6960899999999999),
 ('a1213a3700