In [1]:
!pip install rank_bm25
!pip install transformers

Collecting rank_bm25
  Downloading https://files.pythonhosted.org/packages/d2/e4/38d03d6d5e2deae8d2838b81d6ba2742475ced42045f5c46aeb00c5fb79c/rank_bm25-0.2.tar.gz
Building wheels for collected packages: rank-bm25
  Building wheel for rank-bm25 (setup.py) ... [?25l[?25hdone
  Created wheel for rank-bm25: filename=rank_bm25-0.2-cp36-none-any.whl size=4162 sha256=564cfdc0fd1cd9b3396a45b39beee6a77592c04c3db8254e96dc7a667fc2f5cb
  Stored in directory: /root/.cache/pip/wheels/6f/0c/1f/78945dd6a5478bbcdb50d73ac96ae5af2ffcdfcd374fd9b1bf
Successfully built rank-bm25
Installing collected packages: rank-bm25
Successfully installed rank-bm25-0.2
Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/37/ba/dda44bbf35b071441635708a3dd568a5ca6bf29f77389f7c7c6818ae9498/transformers-2.7.0-py3-none-any.whl (544kB)
[K     |████████████████████████████████| 552kB 4.6MB/s 
[?25hCollecting tokenizers==0.5.2
[?25l  Downloading https://files.pythonhosted.org/packages/d1/3f/73

In [2]:
import os
import re
import pickle
from gensim.summarization.summarizer import summarize
from gensim.utils import simple_preprocess
from gensim.models.word2vec import Word2Vec
from rank_bm25 import BM25Okapi

import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.util import ngrams
from nltk.stem.porter import PorterStemmer
from sklearn.manifold import TSNE
import torch
import scipy as sp
from transformers import BertTokenizer, BertModel
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_distances
from scipy.spatial import distance
from IPython.display import HTML
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import string
import plotly.express as px
sns.set()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


  import pandas.util.testing as tm


# Read files and preprocess

There are 4 files, so we read and merge them into one dataframe

Preprocess the texts by filling NAs, lowercasing, and removing urls

Extract keywords from the titles and abstracts and set as new columns

In [0]:
# Set file paths
data_path = "/content/drive/My Drive/Colab Notebooks/data/cord19"
data_files = ["biorxiv_clean.csv", "clean_noncomm_use.csv", 
              'clean_comm_use.csv', 'clean_pmc.csv'] #['all_sources_metadata_2020-03-13.csv'] 

# Read files
dfs = []
for file in data_files:
    dfs.append(pd.read_csv(os.path.join(data_path, file)))
data = pd.concat(dfs, ignore_index=True)
data.drop_duplicates(subset=["title"],inplace=True)
#data = data.iloc[0:20]

def preprocess(data):

    def preprocess_(x):
        # Lowercase
        x = x.lower()
        # Remove urls
        x = re.sub(r'https?://\S+|www\.\S+', '', x)
        # Remove newline characters
        x = re.sub(r'\n', '', x)
        x = re.sub(r'\[\d{0,2}\s*(,\d{0,2})*\]*', '', x).strip()
        return x

    data = data.copy()

    # Filling the empty abstracts 
    data['abstract'] = data['abstract'].fillna('')
    # Filling the empty titles 
    data['title'] = data['title'].fillna('')

    # Preprocess
    data['abstract_processed'] = data['abstract'].apply(preprocess_)
    data['title_processed'] = data['title'].apply(preprocess_)
    data['text_processed'] = data['text'].apply(preprocess_)
    data['title_abstract_processed'] = data['title_processed'] + '. ' + data['abstract_processed']
    data['text_abstract_processed'] = data['title_processed'] + '. ' + data['text_processed']

    return data


def tokenize(text):
    return [word.strip(string.punctuation) for word in text.split() if word not in stop_words]


def create_sentences_df(data):
    sentences = []
    ids = []
    for idx, row in data.iterrows():
        
        # Create list of sentences from the paper's text
        sents = nltk.sent_tokenize(row['text_processed'])
        # Create a list where all elements are the paper's index number
        ids.extend([idx] * len(sents))
        sentences.extend(sents)

    df = pd.DataFrame({"Id": ids, "Sentence": sentences})
    # Drop sentences that are too short or too long
    df = df[(df.Sentence.map(len) >= 10) & (df.Sentence.map(len) <= 510)]

    # Remove period at the end of the sentence and also any period not followed by a digit
    df['Sentence'] = df['Sentence'].apply(lambda x: re.sub('\.(?!\d)', '', x))

    return df

# Preprocess data
data = preprocess(data)
sentences_df = create_sentences_df(data)


# Create Word Embeddings

In [0]:
sentences_df = sentences_df['Sentence'].apply(tokenize)
model = Word2Vec(
    sentences_df,
    size=200,
    window=5,
    min_count=10,
    sg=1,
    workers=10,
    iter=10)
# Normalize the word embeddings
model.init_sims(replace=True)

# Query a word and see similar words

In [25]:
def query_synonyms(query_term, embedding_model, top_n=5):
    words = embedding_model.wv.most_similar(query_term, topn=top_n)
    words = [word[0] for word in words]
    return words
    
def plot_tsne(word, model):

    if word == "":
        return
    words = model.wv.most_similar(word, topn=30)
    words = [x[0] for x in words] + [word]
    X = model[words]
    
    tsne = TSNE(n_components=2)
    X_tsne = tsne.fit_transform(X)

    df = pd.DataFrame(X_tsne, index=words, columns=['x', 'y'])

    fig = px.scatter(df, x="x", y="y", text=df.index)
    fig.update_traces(textposition='top center')
    fig.show()
    

plot_tsne('covid-19', model)


Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`.


Call to deprecated `__getitem__` (Method will be removed in 4.0.0, use self.wv.__getitem__() instead).



Since different authors might refer to COVID-19 by different terms, we should include all the terms in our first search query, so that no papers are missed.

**Similar terms:** 2019-ncov, sars-cov-2, covid-2019, sars-cov-2



# Retrieve documents based on query

Here we can search for papers on covid-19 by using a BM25 search engine.

By filtering out relevant documents, we can greatly reduce our search space

In [0]:
documents = data['title_abstract_processed'].values
tokenized_corpus = [nltk.word_tokenize(doc) for doc in documents]
bm25 = BM25Okapi(tokenized_corpus) 

In [5]:
def display_summaries(X):
    display(HTML("<b>SUMMARIES OF THE ABSTRACTS</b>"))
    for idx, row in retrieved_df.iloc[0:10].iterrows():
        try:
            summary = summarize(row.abstract_processed, ratio=0.2)
            display(HTML("<p><font color='#3B57BD'>" + summary + "</font></p>"))
        except:
            continue
        print('')

def process_query(query, expand=False):

    query = query.lower()
    query_list = nltk.word_tokenize(query)
    
    if expand == True:
        for word in query_list.copy():
            synonyms = query_synonyms(word, model, 3)
            query_list.extend(synonyms)
    return query_list

def filter_top_docs(data, scores, top_n=300):

    retrieved_df = data.copy()
    retrieved_df['score'] = scores
    retrieved_df = retrieved_df[retrieved_df['score'] > 0]
    retrieved_df.sort_values('score', ascending=False, inplace=True)
    retrieved_df = retrieved_df.iloc[0:top_n]
    return retrieved_df

query = "covid-19, covid19, 2019-ncov, sars-cov-2, covid-2019, sars-cov-2"
query = process_query(query, expand=False)

scores = bm25.get_scores(query)

retrieved_df = filter_top_docs(data, scores, top_n=1000)


# Create sentences df from the retrieved_df papers
sentences_df = create_sentences_df(retrieved_df)

# Remove every column from retrieved_df except the title and author, 
# since we only use these two columns to display later on
retrieved_df = retrieved_df[['title', 'abstract_processed', 'authors']]

# Display summaries of the retrieved_df papers' abstracts
display_summaries(retrieved_df)































# Create local word2vec model on the retrieved documents

In [0]:
def tokenize(text):
    return [word.strip(string.punctuation) for word in text.split() if word not in stop_words]

sentences_df['Tokenized'] = sentences_df['Sentence'].apply(tokenize)

word2vec = Word2Vec(
    sentences_df['Tokenized'],
    size=200,
    window=5,
    min_count=10,
    sg=1,
    workers=10,
    iter=100)

# Normalize the word embeddings
word2vec.init_sims(replace=True)


# Prepare Tfidf matrix

In [0]:
# Vectorize the sentences
vectorizer = TfidfVectorizer(stop_words="english")
tfidf_matrix = vectorizer.fit_transform(sentences_df.Sentence.values)

# Search for answers
#### Now we're ready to start retrieving answers to COVID-19 questions.

#### Method 1: Tf-Idf and Cosine Similarity
#### Method 2: Word2Vec Embeddings and Word Mover's Distance

In [0]:
def display_results(query, sentences_df, top_n=20):

    query = nltk.word_tokenize(query)
    to_display = sentences_df.sort_values('Rank', ascending=True).iloc[0:top_n]

    for idx, row in to_display.iterrows():

        title = str(retrieved_df.loc[row.Id].title)
        authors = str(retrieved_df.loc[row.Id].authors).split(',')[0] + '. et al.'
        

        """
        prev_sentence = ""
        next_sentence = ""
        if idx != 0:
            prev_sentence = str(sentences.loc[idx - 1].Sentence)

        if idx != len(sentences):
            next_sentence = str(sentences.loc[idx + 1].Sentence)
        
        sentence = prev_sentence + row.Sentence + next_sentence
        """

        sentence = nltk.word_tokenize(row.Sentence)
        
        # Display title of the paper and the authors
        display(HTML("<b><font color='black'>" + title + '. ' +
                    "</b>" + authors + "</font>"))


        # Loop over words in the sentence and paste as highlighted text if word is in query,
        # else paste as normal string
        text = str(["<span style='background-color:#9ae59a'>" + word + "</span>"
                        if word in query else word
                        for word in sentence]) + '<hr></div>'
        # Since the text gets pasted like this ['good', 'night'],
        # remove the brackets, commas, and quotes
        text = re.sub("[\'\"\,\[\]]", "", text)
        
        # Display the sentence
        display(HTML("<p><font color='#3B57BD'>" + text + "</font></p>".strip("\'")))
        display(HTML("<hr>"))



def search(query, method='tfidf'):

    # Lower case and remove trailing whitespaces
    query = query.lower().strip()

    if method == 'tfidf':

        query_modified = vectorizer.transform([query])

        sentences_df['Rank'] = cosine_distances(tfidf_matrix, query_modified)
        display_results(query, sentences_df)


    elif method == "wmd":

        query_modified = tokenize(query)
        dists = [word2vec.wv.wmdistance(query_modified, sentence) for sentence in sentences_df.Tokenized]

        sentences_df['Rank'] = dists
        display_results(query, sentences_df)


In [23]:
query = 'infants of mothers with covid-19'
search(query, method='wmd')

In [22]:
query = 'neonates of mothers with covid-19'
search(query, method='tfidf')