# CORD Semantic Search using S-Bert and Annoy

## Importing Libraries

In [1]:
import pandas as pd
import numpy as np
from rank_bm25 import BM25Okapi
import re
pd.options.display.max_colwidth=160

## Data Preprocessing

In [2]:
rel_terms = '.*vir.*|.*sars.*|.*mers.*|.*corona.*|.*ncov.*|.*immun.*|.*nosocomial.*|.*epidem.*|.*emerg.*|.*vacc.*|.*cytokine.*'

def start(data):
    return data.copy()

def clean_title(data):
    # Set junk titles to NAN
    title_relevant = data.title.fillna('').str.match(rel_terms, case=False)
    title_short = data.title.fillna('').apply(len) < 30
    title_junk = title_short & ~title_relevant
    data.loc[title_junk, 'title'] = ''
    return data


def clean_abstract(data):
    # Set unknowns to NAN
    abstract_unknown = data.abstract == 'Unknown'
    data.loc[abstract_unknown, 'abstract'] = np.nan

    # Fill missing abstract with the title
    data.abstract = data.abstract.fillna(data.title)

    # Remove common terms like publisher
    data.abstract = data.abstract.fillna('').apply(lambda x: re.sub('(OBJECTIVE:|Publisher|Abstract|Summary|BACKGROUND|INTRODUCTION)','',x))

    return data


def drop_missing(data):
    missing = (data.published.isnull()) & \
              (data.sha.isnull()) & \
              (data.title == '') & \
              (data.abstract == '')
    return data[~missing].reset_index(drop=True)


def fill_nulls(data):
    data.authors = data.authors.fillna('')
    data.doi = data.doi.fillna('')
    data.journal = data.journal.fillna('')
    data.abstract = data.abstract.fillna('')
    return data


def rename_publish_time(data):
    return data.rename(columns={'publish_time': 'published'})

def clean_metadata(metadata):
    print('Cleaning metadata')
    return metadata.pipe(start) \
        .pipe(clean_title) \
        .pipe(clean_abstract) \
        .pipe(rename_publish_time) \
        .pipe(drop_missing) \
        .pipe(fill_nulls)

## Data Loading

In [3]:
metadata_path = 'data/metadata.csv'

In [4]:
dtypes = {'Microsoft Academic Paper ID': 'str', 'pubmed_id': str}
renames = {'source_x': 'source', 'has_full_text': 'has_text'}
metadata = pd.read_csv(metadata_path, dtype=dtypes, low_memory=False,
                       parse_dates=['publish_time']).rename(columns=renames)


## Data Cleaning

In [5]:
metadata = clean_metadata(metadata)

Cleaning metadata


In [6]:
metadata.head(2)

Unnamed: 0,cord_uid,sha,source,title,doi,pmcid,pubmed_id,license,abstract,published,authors,journal,mag_id,who_covidence_id,arxiv_id,pdf_json_files,pmc_json_files,url,s2_id
0,ug7v899j,d1aafb70c066a2068b02786f8929fd9c900897fb,PMC,"Clinical features of culture-proven Mycoplasma pneumoniae infections at King Abdulaziz University Hospital, Jeddah, Saudi Arabia",10.1186/1471-2334-1-6,PMC35282,11472636,no-cc,This retrospective chart review describes the epidemiology and clinical features of 40 patients with culture-proven Mycoplasma pneumoniae infections at Kin...,2001-07-04,"Madani, Tariq A; Al-Ghamdi, Aisha A",BMC Infect Dis,,,,document_parses/pdf_json/d1aafb70c066a2068b02786f8929fd9c900897fb.json,document_parses/pmc_json/PMC35282.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC35282/,
1,02tnwd4m,6b0567729c2143a66d737eb0a2f63f2dce2e5a7d,PMC,Nitric oxide: a pro-inflammatory mediator in lung disease?,10.1186/rr14,PMC59543,11667967,no-cc,Inflammatory diseases of the respiratory tract are commonly associated with elevated production of nitric oxide (NO•) and increased indices of NO• -dependen...,2000-08-15,"Vliet, Albert van der; Eiserich, Jason P; Cross, Carroll E",Respir Res,,,,document_parses/pdf_json/6b0567729c2143a66d737eb0a2f63f2dce2e5a7d.json,document_parses/pmc_json/PMC59543.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC59543/,


## Model Loading : Sentence transformers model - SciBert

In [10]:
from sentence_transformers import SentenceTransformer, models

## Step 1: use an existing language model
word_embedding_model = models.Transformer('lordtt13/COVID-SciBERT')

## Step 2: use a pool function over the token embeddings
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())

## Join steps 1 and 2 using the modules argument
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

Some weights of the model checkpoint at lordtt13/COVID-SciBERT were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## Abstract Text pre-process

In [31]:
import contractions
import string,re
def data_preprocess(text):
    #removing contractions
    text=contractions.fix(text)
    #removing url and html links
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    html_pattern = re.compile('<.*?>')
    text=html_pattern.sub(r'', text)
    text=url_pattern.sub(r'', text)
    #replacing underscore from text
    text=text.replace('_',' ')
    #To remove the punctuations
    text = text.translate(str.maketrans('','',string.punctuation))
    #will consider only alphabets and numerics
    pat = r'[^a-zA-z0-9]' 
    text=re.sub(pat, ' ', text)  
    #will replace newline with space
    text = re.sub("\n"," ",text)
    #will convert to lower case and will split and join the words
    text=' '.join(text.split())
    text=text.lower()
    return text

In [33]:
from tqdm import tqdm

list_clean=[]
#### In case using with enumerate:
for i, x in enumerate( tqdm(metadata.abstract.values)):
    list_clean.append(data_preprocess(x))

100%|██████████████████████████████████████████████████████████████████████| 1056659/1056659 [08:58<00:00, 1961.76it/s]


In [34]:
len(list_clean)

1056659

## Clean csv file for search

In [43]:
df = metadata[['cord_uid','title','abstract','url']][:5000]
df['abstract'] = list_clean[:5000]

In [44]:
df.head()

Unnamed: 0,cord_uid,title,abstract,url
0,ug7v899j,"Clinical features of culture-proven Mycoplasma pneumoniae infections at King Abdulaziz University Hospital, Jeddah, Saudi Arabia",this retrospective chart review describes the epidemiology and clinical features of 40 patients with cultureproven mycoplasma pneumoniae infections at king ...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC35282/
1,02tnwd4m,Nitric oxide: a pro-inflammatory mediator in lung disease?,inflammatory diseases of the respiratory tract are commonly associated with elevated production of nitric oxide no and increased indices of no dependent oxi...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC59543/
2,ejv2xln0,Surfactant protein-D and pulmonary host defense,surfactant proteind spd participates in the innate response to inhaled microorganisms and organic antigens and contributes to immune and inflammatory regula...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC59549/
3,2b73a28n,Role of endothelin-1 in lung disease,endothelin1 et1 is a 21 amino acid peptide with diverse biological activity that has been implicated in numerous diseases et1 is a potent mitogen regulator ...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC59574/
4,9785vg6d,Gene expression in epithelial cells in response to pneumovirus infection,respiratory syncytial virus rsv and pneumonia virus of mice pvm are viruses of the family paramyxoviridae subfamily pneumovirus which because clinically imp...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC59580/


In [45]:
df.to_csv(r'data\csv_for_search.csv')

## Encoding abstract text

In [None]:
context_embed = model.encode(df['abstract'].values)

In [None]:
len(context_embed[0])

In [None]:
len(context_embed)

## Indexing embeddings using Annoy

In [None]:
from annoy import AnnoyIndex
#512 dimensional vectors
D=768

#Default number of trees
NUM_TREES=70

ann = AnnoyIndex(D)
for index, embed in enumerate(context_embed):
  ann.add_item(index, embed)
ann.build(NUM_TREES)
ann.save('data/ann_full_data.ann')

In [19]:
u2 = AnnoyIndex(768, 'angular')
u2.load(r'data/ann_index.ann')

True

In [21]:
x,dist = u2.get_nns_by_vector(model.encode('covid disease'), 10, search_k=-1, include_distances=True)

In [23]:
my_result_out2=[]
for i,j in zip(x,dist):
        temp_list=list(metadata.loc[i,['title','url']].values)
        temp_list.append(1-((j**2) / 2))
        my_result_out2.append(temp_list)

In [25]:
pd.DataFrame(my_result_out2)

Unnamed: 0,0,1,2
0,Technical Description of RODS: A Real-time Public Health Surveillance System,https://academic.oup.com/jamia/article-pdf/10/5/399/2352016/10-5-399.pdf,0.620595
1,Gene expression in epithelial cells in response to pneumovirus infection,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC59580/,0.567117
2,"The 21st International Symposium on Intensive Care and Emergency Medicine, Brussels, Belgium, 20-23 March 2001",https://www.ncbi.nlm.nih.gov/pmc/articles/PMC137274/,0.546412
3,"Clinical features of culture-proven Mycoplasma pneumoniae infections at King Abdulaziz University Hospital, Jeddah, Saudi Arabia",https://www.ncbi.nlm.nih.gov/pmc/articles/PMC35282/,0.546182
4,Sequence requirements for RNA strand transfer during nidovirus discontinuous subgenomic RNA synthesis,http://europepmc.org/articles/pmc125340?pdf=render,0.519851
5,Role of endothelin-1 in lung disease,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC59574/,0.481446
6,Nitric oxide: a pro-inflammatory mediator in lung disease?,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC59543/,0.480547
7,Surfactant protein-D and pulmonary host defense,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC59549/,0.449996
8,Debate: Transfusing to normal haemoglobin levels will not improve outcome,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC137267/,0.435599
9,Heme oxygenase-1 and carbon monoxide in pulmonary medicine,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC193681/,0.412586


## Ranking based on BM25

In [None]:
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import preprocess_documents, preprocess_string

meta_df_tokens = meta_df.abstract.fillna('').apply(preprocess_string)

In [None]:
from rank_bm25 import BM25Okapi
import numpy as np

bm25_index = BM25Okapi(meta_df_tokens.tolist())

def search(search_string, num_results=10):
    search_tokens = preprocess_string(search_string)
    scores = bm25_index.get_scores(search_tokens)
    top_indexes = np.argsort(scores)[::-1][:num_results]
    return top_indexes

indexes = search('novel coronavirus treatment')
indexes

In [None]:
meta_df.loc[search('novel coronavirus treatment')]