# Search engine

In [31]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [32]:
import pandas as pd

# load dataset

In [33]:
df = pd.read_csv('/content/drive/My Drive/DATA/df.csv')

In [34]:
df.head()

Unnamed: 0,title,description,text
0,Relation of Visual Function to Retinal Nerve F...,Purpose To examine the relation of visual func...,Relation of Visual Function to Retinal Nerve F...
1,Impaired intestinal barrier integrity in the c...,Abstract Background: Growing evidence suggests...,Impaired intestinal barrier integrity in the c...
2,A Francisella tularensis Pathogenicity Island ...,ABSTRACT Francisella tularensis is a gram-nega...,A Francisella tularensis Pathogenicity Island ...
3,High Glucose Causes Apoptosis in Cultured Huma...,Type 2 diabetes is characterized by insulin re...,High Glucose Causes Apoptosis in Cultured Huma...
4,Fine-mapping type 2 diabetes loci to single-va...,We expanded GWAS discovery for type 2 diabetes...,Fine-mapping type 2 diabetes loci to single-va...


# SK-learn

In [35]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [36]:
vectorizer = TfidfVectorizer()
tf_idf = vectorizer.fit_transform(df.text)
#print("idf: ", [(n, idf) for idf, n in zip(vectorizer.idf_, vectorizer.get_feature_names())])
#print("v2i: ", vectorizer.vocabulary_)

In [38]:
q = "protein genesChlamydia"
qtf_idf = vectorizer.transform([q])
res = cosine_similarity(tf_idf, qtf_idf)
res = res.ravel().argsort()[-4:]
#print("\ntop 3 docs for '{}':\n{}".format(q, [df.text[i] for i in res[::-1]]))

In [39]:
for i in res[::-1]:
    print(df.text[i][:100],'\n')

Mapping antigenic domains expressed by Chlamydia trachomatis major outer membrane protein genesChlam 

MglA regulates transcription of virulence factors necessary for Francisella tularensis intraamoebae  

Acylated and Unacylated Ghrelin Promote Proliferation and Inhibit Apoptosis of Pancreatic β-Cells an 

Growth of Francisella spp. in rodent macrophagesWe examined the nature of the interactions between t 



# BM25

In [40]:
import spacy
from rank_bm25 import BM25Okapi
from tqdm import tqdm
import pandas as pd

In [41]:
nlp = spacy.load("en_core_web_sm")
text_list = df.text.str.lower().values
tok_text=[] # for our tokenised corpus
#Tokenising using SpaCy:
for doc in tqdm(nlp.pipe(text_list, disable=["tagger", "parser","ner"])):
    tok = [t.text for t in doc if t.is_alpha]
    tok_text.append(tok)

10it [00:00, 29.34it/s]


In [43]:
bm25 = BM25Okapi(tok_text)
query = "protein genesChlamydia"
tokenized_query = query.lower().split(" ")
import time
t0 = time.time()
results = bm25.get_top_n(tokenized_query, df.text.values, n=4)
t1 = time.time()
print(f'Searched 50,000 records in {round(t1-t0,4) } seconds \n')
for i in results:
    print(i[:100],'\n')

Searched 50,000 records in 0.0014 seconds 

Mapping antigenic domains expressed by Chlamydia trachomatis major outer membrane protein genesChlam 

Acylated and Unacylated Ghrelin Promote Proliferation and Inhibit Apoptosis of Pancreatic β-Cells an 

MglA regulates transcription of virulence factors necessary for Francisella tularensis intraamoebae  

Growth of Francisella spp. in rodent macrophagesWe examined the nature of the interactions between t 

