# Search engine

In [12]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [13]:
import pandas as pd

# load dataset

In [14]:
df = pd.read_csv('/content/drive/My Drive/DATA/df.csv')

In [15]:
df.head()

Unnamed: 0,title,description,text
0,Relation of Visual Function to Retinal Nerve F...,Purpose To examine the relation of visual func...,Relation of Visual Function to Retinal Nerve F...
1,Impaired intestinal barrier integrity in the c...,Abstract Background: Growing evidence suggests...,Impaired intestinal barrier integrity in the c...
2,A Francisella tularensis Pathogenicity Island ...,ABSTRACT Francisella tularensis is a gram-nega...,A Francisella tularensis Pathogenicity Island ...
3,High Glucose Causes Apoptosis in Cultured Huma...,Type 2 diabetes is characterized by insulin re...,High Glucose Causes Apoptosis in Cultured Huma...
4,Fine-mapping type 2 diabetes loci to single-va...,We expanded GWAS discovery for type 2 diabetes...,Fine-mapping type 2 diabetes loci to single-va...


# SK-learn

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
vectorizer = TfidfVectorizer()
tf_idf = vectorizer.fit_transform(df.text)
print("idf: ", [(n, idf) for idf, n in zip(vectorizer.idf_, vectorizer.get_feature_names())])
print("v2i: ", vectorizer.vocabulary_)

idf:  [('001', 2.7047480922384253), ('105', 2.7047480922384253), ('109', 2.7047480922384253), ('11', 2.7047480922384253), ('123', 2.7047480922384253), ('13', 2.7047480922384253), ('130', 2.7047480922384253), ('135', 2.7047480922384253), ('14', 2.7047480922384253), ('16', 2.7047480922384253), ('17', 2.7047480922384253), ('18', 2.7047480922384253), ('180', 2.7047480922384253), ('1990', 2.7047480922384253), ('1e', 2.7047480922384253), ('20', 2.7047480922384253), ('2002', 2.7047480922384253), ('243', 2.7047480922384253), ('25', 2.7047480922384253), ('26', 2.7047480922384253), ('2d', 2.7047480922384253), ('30', 2.7047480922384253), ('36', 2.7047480922384253), ('403', 2.7047480922384253), ('48', 2.7047480922384253), ('51', 2.7047480922384253), ('72', 2.7047480922384253), ('73', 2.7047480922384253), ('78', 2.7047480922384253), ('80', 2.7047480922384253), ('85', 2.7047480922384253), ('898', 2.7047480922384253), ('90', 2.7047480922384253), ('92', 2.7047480922384253), ('abdominal', 2.70474809223

In [28]:
q = "protein genesChlamydia"
qtf_idf = vectorizer.transform([q])
res = cosine_similarity(tf_idf, qtf_idf)
res = res.ravel().argsort()[-5:]
#print("\ntop 3 docs for '{}':\n{}".format(q, [df.text[i] for i in res[::-1]]))

In [29]:
for i in res[::-1]:
    print(df.text[i],'\n')

Mapping antigenic domains expressed by Chlamydia trachomatis major outer membrane protein genesChlamydia trachomatis is an obligate prokaryotic intracellular pathogen of humans that infects mucosal epithelial cells. Exposed domains of its major outer membrane protein (MOMP) are both serotyping and protective antigenic determinants. To identify these domains, we have cloned and epitope-mapped the genes of serovars A, C (C serogroup) and L2, B (B serogroup) with a panel of monoclonal antibodies (mAbs). Predominantly conserved regions of the genes of both serogroups are interspersed with four short variable domains (I-IV). Recombinant phage clones expressing specific MOMP antigenic determinants revealed that protective serotype-specific recognized epitopes in variable domains I and II. Protective subspecies and serogroup-specific mAbs recognized overlapping determinants in variable domain IV near the C terminus. A nonprotective species-specific mAb mapped to an invariant peptide of nine r

# BM25

In [19]:
import spacy
from rank_bm25 import BM25Okapi
from tqdm import tqdm
import pandas as pd

In [20]:
nlp = spacy.load("en_core_web_sm")
text_list = df.text.str.lower().values
tok_text=[] # for our tokenised corpus
#Tokenising using SpaCy:
for doc in tqdm(nlp.pipe(text_list, disable=["tagger", "parser","ner"])):
    tok = [t.text for t in doc if t.is_alpha]
    tok_text.append(tok)

10it [00:00, 36.73it/s]


In [30]:
bm25 = BM25Okapi(tok_text)
query = "protein genesChlamydia"
tokenized_query = query.lower().split(" ")
import time
t0 = time.time()
results = bm25.get_top_n(tokenized_query, df.text.values, n=5)
t1 = time.time()
print(f'Searched 50,000 records in {round(t1-t0,4) } seconds \n')
for i in results:
    print(i,'\n')

Searched 50,000 records in 0.0005 seconds 

Mapping antigenic domains expressed by Chlamydia trachomatis major outer membrane protein genesChlamydia trachomatis is an obligate prokaryotic intracellular pathogen of humans that infects mucosal epithelial cells. Exposed domains of its major outer membrane protein (MOMP) are both serotyping and protective antigenic determinants. To identify these domains, we have cloned and epitope-mapped the genes of serovars A, C (C serogroup) and L2, B (B serogroup) with a panel of monoclonal antibodies (mAbs). Predominantly conserved regions of the genes of both serogroups are interspersed with four short variable domains (I-IV). Recombinant phage clones expressing specific MOMP antigenic determinants revealed that protective serotype-specific recognized epitopes in variable domains I and II. Protective subspecies and serogroup-specific mAbs recognized overlapping determinants in variable domain IV near the C terminus. A nonprotective species-specific 