# Semantic Search

In [1]:
import numpy as np
import pandas as pd
import spacy
import string
import gensim
import operator
import re


# Data Cleaning and Preprocessing

In [2]:
f=open('queries.txt')
query_str = f.read()
queries = query_str.split(".I")
queries[:3]

FileNotFoundError: [Errno 2] No such file or directory: 'queries.txt'

In [None]:
queries_data = []
for t in queries:
    if t.strip() != "":
        i = t.split(".W\n")[0]
        w = t.split(".W\n")[1]
        queries_data.append({"I": i.strip(), "W": w.strip()})

df_queries = pd.DataFrame(queries_data)
df_queries.head()

In [4]:
f=open('docs.txt')
doc_str = f.read()
docs = doc_str.split(".I")

In [5]:
docs_data = []
for t in docs:
    if t.strip() != "":
        i = t.split(".W\n")[0]
        w = t.split(".W\n")[1]
        docs_data.append({"I": i.strip(), "W": w.strip()})

df_docs = pd.DataFrame(docs_data)
df_docs.head()

Unnamed: 0,I,W
0,1,correlation between maternal and fetal plasma ...
1,2,changes of the nucleic acid and phospholipid l...
2,3,surfactant in fetal lamb tracheal fluid . ...
3,4,placental and cord blood lipids.. comparison i...
4,5,free fatty acid concentration in maternal plas...


In [6]:
f=open('relevance.txt')
relevance_str = f.read()
relevance = relevance_str.strip().split("\n")

In [7]:
# Split each line into columns
rows = [list(map(float, line.strip().split())) for line in relevance]

# Create a DataFrame from the rows
df_relevance = pd.DataFrame(rows, columns=["query", "doc", "col3", "col4"])
df_relevance = df_relevance.drop(['col3', 'col4'], axis=1)

In [8]:
df_relevance = df_relevance.astype(int)
df_docs['I'] = df_docs['I'].astype(int)
df_queries['I'] = df_queries['I'].astype(int)

df_rele_doc = pd.merge(df_relevance, df_docs, left_on='doc', right_on='I')

df = pd.merge(df_rele_doc, df_queries, left_on='query', right_on='I')

df = df.rename(columns={'W_x':'docs', 'W_y':'queries'})

final_df = df[['docs', 'queries', 'doc', 'query']]

In [9]:
from spacy.lang.en.stop_words import STOP_WORDS

spacy_nlp = spacy.load('en_core_web_sm')

#create list of punctuations and stopwords
punctuations = string.punctuation
stop_words = spacy.lang.en.stop_words.STOP_WORDS

#function for data cleaning and processing
#This can be further enhanced by adding / removing reg-exps as desired.

def spacy_tokenizer(sentence):
 
    #remove distracting single quotes
    sentence = re.sub('\'','',sentence)

    #remove digits adnd words containing digits
    sentence = re.sub('\w*\d\w*','',sentence)

    #replace extra spaces with single space
    sentence = re.sub(' +',' ',sentence)

    #remove unwanted lines starting from special charcters
    sentence = re.sub(r'\n: \'\'.*','',sentence)
    sentence = re.sub(r'\n!.*','',sentence)
    sentence = re.sub(r'^:\'\'.*','',sentence)
    
    #remove non-breaking new line characters
    sentence = re.sub(r'\n',' ',sentence)
    
    #remove punctunations
    sentence = re.sub(r'[^\w\s]',' ',sentence)
    
    #creating token object
    tokens = spacy_nlp(sentence)
    
    #lower, strip and lemmatize
    tokens = [word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in tokens]
    
    #remove stopwords, and exclude words less than 2 characters
    tokens = [word for word in tokens if word not in stop_words and word not in punctuations and len(word) > 2]
    
    #return tokens
    return tokens

In [10]:
print ('Cleaning and Tokenizing...')
%time final_df['doc_tokenized'] = final_df['docs'].map(lambda x: spacy_tokenizer(x))

final_df.head()

Cleaning and Tokenizing...
Wall time: 53 s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,docs,queries,doc,query,doc_tokenized
0,analysis of mammalian lens proteins by electro...,"the crystalline lens in vertebrates, including...",13,1,"[analysis, mammalian, lens, protein, electroph..."
1,an autoradiographic study on cell migration in...,"the crystalline lens in vertebrates, including...",14,1,"[autoradiographic, study, cell, migration, eye..."
2,lens development.. the differentiation of embr...,"the crystalline lens in vertebrates, including...",15,1,"[lens, development, differentiation, embryonic..."
3,studies on aging with horse crystalline lens g...,"the crystalline lens in vertebrates, including...",72,1,"[study, age, horse, crystalline, lens, gel, co..."
4,histological research on the lens in condition...,"the crystalline lens in vertebrates, including...",79,1,"[histological, research, lens, condition, hypo..."


In [11]:
doc_text = final_df['doc_tokenized']
doc_text[0:5]

0    [analysis, mammalian, lens, protein, electroph...
1    [autoradiographic, study, cell, migration, eye...
2    [lens, development, differentiation, embryonic...
3    [study, age, horse, crystalline, lens, gel, co...
4    [histological, research, lens, condition, hypo...
Name: doc_tokenized, dtype: object

# Building Word Dictionary

In [12]:
from gensim import corpora

#creating term dictionary
%time dictionary = corpora.Dictionary(doc_text)

#filter out terms which occurs in less than 4 documents and more than 20% of the documents.
#NOTE: Since we have smaller dataset, we will keep this commented for now.

#dictionary.filter_extremes(no_below=4, no_above=0.2)

#list of few which which can be further removed
stoplist = set('and if this can would should could tell stop come go')
stop_ids = [dictionary.token2id[stopword] for stopword in stoplist if stopword in dictionary.token2id]
dictionary.filter_tokens(stop_ids)

Wall time: 140 ms


In [13]:
#print top 50 items from the dictionary with their unique token-id
dict_tokens = [[[dictionary[key], dictionary.token2id[dictionary[key]]] for key, value in dictionary.items() if key <= 50]]
print (dict_tokens)

[[['analysis', 0], ['analyze', 1], ['component', 2], ['crystallin', 3], ['detect', 4], ['difference', 5], ['different', 6], ['dimensional', 7], ['electrophoresis', 8], ['fraction', 9], ['fractionation', 10], ['gel', 11], ['lens', 12], ['mammalian', 13], ['mean', 14], ['method', 15], ['number', 16], ['protein', 17], ['provide', 18], ['resolve', 19], ['sensitive', 20], ['specie', 21], ['starch', 22], ['technique', 23], ['vary', 24], ['age', 25], ['alloxan', 26], ['animal', 27], ['appear', 28], ['appendix', 29], ['arc', 30], ['area', 31], ['autoradiographic', 32], ['autoradiography', 33], ['beginning', 34], ['body', 35], ['cataract', 36], ['cell', 37], ['collaboration', 38], ['control', 39], ['count', 40], ['day', 41], ['diabetic', 42], ['diagram', 43], ['distance', 44], ['effect', 45], ['epithelium', 46], ['equal', 47], ['experimental', 48], ['eye', 49], ['frequency', 50]]]


# Feature Extraction (Bag of Words)

In [14]:
corpus = [dictionary.doc2bow(desc) for desc in doc_text]

word_frequencies = [[(dictionary[id], frequency) for id, frequency in line] for line in corpus[0:3]]

print(word_frequencies)

[[('analysis', 1), ('analyze', 2), ('component', 2), ('crystallin', 3), ('detect', 1), ('difference', 1), ('different', 1), ('dimensional', 1), ('electrophoresis', 2), ('fraction', 1), ('fractionation', 1), ('gel', 1), ('lens', 3), ('mammalian', 2), ('mean', 1), ('method', 1), ('number', 1), ('protein', 3), ('provide', 1), ('resolve', 1), ('sensitive', 1), ('specie', 2), ('starch', 1), ('technique', 1), ('vary', 1)], [('lens', 3), ('number', 1), ('age', 1), ('alloxan', 2), ('animal', 1), ('appear', 1), ('appendix', 1), ('arc', 1), ('area', 1), ('autoradiographic', 1), ('autoradiography', 1), ('beginning', 1), ('body', 1), ('cataract', 1), ('cell', 3), ('collaboration', 1), ('control', 1), ('count', 2), ('day', 3), ('diabetic', 5), ('diagram', 1), ('distance', 1), ('effect', 1), ('epithelium', 2), ('equal', 1), ('experimental', 2), ('eye', 2), ('frequency', 2), ('generation', 1), ('gertraude', 1), ('grain', 2), ('histotechnique', 1), ('hour', 3), ('increase', 1), ('injection', 2), ('int


# Build Tf-Idf and LSI Model

In [None]:
%time doc_tfidf_model = gensim.models.TfidfModel(corpus, id2word=dictionary)
%time doc_lsi_model = gensim.models.LsiModel(doc_tfidf_model[corpus], id2word=dictionary, num_topics=300)

Wall time: 95.8 ms


In [None]:
%time gensim.corpora.MmCorpus.serialize('doc_tfidf_model_mm', doc_tfidf_model[corpus])
%time gensim.corpora.MmCorpus.serialize('doc_lsi_model_mm',doc_lsi_model[doc_tfidf_model[corpus]])

In [None]:
#Load the indexed corpus
doc_tfidf_corpus = gensim.corpora.MmCorpus('doc_tfidf_model_mm')
doc_lsi_corpus = gensim.corpora.MmCorpus('doc_lsi_model_mm')

print(doc_tfidf_corpus)
print(doc_lsi_corpus)

# Doc Ranking

In [None]:
from gensim.similarities import MatrixSimilarity

doc_index = MatrixSimilarity(doc_lsi_corpus, num_features = doc_lsi_corpus.num_terms)

In [None]:
from operator import itemgetter


only_queries = df_queries['W'].tolist()
doc_names = []

for query in only_queries:
    
    query_bow = dictionary.doc2bow(spacy_tokenizer(query))
    query_tfidf = doc_tfidf_model[query_bow]
    query_lsi = doc_lsi_model[query_tfidf]

    doc_index.num_best = 30

    doc_list = doc_index[query_lsi]

    doc_list.sort(key=itemgetter(1), reverse=True)

    for j, doc in enumerate(doc_list):

        doc_names.append (
            {
                'Queries': final_df['queries'][doc[0]],
                'Docs': final_df['docs'][doc[0]],
                'Query_id': final_df['query'][doc[0]],
                'Doc_id': final_df['doc'][doc[0]],
                'Relevance_Score': round((doc[1] * 100),2)
            }

        )
        if j == (doc_index.num_best-1):
            break

doc_names_df = pd.DataFrame(doc_names, columns=['Queries','Docs', 'Query_id', 'Doc_id', 'Relevance_Score'])

ranking_df = doc_names_df.sort_values(by=['Query_id', 'Relevance_Score'], ascending=[True, False])

In [None]:
ranking_df.head()

In [None]:
#ranking_df.to_csv('query_doc_ranking.csv', index=False)