Setup

In [None]:
!pip install sentence_transformers

In [None]:
import os
import operator
import numpy as np
import pandas as pd

import re
import string
import spacy
import gensim
import nltk
from gensim import corpora, models, matutils
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import RegexpTokenizer
import spacy.lang.en
from scipy import spatial
from scipy.spatial.distance import cosine

spacy_nlp = spacy.load('en_core_web_sm')

nltk.download('stopwords')

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
from transformers import BertModel, BertTokenizer
from sentence_transformers import SentenceTransformer
import torch

Preparing the Document Collection and User Query

In [None]:
# Set the directory where your documents are stored
document_dir = '/content/drive/MyDrive/file path'

For keyword-based searches, the process will include tokenization, the elimination of stopwords, and the option of either lemmatization or stemming to refine the words further.

In contrast, for semantic search queries that leverage embeddings or neural models capable of grasping broader contexts, preprocessing will be more subdued. The primary focus here will be on tidying up the text to retain its original meaning as much as possible. Given that semantic search aims to grasp the query's context and underlying intent, the approach to preprocessing shifts away from significant word modifications towards cleaning the text to ensure it's in a suitable format for semantic models.

In [None]:
def lemmatize_and_stem(text, use_stemming=True):
    """
    Function to lemmatize and optionally stem words in the text.

    :param text: The input text as a string.
    :param use_stemming: Boolean indicating whether to use stemming.
    :return: Processed text tokens as a list.
    """
    # Initialize Spacy tokenizer and NLTK stemmer
    stemmer = PorterStemmer()

    # Lemmatization
    doc = spacy_nlp(text)
    tokens = [token.lemma_.lower().strip() for token in doc if token.lemma_ != "-PRON-"]

    # Optional stemming
    if use_stemming:
        tokens = [stemmer.stem(token) for token in tokens]

    return tokens

In [None]:
def preprocess(text, use_for='search_type'):
    """
    Function to preprocess text for either keyword-based or semantic search models.

    :param text: The input text as a string.
    :param use_for: Specify 'keyword' for keyword-based model preprocessing, 'semantic' for semantic-based model.
    :return: Preprocessed text as a list of tokens.
    """
    # Common preprocessing
    text = re.sub('\'', '', text)  # remove distracting single quotes
    text = re.sub('\w*\d\w*', '', text)  # remove digits and words containing digits
    text = re.sub(' +', ' ', text)  # replace extra spaces with single space
    text = re.sub(r'[^\w\s]', ' ', text)  # remove punctuations
    text = re.sub(r'\n', ' ', text)  # remove non-breaking new line characters

    if use_for == 'keyword':
        # Keyword-based model specific preprocessing
        tokenizer = RegexpTokenizer(r'\w+')
        stop_words = set(stopwords.words('english'))
        tokens = tokenizer.tokenize(text.lower())
        tokens = [word for word in tokens if word not in stop_words and len(word) > 2]
        tokens = lemmatize_and_stem(text, use_stemming=False)
        tokens = [word for word in tokens if len(word) > 2]
        return tokens
    elif use_for == 'semantic':
        return text.strip()

For keyword search

In [None]:
# Load and preprocess documents
texts = []
document_filenames = []

for filename in os.listdir(document_dir):
    if filename.endswith(".txt"):
        filepath = os.path.join(document_dir, filename)
        with open(filepath, 'r', encoding='utf-8') as file:
            text = file.read()
            processed_text = preprocess(text, use_for='keyword')
            texts.append(processed_text)
            document_filenames.append(filename)

In [None]:
# Create a dictionary and corpus for retrieval
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

In [None]:
#print top 50 items from the dictionary with their unique token-id
dict_tokens = [[[dictionary[key], dictionary.token2id[dictionary[key]]] for key, value in dictionary.items() if key <= 50]]
print (dict_tokens)

[[['aberrant', 0], ['able', 1], ['abstract', 2], ['accurate', 3], ['across', 4], ['addition', 5], ['aim', 6], ['algorithm', 7], ['allele', 8], ['allelic', 9], ['allelotype', 10], ['allow', 11], ['almost', 12], ['also', 13], ['alteration', 14], ['amplification', 15], ['amplified', 16], ['amplify', 17], ['analysis', 18], ['analyze', 19], ['and', 20], ['apparent', 21], ['apply', 22], ['approach', 23], ['array', 24], ['arrive', 25], ['ascn', 26], ['available', 27], ['base', 28], ['because', 29], ['believe', 30], ['between', 31], ['both', 32], ['but', 33], ['can', 34], ['cancer', 35], ['candidate', 36], ['category', 37], ['cause', 38], ['cell', 39], ['central', 40], ['chromatid', 41], ['chromosomal', 42], ['chromosome', 43], ['citation', 44], ['classification', 45], ['classify', 46], ['collection', 47], ['combine', 48], ['complicate', 49], ['conceivably', 50]]]


In [None]:
word_frequencies = [[(dictionary[id], frequency) for id, frequency in line] for line in corpus[0:3]]
print(word_frequencies)

[[('aberrant', 1), ('able', 4), ('abstract', 1), ('accurate', 1), ('across', 4), ('addition', 1), ('aim', 1), ('algorithm', 2), ('allele', 8), ('allelic', 1), ('allelotype', 2), ('allow', 1), ('almost', 1), ('also', 4), ('alteration', 5), ('amplification', 14), ('amplified', 1), ('amplify', 5), ('analysis', 1), ('analyze', 2), ('and', 13), ('apparent', 1), ('apply', 2), ('approach', 2), ('array', 5), ('arrive', 1), ('ascn', 1), ('available', 1), ('base', 2), ('because', 3), ('believe', 2), ('between', 1), ('both', 2), ('but', 1), ('can', 1), ('cancer', 6), ('candidate', 1), ('category', 1), ('cause', 2), ('cell', 3), ('central', 1), ('chromatid', 1), ('chromosomal', 2), ('chromosome', 5), ('citation', 14), ('classification', 1), ('classify', 1), ('collection', 1), ('combine', 1), ('complicate', 1), ('conceivably', 1), ('conclude', 1), ('confirm', 1), ('constant', 1), ('contain', 3), ('contribution', 1), ('copy', 15), ('could', 1), ('counterpart', 1), ('currently', 1), ('data', 1), ('da

For semantic search

In [None]:
# Load and preprocess documents
filenames = []
texts_semantic = []
for filename in os.listdir(document_dir):
    if filename.endswith(".txt"):
        filepath = os.path.join(document_dir, filename)
        with open(filepath, 'r', encoding='utf-8') as file:
            text = file.read()
            processed_text = preprocess(text, use_for='semantic')
            texts_semantic.append(processed_text)
            filenames.append(filename)

Comparison

In [None]:
query = "How to preprocess text in Python? Exploring text preprocessing techniques."

print(preprocess(query, use_for='keyword'))
print(preprocess(query, use_for='semantic'))

['how', 'preprocess', 'text', 'python', 'explore', 'text', 'preprocesse', 'technique']
How to preprocess text in Python  Exploring text preprocessing techniques


Embedding

In practice we need to run the embedding model twice:

Indexing Stage: This is an offline process where the document collection is transformed into embeddings in a batch operation. While this stage demands high computational throughput, it is not latency-sensitive. The goal here is to precompute and store the embeddings for all documents in the database, facilitating efficient retrieval during the search.

Query Embedding: At the time of each search request, the query itself is converted into an embedding. This operation is online and requires prompt execution to ensure a swift response to the search inquiry. Despite the need for speed, this process benefits from the models' ability to understand and match the query's semantic context with the precomputed document embeddings.

TF-IDF Embeddings for keyword-based search

In [None]:
# Initialize and train the TF-IDF model
tfidf = models.TfidfModel(corpus)  # fit TF-IDF model

# Apply transformation to the entire corpus
tfidf_corpus = [tfidf[doc] for doc in corpus]

Semantic Embeddings

Most popular transformer-based models.

The architecture of the Transformer model is built around two main components:

Encoder: This component processes textual input, which has been converted into numerical format, to generate embeddings that encapsulate the semantic essence of the input text.

Decoder: This part reverses the encoder's operation. Starting from the embeddings, it generates predictions for the subsequent text token.

Positioned at the heart of this structure, between the encoder and decoder, lies the embedding representation of the input. It's crucial to distinguish between the input vectors and the embedding vectors, despite both being numerical arrays:

Input Vectors: These are essentially sequences of term identifiers, each selected from a predefined vocabulary for instance, BERT operates with a vocabulary size of about 32,000 terms, and are padded to maintain a uniform length across inputs.

Embedding Vectors: Represent the model's internal interpretation of the input. They are the "lens" through which the neural network perceives your data. It's reasonable to anticipate that documents bearing semantic similarities will yield closely related internal representations.

Models like BERT are tailored to utilize solely the encoder segment of the Transformer architecture. They excel in tasks such as text classification, summarization, and entity recognition, owing to their ability to deeply understand and represent the semantics of text.

Conversely, models from the GPT family employ only the decoder component. Their strengths lie in generating coherent and contextually relevant text based on the embeddings, showcasing their proficiency in tasks that involve text creation and completion.

Semantic embedding using a sentence transformer model

In [None]:
# Load a pre-trained model # We will use a small model due to memory constraint
model = SentenceTransformer('all-MiniLM-L6-v2')

In [None]:
def get_sentence_embeddings(documents):
    """
    Generates embeddings for a list of documents using a Sentence Transformer model.

    Parameters:
    - documents: A list of strings (documents or sentences) to be embedded.

    Returns:
    - An array of embeddings.
    """
    # Generate embeddings
    embeddings = model.encode(documents, show_progress_bar=True)
    return embeddings

Search

Basic Information Retrieval with TF-IDF for keyword-based search

In [None]:
def tfidf_search(query, dictionary, tfidf_model, dense_tfidf):
    """
    Search the corpus using a TF-IDF representation and cosine similarity.

    Parameters:
    - query: The search query as a string.
    - dictionary: The Gensim dictionary mapping of ids to terms.
    - tfidf_model: The trained Gensim TF-IDF model.
    - dense_tfidf: The dense TF-IDF representation of the entire corpus.

    Returns:
    - A list of tuples (document_index, similarity_score) sorted by similarity score in descending order.
    """
    # Preprocess and vectorize the query in the same way as the corpus
    query_bow = dictionary.doc2bow(query.lower().split())
    query_tfidf = tfidf_model[query_bow]
    query_dense = matutils.sparse2full(query_tfidf, len(dictionary))

    # Calculate cosine similarity between the query and all documents
    similarities = []
    for doc in dense_tfidf:
        sim = 1 - spatial.distance.cosine(query_dense, doc)
        similarities.append(sim)

    # Sort documents by their similarity to the query
    sorted_similarities = sorted(enumerate(similarities), key=lambda x: x[1], reverse=True)

    return sorted_similarities

Extending to Semantic Search
Now, let's implement a semantic search function using sentence transformers for more meaningful searches.

Semantic search with all-MiniLM-L6-v2

The smaller the model, the lower the search latency and the faster the indexing speed. Huge SGPT and GTR models can only run on expensive GPUs.

The larger the number of parameters in the model, the better retrieval quality. all-MiniLM-L6-v2 is a good model, but it is too small to catch all the semantic differences in search with its 10M parameters. (Rank #72 for retrieval for BEIR/MTEB benchmark)

Ranking: https://huggingface.co/spaces/mteb/leaderboard

In [None]:
def semantic_search(query, documents, top_k=5):
    """
    Performs semantic search to find the most similar documents to the query.

    Parameters:
    - query: A string representing the search query.
    - documents: A list of strings representing the documents.
    - top_k: The number of top similar documents to return.

    Returns:
    - A list of tuples (document index, similarity score) for the top_k most similar documents.
    """
    query_embedding = model.encode([query])
    document_embeddings = get_sentence_embeddings(documents)

    # Calculate cosine similarities
    similarities = cosine_similarity(query_embedding, document_embeddings)[0]

    # Get top_k most similar documents
    top_k_indices = np.argsort(similarities)[::-1][:top_k]

    return [(index, similarities[index]) for index in top_k_indices]

Run

In [None]:
# Example query
query = "what is a phylogenetic context"

# Perform semantic search
results = semantic_search(query, texts_semantic, top_k=5)

# Display the results
for idx, score in results:
    print(f"Document index: {idx}, Similarity score: {score:.4f}, Filename: {filename}")

In [None]:
# Example usage of the tfidf_search function
query = "what is a phylogenetic context"
search_results = tfidf_search(query, dictionary, tfidf, dense_tfidf)

# Display top 5 results with correct filename retrieval
for doc_index, sim_score in search_results[:5]:
    filename = document_filenames[doc_index]
    print(f"Document index: {doc_index}, Similarity score: {sim_score:.4f}, Filename: {filename}")