# Setup

## Installing Libraries

In [4]:
!pip install sentence_transformers



## Importing Libraries

In [14]:
import os
import operator

import numpy as np
import pandas as pd

import re
import string

import gensim
from gensim import corpora, models, matutils

import spacy
import spacy.lang.en
spacy_nlp = spacy.load('en_core_web_sm')

import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import RegexpTokenizer
nltk.download('stopwords')

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from sentence_transformers import SentenceTransformer

from scipy.spatial.distance import cosine

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Preparing the Document Collection

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
# Set the directory where your documents are stored
document_dir = '/content/drive/MyDrive/Colab Notebooks/doc_collection'

#Data Source: https://archive.ics.uci.edu/dataset/311/sentence+classification

# Preprocessing Techniques

## preprocess Function

In [7]:
def preprocess(text, use_for='search_type'):
    """
    Function to preprocess text for either keyword-based or semantic search models.

    :param text: The input text as a string.
    :param use_for: Specify 'keyword' for keyword-based model preprocessing, 'semantic' for semantic-based model.
    :return: Preprocessed text as a list of tokens.
    """
    # Common preprocessing
    text = re.sub('\'', '', text)  # remove distracting single quotes
    text = re.sub('\w*\d\w*', '', text)  # remove digits and words containing digits
    text = re.sub(' +', ' ', text)  # replace extra spaces with single space
    text = re.sub(r'[^\w\s]', ' ', text)  # remove punctuations
    text = re.sub(r'\n', ' ', text)  # remove non-breaking new line characters

    if use_for == 'keyword':
        # Keyword-based model specific preprocessing
        tokenizer = RegexpTokenizer(r'\w+')
        stop_words = set(stopwords.words('english'))
        tokens = tokenizer.tokenize(text.lower())
        tokens = [word for word in tokens if word not in stop_words and len(word) > 2]
        tokens = lemmatize_and_stem(text, use_stemming=False)
        tokens = [word for word in tokens if len(word) > 2]
        return tokens
    elif use_for == 'semantic':
        return text.strip()

## lemmatize_and_stem Function

In [8]:
def lemmatize_and_stem(text, use_stemming=True):
    """
    Function to lemmatize and optionally stem words in the text.

    :param text: The input text as a string.
    :param use_stemming: Boolean indicating whether to use stemming.
    :return: Processed text tokens as a list.
    """
    # Initialize Spacy tokenizer and NLTK stemmer
    stemmer = PorterStemmer()

    # Lemmatization
    doc = spacy_nlp(text)
    tokens = [token.lemma_.lower().strip() for token in doc if token.lemma_ != "-PRON-"]

    # Optional stemming
    if use_stemming:
        tokens = [stemmer.stem(token) for token in tokens]

    return tokens

# Keyword-based Search Implementation

## Loading and Preprocessing Documents

In [9]:
# Load and preprocess documents
texts = []
document_filenames = []

for filename in os.listdir(document_dir):
    if filename.endswith(".txt"):
        filepath = os.path.join(document_dir, filename)
        with open(filepath, 'r', encoding='utf-8') as file:
            text = file.read()
            processed_text = preprocess(text, use_for='keyword') #preprocess()
            texts.append(processed_text)
            document_filenames.append(filename)

## Preparing the Dictionary and Corpus

In [10]:
# Create a dictionary and corpus for retrieval
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

## For Debugging and Checking
We print Dictionary Tokens:

In [11]:
#print top 50 items from the dictionary with their unique token-id
dict_tokens = [(token, id) for token, id in dictionary.token2id.items() if id < 50]
print(dict_tokens)

[('aberrant', 0), ('able', 1), ('abstract', 2), ('accurate', 3), ('across', 4), ('addition', 5), ('aim', 6), ('algorithm', 7), ('allele', 8), ('allelic', 9), ('allelotype', 10), ('allow', 11), ('almost', 12), ('also', 13), ('alteration', 14), ('amplification', 15), ('amplified', 16), ('amplify', 17), ('analysis', 18), ('analyze', 19), ('and', 20), ('apparent', 21), ('apply', 22), ('approach', 23), ('array', 24), ('arrive', 25), ('ascn', 26), ('available', 27), ('base', 28), ('because', 29), ('believe', 30), ('between', 31), ('both', 32), ('but', 33), ('can', 34), ('cancer', 35), ('candidate', 36), ('category', 37), ('cause', 38), ('cell', 39), ('central', 40), ('chromatid', 41), ('chromosomal', 42), ('chromosome', 43), ('citation', 44), ('classification', 45), ('classify', 46), ('collection', 47), ('combine', 48), ('complicate', 49)]


We print Word Frequencies:

In [12]:
# this loop will go through the first two documents in the corpus
for doc_id, doc in enumerate(corpus[:2]):
    print(f"Document {doc_id + 1} word frequencies:")
    # for each word_id, frequency pair in the document
    for id, frequency in doc:
        # print the word and its frequency
        print(f"{dictionary[id]}: {frequency}")
    print("\n")

Document 1 word frequencies:
aberrant: 1
able: 4
abstract: 1
accurate: 1
across: 4
addition: 1
aim: 1
algorithm: 2
allele: 8
allelic: 1
allelotype: 2
allow: 1
almost: 1
also: 4
alteration: 5
amplification: 14
amplified: 1
amplify: 5
analysis: 1
analyze: 2
and: 13
apparent: 1
apply: 2
approach: 2
array: 5
arrive: 1
ascn: 1
available: 1
base: 2
because: 3
believe: 2
between: 1
both: 2
but: 1
can: 1
cancer: 6
candidate: 1
category: 1
cause: 2
cell: 3
central: 1
chromatid: 1
chromosomal: 2
chromosome: 5
citation: 14
classification: 1
classify: 1
collection: 1
combine: 1
complicate: 1
conceivably: 1
conclude: 1
confirm: 1
constant: 1
contain: 3
contribution: 1
copy: 15
could: 1
counterpart: 1
currently: 1
data: 1
datum: 4
delete: 1
deleterious: 1
deletion: 4
demonstrate: 1
derive: 1
describe: 2
detection: 1
determine: 2
develop: 2
disequilibrium: 1
distinction: 1
dna: 2
due: 2
each: 4
effect: 1
either: 1
emerge: 1
employ: 1
essentially: 1
estimate: 1
event: 2
example: 1
exchange: 1
exclusiv

## TF-IDF Embeddings for Keyword-based Search

In [13]:
# Initialize and train the TF-IDF model
tfidf = models.TfidfModel(corpus)  # fit TF-IDF model

# Apply transformation to the entire corpus
tfidf_corpus = [tfidf[doc] for doc in corpus]

# Convert the sparse TF-IDF vectors to a dense format
dense_tfidf = np.array([matutils.sparse2full(doc, len(dictionary)) for doc in tfidf_corpus])

## Basic Information Retrieval with TF-IDF for Keyword-based Search

In [15]:
def tfidf_search(query, dictionary, tfidf_model, dense_tfidf):
    """
    Search the corpus using a TF-IDF representation and cosine similarity.

    Parameters:
    - query: The search query as a string.
    - dictionary: The Gensim dictionary mapping of ids to terms.
    - tfidf_model: The trained Gensim TF-IDF model.
    - dense_tfidf: The dense TF-IDF representation of the entire corpus.

    Returns:
    - A list of tuples (document_index, similarity_score) sorted by similarity score in descending order.
    """
    # Preprocess and vectorize the query in the same way as the corpus
    query_bow = dictionary.doc2bow(query.lower().split())
    query_tfidf = tfidf_model[query_bow]
    query_dense = matutils.sparse2full(query_tfidf, len(dictionary))

    # Calculate cosine similarity between the query and all documents
    similarities = []
    for doc in dense_tfidf:
      if np.count_nonzero(doc) == 0 or np.count_nonzero(query_dense) == 0:
            # One of the vectors is all zeros, meaning no terms matched
            similarities.append(0)
      else:
            sim = 1 - spatial.distance.cosine(query_dense, doc)
            similarities.append(sim)

    # Sort documents by their similarity to the query
    sorted_similarities = sorted(enumerate(similarities), key=lambda x: x[1], reverse=True)

    return sorted_similarities

## Search Example: Runninng a Query

In [16]:
# Example usage of the tfidf_search function
query = "what is a phylogenetic context"
search_results = tfidf_search(query, dictionary, tfidf, dense_tfidf)

# Display top 5 results with correct filename retrieval
for doc_index, sim_score in search_results[:5]:
    filename = document_filenames[doc_index]
    print(f"Document index: {doc_index}, Similarity score: {sim_score:.4f}, Filename: {filename}")

Document index: 3, Similarity score: 0.1394, Filename: 37.txt
Document index: 7, Similarity score: 0.0628, Filename: 20.txt
Document index: 1, Similarity score: 0.0247, Filename: 66.txt
Document index: 11, Similarity score: 0.0214, Filename: 63.txt
Document index: 8, Similarity score: 0.0179, Filename: 50.txt


# Semantic Search Implementation

## Loading and Preprocessing Documents

In [17]:
# Load and preprocess documents
filenames = []
texts_semantic = []
for filename in os.listdir(document_dir):
    if filename.endswith(".txt"):
        filepath = os.path.join(document_dir, filename)
        with open(filepath, 'r', encoding='utf-8') as file:
            text = file.read()
            processed_text = preprocess(text, use_for='semantic')
            texts_semantic.append(processed_text)
            filenames.append(filename)

## Semantic Embeddings - Pre-trained Model

In [18]:
# Load a pre-trained model # We will use a small model due to memory constraint
model = SentenceTransformer('all-MiniLM-L6-v2')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

## Generating Sentence Embeddings

In [19]:
def get_sentence_embeddings(documents):
    """
    Generates embeddings for a list of documents using a Sentence Transformer model.

    Parameters:
    - documents: A list of strings (documents or sentences) to be embedded.

    Returns:
    - An array of embeddings.
    """
    # Generate embeddings
    embeddings = model.encode(documents, show_progress_bar=True)
    return embeddings

## Semantic Search Function

In [20]:
def semantic_search(query, documents, top_k=5):
    """
    Performs semantic search to find the most similar documents to the query.

    Parameters:
    - query: A string representing the search query.
    - documents: A list of strings representing the documents.
    - top_k: The number of top similar documents to return.

    Returns:
    - A list of tuples (document index, similarity score) for the top_k most similar documents.
    """
    query_embedding = model.encode([query])
    document_embeddings = get_sentence_embeddings(documents)

    # Calculate cosine similarities
    similarities = cosine_similarity(query_embedding, document_embeddings)[0]

    # Get top_k most similar documents
    top_k_indices = np.argsort(similarities)[::-1][:top_k]

    return [(index, similarities[index]) for index in top_k_indices]

## Search Example: Running a Query

In [21]:
# Example query
query = "what is a phylogenetic context"

# Perform semantic search
results = semantic_search(query, texts_semantic, top_k=5)

# Display the results
for idx, score in results:
    # Retrieve the filename using the document index from the search results
    specific_filename = filenames[idx]  # Corrected to use 'idx' to fetch the correct filename
    print(f"Document index: {idx}, Similarity score: {score:.4f}, Filename: {specific_filename}")

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Document index: 7, Similarity score: 0.4318, Filename: 20.txt
Document index: 3, Similarity score: 0.4283, Filename: 37.txt
Document index: 6, Similarity score: 0.3333, Filename: 90.txt
Document index: 9, Similarity score: 0.2925, Filename: 4.txt
Document index: 11, Similarity score: 0.2308, Filename: 63.txt


# Comparing Two Search Strategies

## Comparing Two Processing Approaches

In [22]:
query = "How to preprocess text in Python? Exploring text preprocessing techniques."

print(preprocess(query, use_for='keyword'))
print(preprocess(query, use_for='semantic'))

['how', 'preprocess', 'text', 'python', 'explore', 'text', 'preprocesse', 'technique']
How to preprocess text in Python  Exploring text preprocessing techniques


## Comparing Search Strategies

In [23]:
# Example query
query = "what is a phylogenetic context"

# Perform semantic search
results = semantic_search(query, texts_semantic, top_k=5)

# Display the results
for idx, score in results:
    # Retrieve the filename using the document index from the search results
    specific_filename = filenames[idx]  # Corrected to use 'idx' to fetch the correct filename
    print(f"Document index: {idx}, Similarity score: {score:.4f}, Filename: {specific_filename}")

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Document index: 7, Similarity score: 0.4318, Filename: 20.txt
Document index: 3, Similarity score: 0.4283, Filename: 37.txt
Document index: 6, Similarity score: 0.3333, Filename: 90.txt
Document index: 9, Similarity score: 0.2925, Filename: 4.txt
Document index: 11, Similarity score: 0.2308, Filename: 63.txt


In [24]:
# Example usage of the tfidf_search function
query = "what is a phylogenetic context"
search_results = tfidf_search(query, dictionary, tfidf, dense_tfidf)

# Display top 5 results with correct filename retrieval
for doc_index, sim_score in search_results[:5]:
    filename = document_filenames[doc_index]
    print(f"Document index: {doc_index}, Similarity score: {sim_score:.4f}, Filename: {filename}")

Document index: 3, Similarity score: 0.1394, Filename: 37.txt
Document index: 7, Similarity score: 0.0628, Filename: 20.txt
Document index: 1, Similarity score: 0.0247, Filename: 66.txt
Document index: 11, Similarity score: 0.0214, Filename: 63.txt
Document index: 8, Similarity score: 0.0179, Filename: 50.txt


Evaluation

embedding similarity
Manual inspection - domain expert

Precision recall, F1 score nDCG MAP