# Geometric Data Analysis Project (Semantic Search)

## 1. Loading Data Into DB & Embedding

In [None]:
import polars as pl  # Dataframe library for loading initial dataset
from playhouse.postgres_ext import PostgresqlExtDatabase, fn  # Peewee ORM with extras

from utilities.models import (  # ORM Classes
    database_driver, # Database driver to be used (Uses `config.yaml`)
    create_tables,
    Patent, # Main class, holding the texts
    arctic_noverlap,
    arctic_recursive,
    arctic_sliding,
    minilm_noverlap,
    minilm_recursive,
    minilm_sliding,
)


In [3]:
from utilities.setup import load_config  # Loading Database config

Loading train and test split of "Nuclear Patents", which is small enough for efficient handling, but large enough for more advanced methods:

In [2]:
# Loads train and test split of "Nuclear Patents"
# Small enough for efficient handling

splits = {'train': 'data/train-00000-of-00001.parquet', 'test': 'data/test-00000-of-00001.parquet'}
train = pl.read_parquet('hf://datasets/arcee-ai/nuclear_patents/' + splits['train'])
test = pl.read_parquet('hf://datasets/arcee-ai/nuclear_patents/' + splits['test'])

# Combining train and test splits and filtering out nulls
patent_data = pl.concat([train, test]).filter(
    pl.col("patent_number").is_not_null() & (pl.col("patent_number").str.len_chars() > 0) &
    pl.col("section").is_not_null() & (pl.col("section").str.len_chars() > 0) &
    pl.col("raw_text").is_not_null() & (pl.col("raw_text").str.len_chars() > 0) &
    (pl.col("raw_text").str.len_chars() <= 5000)  # Filter for raw_text length <= 5000
)

patent_data.head()

patent_number,section,raw_text
str,str,str
"""062326139""","""abstract""","""An angular pumped and emitting…"
"""059600497""","""abstract""","""The operator of a nuclear stea…"
"""042499950""","""abstract""","""In a fast reactor constituted …"
"""051606950""","""abstract""","""An apparatus and method of enh…"
"""044477333""","""claims""","""1. A radiation-shielding trans…"


Creating the specified tables (Connection and ORM models are set up in `utilities/models.py`):

In [2]:
database_driver.connect()

True

In [10]:
create_tables()

Inserting the data from `patent_data` into `patents` table

In [6]:
def bulk_insert_patents(patent_data: pl.DataFrame) -> None:
    with database_driver.atomic():
        batch_size = 1000

        patent_records = patent_data.to_dicts()

        for i in range(0, len(patent_records), batch_size):
            batch = patent_records[i:i + batch_size]

            # Add search_vector field before inserting
            for record in batch:
                record["search_vector"] = fn.to_tsvector('english', record["raw_text"])

            # Bulk insert batch
            Patent.insert_many(batch).execute()

            print(f"Inserted records {i} to {min(i + batch_size, len(patent_records))}")

    print(f"Insertion: Successful ({len(patent_records)} records created)")


In [11]:
bulk_insert_patents(patent_data)

Inserted records 0 to 1000
Inserted records 1000 to 2000
Inserted records 2000 to 3000
Inserted records 3000 to 4000
Inserted records 4000 to 5000
Inserted records 5000 to 6000
Inserted records 6000 to 6678
Insertion: Successful (6678 records created)


Loading embedding models (MiniLM, Arctic Embed M) and chunkers (No overlap splitter, recursive splitter, sliding window splitter)

In [6]:
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


In [7]:
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer

In [18]:
class SlidingWindowSplitter():
    def __init__(self, window_size=400, step=100):
        self.window_size = window_size
        self.step = step

    def split_text(self, text):
        """Implements sliding window chunking ."""
        words = text.split() # Splits at whitespaces
        chunks = [' '.join(words[i:i+self.window_size]) for i in range(0, len(words), self.step)]
        return chunks

In [8]:
mini_lm = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
arctic_embed = SentenceTransformer('Snowflake/snowflake-arctic-embed-m')

In [None]:
fixed_size_splitter = CharacterTextSplitter(
    separator=" ",
    chunk_size=400,
    chunk_overlap=0
)

In [9]:
sentence_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 400,
    chunk_overlap  = 100,
    separators = ["\n\n", "\n", ".", "?", "!", " ", ""]
)

In [20]:
window_splitter = SlidingWindowSplitter(
    window_size = 400,
    step = 100
)

In [None]:
def bulk_insert_chunks_and_embeddings(chunker, embedding_model, chunk_db):
    patent_data = Patent.select()
    batch_size = 1000  # Choose a batch size for bulk insert
    for i in range(0, len(patent_data), batch_size):
        batch = patent_data[i:i + batch_size]
        chunk_data = []
        chunks_to_embed = []

        for patent in batch:
            # Chunk the raw_text
            chunks = chunker.split_text(patent.raw_text)
            chunks_to_embed.extend(chunks)  # Collect chunks

            # Prepare chunk_data entries
            for chunk in chunks:
                chunk_data.append({
                    'patent_number': patent.id,
                    'chunk_text': chunk,
                    'embedding': None  # Placeholder for embedding
                })

        # Generate embeddings in bulk
        embeddings = embedding_model.encode(chunks_to_embed)

        # Now update chunk_data
        for idx, embedding in enumerate(embeddings):
            chunk_data[idx]['embedding'] = embedding.tolist()

        # Insert chunks in bulk
        with database_driver.atomic():
            chunk_db.insert_many(chunk_data).execute()  # Bulk insert into the given chunk_db table
        print(f"Inserted chunks for patents {i} to {i + len(batch)}")

In [15]:
bulk_insert_chunks_and_embeddings(
    fixed_size_splitter,
    mini_lm,
    minilm_noverlap
)

Inserted chunks for patents 0 to 1000
Inserted chunks for patents 1000 to 2000
Inserted chunks for patents 2000 to 3000
Inserted chunks for patents 3000 to 4000
Inserted chunks for patents 4000 to 5000
Inserted chunks for patents 5000 to 6000
Inserted chunks for patents 6000 to 6678


In [16]:
bulk_insert_chunks_and_embeddings(
    sentence_splitter,
    mini_lm,
    minilm_recursive
)

Inserted chunks for patents 0 to 1000
Inserted chunks for patents 1000 to 2000
Inserted chunks for patents 2000 to 3000
Inserted chunks for patents 3000 to 4000
Inserted chunks for patents 4000 to 5000
Inserted chunks for patents 5000 to 6000
Inserted chunks for patents 6000 to 6678


In [21]:
bulk_insert_chunks_and_embeddings(
    window_splitter,
    mini_lm,
    minilm_sliding
)

Inserted chunks for patents 0 to 1000
Inserted chunks for patents 1000 to 2000
Inserted chunks for patents 2000 to 3000
Inserted chunks for patents 3000 to 4000
Inserted chunks for patents 4000 to 5000
Inserted chunks for patents 5000 to 6000
Inserted chunks for patents 6000 to 6678


In [22]:
bulk_insert_chunks_and_embeddings(
    fixed_size_splitter,
    arctic_embed,
    arctic_noverlap
)

Inserted chunks for patents 0 to 1000
Inserted chunks for patents 1000 to 2000
Inserted chunks for patents 2000 to 3000
Inserted chunks for patents 3000 to 4000
Inserted chunks for patents 4000 to 5000
Inserted chunks for patents 5000 to 6000
Inserted chunks for patents 6000 to 6678


In [23]:
bulk_insert_chunks_and_embeddings(
    sentence_splitter,
    arctic_embed,
    arctic_recursive
)

Inserted chunks for patents 0 to 1000
Inserted chunks for patents 1000 to 2000
Inserted chunks for patents 2000 to 3000
Inserted chunks for patents 3000 to 4000
Inserted chunks for patents 4000 to 5000
Inserted chunks for patents 5000 to 6000
Inserted chunks for patents 6000 to 6678


In [24]:
bulk_insert_chunks_and_embeddings(
    window_splitter,
    arctic_embed,
    arctic_sliding
)

Inserted chunks for patents 0 to 1000
Inserted chunks for patents 1000 to 2000
Inserted chunks for patents 2000 to 3000
Inserted chunks for patents 3000 to 4000
Inserted chunks for patents 4000 to 5000
Inserted chunks for patents 5000 to 6000
Inserted chunks for patents 6000 to 6678


## 2. Search Functions

### Search Functions:

In [16]:
def semantic_search(keyword, chunk_db, embedding_model, fetch_size=10):
    embedding = embedding_model.encode(keyword)

    results = chunk_db \
        .select(chunk_db.patent_number, chunk_db.chunk_text) \
        .where(chunk_db.embedding.cosine_distance(embedding) < 0.6) \
        .order_by(chunk_db.embedding.cosine_distance(embedding).asc()) \
        .limit(fetch_size)

    return pl.DataFrame(list(results.dicts())) if results.exists() else pl.DataFrame()

In [None]:
def hybrid_search(keyword, chunk_db, patent_db, embedding_model, fetch_size=10, lambda_weight=0.5):
    embedding = embedding_model.encode(keyword)

    # Semantic Search
    semantic_results = chunk_db \
        .select(chunk_db.patent_number, chunk_db.chunk_text, 
                (1 - chunk_db.embedding.cosine_distance(embedding)).alias('semantic_score')) \
        .where(chunk_db.embedding.cosine_distance(embedding) < 0.6) \
        .order_by((1 - chunk_db.embedding.cosine_distance(embedding)).desc()) \
        .limit(100)

    semantic_df = pl.DataFrame(list(semantic_results.dicts())) if semantic_results.exists() else pl.DataFrame()

    # Keyword-based Full-Text Search
    keyword_results = patent_db \
        .select(patent_db.id, patent_db.patent_number, patent_db.raw_text, 
                fn.ts_rank_cd(patent_db.search_vector, fn.to_tsquery(keyword)).alias('keyword_score')) \
        .where(patent_db.search_vector.match(keyword)) \
        .order_by(fn.ts_rank_cd(patent_db.search_vector, fn.to_tsquery(keyword)).desc()) \
        .limit(100)

    keyword_df = pl.DataFrame(list(keyword_results.dicts())) if keyword_results.exists() else pl.DataFrame()

    if keyword_df.is_empty() and semantic_df.is_empty():
        return pl.DataFrame()  # No results

    # Ensure both DataFrames have the same type for `id`
    keyword_df = keyword_df.with_columns(pl.col("id").cast(pl.Utf8))
    semantic_df = semantic_df.with_columns(pl.col("patent_number").cast(pl.Utf8))

    # Keep only relevant columns
    keyword_df = keyword_df.select(["id", "keyword_score"])
    semantic_df = semantic_df.select(["patent_number", "semantic_score"])

    # Merge on 'id' from keyword_df and 'patent_number' from semantic_df
    combined_df = keyword_df.join(semantic_df, left_on="id", right_on="patent_number", how="inner").fill_null(0)

    combined_df = combined_df.with_columns(
        pl.col("semantic_score").cast(pl.Float64),
        pl.col("keyword_score").cast(pl.Float64)
    )

    # Compute Final Hybrid Score
    combined_df = combined_df.with_columns(
        ((lambda_weight * combined_df["semantic_score"]) + 
         ((1 - lambda_weight) * combined_df["keyword_score"])).alias("final_score")
    )

    # Get Top `patent_number`s Based on Hybrid Score
    top_patent_ids = combined_df.sort("final_score", descending=True)["id"].head(fetch_size).to_list()

    if not top_patent_ids:
        return pl.DataFrame()

    # Fetch Full Patent Data for the Top IDs
    full_patent_data = patent_db.select(patent_db.patent_number, patent_db.section ,patent_db.raw_text).where(patent_db.id.in_(top_patent_ids))

    return pl.DataFrame(list(full_patent_data.dicts()))  # Convert to Polars DataFrame

### Keywords for testing:

In [8]:
keyword_list = [
    "nuclear stream",
    "reactor",
    "nuclear fusion",
    "radiation-shielding",
    "X-ray",
    "irradiation",
    "Si crystal"
]

In [17]:
res = semantic_search(
    keyword_list[4],
    minilm_noverlap,
    mini_lm
)

res.head()

patent_number,chunk_text
i64,str
18524,"""incident to the X-ray detector…"
13570,"""of the X-ray beam."""
14460,"""passage of X-rays correspondin…"
17716,"""rest. 13. An X-ray examination…"
14589,"""1. An X-ray examination appara…"


In [66]:
res = hybrid_search(
    keyword_list[4],
    minilm_noverlap,
    Patent,
    mini_lm
)

res.head()

patent_number,section,raw_text
str,str,str
"""051596210""","""claims""","""1. An X-ray transmitting windo…"
"""062755684""","""claims""","""1. An X-ray examination appara…"
"""060944714""","""claims""","""1. X-ray concentrator comprisi…"
"""046882421""","""summary""","""BACKGROUND OF THE INVENTION Th…"


## 3. Benchmark Context Relevance Using TruLens

In [9]:
from benchmark.benchmark_retrieval import evaluate_retrieval

In [33]:
def evaluate_model_for_keywords(model, embedding_model, search_function):
    relevance_scores = []
    
    for keyword in keyword_list:
        # Get the relevance score for the current keyword
        relevance_score = evaluate_retrieval(keyword, model, embedding_model, search_function)["context_relevance"]
        relevance_scores.append(relevance_score)
    
    # Compute the mean relevance for this model across all keywords
    mean_relevance = sum(relevance_scores) / len(relevance_scores) if relevance_scores else 0
    return mean_relevance

In [34]:
minilm_noverlap_res = evaluate_model_for_keywords(minilm_noverlap, mini_lm, semantic_search)
minilm_recursive_res = evaluate_model_for_keywords(minilm_recursive, mini_lm, semantic_search)
minilm_sliding_res = evaluate_model_for_keywords(minilm_sliding, mini_lm, semantic_search)
arctic_noverlap_res = evaluate_model_for_keywords(arctic_noverlap, arctic_embed, semantic_search)
arctic_recursive_res = evaluate_model_for_keywords(arctic_recursive, arctic_embed, semantic_search)
arctic_sliding_res = evaluate_model_for_keywords(arctic_sliding, arctic_embed, semantic_search)

✅ In context_relevance, input question will be set to __record__.main_input or `Select.RecordInput` .
✅ In context_relevance, input context will be set to text .
✅ In context_relevance, input question will be set to __record__.main_input or `Select.RecordInput` .
✅ In context_relevance, input context will be set to text .
✅ In context_relevance, input question will be set to __record__.main_input or `Select.RecordInput` .
✅ In context_relevance, input context will be set to text .
✅ In context_relevance, input question will be set to __record__.main_input or `Select.RecordInput` .
✅ In context_relevance, input context will be set to text .
✅ In context_relevance, input question will be set to __record__.main_input or `Select.RecordInput` .
✅ In context_relevance, input context will be set to text .
✅ In context_relevance, input question will be set to __record__.main_input or `Select.RecordInput` .
✅ In context_relevance, input context will be set to text .
✅ In context_relevance, inpu

In [35]:
print("minilm_noverlap Result:", minilm_noverlap_res)
print("minilm_recursive Result:", minilm_recursive_res)
print("minilm_sliding Result:", minilm_sliding_res)
print("arctic_noverlap Result:", arctic_noverlap_res)
print("arctic_recursive Result:", arctic_recursive_res)
print("arctic_sliding Result:", arctic_sliding_res)

minilm_noverlap Result: 0.9047619047619048
minilm_recursive Result: 1.0
minilm_sliding Result: 1.0
arctic_noverlap Result: 0.19047619047619047
arctic_recursive Result: 0.19047619047619047
arctic_sliding Result: 0.14285714285714285


In [36]:
benchmark_data = pl.read_csv("./assets/benchmark.csv")
benchmark_data

model_name,elapsed_time,number_of_records,mean_context_relevance
str,f64,i64,f64
"""minilm_noverlap""",137.5,36849,0.904762
"""minilm_recursive""",201.8,50475,1.0
"""minilm_sliding""",95.1,25009,1.0
"""arctic_noverlap""",341.7,36849,0.190476
"""arctic_recursive""",511.7,50475,0.190476
"""arctic_sliding""",330.8,25009,0.142857


## 4. Plotting Results

In [41]:
import altair as alt

In [54]:
# Create the bar plot
plot = benchmark_data.plot.bar(
    x="model_name", 
    y="mean_context_relevance"
)

# Configure the plot with title and axis labels
plot = plot.properties(
    title="Model Performance Comparison by Mean Context Relevance (higher is better)",
    width=600
).encode(
    x=alt.X('model_name:N', title='Model Name'),  # Setting x-axis label
    y=alt.Y('mean_context_relevance:Q', title='Mean Context Relevance')  # Setting y-axis label
)

# Show the plot
plot.show()


In [51]:
# Create the bar plot
plot = benchmark_data.plot.bar(
    x="model_name", 
    y="elapsed_time"
)

# Configure the plot with title and axis labels
plot = plot.properties(
    title="Model Performance Comparison by Elapsed Time (lower is better)",
    width=600
).encode(
    x=alt.X('model_name:N', title='Model Name'),  # Setting x-axis label
    y=alt.Y('elapsed_time:Q', title='Elapsed Time')  # Setting y-axis label
)

# Show the plot
plot.show()

In [53]:
# Create the bar plot
plot = benchmark_data.plot.bar(
    x="model_name", 
    y="number_of_records"
)

# Configure the plot with title and axis labels
plot = plot.properties(
    title="Model Performance Comparison by Number of DataBase Records",
    width=600
).encode(
    x=alt.X('model_name:N', title='Model Name'),  # Setting x-axis label
    y=alt.Y('number_of_records:Q', title='Number of DataBase Records')  # Setting y-axis label
)

# Show the plot
plot.show()