# Geometric Data Analysis Project (Semantic Search)

## 1. Loading Data Into DB & Embedding

In [1]:
import polars as pl  # Dataframe library for loading initial dataset
from playhouse.postgres_ext import PostgresqlExtDatabase, fn  # Peewee ORM with extras

from utilities.models import (  # ORM Classes
    database_driver, # Database driver to be used (Uses `config.yaml`)
    create_tables,
    Patent, # Main class, holding the texts
    arctic_noverlap,
    arctic_recursive,
    arctic_sliding,
    minilm_noverlap,
    minilm_recursive,
    minilm_sliding,
)
from utilities.setup import load_config  # Loading Database config


Loading train and test split of "Nuclear Patents", which is small enough for efficient handling, but large enough for more advanced methods:

In [8]:
# Loads train and test split of "Nuclear Patents"
# Small enough for efficient handling

splits = {'train': 'data/train-00000-of-00001.parquet', 'test': 'data/test-00000-of-00001.parquet'}
train = pl.read_parquet('hf://datasets/arcee-ai/nuclear_patents/' + splits['train'])
test = pl.read_parquet('hf://datasets/arcee-ai/nuclear_patents/' + splits['test'])

# Combining train and test splits and filtering out nulls
patent_data = pl.concat([train, test]).filter(
    pl.col("patent_number").is_not_null() & (pl.col("patent_number").str.len_chars() > 0) &
    pl.col("section").is_not_null() & (pl.col("section").str.len_chars() > 0) &
    pl.col("raw_text").is_not_null() & (pl.col("raw_text").str.len_chars() > 0) &
    (pl.col("raw_text").str.len_chars() <= 5000)  # Add filter for raw_text length <= 5000
)

patent_data.head()

patent_number,section,raw_text
str,str,str
"""062326139""","""abstract""","""An angular pumped and emitting…"
"""059600497""","""abstract""","""The operator of a nuclear stea…"
"""042499950""","""abstract""","""In a fast reactor constituted …"
"""051606950""","""abstract""","""An apparatus and method of enh…"
"""044477333""","""claims""","""1. A radiation-shielding trans…"


Creating the specified tables (Connection and ORM models are set up in `utilities/models.py`):

In [9]:
database_driver.connect()

OperationalError: Connection already opened.

In [10]:
create_tables()

Inserting the data from `patent_data` into `patents` table

In [6]:
def bulk_insert_patents(patent_data: pl.DataFrame) -> None:
    with database_driver.atomic():
        batch_size = 1000

        patent_records = patent_data.to_dicts()

        for i in range(0, len(patent_records), batch_size):
            batch = patent_records[i:i + batch_size]

            # Add search_vector field before inserting
            for record in batch:
                record["search_vector"] = fn.to_tsvector('english', record["raw_text"])

            # Bulk insert batch
            Patent.insert_many(batch).execute()

            print(f"Inserted records {i} to {min(i + batch_size, len(patent_records))}")

    print(f"Insertion: Successful ({len(patent_records)} records created)")


In [11]:
bulk_insert_patents(patent_data)

Inserted records 0 to 1000
Inserted records 1000 to 2000
Inserted records 2000 to 3000
Inserted records 3000 to 4000
Inserted records 4000 to 5000
Inserted records 5000 to 6000
Inserted records 6000 to 6678
Insertion: Successful (6678 records created)


Loading embedding models (MiniLM, Arctic Embed M) and chunkers (No overlap splitter, recursive splitter, sliding window splitter)

In [None]:
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer

In [None]:
mini_lm = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
arctic_embed = SentenceTransformer('Snowflake/snowflake-arctic-embed-m')

In [None]:
fixed_size_splitter = CharacterTextSplitter(
    separator=" ",
    chunk_size=400,  # Number of characters per chunk
    chunk_overlap=0  # No overlap
)

In [None]:
sentence_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 400,
    chunk_overlap  = 100,
    separators = ["\n\n", "\n", ".", "?", "!", " ", ""]
)

In [None]:
def sliding_window_chunking(text, window_size=400, step=100):
    """Implements sliding window chunking with LangChain."""
    words = text.split()
    chunks = [' '.join(words[i:i+window_size]) for i in range(0, len(words), step)]
    return chunks

In [None]:
def bulk_insert(patent_data: pl.DataFrame, splitter) -> None:
    with database_driver.atomic():
        batch_size = 1000

        patent_records = patent_data.to_dicts()

        for i in range(0, len(patent_records), batch_size):
            batch = patent_records[i:i + batch_size]

            # Add search_vector field before inserting
            for record in batch:
                record["search_vector"] = fn.to_tsvector('english', record["raw_text"])

            # Bulk insert batch
            Patent.insert_many(batch).execute()

            print(f"Inserted records {i} to {min(i + batch_size, len(patent_records))}")

    print(f"Insertion: Successful ({len(patent_records)} records created)")