# Geometric Data Analysis Project (Semantic Search)

## 1. Loading Data Into DB & Embedding

In [1]:
import polars as pl  # Dataframe library for loading initial dataset
from playhouse.postgres_ext import PostgresqlExtDatabase, fn  # Peewee ORM with extras

from utilities.models import (  # ORM Classes
    database_driver, # Database driver to be used (Uses `config.yaml`)
    create_tables,
    Patent, # Main class, holding the texts
    arctic_noverlap,
    arctic_recursive,
    arctic_sliding,
    minilm_noverlap,
    minilm_recursive,
    minilm_sliding,
)
from utilities.setup import load_config  # Loading Database config


Loading train and test split of "Nuclear Patents", which is small enough for efficient handling, but large enough for more advanced methods:

In [2]:
# Loads train and test split of "Nuclear Patents"
# Small enough for efficient handling

splits = {'train': 'data/train-00000-of-00001.parquet', 'test': 'data/test-00000-of-00001.parquet'}
train = pl.read_parquet('hf://datasets/arcee-ai/nuclear_patents/' + splits['train'])
test = pl.read_parquet('hf://datasets/arcee-ai/nuclear_patents/' + splits['test'])

# Combining train and test splits and filtering out nulls
patent_data = pl.concat([train, test]).filter(
    pl.col("patent_number").is_not_null() & (pl.col("patent_number").str.len_chars() > 0) &
    pl.col("section").is_not_null() & (pl.col("section").str.len_chars() > 0) &
    pl.col("raw_text").is_not_null() & (pl.col("raw_text").str.len_chars() > 0) &
    (pl.col("raw_text").str.len_chars() <= 5000)  # Filter for raw_text length <= 5000
)

patent_data.head()

patent_number,section,raw_text
str,str,str
"""062326139""","""abstract""","""An angular pumped and emitting…"
"""059600497""","""abstract""","""The operator of a nuclear stea…"
"""042499950""","""abstract""","""In a fast reactor constituted …"
"""051606950""","""abstract""","""An apparatus and method of enh…"
"""044477333""","""claims""","""1. A radiation-shielding trans…"


Creating the specified tables (Connection and ORM models are set up in `utilities/models.py`):

In [3]:
database_driver.connect()

True

In [10]:
create_tables()

Inserting the data from `patent_data` into `patents` table

In [6]:
def bulk_insert_patents(patent_data: pl.DataFrame) -> None:
    with database_driver.atomic():
        batch_size = 1000

        patent_records = patent_data.to_dicts()

        for i in range(0, len(patent_records), batch_size):
            batch = patent_records[i:i + batch_size]

            # Add search_vector field before inserting
            for record in batch:
                record["search_vector"] = fn.to_tsvector('english', record["raw_text"])

            # Bulk insert batch
            Patent.insert_many(batch).execute()

            print(f"Inserted records {i} to {min(i + batch_size, len(patent_records))}")

    print(f"Insertion: Successful ({len(patent_records)} records created)")


In [11]:
bulk_insert_patents(patent_data)

Inserted records 0 to 1000
Inserted records 1000 to 2000
Inserted records 2000 to 3000
Inserted records 3000 to 4000
Inserted records 4000 to 5000
Inserted records 5000 to 6000
Inserted records 6000 to 6678
Insertion: Successful (6678 records created)


Loading embedding models (MiniLM, Arctic Embed M) and chunkers (No overlap splitter, recursive splitter, sliding window splitter)

In [4]:
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


In [5]:
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer

In [18]:
class SlidingWindowSplitter():
    def __init__(self, window_size=400, step=100):
        self.window_size = window_size
        self.step = step

    def split_text(self, text):
        """Implements sliding window chunking ."""
        words = text.split() # Splits at whitespaces
        chunks = [' '.join(words[i:i+self.window_size]) for i in range(0, len(words), self.step)]
        return chunks

In [7]:
mini_lm = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
arctic_embed = SentenceTransformer('Snowflake/snowflake-arctic-embed-m')

In [8]:
fixed_size_splitter = CharacterTextSplitter(
    separator=" ",
    chunk_size=400,  # Number of characters per chunk
    chunk_overlap=0  # No overlap
)

In [9]:
sentence_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 400,
    chunk_overlap  = 100,
    separators = ["\n\n", "\n", ".", "?", "!", " ", ""]
)

In [20]:
window_splitter = SlidingWindowSplitter(
    window_size = 400,
    step = 100
)

In [14]:
def bulk_insert_chunks_and_embeddings(chunker, embedding_model, chunk_db):
    patent_data = Patent.select()  # Load all patents, you can add filters as needed
    batch_size = 1000  # Choose a batch size for bulk insert
    for i in range(0, len(patent_data), batch_size):
        batch = patent_data[i:i + batch_size]
        chunk_data = []
        chunks_to_embed = []  # Store chunks to be embedded in bulk

        for patent in batch:
            # Chunk the raw_text into smaller parts
            chunks = chunker.split_text(patent.raw_text)
            chunks_to_embed.extend(chunks)  # Collect all chunks in this batch

            # Prepare chunk_data entries (without embeddings for now)
            for chunk in chunks:
                chunk_data.append({
                    'patent_number': patent.id,
                    'chunk_text': chunk,
                    'embedding': None  # Placeholder for embedding, to be updated later
                })

        # Generate embeddings in bulk for the chunks
        embeddings = embedding_model.encode(chunks_to_embed)

        # Now update chunk_data with the generated embeddings
        for idx, embedding in enumerate(embeddings):
            chunk_data[idx]['embedding'] = embedding.tolist()  # Assuming embedding is a numpy array or list

        # Insert chunks in bulk
        with database_driver.atomic():
            chunk_db.insert_many(chunk_data).execute()  # Bulk insert into the given chunk_db table
        print(f"Inserted chunks for patents {i} to {i + len(batch)}")

In [15]:
bulk_insert_chunks_and_embeddings(
    fixed_size_splitter,
    mini_lm,
    minilm_noverlap
)

Inserted chunks for patents 0 to 1000
Inserted chunks for patents 1000 to 2000
Inserted chunks for patents 2000 to 3000
Inserted chunks for patents 3000 to 4000
Inserted chunks for patents 4000 to 5000
Inserted chunks for patents 5000 to 6000
Inserted chunks for patents 6000 to 6678


In [16]:
bulk_insert_chunks_and_embeddings(
    sentence_splitter,
    mini_lm,
    minilm_recursive
)

Inserted chunks for patents 0 to 1000
Inserted chunks for patents 1000 to 2000
Inserted chunks for patents 2000 to 3000
Inserted chunks for patents 3000 to 4000
Inserted chunks for patents 4000 to 5000
Inserted chunks for patents 5000 to 6000
Inserted chunks for patents 6000 to 6678


In [21]:
bulk_insert_chunks_and_embeddings(
    window_splitter,
    mini_lm,
    minilm_sliding
)

Inserted chunks for patents 0 to 1000
Inserted chunks for patents 1000 to 2000
Inserted chunks for patents 2000 to 3000
Inserted chunks for patents 3000 to 4000
Inserted chunks for patents 4000 to 5000
Inserted chunks for patents 5000 to 6000
Inserted chunks for patents 6000 to 6678


In [22]:
bulk_insert_chunks_and_embeddings(
    fixed_size_splitter,
    arctic_embed,
    arctic_noverlap
)

Inserted chunks for patents 0 to 1000
Inserted chunks for patents 1000 to 2000
Inserted chunks for patents 2000 to 3000
Inserted chunks for patents 3000 to 4000
Inserted chunks for patents 4000 to 5000
Inserted chunks for patents 5000 to 6000
Inserted chunks for patents 6000 to 6678


In [23]:
bulk_insert_chunks_and_embeddings(
    sentence_splitter,
    arctic_embed,
    arctic_recursive
)

Inserted chunks for patents 0 to 1000
Inserted chunks for patents 1000 to 2000
Inserted chunks for patents 2000 to 3000
Inserted chunks for patents 3000 to 4000
Inserted chunks for patents 4000 to 5000
Inserted chunks for patents 5000 to 6000
Inserted chunks for patents 6000 to 6678


In [24]:
bulk_insert_chunks_and_embeddings(
    window_splitter,
    arctic_embed,
    arctic_sliding
)

Inserted chunks for patents 0 to 1000
Inserted chunks for patents 1000 to 2000
Inserted chunks for patents 2000 to 3000
Inserted chunks for patents 3000 to 4000
Inserted chunks for patents 4000 to 5000
Inserted chunks for patents 5000 to 6000
Inserted chunks for patents 6000 to 6678
