# Geometric Data Analysis Project (Semantic Search)

## 1. Loading Data Into DB & Embedding

In [None]:
import polars as pl  # Dataframe library for loading initial dataset
from playhouse.postgres_ext import PostgresqlExtDatabase  # Peewee ORM with extras

from utilities.models import (  # ORM Classes
    database_driver, # Database driver to be used (Uses `config.yaml`)
    create_tables,
    Patent, # Main class, holding the texts
    arctic_noverlap,
    arctic_recursive,
    arctic_sliding,
    minilm_noverlap,
    minilm_recursive,
    minilm_sliding,
)
from utilities.setup import load_config  # Loading Database config


Loading train and test split of "Nuclear Patents", which is small enough for efficient handling, but large enough for more advanced methods:

In [None]:
# Loads train and test split of "Nuclear Patents"
# Small enough for efficient handling

splits = {'train': 'data/train-00000-of-00001.parquet', 'test': 'data/test-00000-of-00001.parquet'}
train = pl.read_parquet('hf://datasets/arcee-ai/nuclear_patents/' + splits['train'])
test = pl.read_parquet('hf://datasets/arcee-ai/nuclear_patents/' + splits['test'])

# Combining train and test splits and filtering out nulls
patent_data = pl.concat([train, test]).filter(pl.col(['patent_number', 'section', 'raw_text']).is_not_null())

patent_data.head()

Creating the specified tables (Connection and ORM models are set up in `utilities/models.py`):

In [None]:
create_tables()

Inserting the data from `patent_data` into `patents` table

In [None]:
def bulk_insert_patents(patent_data: pl.DataFrame) -> None:
    with database_driver.atomic():
        batch_size = 1000

        patent_records = patent_data.to_dicts()

        for i in range(0, len(patent_records), batch_size):
            batch = patent_records[i:i + batch_size]

            # Bulk insert batch
            Patent.insert_many(batch).execute()

            print(f"Inserted records {i} to {min(i + batch_size, len(patent_records))}")
    print(f"Insertion: Successful ({len(patent_records)} recurds created)")

Loading embedding models (MiniLM, Arctic Embed M) and chunkers (No overlap splitter, recursive splitter, sliding window splitter)

In [None]:
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer

In [None]:
mini_lm = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
arctic_embed = SentenceTransformer('Snowflake/snowflake-arctic-embed-m')