In [1]:
import os
import sys
sys.path.insert(0, os.getcwd())

from schema_matcher import (
    SchemaMatcher, 
    SchemaVocabulary, 
    load_schema_org_vocabulary
)

print("Schema matcher module loaded")

Schema matcher module loaded


## Configuration

In [2]:
# Configure embedding model
# Load one of these in LM Studio:
#   - nomic-ai/nomic-embed-text-v1.5
#   - BAAI/bge-small-en-v1.5

EMBED_BASE_URL = os.environ.get("LM_STUDIO_BASE_URL", "http://host.docker.internal:1234/v1")
EMBED_MODEL = "s3dev-ai/text-embedding-nomic-embed-text-v1.5"  # Adjust based on what you load

VOCAB_CACHE_DIR = "data/vocab_cache"
os.makedirs(VOCAB_CACHE_DIR, exist_ok=True)

print(f"Embedding endpoint: {EMBED_BASE_URL}")
print(f"Embedding model: {EMBED_MODEL}")

Embedding endpoint: http://host.docker.internal:1234/v1
Embedding model: s3dev-ai/text-embedding-nomic-embed-text-v1.5


## Load Schema.org Vocabulary

In [3]:
# Load schema.org (fetches from web if not cached)
schema_vocab = load_schema_org_vocabulary()

print(f"\nLoaded schema.org:")
print(f"  - {len(schema_vocab.get_classes())} classes")
print(f"  - {len(schema_vocab.get_properties())} properties")

Fetching schema.org from https://schema.org/version/latest/schemaorg-current-https.jsonld...
Processing 3187 schema.org definitions...
Loaded 930 classes and 1520 properties

Loaded schema.org:
  - 930 classes
  - 1520 properties


## Initialize Matcher and Build Index

In [4]:
# Create matcher with embedding configuration
matcher = SchemaMatcher(
    embed_base_url=EMBED_BASE_URL,
    embed_model=EMBED_MODEL
)

# Add schema.org vocabulary
matcher.add_vocabulary(schema_vocab)

# Test embedding connection
print("Testing embedding connection...")
test_embedding = matcher.embed(["test connection"])
print(f"  Embedding dimension: {test_embedding.shape[1]}")
print("  Connection OK!")

Testing embedding connection...
  Embedding dimension: 768
  Connection OK!


In [5]:
# Build embedding index for all vocabulary terms
# This takes a minute or two depending on vocabulary size and model
print("Building embedding index...")
matcher.build_all_indexes()
print("Done!")

Building embedding index...
Building index for Schema.org...
  Indexed 2450 terms
Done!


In [6]:
# Save for reuse
matcher.save(VOCAB_CACHE_DIR)
print(f"Saved vocabulary index to {VOCAB_CACHE_DIR}")

Saved vocabulary index to data/vocab_cache


## Test the Matcher

In [7]:
# Test finding classes
print("Finding class for 'a famous scientist who made important discoveries':")
results = matcher.find_class("a famous scientist who made important discoveries")
for r in results:
    print(f"  {r['prefix']} ({r['score']:.3f}): {r['description'][:60]}...")

Finding class for 'a famous scientist who made important discoveries':
  schema:Researcher (0.653): Researchers....
  schema:Specialty (0.562): Any branch of a field in which people typically develop spec...
  schema:DiscoverAction (0.557): The act of discovering/finding an object....
  schema:MathSolver (0.544): A math solver which is capable of solving a subset of mathem...
  schema:ChemicalSubstance (0.543): A chemical substance is 'a portion of matter of constant com...


In [8]:
# Test finding properties
print("Finding property for 'the date when a person was born':")
results = matcher.find_property("the date when a person was born", subject_type="Person")
for r in results:
    print(f"  {r['prefix']} ({r['score']:.3f})")
    print(f"    domain: {r['domain']}, range: {r['range']}")

Finding property for 'the date when a person was born':
  schema:birthDate (0.876)
    domain: Person, range: Date
  schema:birthPlace (0.795)
    domain: Person, range: Place
  schema:deathDate (0.771)
    domain: Person, range: Date
  schema:nationality (0.734)
    domain: Person, range: Country
  schema:parents (0.714)
    domain: Person, range: Person


In [9]:
# Test finding property for "worked at"
print("Finding property for 'where someone worked or was employed':")
results = matcher.find_property("where someone worked or was employed", subject_type="Person")
for r in results:
    print(f"  {r['prefix']} ({r['score']:.3f}): {r['description'][:50]}...")

Finding property for 'where someone worked or was employed':
  schema:worksFor (0.765): Organizations that the person works for....
  schema:employee (0.764): Someone working for this organization....
  schema:colleagues (0.743): A colleague of the person....
  schema:colleague (0.738): A colleague of the person....
  schema:employees (0.735): People working for this organization....


In [10]:
# Test full triple matching
print("Finding components for: 'Einstein worked at Princeton University'")
results = matcher.find_triple_components(
    subject_desc="Albert Einstein, a famous physicist",
    predicate_desc="worked at or was employed by",
    object_desc="Princeton University, an academic institution"
)

print("\nSubject types:")
for r in results['subject_types'][:3]:
    print(f"  {r['prefix']} ({r['score']:.3f})")

print("\nPredicates:")
for r in results['predicates'][:3]:
    print(f"  {r['prefix']} ({r['score']:.3f})")

print("\nObject types:")
for r in results['object_types'][:3]:
    print(f"  {r['prefix']} ({r['score']:.3f})")

Finding components for: 'Einstein worked at Princeton University'



Subject types:
  schema:Researcher (0.580)
  schema:Dentist (0.579)
  schema:EducationalAudience (0.568)

Predicates:
  schema:employee (0.670)
  schema:worksFor (0.659)
  schema:employmentUnit (0.648)

Object types:
  schema:CollegeOrUniversity (0.712)
  schema:ResearchOrganization (0.676)
  schema:Organization (0.667)


## Load Pre-built Index

Once the index is built, you can load it directly:

In [11]:
# Example of loading pre-built index
matcher = SchemaMatcher.load(VOCAB_CACHE_DIR, embed_base_url=EMBED_BASE_URL)
print(f"Loaded {len(matcher.vocabularies)} vocabularies")

Loaded 1 vocabularies
