# Hybrid ScyllaDB Store - CRUD Operations with Semantic Search

This notebook demonstrates CRUD operations with the hybrid ScyllaDB store implementation that includes semantic search using Google Gemini embeddings.

## Setup

First, let's import the necessary modules and create a hybrid store instance.

In [1]:
import asyncio
import os
from datetime import datetime
from dotenv import load_dotenv
from scylladb_store_with_embeddings import (
    HybridAsyncScyllaDBStore,
    EmbeddingConfig,
    TTLConfig,
    GetOp,
    PutOp,
    SearchOp,
    ListNamespacesOp
)

# Load environment variables
load_dotenv()

# Configuration
CONTACT_POINTS = ["127.0.0.1"]
KEYSPACE = "test_hybrid_crud"

print("✓ Imports successful")

✓ Imports successful


## Initialize Hybrid Store

Create a store with Gemini embeddings enabled.

In [2]:
# Get API key
api_key = os.getenv("GOOGLE_API_KEY")
if not api_key:
    print("⚠️  WARNING: GOOGLE_API_KEY not set in .env file")
    print("Semantic search features will not be available.")
else:
    print("✓ API key loaded")

# Create store instance
store = None

async def init_store():
    global store
    from cassandra.cluster import Cluster
    
    # Embedding configuration
    embedding_config = EmbeddingConfig(
        model="models/gemini-embedding-001",
        distance="cosine",
        fields=["$"],  # Embed entire document
        api_key=api_key
    ) if api_key else None
    
    # Create cluster and session
    cluster = Cluster(CONTACT_POINTS)
    session = await asyncio.get_event_loop().run_in_executor(None, cluster.connect)
    
    # Create hybrid store
    store = HybridAsyncScyllaDBStore(
        session=session,
        keyspace=KEYSPACE,
        ttl=TTLConfig(refresh_on_read=True),
        embedding_config=embedding_config
    )
    
    # Setup database
    await store.setup()
    print(f"✓ Hybrid store initialized")
    print(f"  Keyspace: {KEYSPACE}")
    if embedding_config:
        print(f"  Model: {embedding_config['model']}")
        print(f"  Embedding dims: {store.embedding_config.get('dims', 'auto-detect')}")

await init_store()

✓ API key loaded


E0000 00:00:1759373116.221775  714714 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.
E0000 00:00:1759373116.227611  714714 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.
E0000 00:00:1759373116.229680  714714 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.
E0000 00:00:1759373116.231098  714714 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.
Could not add embedding column: Error from server: code=2200 [Invalid query] message="Invalid column name embedding because it conflicts with an existing column"


✓ Hybrid store initialized
  Keyspace: test_hybrid_crud
  Model: models/gemini-embedding-001
  Embedding dims: 3072


## CREATE Operations

Test creating/storing items. With embeddings enabled, each item is automatically embedded.

In [None]:
# Create tech articles with rich content for semantic search
articles = [
    {
        "namespace": ("articles", "tech"),
        "key": "ml-basics",
        "value": {
            "title": "Introduction to Machine Learning",
            "content": "Machine learning enables computers to learn from data without explicit programming. It uses algorithms to find patterns and make predictions.",
            "category": "AI",
            "views": 1200,
            "published": "2024-01-15"
        }
    },
    {
        "namespace": ("articles", "tech"),
        "key": "python-guide",
        "value": {
            "title": "Python Programming for Beginners",
            "content": "Python is a versatile programming language known for its readability and extensive libraries. Perfect for data science and web development.",
            "category": "Programming",
            "views": 2500,
            "published": "2024-02-01"
        }
    },
    {
        "namespace": ("articles", "tech"),
        "key": "db-scaling",
        "value": {
            "title": "Database Scaling Strategies",
            "content": "Learn how to scale databases horizontally and vertically. Covers sharding, replication, and NoSQL solutions like ScyllaDB.",
            "category": "Database",
            "views": 800,
            "published": "2024-01-20"
        }
    },
    {
        "namespace": ("articles", "tech"),
        "key": "react-hooks",
        "value": {
            "title": "Understanding React Hooks",
            "content": "React Hooks revolutionized state management in functional components. Learn useState, useEffect, and custom hooks.",
            "category": "Web Development",
            "views": 3200,
            "published": "2024-02-10"
        }
    },
]

for article in articles:
    await store.aput(**article)
    print(f"✓ Created: {article['value']['title']}")

print(f"\n✓ Inserted {len(articles)} articles with embeddings")

In [None]:
# Create user profiles
users = [
    {
        "namespace": ("users", "u1"),
        "key": "profile",
        "value": {
            "name": "Alice Johnson",
            "email": "alice@example.com",
            "role": "Data Scientist",
            "interests": ["machine learning", "statistics", "python"]
        }
    },
    {
        "namespace": ("users", "u2"),
        "key": "profile",
        "value": {
            "name": "Bob Smith",
            "email": "bob@example.com",
            "role": "Full Stack Developer",
            "interests": ["react", "node.js", "databases"]
        }
    },
]

for user in users:
    await store.aput(**user)
    print(f"✓ Created user: {user['value']['name']}")

## READ Operations

### 1. Vanilla Filter-Based Search

In [3]:
# Get a specific article
article = await store.aget(("articles", "tech"), "ml-basics")
print("Retrieved article:")
print(f"  Title: {article.value['title']}")
print(f"  Category: {article.value['category']}")
print(f"  Views: {article.value['views']}")

Retrieved article:


AttributeError: 'NoneType' object has no attribute 'value'

In [None]:
# Filter: Articles with views > 2000
popular = await store.asearch(
    ("articles", "tech"),
    filter={"views": {"$gt": 2000}},
    limit=10
)

print(f"Popular articles (views > 2000): {len(popular)}")
for article in popular:
    print(f"  - {article.value['title']} ({article.value['views']} views)")

In [None]:
# Filter: Articles by category
ai_articles = await store.asearch(
    ("articles", "tech"),
    filter={"category": "AI"},
    limit=10
)

print(f"AI category articles: {len(ai_articles)}")
for article in ai_articles:
    print(f"  - {article.value['title']}")

### 2. Semantic Search

Search using natural language queries. The store finds semantically similar content.

In [None]:
# Semantic search: Find articles about AI and learning
results = await store.asearch(
    ("articles", "tech"),
    query="artificial intelligence and learning from data",
    limit=3
)

print("Query: 'artificial intelligence and learning from data'\n")
print("Most relevant articles:")
for r in results:
    print(f"  {r.score:.3f} - {r.value['title']}")
    print(f"         {r.value['content'][:80]}...")

In [None]:
# Semantic search: Find articles about web development
results = await store.asearch(
    ("articles", "tech"),
    query="building modern web applications with JavaScript frameworks",
    limit=3
)

print("Query: 'building modern web applications with JavaScript frameworks'\n")
print("Most relevant articles:")
for r in results:
    print(f"  {r.score:.3f} - {r.value['title']}")

In [None]:
# Semantic search: Database topics
results = await store.asearch(
    ("articles", "tech"),
    query="how to handle large amounts of data efficiently",
    limit=3
)

print("Query: 'how to handle large amounts of data efficiently'\n")
print("Most relevant articles:")
for r in results:
    print(f"  {r.score:.3f} - {r.value['title']}")

### 3. Hybrid Search (Semantic + Filters)

Combine semantic search with traditional filters for powerful queries.

In [None]:
# Hybrid: Semantic search + views filter
results = await store.asearch(
    ("articles", "tech"),
    query="programming and software development",
    filter={"views": {"$gte": 1000}},
    limit=5
)

print("Hybrid Search: 'programming' + views >= 1000\n")
for r in results:
    print(f"  {r.score:.3f} - {r.value['title']} ({r.value['views']} views)")

In [None]:
# Hybrid: Semantic search + category filter
results = await store.asearch(
    ("articles", "tech"),
    query="working with data and making predictions",
    filter={"category": "AI"},
    limit=5
)

print("Hybrid Search: 'data predictions' + category=AI\n")
for r in results:
    print(f"  {r.score:.3f} - {r.value['title']}")

## UPDATE Operations

In [None]:
# Update an article (embeddings are automatically regenerated)
await store.aput(
    namespace=("articles", "tech"),
    key="ml-basics",
    value={
        "title": "Introduction to Machine Learning",
        "content": "Machine learning enables computers to learn from data without explicit programming. Deep learning uses neural networks for complex pattern recognition.",
        "category": "AI",
        "views": 1250,  # Updated views
        "published": "2024-01-15"
    }
)
print("✓ Updated ml-basics article (embeddings regenerated)")

# Verify update
article = await store.aget(("articles", "tech"), "ml-basics")
print(f"  New views: {article.value['views']}")
print(f"  Updated content: {article.value['content'][:60]}...")

## DELETE Operations

In [None]:
# Create a temporary article
await store.aput(
    namespace=("articles", "tech"),
    key="temp-article",
    value={
        "title": "Temporary Article",
        "content": "This will be deleted soon.",
        "category": "Test",
        "views": 0
    }
)
print("✓ Created temporary article")

# Delete it
await store.adelete(("articles", "tech"), "temp-article")
print("✓ Deleted temporary article")

# Verify deletion
deleted = await store.aget(("articles", "tech"), "temp-article")
print(f"  Article exists: {deleted is not None}")

## BATCH Operations

In [None]:
# Execute multiple operations in one batch
ops = [
    GetOp(namespace=("articles", "tech"), key="ml-basics"),
    GetOp(namespace=("articles", "tech"), key="python-guide"),
    SearchOp(
        namespace_prefix=("articles", "tech"),
        filter={"category": "Database"},
        limit=10
    ),
    ListNamespacesOp(
        match_conditions=(("articles",), None, None),
        limit=10
    )
]

results = await store.abatch(ops)
print(f"✓ Executed {len(ops)} operations in batch\n")

print("Results:")
print(f"  1. GET ml-basics: {results[0].value['title'] if results[0] else 'Not found'}")
print(f"  2. GET python-guide: {results[1].value['title'] if results[1] else 'Not found'}")
print(f"  3. SEARCH Database category: Found {len(results[2])} articles")
print(f"  4. LIST namespaces: Found {len(results[3])} namespaces")

## Namespace Operations

In [None]:
# List all namespaces
namespaces = await store.alist_namespaces()
print(f"All namespaces ({len(namespaces)}):")
for ns in namespaces:
    print(f"  - {ns}")

In [None]:
# List namespaces with prefix
article_namespaces = await store.alist_namespaces(prefix=("articles",))
print(f"Article namespaces:")
for ns in article_namespaces:
    print(f"  - {ns}")

## Index Operations

The vector index can be saved and loaded for persistence.

In [None]:
# Save index to disk
index_path = "./test_vector_index"
store.save_index(index_path)
print(f"✓ Saved index to {index_path}")

In [None]:
# Rebuild index from database
count = await store.rebuild_index()
print(f"✓ Rebuilt index with {count} items")

In [None]:
# Load index from disk
store.load_index(index_path)
print(f"✓ Loaded index from {index_path}")

# Test search after reload
results = await store.asearch(
    ("articles", "tech"),
    query="programming languages",
    limit=3
)
print("\nSearch after index reload:")
for r in results:
    print(f"  {r.score:.3f} - {r.value['title']}")

## Summary

Display all stored data.

In [None]:
# Get all articles
all_articles = await store.asearch(("articles", "tech"), limit=100)
print(f"Total articles: {len(all_articles)}\n")
for article in all_articles:
    print(f"  - {article.value['title']}")
    print(f"    Category: {article.value['category']}, Views: {article.value['views']}")

In [None]:
# Get all users
all_users = await store.asearch(("users",), limit=100)
print(f"\nTotal users: {len(all_users)}\n")
for user in all_users:
    print(f"  - {user.value['name']} ({user.value['role']})")
    print(f"    Interests: {', '.join(user.value['interests'])}")

## Comparison: Filter vs Semantic vs Hybrid

Compare different search approaches.

In [None]:
test_query = "learning to code and build software"

print(f"Query: '{test_query}'\n")
print("=" * 70)

# 1. Filter search (won't match semantic meaning)
print("\n1. Filter Search (category=Programming):")
filter_results = await store.asearch(
    ("articles", "tech"),
    filter={"category": "Programming"},
    limit=3
)
for r in filter_results:
    print(f"   - {r.value['title']}")

# 2. Pure semantic search
print("\n2. Semantic Search (embeddings):")
semantic_results = await store.asearch(
    ("articles", "tech"),
    query=test_query,
    limit=3
)
for r in semantic_results:
    print(f"   {r.score:.3f} - {r.value['title']}")

# 3. Hybrid search
print("\n3. Hybrid Search (semantic + views > 1000):")
hybrid_results = await store.asearch(
    ("articles", "tech"),
    query=test_query,
    filter={"views": {"$gt": 1000}},
    limit=3
)
for r in hybrid_results:
    print(f"   {r.score:.3f} - {r.value['title']} ({r.value['views']} views)")

## Cleanup (Optional)

Uncomment to clean up test data.

In [None]:
async def cleanup():
    namespaces = await store.alist_namespaces()
    for ns in namespaces:
        items = await store.asearch(ns, limit=1000)
        for item in items:
            await store.adelete(item.namespace, item.key)
            print(f"Deleted: {item.namespace} / {item.key}")
    print("\n✓ All data cleaned up")

await cleanup()