In [1]:
# Import required libraries and tools
import os
import sys
import json
from pprint import pprint

# Load environment variables
from dotenv import load_dotenv
load_dotenv()

# Add tools directory to path
sys.path.append('.')

# Import all our tools
from search_talks_by_filters import search_talks_by_filters
from search_talks_semantically import search_talks_semantically
from analyze_speaker_activity import analyze_speaker_activity
from get_talk_details import get_talk_details
from find_similar_content import find_similar_content
from analyze_topics_and_trends import analyze_topics_and_trends

print("✅ All tools imported successfully!")
print(f"🔑 ApertureDB Key configured: {'Yes' if os.getenv('APERTUREDB_KEY') else 'No'}")

# Helper function for pretty printing results
def print_results(result, max_results=5):
    """Pretty print tool results with truncation"""
    if isinstance(result, dict) and 'results' in result:
        print(f"Total found: {result.get('total_found', 'Unknown')}")
        if result.get('query_summary'):
            print(f"Query: {result['query_summary']}")
        if result.get('sort_info'):
            print(f"Sorting: {result['sort_info']}")
        
        results = result['results'][:max_results]
        for i, talk in enumerate(results, 1):
            print(f"\n{i}. {talk.get('title', 'No Title')}")
            print(f"   Speaker: {talk.get('speaker', 'Unknown')}")
            print(f"   Company: {talk.get('company', 'Unknown')}")
            print(f"   Views: {talk.get('views', 0):,}")
            if talk.get('published_date'):
                print(f"   Date: {talk['published_date']}")
    else:
        pprint(result)

✅ All tools imported successfully!
🔑 ApertureDB Key configured: Yes


## Tool 2: `search_talks_semantically` - Comprehensive Semantic Search Testing

This tool performs semantic search across talk content using natural language queries and embeddings. It searches through video transcripts, talk abstracts/metadata, and speaker bios using the SentenceTransformer model to find semantically similar content.

### Test 1: General AI Agents Concept Search
**Query**: "Find talks that discuss AI agents with memory and reasoning capabilities"

This tests broad conceptual semantic search across all content types to find talks about AI agents, a popular MLOps topic. The tool will search transcripts, abstracts, and speaker bios for semantically related content.

In [2]:
# Test 1: General AI Agents Concept Search
result1 = search_talks_semantically.invoke({
    "query": "AI agents with memory and reasoning capabilities",
    "search_type": "all",
    "k_neighbors": 8
})

pprint(result1)

{'query_vector_info': 'Generated 768D embedding using '
                      'google/embeddinggemma-300m',
 'results': [{'category': 'Future trends',
              'content_type': 'abstract/metadata',
              'context_info': 'From talk abstract/metadata',
              'event': 'TMLS 2024',
              'matching_text': 'GenAI Investing in 2024 | An updated overview '
                               'on the Gen Ai market landscape and investment '
                               'activity, along with investor insights for '
                               'fundraising. | An updated overview on the Gen '
                               'Ai landscape and investment activity, as well '
                               'as some investor insights for fundraising. | '
                               'Future trends | GenAI Landsc...',
              'similarity_score': 0.439,
              'speaker': 'Margo Wu',
              'talk_id': 'dacfab95-57e8-5086-bc22-74b86cf3fbb3',
              't

### Test 2: Transcript-Specific Technical Search
**Query**: "Vector databases and embeddings in production systems"

This tests semantic search specifically within video transcripts to find technical discussions about vector databases. Using search_type="transcript" focuses only on spoken content from the videos.

In [3]:
# Test 2: Transcript-Specific Technical Search
result2 = search_talks_semantically.invoke({
    "query": "vector databases and embeddings in production systems",
    "search_type": "transcript",
    "k_neighbors": 6
})

pprint(result2)

{'query_vector_info': 'Generated 768D embedding using '
                      'google/embeddinggemma-300m',
 'results': [{'category': 'Business and stakeholder alignment',
              'content_type': 'transcript',
              'context_info': 'Timestamp: 294-319s',
              'event': 'TMLS 2023',
              'matching_text': 'like how data science interacts and provides '
                               'like value how they communicate that value can '
                               'look super different so data science reporting '
                               'into itself into engineering into Finance into '
                               'product all changes the shape of the nature of '
                               'your communication it also changes how they '
                               'interact with business ...',
              'similarity_score': 0.365,
              'speaker': 'Wendy Foster',
              'talk_id': 'a8b6ed5b-0ca2-5b23-9d9d-8afd0213f7b3',
      

### Test 3: Abstract/Metadata Search for RAG Systems
**Query**: "Retrieval Augmented Generation and knowledge retrieval"

This focuses on talk abstracts and keywords to find structured descriptions of RAG systems. Using search_type="meta" searches through curated abstracts and metadata rather than raw transcripts.

In [4]:
# Test 3: Abstract/Metadata Search for RAG Systems
result3 = search_talks_semantically.invoke({
    "query": "machine learning",
    "search_type": "meta",
    "k_neighbors": 7
})

pprint(result3)

{'query_vector_info': 'Generated 768D embedding using '
                      'google/embeddinggemma-300m',
 'results': [{'category': 'Business and stakeholder alignment',
              'content_type': 'abstract/metadata',
              'context_info': 'From talk abstract/metadata',
              'event': None,
              'matching_text': 'Business Panel: GenAI Use-cases Across '
                               'Industry Verticals. Early Trends and ROI | '
                               'Business and stakeholder alignment | GenAI '
                               'use-cases, Industry verticals, ROI',
              'similarity_score': 0.452,
              'speaker': 'Surbhi Rathore, Kamelia Aryafar, Shingai Manjengwa, '
                         'Manas Bhuyan, Laila Paszti',
              'talk_id': 'e39c7304-db05-518d-a3db-8c962214dc8f',
              'title': 'Business Panel: GenAI Use-cases Across Industry '
                       'Verticals. Early Trends and ROI',
              'you

### Test 4: Speaker Expertise Discovery
**Query**: "Machine learning deployment and containerization expertise"

This searches speaker bios to find experts in ML deployment. Using search_type="bio" focuses on speaker background information to identify domain experts and their areas of specialization.

In [5]:
# Test 4: Speaker Expertise Discovery
result4 = search_talks_semantically.invoke({
    "query": "machine learning",
    "search_type": "bio",
    "k_neighbors": 5
})

pprint(result4)

{'query_vector_info': 'Generated 768D embedding using '
                      'google/embeddinggemma-300m',
 'results': [{'category': 'Deployment and integration',
              'content_type': 'speaker bio',
              'context_info': 'From speaker bio',
              'event': 'TMLS 2024',
              'matching_text': 'Engineering Manager | I lead the Content '
                               'Mining team at Pinterest. We use ML to '
                               'understand the webpages and extract useful '
                               'information for our users.',
              'similarity_score': 0.32,
              'speaker': 'Patrick Halina',
              'talk_id': 'ec25523f-194c-588d-a930-264aaa0fa4a3',
              'title': 'Web Extraction With LLMs',
              'youtube_url': 'https://www.youtube.com/watch?v=olf7hs0c0zk'},
             {'category': 'Model dev, training, arch.',
              'content_type': 'speaker bio',
              'context_info': 'From speake

### Test 5: Time-Filtered Semantic Search
**Query**: "LangChain and LangGraph frameworks for building AI applications"

This combines semantic search with temporal filtering to find recent discussions about popular AI frameworks. The date filter helps focus on current trends and recent developments.

In [6]:
# Test 5: Time-Filtered Semantic Search
result5 = search_talks_semantically.invoke({
    "query": "LangChain and LangGraph frameworks for building AI applications",
    "search_type": "all",
    "date_from": "2024-01-01",
    "k_neighbors": 8
})

pprint(result5)

{'query_vector_info': 'Generated 768D embedding using '
                      'google/embeddinggemma-300m',
 'results': [{'category': None,
              'content_type': 'abstract/metadata',
              'context_info': 'From talk abstract/metadata',
              'event': None,
              'matching_text': 'Deploying and Evaluating RAG pipelines with '
                               'Lightning Studios | Learn how to use Lightning '
                               'Studios to quickly deploy AI agents and '
                               'accelerate your evaluation of RAG pipelines. | '
                               'Learn how to use Lightning Studios to quickly '
                               'deploy AI agents and accelerate your '
                               'evaluation of RAG pipelines. | Deployme...',
              'similarity_score': 0.405,
              'speaker': None,
              'talk_id': '0c7e727c-214e-5fc1-954e-3aa19bd8a513',
              'title': None,
           

### Test 7: Category-Filtered DevOps Search
**Query**: "Kubernetes orchestration and container scaling strategies"

This combines semantic search with category filtering to find DevOps-related content. The category filter helps narrow results to deployment and infrastructure topics.

In [7]:
# Test 7: Category-Filtered DevOps Search
result7 = search_talks_semantically.invoke({
    "query": "Kubernetes orchestration and container scaling strategies",
    "search_type": "all",
    "category": "Deployment and integration",
    "k_neighbors": 6
})


pprint(result7)

{'query_vector_info': 'Generated 768D embedding using '
                      'google/embeddinggemma-300m',
 'results': [{'category': None,
              'content_type': 'abstract/metadata',
              'context_info': 'From talk abstract/metadata',
              'event': None,
              'matching_text': 'Optimized AI Deployment Platform | Showcasing '
                               'CentMLs ability to streamline the process of '
                               'deploying and optimizing LLMs in production. | '
                               'Deployment and integration | CentML, LLM '
                               'deployment, Optimization',
              'similarity_score': 0.344,
              'speaker': None,
              'talk_id': '6b8d79ff-1417-57e1-9263-12d335800919',
              'title': None,
              'youtube_url': None},
             {'category': None,
              'content_type': 'speaker bio',
              'context_info': 'From speaker bio',
              'e

### Test 8: Event-Specific AI Search
**Query**: "Generative AI and large language models in production"

This searches for GenAI content within a specific event context. Event filtering helps focus on content from particular conferences or symposiums with relevant themes.

In [8]:
# Test 8: Event-Specific AI Search  
result8 = search_talks_semantically.invoke({
    "query": "generative AI and large language models in production",
    "search_type": "all",
    "event_name": "MLOps & GenAI World 2024",
    "k_neighbors": 7
})

pprint(result8)

{'query_vector_info': 'Generated 768D embedding using '
                      'google/embeddinggemma-300m',
 'results': [{'category': None,
              'content_type': 'speaker bio',
              'context_info': 'From speaker bio',
              'event': None,
              'matching_text': 'Senior Software Engineer | Anu is a senior '
                               'software engineer working on optimizing Google '
                               'Kubernetes Engine for techniques like RAG and '
                               'supporting popular AI/ML framework and tools '
                               'such as Ray.',
              'similarity_score': 0.352,
              'speaker': None,
              'talk_id': '3bec210f-b106-5ff7-ab2c-3b467ff012d9',
              'title': None,
              'youtube_url': None},
             {'category': None,
              'content_type': 'speaker bio',
              'context_info': 'From speaker bio',
              'event': None,
              

### Test 9: Speaker-Specific Expertise Search
**Query**: "Python applications"

This searches for technical content but only within talks by a specific speaker. This helps analyze what particular experts discuss and their areas of deep expertise. 

In [9]:
# Test 9: Speaker-Specific Expertise Search
result9 = search_talks_semantically.invoke({
    "query": "python applications",
    "search_type": "all",
    "speaker_name": "Sophia Yang",  
    "k_neighbors": 5
})

pprint(result9)

{'query_vector_info': 'Generated 768D embedding using '
                      'google/embeddinggemma-300m',
 'results': [{'category': None,
              'content_type': 'transcript',
              'context_info': 'Timestamp: 268-294s',
              'event': None,
              'matching_text': 'execute python in the web browser it actually '
                               'brings the whole ecosystem whole python '
                               'ecosystem into the browser for you when we use '
                               "Python we're not just use the standard python "
                               'libraries we actually import numpy scipy '
                               'pandas cycling learn all those python '
                               'ecosystems are can be used in the browser ...',
              'similarity_score': 0.364,
              'speaker': None,
              'talk_id': 'd915e5d8-a34e-5576-85a5-1d23c1bc41de',
              'title': None,
              'youtube_url

### Test 10: Multi-Modal Data Science Search
**Query**: "Feature stores and data versioning for machine learning"

This searches across all content types for data engineering topics. This tests how well the semantic search handles ML infrastructure concepts across different content modalities.

In [10]:
# Test 10: Multi-Modal Data Science Search
result10 = search_talks_semantically.invoke({
    "query": "feature stores and data versioning for machine learning",
    "search_type": "all",
    "k_neighbors": 8
})

pprint(result10)

{'query_vector_info': 'Generated 768D embedding using '
                      'google/embeddinggemma-300m',
 'results': [{'category': 'Business and stakeholder alignment',
              'content_type': 'abstract/metadata',
              'context_info': 'From talk abstract/metadata',
              'event': None,
              'matching_text': 'Panel: How companies can navigate and approach '
                               'the new advancements in generative AI | '
                               'Business and stakeholder alignment | '
                               'Generative AI, Business Strategy, Panel '
                               'Discussion',
              'similarity_score': 0.451,
              'speaker': 'Michel Dubois, Cameron Schuler, Alex LaPlante, Sara '
                         'Hooker',
              'talk_id': '7255dc0f-0f29-5aad-9a95-a7956c44673a',
              'title': 'Panel: How companies can navigate and approach the new '
                       'advancements in 

### Test 11: Broad Discovery with Large K
**Query**: "Python libraries and frameworks for AI development"

This uses a larger k_neighbors value to get a comprehensive overview of Python-related AI content. Useful for broad topic exploration and discovering diverse perspectives.

In [11]:
# Test 11: Broad Discovery with Large K
result11 = search_talks_semantically.invoke({
    "query": "Python libraries and frameworks for AI development",
    "search_type": "all",
    "k_neighbors": 15
})

pprint(result11)

{'query_vector_info': 'Generated 768D embedding using '
                      'google/embeddinggemma-300m',
 'results': [{'category': 'Deployment and integration',
              'content_type': 'abstract/metadata',
              'context_info': 'From talk abstract/metadata',
              'event': 'MLOps & GenAI World 2024',
              'matching_text': 'Large Language Model Training and Serving at '
                               'LinkedIn | In this talk, Dre will describe '
                               'some of the fundamental challenges and '
                               'solutions faced by the LinkedIn team as they '
                               'build innovative products based on LLMs and '
                               'agents. | How do I build scalable training and '
                               'serving solutions for large language ...',
              'similarity_score': 0.452,
              'speaker': 'Dre Olgiati',
              'talk_id': '7da06749-38e6-59c8-b00d-

### Test 12: Complex Multi-Filter Semantic Search
**Query**: "GPU acceleration and distributed training for deep learning"

This combines semantic search with multiple filters (time, category) to find recent, technical content about GPU computing. Tests the tool's ability to handle complex, multi-dimensional queries.

In [12]:
# Test 12: Complex Multi-Filter Semantic Search
result12 = search_talks_semantically.invoke({
    "query": "GPU acceleration and distributed training for deep learning",
    "search_type": "all",
    "date_from": "2024-01-01",
    "category": "Model dev, training, arch.",
    "k_neighbors": 6
})

pprint(result12)

{'query_vector_info': 'Generated 768D embedding using '
                      'google/embeddinggemma-300m',
 'results': [{'category': None,
              'content_type': 'abstract/metadata',
              'context_info': 'From talk abstract/metadata',
              'event': None,
              'matching_text': 'Supercharging Recommender Systems: Unleashing '
                               'the Power of Distributed Model Training | '
                               'Stitch Fix utilizes a sophisticated '
                               'multi-tiered recommender system stack, '
                               'encompassing feature generation, scoring, '
                               'ranking, and business policy decision-making. '
                               'This presentation delves into the training '
                               'archit...',
              'similarity_score': 0.419,
              'speaker': None,
              'talk_id': '29a3f0f3-f82a-5291-8e6e-76b201799176',
      