In [2]:

# Import required libraries and tools
import os
import sys
import json
from pprint import pprint

# Load environment variables
from dotenv import load_dotenv
load_dotenv()

# Add tools directory to path
sys.path.append('.')

# Import all our tools
from search_talks_by_filters import search_talks_by_filters
from search_talks_semantically import search_talks_semantically
from analyze_speaker_activity import analyze_speaker_activity
from get_talk_details import get_talk_details
from find_similar_content import find_similar_content
from analyze_topics_and_trends import analyze_topics_and_trends

print("✅ All tools imported successfully!")
print(f"🔑 ApertureDB Key configured: {'Yes' if os.getenv('APERTUREDB_KEY') else 'No'}")

# Helper function for pretty printing results
def print_results(result, max_results=5):
    """Pretty print tool results with truncation"""
    if isinstance(result, dict) and 'results' in result:
        print(f"Total found: {result.get('total_found', 'Unknown')}")
        if result.get('query_summary'):
            print(f"Query: {result['query_summary']}")
        if result.get('sort_info'):
            print(f"Sorting: {result['sort_info']}")
        
        results = result['results'][:max_results]
        for i, talk in enumerate(results, 1):
            print(f"\n{i}. {talk.get('title', 'No Title')}")
            print(f"   Speaker: {talk.get('speaker', 'Unknown')}")
            print(f"   Company: {talk.get('company', 'Unknown')}")
            print(f"   Views: {talk.get('views', 0):,}")
            if talk.get('published_date'):
                print(f"   Date: {talk['published_date']}")
    else:
        pprint(result)

✅ All tools imported successfully!
🔑 ApertureDB Key configured: Yes


# Test Scenarios for analyze_topics_and_trends Tool

This notebook contains comprehensive test scenarios for the `analyze_topics_and_trends` tool:

**Analysis Types Covered:**
1. **Tools Analysis** - Software tools and libraries (Tests 1, 4, 8, 12)
2. **Technology Analysis** - Technologies and tech concepts (Tests 2, 6, 9, 11)  
3. **Topic Analysis** - Discussion themes and patterns (Tests 3, 7)
4. **Keyword Analysis** - Frequent terms and vocabulary (Tests 5, 10)

**Filter Dimensions Tested:**
- **Time-based**: Recent years (2023-2024), Historical (2020-2022), Single year (2023), Current (2024)
- **Category-based**: Deployment & Integration, MLOps, Business & Stakeholder Alignment
- **Event-based**: MLOps & GenAI World 2024
- **Content Source**: Transcripts only, Abstracts only, All sources combined
- **Threshold-based**: Different minimum mention requirements (2, 3, 4, 10+)

Each test demonstrates realistic scenarios where users would want to understand trends, popular tools, emerging technologies, or topic evolution over time within the MLOps ecosystem.

### Test 1: Analyze Top Software Tools Mentioned Across All Content

**Scenario**: User wants to understand which software tools and libraries are most frequently discussed in MLOps presentations  
**Query**: "What are the most popular software tools mentioned in all talks?"

This test analyzes tool mentions across all content to identify the most frequently referenced software libraries, frameworks, and platforms. The tool uses pattern matching to identify tools like TensorFlow, PyTorch, Docker, Kubernetes, etc.

In [3]:
# Test 1: Analyze Top Software Tools
result1 = analyze_topics_and_trends.invoke({
    "analysis_type": "tools",
    "top_n": 15,
    "min_mentions": 3
})
print("=== Top Software Tools Analysis ===")
print(result1)

=== Top Software Tools Analysis ===
{'analysis_results': [{'item': 'kubernetes', 'count': 21, 'percentage': 7.6, 'sample_mentions': [{'talk_title': 'Leverage Kubernetes To Optimize the Utilization of Your AI Accelerators', 'published_date': '2024-12-05'}, {'talk_title': 'Leverage Kubernetes To Optimize the Utilization of Your AI Accelerators', 'published_date': '2024-12-05'}, {'talk_title': 'Leverage Kubernetes To Optimize the Utilization of Your AI Accelerators', 'published_date': '2024-12-05'}]}, {'item': 'langchain', 'count': 20, 'percentage': 7.2, 'sample_mentions': [{'talk_title': 'Building Agentic and Multi-Agent Systems with LangGraph (Pt. 2)', 'published_date': '2024-12-06'}, {'talk_title': 'Building Agentic and Multi-Agent Systems with LangGraph (Pt. 2)', 'published_date': '2024-12-06'}, {'talk_title': 'Building Agentic and Multi-Agent Systems with LangGraph (Pt. 2)', 'published_date': '2024-12-06'}]}, {'item': 'pytorch', 'count': 19, 'percentage': 6.8, 'sample_mentions': [{'t

### Test 2: Technology Trends in Recent Years (2023-2024)

**Scenario**: User wants to understand which technologies are trending in recent MLOps discussions  
**Query**: "What technologies are most discussed in talks from 2023-2024?"

This test focuses on technology mentions (AI, ML, LLMs, Vector Databases, etc.) from recent years to identify current technological focus areas and emerging trends in the field.

In [4]:
# Test 2: Recent Technology Trends (2023-2024)
result2 = analyze_topics_and_trends.invoke({
    "analysis_type": "technologies",
    "date_from": "2023-01-01",
    "date_to": "2024-12-31",
    "top_n": 12
})
print("=== Recent Technology Trends ===")
print(result2)

=== Recent Technology Trends ===
{'analysis_results': [{'item': 'AI', 'count': 612, 'percentage': 225.0}, {'item': 'ML', 'count': 399, 'percentage': 146.7}, {'item': 'MLOps', 'count': 160, 'percentage': 58.8}, {'item': 'LLM', 'count': 151, 'percentage': 55.5}, {'item': 'machine learning', 'count': 105, 'percentage': 38.6}, {'item': 'RAG', 'count': 94, 'percentage': 34.6}, {'item': 'Machine Learning', 'count': 53, 'percentage': 19.5}, {'item': 'cloud', 'count': 37, 'percentage': 13.6}, {'item': 'real-time', 'count': 33, 'percentage': 12.1}, {'item': 'monitoring', 'count': 32, 'percentage': 11.8}, {'item': 'deep learning', 'count': 17, 'percentage': 6.2}, {'item': 'GPT', 'count': 14, 'percentage': 5.1}], 'time_trends': {}, 'analysis_summary': 'Performed technologies analysis filtered by from 2023-01-01, until 2024-12-31 from talk abstracts/metadata', 'total_items_found': 12, 'content_stats': {'total_talks': 272, 'total_text_chunks': 272, 'content_sources': ['talk abstracts/metadata']}, '

### Test 3: Topic Analysis from Video Transcripts

**Scenario**: User wants to analyze what topics are actually discussed in video content rather than just abstracts  
**Query**: "What are the main discussion topics based on actual video transcripts?"

This test analyzes topics from video transcripts to understand what speakers actually talk about during their presentations, which may differ from abstract summaries.

In [5]:
# Test 3: Topic Analysis from Transcripts
result3 = analyze_topics_and_trends.invoke({
    "analysis_type": "topics",
    "content_source": "transcripts",
    "top_n": 10,
    "min_mentions": 5
})
print("=== Topics from Video Transcripts ===")
print(result3)

=== Topics from Video Transcripts ===
{'analysis_results': [{'item': 'data', 'count': 11854, 'percentage': 70.7}, {'item': 'model', 'count': 10440, 'percentage': 62.3}, {'item': 'training', 'count': 2192, 'percentage': 13.1}, {'item': 'pipeline', 'count': 1588, 'percentage': 9.5}, {'item': 'production', 'count': 1219, 'percentage': 7.3}, {'item': 'feature', 'count': 1157, 'percentage': 6.9}, {'item': 'agent', 'count': 1092, 'percentage': 6.5}, {'item': 'performance', 'count': 985, 'percentage': 5.9}, {'item': 'search', 'count': 968, 'percentage': 5.8}, {'item': 'vector', 'count': 881, 'percentage': 5.3}], 'time_trends': {}, 'analysis_summary': 'Performed topics analysis from video transcripts', 'total_items_found': 10, 'content_stats': {'total_talks': 278, 'total_text_chunks': 16770, 'content_sources': ['video transcripts']}, 'success': True}


### Test 4: Tools Analysis for Deployment Category

**Scenario**: User wants to know which tools are most mentioned in deployment-focused talks  
**Query**: "What tools are most discussed in deployment and integration talks?"

This test filters content to the "Deployment and integration" category and analyzes tool mentions to understand the technology stack commonly used for deployment scenarios.

In [6]:
# Test 4: Tools in Deployment Category
result4 = analyze_topics_and_trends.invoke({
    "analysis_type": "tools",
    "category": "Deployment and integration",
    "top_n": 10
})
print("=== Tools in Deployment & Integration ===")
print(result4)

=== Tools in Deployment & Integration ===
{'analysis_results': [{'item': 'kubernetes', 'count': 18, 'percentage': 27.7, 'sample_mentions': [{'talk_title': 'Deploying LLMs on Kubernetes environments', 'published_date': '2024-10-31'}, {'talk_title': 'Deploying LLMs on Kubernetes environments', 'published_date': '2024-10-31'}, {'talk_title': 'Deploying LLMs on Kubernetes environments', 'published_date': '2024-10-31'}]}, {'item': 'redis', 'count': 4, 'percentage': 6.2, 'sample_mentions': [{'talk_title': 'Building a Fraud Detection Model with Feature Stores (Includes Bonus Case Study: How Shopify uses Feast to Manage its ML Features)', 'published_date': '2023-08-17'}, {'talk_title': 'Feature Stores in Practice: Train and Deploy an End-to-End Fraud Detection Model with Featureform, Redis, and AWS.', 'published_date': '2023-11-10'}, {'talk_title': 'Feature Stores in Practice: Train and Deploy an End-to-End Fraud Detection Model with Featureform, Redis, and AWS.', 'published_date': '2023-11-10

### Test 5: Keyword Analysis from MLOps & GenAI World 2024

**Scenario**: User wants to analyze the most frequently used keywords in a specific event  
**Query**: "What were the most common keywords in MLOps & GenAI World 2024?"

This test performs keyword frequency analysis on content from a specific event to understand the terminology and focus areas of that particular conference.

In [7]:
# Test 5: Keywords from MLOps & GenAI World 2024
result5 = analyze_topics_and_trends.invoke({
    "analysis_type": "keywords",
    "event_name": "MLOps & GenAI World 2024",
    "top_n": 20,
    "min_mentions": 4
})
print("=== Keywords from MLOps & GenAI World 2024 ===")
print(result5)

=== Keywords from MLOps & GenAI World 2024 ===
{'analysis_results': [{'item': 'models', 'count': 117, 'percentage': 0.83}, {'item': 'model', 'count': 85, 'percentage': 0.6}, {'item': 'data', 'count': 71, 'percentage': 0.5}, {'item': 'session', 'count': 70, 'percentage': 0.5}, {'item': 'systems', 'count': 67, 'percentage': 0.47}, {'item': 'llms', 'count': 62, 'percentage': 0.44}, {'item': 'applications', 'count': 61, 'percentage': 0.43}, {'item': 'generative', 'count': 55, 'percentage': 0.39}, {'item': 'llm', 'count': 55, 'percentage': 0.39}, {'item': 'deployment', 'count': 53, 'percentage': 0.38}, {'item': 'techniques', 'count': 49, 'percentage': 0.35}, {'item': 'open', 'count': 49, 'percentage': 0.35}, {'item': 'production', 'count': 48, 'percentage': 0.34}, {'item': 'challenges', 'count': 48, 'percentage': 0.34}, {'item': 'learn', 'count': 47, 'percentage': 0.33}, {'item': 'insights', 'count': 47, 'percentage': 0.33}, {'item': 'agent', 'count': 47, 'percentage': 0.33}, {'item': 'prac

### Test 6: Historical Technology Analysis (2020-2022)

**Scenario**: User wants to understand how technology focus has evolved by looking at earlier years  
**Query**: "What technologies were most discussed in the early MLOps period (2020-2022)?"

This test analyzes technology mentions from earlier years to provide historical context and understand how the field has evolved over time.

In [9]:
# Test 6: Historical Technology Analysis (2020-2022)
result6 = analyze_topics_and_trends.invoke({
    "analysis_type": "technologies",
    "date_from": "2020-01-01",
    "date_to": "2024-12-31",
    "top_n": 10
})
print("=== Historical Technology Trends (2020-2022) ===")
print(result6)

=== Historical Technology Trends (2020-2022) ===
{'analysis_results': [{'item': 'AI', 'count': 612, 'percentage': 225.0}, {'item': 'ML', 'count': 399, 'percentage': 146.7}, {'item': 'MLOps', 'count': 160, 'percentage': 58.8}, {'item': 'LLM', 'count': 151, 'percentage': 55.5}, {'item': 'machine learning', 'count': 105, 'percentage': 38.6}, {'item': 'RAG', 'count': 94, 'percentage': 34.6}, {'item': 'Machine Learning', 'count': 53, 'percentage': 19.5}, {'item': 'cloud', 'count': 37, 'percentage': 13.6}, {'item': 'real-time', 'count': 33, 'percentage': 12.1}, {'item': 'monitoring', 'count': 32, 'percentage': 11.8}], 'time_trends': {}, 'analysis_summary': 'Performed technologies analysis filtered by from 2020-01-01, until 2024-12-31 from talk abstracts/metadata', 'total_items_found': 10, 'content_stats': {'total_talks': 272, 'total_text_chunks': 272, 'content_sources': ['talk abstracts/metadata']}, 'success': True}


### Test 7: Comprehensive Analysis - All Content Sources

**Scenario**: User wants the most complete view by analyzing both abstracts and transcripts together  
**Query**: "Give me a comprehensive view of all topics discussed across both abstracts and video content"

This test uses all available content sources (abstracts + transcripts) to provide the most comprehensive topic analysis possible.

In [10]:
# Test 7: Comprehensive All-Content Analysis
result7 = analyze_topics_and_trends.invoke({
    "analysis_type": "topics",
    "content_source": "all",
    "top_n": 15,
    "min_mentions": 3
})
print("=== Comprehensive Topics Analysis (All Sources) ===")
print(result7)

=== Comprehensive Topics Analysis (All Sources) ===
{'analysis_results': [{'item': 'data', 'count': 12350, 'percentage': 72.4}, {'item': 'model', 'count': 10844, 'percentage': 63.6}, {'item': 'training', 'count': 2371, 'percentage': 13.9}, {'item': 'pipeline', 'count': 1646, 'percentage': 9.7}, {'item': 'production', 'count': 1342, 'percentage': 7.9}, {'item': 'feature', 'count': 1187, 'percentage': 7.0}, {'item': 'agent', 'count': 1149, 'percentage': 6.7}, {'item': 'performance', 'count': 1113, 'percentage': 6.5}, {'item': 'search', 'count': 1001, 'percentage': 5.9}, {'item': 'vector', 'count': 905, 'percentage': 5.3}, {'item': 'inference', 'count': 894, 'percentage': 5.2}, {'item': 'agents', 'count': 710, 'percentage': 4.2}, {'item': 'evaluation', 'count': 678, 'percentage': 4.0}, {'item': 'memory', 'count': 660, 'percentage': 3.9}, {'item': 'embedding', 'count': 612, 'percentage': 3.6}], 'time_trends': {}, 'analysis_summary': 'Performed topics analysis from abstracts and transcripts

### Test 8: MLOps Category Tool Analysis

**Scenario**: User wants to understand which tools are most important in core MLOps practices  
**Query**: "What are the essential tools mentioned in Introduction to MLOps and GenAI category talks?"

This test focuses specifically on the MLOps category to identify the core toolset and technologies that are central to MLOps and GenAI practices and workflows.

In [12]:
# Test 8: MLOps Category Tools
result8 = analyze_topics_and_trends.invoke({
    "analysis_type": "tools",
    "category": "Introduction to MLOps and GenAI",
    "top_n": 12,
    "min_mentions": 2
})
print("=== Tools in MLOps Category ===")
print(result8)

=== Tools in MLOps Category ===
{'analysis_results': [{'item': 'langchain', 'count': 7, 'percentage': 21.2, 'sample_mentions': [{'talk_title': 'Introduction to LangChain and Retrieval Augmented Generation (RAG)', 'published_date': '2023-11-02'}, {'talk_title': 'Introduction to LangChain and Retrieval Augmented Generation (RAG)', 'published_date': '2023-11-02'}, {'talk_title': 'Introduction to LangChain and Retrieval Augmented Generation (RAG)', 'published_date': '2023-11-02'}]}, {'item': 'transformers', 'count': 4, 'percentage': 12.1, 'sample_mentions': [{'talk_title': 'Building AI Applications with Transformers', 'published_date': '2023-08-17'}, {'talk_title': 'Building AI Applications with Transformers', 'published_date': '2023-08-17'}, {'talk_title': 'Transforming The Retail Industry with Transformers', 'published_date': '2023-08-17'}]}, {'item': 'mlflow', 'count': 2, 'percentage': 6.1, 'sample_mentions': [{'talk_title': 'Building Reproducible ML Processes with an Open Source Stack'

### Test 9: Single Year Deep Dive - 2023 Technology Focus

**Scenario**: User wants to understand the technology landscape for a specific year  
**Query**: "What were the key technologies and trends in 2023 specifically?"

This test performs a focused analysis on a single year (2023) to understand the technology landscape and trends during that specific period.

In [13]:
# Test 9: 2023 Technology Deep Dive
result9 = analyze_topics_and_trends.invoke({
    "analysis_type": "technologies",
    "date_from": "2023-01-01",
    "date_to": "2023-12-31",
    "top_n": 15
})
print("=== 2023 Technology Deep Dive ===")
print(result9)

=== 2023 Technology Deep Dive ===
{'analysis_results': [{'item': 'ML', 'count': 219, 'percentage': 178.0}, {'item': 'AI', 'count': 159, 'percentage': 129.3}, {'item': 'MLOps', 'count': 84, 'percentage': 68.3}, {'item': 'machine learning', 'count': 52, 'percentage': 42.3}, {'item': 'LLM', 'count': 36, 'percentage': 29.3}, {'item': 'Machine Learning', 'count': 28, 'percentage': 22.8}, {'item': 'monitoring', 'count': 18, 'percentage': 14.6}, {'item': 'deep learning', 'count': 11, 'percentage': 8.9}, {'item': 'Monitoring', 'count': 10, 'percentage': 8.1}, {'item': 'real-time', 'count': 9, 'percentage': 7.3}, {'item': 'cloud', 'count': 9, 'percentage': 7.3}, {'item': 'computer vision', 'count': 9, 'percentage': 7.3}, {'item': 'observability', 'count': 9, 'percentage': 7.3}, {'item': 'GPT', 'count': 8, 'percentage': 6.5}, {'item': 'NLP', 'count': 7, 'percentage': 5.7}], 'time_trends': {}, 'analysis_summary': 'Performed technologies analysis filtered by from 2023-01-01, until 2023-12-31 from 

### Test 10: High-Threshold Keywords Analysis

**Scenario**: User wants to find only the most frequently mentioned keywords to avoid noise  
**Query**: "What are the most commonly used terms with at least 10 mentions?"

This test uses a higher minimum mention threshold to filter out rare terms and focus on the most significant and frequently used keywords across all content.

In [14]:
# Test 10: High-Threshold Keywords
result10 = analyze_topics_and_trends.invoke({
    "analysis_type": "keywords",
    "min_mentions": 10,
    "top_n": 25
})
print("=== Most Frequent Keywords (10+ mentions) ===")
print(result10)

=== Most Frequent Keywords (10+ mentions) ===
{'analysis_results': [{'item': 'data', 'count': 496, 'percentage': 0.92}, {'item': 'models', 'count': 490, 'percentage': 0.91}, {'item': 'model', 'count': 404, 'percentage': 0.75}, {'item': 'llms', 'count': 266, 'percentage': 0.49}, {'item': 'learning', 'count': 257, 'percentage': 0.48}, {'item': 'our', 'count': 207, 'percentage': 0.38}, {'item': 'machine', 'count': 201, 'percentage': 0.37}, {'item': 'talk', 'count': 201, 'percentage': 0.37}, {'item': 'training', 'count': 179, 'percentage': 0.33}, {'item': 'challenges', 'count': 174, 'percentage': 0.32}, {'item': 'use', 'count': 173, 'percentage': 0.32}, {'item': 'applications', 'count': 165, 'percentage': 0.31}, {'item': 'mlops', 'count': 163, 'percentage': 0.3}, {'item': 'real', 'count': 162, 'percentage': 0.3}, {'item': 'generative', 'count': 162, 'percentage': 0.3}, {'item': 'systems', 'count': 160, 'percentage': 0.3}, {'item': 'llm', 'count': 153, 'percentage': 0.28}, {'item': 'languag

### Test 11: Business Category Technology Analysis

**Scenario**: User wants to understand technology discussed in business-focused talks  
**Query**: "What technologies are mentioned in business and stakeholder alignment talks?"

This test focuses on business-oriented content to understand which technologies are discussed in the context of business strategy and stakeholder concerns.

In [15]:
# Test 11: Business Category Technology Analysis
result11 = analyze_topics_and_trends.invoke({
    "analysis_type": "technologies",
    "category": "Business and stakeholder alignment",
    "top_n": 8
})
print("=== Technologies in Business Context ===")
print(result11)

=== Technologies in Business Context ===
{'analysis_results': [{'item': 'AI', 'count': 64, 'percentage': 220.7}, {'item': 'ML', 'count': 23, 'percentage': 79.3}, {'item': 'machine learning', 'count': 7, 'percentage': 24.1}, {'item': 'Machine Learning', 'count': 4, 'percentage': 13.8}, {'item': 'LLM', 'count': 4, 'percentage': 13.8}, {'item': 'Natural Language Processing', 'count': 2, 'percentage': 6.9}, {'item': 'MLOps', 'count': 2, 'percentage': 6.9}], 'time_trends': {}, 'analysis_summary': "Performed technologies analysis filtered by category 'Business and stakeholder alignment' from talk abstracts/metadata", 'total_items_found': 7, 'content_stats': {'total_talks': 29, 'total_text_chunks': 29, 'content_sources': ['talk abstracts/metadata']}, 'success': True}


### Test 12: Recent Tools with Abstract Focus

**Scenario**: User wants to understand which tools are highlighted in recent talk abstracts  
**Query**: "What tools are featured in abstracts of recent talks (2024)?"

This test focuses on recent abstracts to understand which tools are being highlighted as key components in current MLOps presentations.

In [16]:
# Test 12: Recent Tools from Abstracts (2024)
result12 = analyze_topics_and_trends.invoke({
    "analysis_type": "tools",
    "content_source": "abstracts",
    "date_from": "2024-01-01",
    "top_n": 12
})
print("=== Recent Tools from 2024 Abstracts ===")
print(result12)

=== Recent Tools from 2024 Abstracts ===
{'analysis_results': [{'item': 'pytorch', 'count': 17, 'percentage': 11.3, 'sample_mentions': [{'talk_title': 'Extending PyTorch for Custom Compiler Targets', 'published_date': '2024-10-31'}, {'talk_title': 'Extending PyTorch for Custom Compiler Targets', 'published_date': '2024-10-31'}, {'talk_title': 'Extending PyTorch for Custom Compiler Targets', 'published_date': '2024-10-31'}]}, {'item': 'langgraph', 'count': 17, 'percentage': 11.3, 'sample_mentions': [{'talk_title': 'Building Agentic and Multi-Agent Systems with LangGraph (Pt. 2)', 'published_date': '2024-12-06'}, {'talk_title': 'Building Agentic and Multi-Agent Systems with LangGraph (Pt. 2)', 'published_date': '2024-12-06'}, {'talk_title': 'Building Agentic and Multi-Agent Systems with LangGraph (Pt. 2)', 'published_date': '2024-12-06'}]}, {'item': 'kubernetes', 'count': 15, 'percentage': 9.9, 'sample_mentions': [{'talk_title': 'Leverage Kubernetes To Optimize the Utilization of Your AI