# YouTube Trends Exploratory Analysis

This notebook demonstrates the POC functionality for semantic search over YouTube trending videos.

In [None]:
import sys
sys.path.insert(0, '..')

import pandas as pd
import numpy as np
from loguru import logger

from src.data import DataLoader, DataPreprocessor
from src.embeddings import get_embedding_model
from src.search import SemanticSearch

## 1. Load and Explore Data

In [None]:
# Load data
loader = DataLoader()
df = loader.load_all_csv_files()

print(f"Loaded {len(df)} records")
df.head()

In [None]:
# Data info
df.info()

## 2. Preprocess Data

In [None]:
preprocessor = DataPreprocessor()
df_processed = preprocessor.preprocess(df)

print(f"Preprocessed {len(df_processed)} records")
df_processed[['title', 'channel_title', 'category_name', 'searchable_text']].head()

## 3. Generate Embeddings (Sample)

In [None]:
# Take a small sample for demo
sample_df = df_processed.sample(n=100, random_state=42)

# Get embedding model
embedding_model = get_embedding_model()

# Generate embeddings
texts = sample_df['searchable_text'].tolist()
embeddings = embedding_model.encode(texts)

print(f"Generated {len(embeddings)} embeddings")
print(f"Embedding dimension: {embeddings.shape[1]}")

## 4. Semantic Search Demo

In [None]:
# Initialize search (assumes data is already indexed)
search = SemanticSearch()

# Show stats
stats = search.get_stats()
print("Collection Statistics:")
for key, value in stats.items():
    print(f"  {key}: {value}")

In [None]:
# Example search
query = "funny cat videos"
results = search.search(query, limit=5)

print(f"\nSearch results for: '{query}'\n")
for i, result in enumerate(results, 1):
    print(f"{i}. {result['title']}")
    print(f"   Channel: {result['channel']}")
    print(f"   Category: {result['category']}")
    print(f"   Views: {result['views']:,}")
    print(f"   Score: {result['score']:.4f}")
    print()

In [None]:
# Search by category
results = search.search_by_category("gaming tutorials", category="Gaming", limit=5)

print("Gaming videos:")
for result in results:
    print(f"- {result['title']} (Score: {result['score']:.4f})")

In [None]:
# Search popular videos
results = search.search_popular("music", min_views=1000000, limit=5)

print("Popular music videos (>1M views):")
for result in results:
    print(f"- {result['title']} ({result['views']:,} views, Score: {result['score']:.4f})")

## 5. Visualize Embeddings (Optional)

Use dimensionality reduction to visualize embeddings in 2D.

In [None]:
# This requires additional libraries: scikit-learn, matplotlib
# Uncomment to use:

# from sklearn.decomposition import PCA
# import matplotlib.pyplot as plt

# # Reduce to 2D
# pca = PCA(n_components=2)
# embeddings_2d = pca.fit_transform(embeddings)

# # Plot
# plt.figure(figsize=(12, 8))
# plt.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1], alpha=0.5)
# plt.title('Video Embeddings (PCA 2D)')
# plt.xlabel('PC1')
# plt.ylabel('PC2')
# plt.show()