# Full Pipeline: Hybrid Movie Recommendation System

This notebook demonstrates the entire workflow:
1.  **Setup**: Install dependencies and load config.
2.  **Data**: Download and preprocess.
3.  **Training**: Train Hybrid model and upload checkpoints to Hugging Face.
4.  **Evaluation**: Evaluate model performance.
5.  **Inference**: Generate recommendations with LLM explanation.
6.  **Comparison**: SVD vs Hybrid.

In [None]:
# 1. Setup
import os
import yaml
import sys
from dotenv import load_dotenv

# Change to project root if in notebooks dir
if os.path.basename(os.getcwd()) == 'notebooks':
    os.chdir('..')

load_dotenv()
sys.path.append(os.getcwd())

CONFIG_PATH = "config/config.yml"
with open(CONFIG_PATH, "r") as f:
    config = yaml.safe_load(f)

print(f"Current working directory: {os.getcwd()}")
print("Config loaded.")

In [None]:
# 2. Data Download & Preprocess
# Ensure kaggle.json is in place

!python scripts/download_kaggle_datasets.py --config {CONFIG_PATH}

# Preprocess
from src.data.preprocess import Preprocessor

preprocessor = Preprocessor(CONFIG_PATH)
preprocessor.run()

In [None]:
# 3. Training & HF Upload
# This script trains SVD + TF-IDF and uploads to xt2201/hybrid-movie-recsys

!python scripts/train_hybrid.py

In [None]:
# 4. Evaluation
!python scripts/evaluate.py

In [None]:
# 5. Inference with LLM
from src.recommender.hybrid import HybridRecommender
from src.llm.qwen_client import QwenClient
from src.llm.reranker import Reranker
from src.llm.explainer import Explainer

# Load models
recsys = HybridRecommender(CONFIG_PATH)
recsys.fit() # Or load from checkpoint if implemented

client = QwenClient(CONFIG_PATH)
reranker = Reranker(client, CONFIG_PATH)
explainer = Explainer(client, CONFIG_PATH)

# User Query
user_id = 1
query = "I want a touching drama about family"

# Get candidates
raw_recs = recsys.recommend(user_id, N=20)
candidates = []
movies_df = recsys.dataset.movies

for idx, score in raw_recs:
    # Map internal idx to movie details
    # Note: item_map is original_id -> internal_idx
    # We need internal_idx -> details
    # Find original ID
    original_id = recsys.dataset.reverse_item_map[idx]
    row = movies_df[movies_df['movieId'] == original_id].iloc[0]
    candidates.append({
        "id": idx,
        "title": row['title'],
        "genres": row['genres'],
        "overview": row['overview'],
        "base_score": score
    })

# Rerank
prefs = {"mood": "touching", "must_genres": ["Drama"]}
ranked = reranker.rerank(query, prefs, candidates)

print(f"Query: {query}\n")
print("Top Recommendations:")
for item in ranked[:5]:
    print(f"- {item.get('title')} (Score: {item.get('score')})")
    print(f"  Reason: {item.get('reason')}")
    print()

In [None]:
# 6. Comparison (SVD vs Hybrid)
# We can plot metrics from W&B or just run quick eval here
import matplotlib.pyplot as plt

# Dummy data for visualization if not running full eval loop again
models = ['SVD', 'Hybrid']
precision = [0.25, 0.28] # Example values
recall = [0.20, 0.22]

x = range(len(models))
plt.bar(x, precision, width=0.4, label='Precision@10', align='center')
plt.bar([i + 0.4 for i in x], recall, width=0.4, label='Recall@10', align='center')
plt.xticks([i + 0.2 for i in x], models)
plt.legend()
plt.title("Model Comparison")
plt.show()