# NER + TF-IDF Topic Extraction Backfill

This notebook processes existing news articles to extract topics using Named Entity Recognition (NER) with Google Gemini and TF-IDF scoring.

## Overview
1. Load articles from database
2. Extract entities using Gemini NER
3. Calculate TF-IDF scores
4. Store topics and update trending topics
5. Generate statistics and visualizations


## 1. Setup and Configuration


In [None]:
import os
import json
import time
import asyncio
from datetime import datetime, timedelta
from typing import List, Dict, Any, Optional
import pandas as pd
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

# Database and AI imports
import libsql_client
import google.generativeai as genai
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load environment variables
from dotenv import load_dotenv
load_dotenv()

print("✅ All imports successful")


In [None]:
# Configuration
BATCH_SIZE = 15  # Process articles in batches
RATE_LIMIT_DELAY = 2  # Seconds between API calls
MAX_ARTICLES = None  # Set to None for all articles, or number to limit

# Entity types for NER
ENTITY_TYPES = [
    'TECH', 'ORG', 'PERSON', 'LOCATION', 'CONCEPT', 'EVENT', 'OTHER'
]

# TF-IDF configuration
TFIDF_CONFIG = {
    'max_features': 1000,
    'stop_words': 'english',
    'ngram_range': (1, 2),  # unigrams and bigrams
    'min_df': 2,  # minimum document frequency
    'sublinear_tf': True  # log scaling
}

print(f"📊 Configuration loaded:")
print(f"   Batch size: {BATCH_SIZE}")
print(f"   Rate limit: {RATE_LIMIT_DELAY}s")
print(f"   Max articles: {MAX_ARTICLES or 'All'}")
print(f"   Entity types: {ENTITY_TYPES}")


## 2. Database Connection


In [None]:
# Initialize database connection
database_url = os.getenv('TURSO_DATABASE_URL')
auth_token = os.getenv('TURSO_AUTH_TOKEN')

if not database_url or not auth_token:
    raise ValueError("Missing TURSO_DATABASE_URL or TURSO_AUTH_TOKEN in environment")

# Create database client
client = libsql_client.create_client(
    url=database_url,
    auth_token=auth_token
)

print("✅ Database connection established")


In [None]:
# Test database connection
try:
    result = client.execute("SELECT COUNT(*) as count FROM news_articles")
    total_articles = result.rows[0]['count']
    print(f"📰 Found {total_articles} articles in database")
    
    # Check existing topics
    result = client.execute("SELECT COUNT(*) as count FROM article_topics")
    existing_topics = result.rows[0]['count']
    print(f"🏷️  Found {existing_topics} existing topics")
    
except Exception as e:
    print(f"❌ Database test failed: {e}")
    raise
