# NER + TF-IDF Topic Extraction Backfill

This notebook processes existing news articles to extract topics using Named Entity Recognition (NER) and TF-IDF scoring.

## Setup and Configuration


In [None]:
import os
import json
import time
import pandas as pd
import numpy as np
from typing import List, Dict, Tuple
from dotenv import load_dotenv
import libsql_client
import google.generativeai as genai
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import warnings
warnings.filterwarnings('ignore')

# Load environment variables
load_dotenv()

# Configuration
TURSO_URL = os.getenv("TURSO_DATABASE_URL")
TURSO_TOKEN = os.getenv("TURSO_AUTH_TOKEN")
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")

# Processing configuration
BATCH_SIZE = 10
MAX_ARTICLES = None  # None = process all
DAYS_BACK = 7  # None = all time
DELAY_BETWEEN_CALLS = 1  # seconds
DELAY_BETWEEN_BATCHES = 3  # seconds

# Initialize clients
client = libsql_client.create_client(
    url=TURSO_URL,
    auth_token=TURSO_TOKEN
)

genai.configure(api_key=GOOGLE_API_KEY)
model = genai.GenerativeModel('gemini-2.0-flash-lite')

print("✅ Configuration loaded")
print(f"Database: {TURSO_URL[:50]}...")
print(f"Batch size: {BATCH_SIZE}")
print(f"Max articles: {MAX_ARTICLES or 'All'}")
print(f"Days back: {DAYS_BACK or 'All time'}")


## Database Functions


In [None]:
def get_articles_without_topics():
    """Get articles that don't have topics extracted yet"""
    query = """
        SELECT na.* 
        FROM news_articles na
        LEFT JOIN article_topics at ON na.id = at.article_id
        WHERE at.id IS NULL
    """
    
    if DAYS_BACK:
        query += f" AND na.created_at >= datetime('now', '-{DAYS_BACK} days')"
    
    query += " ORDER BY na.created_at DESC"
    
    if MAX_ARTICLES:
        query += f" LIMIT {MAX_ARTICLES}"
    
    result = client.execute(query)
    return result.rows

def store_article_topics(article_id: str, topics: List[Dict]):
    """Store extracted topics for an article"""
    for topic in topics:
        topic_id = f"topic_{int(time.time() * 1000)}_{os.urandom(4).hex()}"
        
        client.execute("""
            INSERT INTO article_topics (id, article_id, entity_text, entity_type, tfidf_score, ner_confidence)
            VALUES (?, ?, ?, ?, ?, ?)
        """, [
            topic_id,
            article_id,
            topic['text'],
            topic['type'],
            topic.get('tfidf_score', 0.5),
            topic.get('confidence', 0.8)
        ])

def update_trending_topics(topics: List[Dict]):
    """Update or create trending topics"""
    for topic in topics:
        # Check if topic exists
        existing = client.execute("""
            SELECT id, occurrence_count, avg_tfidf_score, article_ids
            FROM trending_topics
            WHERE LOWER(topic_text) = LOWER(?)
        """, [topic['text']])
        
        if existing.rows:
            # Update existing topic
            row = existing.rows[0]
            new_score = (row['avg_tfidf_score'] + topic.get('tfidf_score', 0.5)) / 2
            
            client.execute("""
                UPDATE trending_topics
                SET occurrence_count = occurrence_count + 1,
                    avg_tfidf_score = ?,
                    last_seen_at = CURRENT_TIMESTAMP
                WHERE id = ?
            """, [new_score, row['id']])
        else:
            # Create new trending topic
            topic_id = f"trending_{int(time.time() * 1000)}_{os.urandom(4).hex()}"
            
            client.execute("""
                INSERT INTO trending_topics (id, topic_text, entity_type, occurrence_count, avg_tfidf_score, article_ids)
                VALUES (?, ?, ?, ?, ?, ?)
            """, [
                topic_id,
                topic['text'],
                topic['type'],
                1,
                topic.get('tfidf_score', 0.5),
                '[]'
            ])

print("✅ Database functions ready")


## NER with Gemini


In [None]:
def extract_topics_with_gemini(title: str, content: str) -> List[Dict]:
    """Extract topics using Gemini NER"""
    prompt = f"""Extract 5-10 key topics/entities from this news article. Focus on:
- Organizations, companies, technologies
- People, scientists, researchers
- Locations, countries, regions  
- Scientific concepts, discoveries
- Events, phenomena

Title: {title}
Content: {content[:1000]} 

Return ONLY a JSON array with this format:
[{{"text": "entity name", "type": "TECH|ORG|PERSON|LOCATION|CONCEPT|EVENT"}}]"""

    try:
        response = model.generate_content(
            prompt,
            generation_config=genai.types.GenerationConfig(
                temperature=0.3,
                max_output_tokens=1000,
            )
        )
        
        # Extract JSON from response
        text = response.text
        json_start = text.find('[')
        json_end = text.rfind(']') + 1
        
        if json_start >= 0 and json_end > json_start:
            topics = json.loads(text[json_start:json_end])
            return [
                {
                    'text': t['text'],
                    'type': t.get('type', 'CONCEPT'),
                    'confidence': 0.8,
                    'tfidf_score': 0.5  # Will be updated with TF-IDF
                }
                for t in topics
            ]
        return []
    except Exception as e:
        print(f"❌ Error extracting topics: {e}")
        return []

# Test with one article
articles = get_articles_without_topics()
if len(articles) > 0:
    test_article = articles[0]
    test_topics = extract_topics_with_gemini(test_article['title'], test_article['content'])
    print(f"\n🧪 Test extraction for: {test_article['title'][:60]}...")
    print(f"Found {len(test_topics)} topics:")
    for topic in test_topics[:5]:
        print(f"  - {topic['text']} ({topic['type']})")


## TF-IDF Calculation


In [None]:
def calculate_tfidf_scores(articles: List[Dict]) -> Dict[str, float]:
    """Calculate TF-IDF scores for all terms across articles"""
    # Prepare documents
    documents = []
    for article in articles:
        doc = f"{article['title']} {article['content']}"
        documents.append(doc)
    
    # Create TF-IDF vectorizer
    vectorizer = TfidfVectorizer(
        max_features=1000,
        stop_words='english',
        ngram_range=(1, 2),  # Include bigrams
        min_df=2,  # Term must appear in at least 2 documents
        max_df=0.8  # Term must appear in less than 80% of documents
    )
    
    # Fit and transform
    tfidf_matrix = vectorizer.fit_transform(documents)
    feature_names = vectorizer.get_feature_names_out()
    
    # Calculate average TF-IDF scores
    mean_scores = np.mean(tfidf_matrix.toarray(), axis=0)
    
    # Create mapping of terms to scores
    term_scores = dict(zip(feature_names, mean_scores))
    
    return term_scores

def update_topic_tfidf_scores(topics: List[Dict], tfidf_scores: Dict[str, float]):
    """Update topics with TF-IDF scores"""
    for topic in topics:
        # Find best matching term in TF-IDF scores
        topic_text = topic['text'].lower()
        best_score = 0.0
        best_term = None
        
        for term, score in tfidf_scores.items():
            if topic_text in term or term in topic_text:
                if score > best_score:
                    best_score = score
                    best_term = term
        
        if best_term:
            topic['tfidf_score'] = best_score
            topic['matched_term'] = best_term
        else:
            topic['tfidf_score'] = 0.1  # Default low score

print("✅ TF-IDF functions ready")


## Main Processing Loop


In [None]:
# Get all articles without topics
articles = get_articles_without_topics()
print(f"📊 Found {len(articles)} articles without topics")

if len(articles) == 0:
    print("🎉 All articles already have topics!")
else:
    # Calculate TF-IDF scores for all articles
    print("\n🔢 Calculating TF-IDF scores...")
    tfidf_scores = calculate_tfidf_scores(articles)
    print(f"✅ Calculated TF-IDF for {len(tfidf_scores)} terms")
    
    # Process in batches
    total_processed = 0
    total_errors = 0
    all_errors = []
    
    print(f"\n🚀 Starting processing of {len(articles)} articles...")
    print("=" * 60)
    
    for i in range(0, len(articles), BATCH_SIZE):
        batch = articles[i:i + BATCH_SIZE]
        batch_num = (i // BATCH_SIZE) + 1
        total_batches = (len(articles) + BATCH_SIZE - 1) // BATCH_SIZE
        
        print(f"\n📦 Processing batch {batch_num}/{total_batches} ({len(batch)} articles)")
        
        for j, article in enumerate(batch):
            try:
                print(f"\n[{j+1}/{len(batch)}] Processing: {article['title'][:60]}...")
                
                # Extract topics
                topics = extract_topics_with_gemini(article['title'], article['content'])
                
                if not topics:
                    print(f"  ⚠️  No topics extracted")
                    all_errors.append(f"{article['id']}: No topics extracted")
                    total_errors += 1
                    continue
                
                # Update with TF-IDF scores
                update_topic_tfidf_scores(topics, tfidf_scores)
                
                # Store topics
                store_article_topics(article['id'], topics)
                update_trending_topics(topics)
                
                total_processed += 1
                print(f"  ✅ Extracted {len(topics)} topics")
                print(f"     Topics: {', '.join([t['text'] for t in topics[:3]])}...")
                
                # Rate limiting
                time.sleep(DELAY_BETWEEN_CALLS)
                
            except Exception as e:
                print(f"  ❌ Error: {e}")
                all_errors.append(f"{article['id']}: {str(e)}")
                total_errors += 1
        
        print(f"✅ Batch {batch_num} complete: {len(batch)} articles processed")
        
        # Delay between batches
        if i + BATCH_SIZE < len(articles):
            print(f"⏸️  Waiting {DELAY_BETWEEN_BATCHES}s before next batch...")
            time.sleep(DELAY_BETWEEN_BATCHES)
    
    print("\n" + "=" * 60)
    print("🎉 Processing complete!")
    print(f"✅ Total processed: {total_processed}")
    print(f"❌ Total errors: {total_errors}")
    
    if all_errors:
        print(f"\nError details (first 10):")
        for error in all_errors[:10]:
            print(f"  - {error}")


## Results and Statistics


In [None]:
# Get final statistics
stats = client.execute("""
    SELECT 
        COUNT(DISTINCT na.id) as total_articles,
        COUNT(DISTINCT at.article_id) as articles_with_topics,
        COUNT(DISTINCT tt.id) as total_trending_topics
    FROM news_articles na
    LEFT JOIN article_topics at ON na.id = at.article_id
    LEFT JOIN trending_topics tt ON 1=1
""")

row = stats.rows[0]
coverage = (row['articles_with_topics'] / row['total_articles'] * 100) if row['total_articles'] > 0 else 0

print("\n📊 Final Statistics:")
print(f"Total articles: {row['total_articles']}")
print(f"Articles with topics: {row['articles_with_topics']}")
print(f"Articles without topics: {row['total_articles'] - row['articles_with_topics']}")
print(f"Total trending topics: {row['total_trending_topics']}")
print(f"Coverage: {coverage:.2f}%")

# Show top trending topics
top_topics = client.execute("""
    SELECT topic_text, entity_type, occurrence_count, avg_tfidf_score
    FROM trending_topics
    ORDER BY (avg_tfidf_score * occurrence_count) DESC
    LIMIT 10
""")

print(f"\n🔥 Top 10 Trending Topics:")
for i, topic in enumerate(top_topics.rows, 1):
    print(f"{i:2d}. {topic['topic_text']} ({topic['entity_type']}) - "
          f"Count: {topic['occurrence_count']}, Score: {topic['avg_tfidf_score']:.3f}")

# Show topic distribution by type
type_dist = client.execute("""
    SELECT entity_type, COUNT(*) as count
    FROM trending_topics
    GROUP BY entity_type
    ORDER BY count DESC
""")

print(f"\n📈 Topic Distribution by Type:")
for row in type_dist.rows:
    print(f"  {row['entity_type']}: {row['count']} topics")
