In [2]:
import os
import re
import time
import requests
from bs4 import BeautifulSoup
from dotenv import load_dotenv
from supabase import create_client, Client
import pandas as pd
from datetime import datetime
from tqdm.notebook import tqdm

# Initialize Supabase client
SUPABASE_URL = "https://lgnhjzlbezpczlobeevu.supabase.co"
SUPABASE_KEY = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZSIsInJlZiI6ImxnbmhqemxiZXpwY3psb2JlZXZ1Iiwicm9sZSI6ImFub24iLCJpYXQiOjE3NTgyMDgzNjcsImV4cCI6MjA3Mzc4NDM2N30.O5Yt0dOyYq326ESo0LBL7lGj4k8zwpuodJfTtGwrPek"  # Replace with your actual key

if not SUPABASE_URL or not SUPABASE_KEY:
    raise ValueError("Missing Supabase credentials")

supabase: Client = create_client(SUPABASE_URL, SUPABASE_KEY)

print("✅ Environment loaded successfully")
print(f"📊 Supabase URL: {SUPABASE_URL[:30]}...")
print(f"🔑 API Key loaded: {len(SUPABASE_KEY)} characters")

✅ Environment loaded successfully
📊 Supabase URL: https://lgnhjzlbezpczlobeevu.s...
🔑 API Key loaded: 208 characters


In [3]:
# Fetch all RSS articles where content_full is NULL or empty
response = supabase.table('news_cleaned')\
    .select('id, title, link, source')\
    .eq('source', 'The Hindu - RSS')\
    .is_('content_full', 'null')\
    .execute()

rss_articles = pd.DataFrame(response.data)

print(f"📰 Found {len(rss_articles)} RSS articles needing full content")
print(f"\n📋 Sample articles:")
print(rss_articles[['id', 'title']].head())

📰 Found 68 RSS articles needing full content

📋 Sample articles:
                                     id  \
0  41eb8377-f89f-499a-8f3a-212e537b42de   
1  6067d8d8-dff9-47c8-8121-b6ea7fa6f619   
2  7c9518c5-2224-4cd6-afc5-4318c78237c7   
3  a29498b2-9a64-4e13-b78b-7edd00272200   
4  c468b884-94b3-4973-9f05-6601255010c7   

                                               title  
0  Orange alert issued in north Tamil Nadu distri...  
1  Kalaignar International Convention Centre set ...  
2  Eminent plastic surgeon K. Mathangi Ramakrishn...  
3  What is the problem faced by paddy farmers of ...  
4                  BM reviews works in Adyar estuary  


In [4]:
def scrape_hindu_article(url, max_retries=3):
    """
    Scrape full article content from The Hindu
    Returns: (content_text, error_message)
    """
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
    }
    
    for attempt in range(max_retries):
        try:
            response = requests.get(url, headers=headers, timeout=10)
            response.raise_for_status()
            
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # The Hindu article content is typically in <div> with class 'articlebodycontent'
            # Try multiple selectors
            content = None
            
            # Selector 1: Main article body
            article_body = soup.find('div', class_='articlebodycontent')
            if article_body:
                paragraphs = article_body.find_all('p')
                content = ' '.join([p.get_text(strip=True) for p in paragraphs])
            
            # Selector 2: Alternative structure
            if not content:
                article_body = soup.find('div', {'itemprop': 'articleBody'})
                if article_body:
                    paragraphs = article_body.find_all('p')
                    content = ' '.join([p.get_text(strip=True) for p in paragraphs])
            
            # Selector 3: Generic article tag
            if not content:
                article = soup.find('article')
                if article:
                    paragraphs = article.find_all('p')
                    content = ' '.join([p.get_text(strip=True) for p in paragraphs])
            
            if content and len(content) > 100:  # Ensure we got meaningful content
                # Clean the content
                content = re.sub(r'\s+', ' ', content).strip()
                return content, None
            else:
                return None, "Content too short or not found"
                
        except requests.exceptions.Timeout:
            if attempt < max_retries - 1:
                time.sleep(2)
                continue
            return None, "Timeout after retries"
        
        except requests.exceptions.RequestException as e:
            return None, f"Request error: {str(e)}"
        
        except Exception as e:
            return None, f"Parsing error: {str(e)}"
    
    return None, "Max retries exceeded"


# Test the function on first article
test_url = rss_articles.iloc[0]['link']
test_content, test_error = scrape_hindu_article(test_url)

print("🧪 Test Scrape:")
print(f"URL: {test_url}")
if test_content:
    print(f"✅ Success! Content length: {len(test_content)} characters")
    print(f"Preview: {test_content[:200]}...")
else:
    print(f"❌ Failed: {test_error}")

🧪 Test Scrape:
URL: https://www.thehindu.com/news/national/tamil-nadu/cyclone-montha-orange-alert-issued-in-north-tamil-nadu-districts-as-weather-system-advances/article70207325.ece
✅ Success! Content length: 3440 characters
Preview: Cyclone Monthais expected to bring heavy rainfall over some parts of north Tamil Nadu until it crosses the coast on Tuesday (October 28, 2025) evening or night. The Regional Meteorological Centre (RMC...


In [5]:
# Storage for results
results = {
    'successful': [],
    'failed': [],
    'errors': []
}

print(f"🚀 Starting scrape of {len(rss_articles)} articles...")
print("=" * 80)

# Process each article with progress bar
for idx, row in tqdm(rss_articles.iterrows(), total=len(rss_articles), desc="Scraping"):
    article_id = row['id']
    link = row['link']
    title = row['title']
    
    # Scrape content
    content, error = scrape_hindu_article(link)
    
    if content:
        results['successful'].append({
            'id': article_id,
            'title': title,
            'content': content,
            'content_length': len(content)
        })
    else:
        results['failed'].append({
            'id': article_id,
            'title': title,
            'link': link,
            'error': error
        })
        results['errors'].append(error)
    
    # Rate limiting - be polite to The Hindu's servers
    time.sleep(1)  # 1 second between requests

print("\n" + "=" * 80)
print("📊 Scraping Complete!")
print(f"✅ Successful: {len(results['successful'])}")
print(f"❌ Failed: {len(results['failed'])}")

🚀 Starting scrape of 68 articles...


Scraping:   0%|          | 0/68 [00:00<?, ?it/s]


📊 Scraping Complete!
✅ Successful: 68
❌ Failed: 0


In [6]:
print("💾 Updating Supabase with scraped content...")
print("=" * 80)

update_success = 0
update_failed = 0

for article in tqdm(results['successful'], desc="Updating DB"):
    try:
        response = supabase.table('news_cleaned')\
            .update({
                'content_full': article['content'],
                'scraped_at': datetime.now().isoformat()
            })\
            .eq('id', article['id'])\
            .execute()
        
        update_success += 1
        
    except Exception as e:
        print(f"\n❌ Failed to update {article['id']}: {e}")
        update_failed += 1

print("\n" + "=" * 80)
print("✅ Database Update Complete!")
print(f"   - Successfully updated: {update_success}")
print(f"   - Failed to update: {update_failed}")

💾 Updating Supabase with scraped content...


Updating DB:   0%|          | 0/68 [00:00<?, ?it/s]


✅ Database Update Complete!
   - Successfully updated: 68
   - Failed to update: 0


In [7]:
print("=" * 80)
print("📋 FINAL SUMMARY REPORT")
print("=" * 80)

print(f"\n📰 Total RSS Articles Processed: {len(rss_articles)}")
print(f"✅ Successfully Scraped: {len(results['successful'])} ({len(results['successful'])/len(rss_articles)*100:.1f}%)")
print(f"❌ Failed to Scrape: {len(results['failed'])} ({len(results['failed'])/len(rss_articles)*100:.1f}%)")
print(f"💾 Database Updates: {update_success} successful, {update_failed} failed")

# Content length statistics
if results['successful']:
    content_lengths = [r['content_length'] for r in results['successful']]
    print(f"\n📏 Content Length Statistics:")
    print(f"   - Min: {min(content_lengths)} characters")
    print(f"   - Max: {max(content_lengths)} characters")
    print(f"   - Average: {sum(content_lengths)/len(content_lengths):.0f} characters")

# Show failed articles
if results['failed']:
    print(f"\n❌ Failed Articles ({len(results['failed'])}):")
    failed_df = pd.DataFrame(results['failed'])
    print(failed_df[['title', 'error']].to_string(index=False))
    
    # Error breakdown
    print(f"\n🔍 Error Breakdown:")
    error_counts = pd.Series(results['errors']).value_counts()
    print(error_counts)

print("\n" + "=" * 80)
print("🎉 Scraping pipeline complete!")
print("=" * 80)

📋 FINAL SUMMARY REPORT

📰 Total RSS Articles Processed: 68
✅ Successfully Scraped: 68 (100.0%)
❌ Failed to Scrape: 0 (0.0%)
💾 Database Updates: 68 successful, 0 failed

📏 Content Length Statistics:
   - Min: 130 characters
   - Max: 29787 characters
   - Average: 2870 characters

🎉 Scraping pipeline complete!
