In [17]:
# 05: Multi-Date Archive Collection
# Goal: Scrape The Hindu archives for multiple dates (Sept 20-27, 2025)
# Expected: ~400-500 Tamil Nadu articles across 7 days

import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime, timedelta
import time
import json
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from supabase import create_client, Client

print("Libraries imported successfully!")

Libraries imported successfully!


In [10]:
# ============================================
# DATE RANGE CONFIGURATION
# ============================================
START_DATE = "2025-06-05"
END_DATE = "2025-07-05"

# Generate date range
start = datetime.strptime(START_DATE, '%Y-%m-%d')
end = datetime.strptime(END_DATE, '%Y-%m-%d')
date_list = []

current = start
while current <= end:
    date_list.append(current.strftime('%Y-%m-%d'))
    current += timedelta(days=1)

print(f"🎯 DATE RANGE: {START_DATE} to {END_DATE}")
print(f"📅 Total Days: {len(date_list)}")
print(f"\nDates to scrape:")
for i, date in enumerate(date_list, 1):
    print(f"  {i}. {date}")

# ============================================
# ARCHIVE SETTINGS
# ============================================
BASE_ARCHIVE_URL = "https://www.thehindu.com/archive/web/"
MAX_PAGES_TO_CHECK = 15  # Maximum pages to check for each date

# Tamil Nadu categories
TN_CATEGORIES = {
    'tamil nadu',
    'chennai', 'coimbatore', 'madurai',
    'tiruchirappalli', 'tiruchi', 'trichy',
    'salem', 'tirunelveli', 'erode', 'vellore',
    'thoothukudi', 'tuticorin', 'tiruppur',
    'dindigul', 'thanjavur', 'nagercoil',
    'kanyakumari', 'kanniyakumari', 'karur',
    'pudukkottai', 'cuddalore', 'hosur',
    'kanchipuram', 'avadi', 'kancheepuram'
}

# Scraping settings
RATE_LIMIT_SECONDS = 2
RATE_LIMIT_BETWEEN_DATES = 5
REQUEST_TIMEOUT = 10

print(f"\n⚙️ Settings:")
print(f"  Max pages to check: {MAX_PAGES_TO_CHECK}")
print(f"  Rate limit: {RATE_LIMIT_SECONDS}s between pages")
print(f"  Rate limit: {RATE_LIMIT_BETWEEN_DATES}s between dates")
print(f"  TN categories: {len(TN_CATEGORIES)}")
print(f"\n💡 Note: Actual pages per date will be detected automatically")

🎯 DATE RANGE: 2025-06-05 to 2025-07-05
📅 Total Days: 31

Dates to scrape:
  1. 2025-06-05
  2. 2025-06-06
  3. 2025-06-07
  4. 2025-06-08
  5. 2025-06-09
  6. 2025-06-10
  7. 2025-06-11
  8. 2025-06-12
  9. 2025-06-13
  10. 2025-06-14
  11. 2025-06-15
  12. 2025-06-16
  13. 2025-06-17
  14. 2025-06-18
  15. 2025-06-19
  16. 2025-06-20
  17. 2025-06-21
  18. 2025-06-22
  19. 2025-06-23
  20. 2025-06-24
  21. 2025-06-25
  22. 2025-06-26
  23. 2025-06-27
  24. 2025-06-28
  25. 2025-06-29
  26. 2025-06-30
  27. 2025-07-01
  28. 2025-07-02
  29. 2025-07-03
  30. 2025-07-04
  31. 2025-07-05

⚙️ Settings:
  Max pages to check: 15
  Rate limit: 2s between pages
  Rate limit: 5s between dates
  TN categories: 26

💡 Note: Actual pages per date will be detected automatically


In [11]:
# ============================================
# SUPABASE CREDENTIALS - REPLACE THESE
# ============================================
SUPABASE_URL = "https://lgnhjzlbezpczlobeevu.supabase.co"
SUPABASE_KEY = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZSIsInJlZiI6ImxnbmhqemxiZXpwY3psb2JlZXZ1Iiwicm9sZSI6ImFub24iLCJpYXQiOjE3NTgyMDgzNjcsImV4cCI6MjA3Mzc4NDM2N30.O5Yt0dOyYq326ESo0LBL7lGj4k8zwpuodJfTtGwrPek"
# ============================================

# Initialize Supabase client
supabase: Client = create_client(SUPABASE_URL, SUPABASE_KEY)

print("✅ Supabase client initialized!")
print(f"Connected to: {SUPABASE_URL}")

# Test connection - check current article count
try:
    result = supabase.table('news_cleaned').select('*', count='exact').execute()
    print(f"\n📊 Current articles in database: {result.count}")
except Exception as e:
    print(f"⚠️ Connection test failed: {e}")

✅ Supabase client initialized!
Connected to: https://lgnhjzlbezpczlobeevu.supabase.co

📊 Current articles in database: 1272


In [12]:
def parse_articles_from_html(html_content):
    """
    Parse all articles from archive page HTML
    Returns: List of article dictionaries
    """
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Find all article elements
    article_elements = soup.find_all('div', class_='element')
    
    articles = []
    
    for element in article_elements:
        # Extract category label
        label_div = element.find('div', class_='label')
        category = label_div.get_text(strip=True) if label_div else ''
        
        # Extract title and link
        title_div = element.find('div', class_='title')
        if title_div:
            link_tag = title_div.find('a')
            if link_tag:
                title = link_tag.get_text(strip=True)
                link = link_tag.get('href', '')
                
                articles.append({
                    'category': category,
                    'title': title,
                    'link': link
                })
    
    return articles


def is_tamil_nadu_article(category):
    """
    Check if article belongs to Tamil Nadu
    """
    return category.lower() in TN_CATEGORIES


print("✅ Helper functions defined:")
print("  - parse_articles_from_html()")
print("  - is_tamil_nadu_article()")

✅ Helper functions defined:
  - parse_articles_from_html()
  - is_tamil_nadu_article()


In [13]:
def scrape_archive_for_date(target_date, driver, total_pages=12):
    """
    Scrape all pages for a specific date
    
    Args:
        target_date: str in format 'YYYY-MM-DD'
        driver: Selenium WebDriver instance
        total_pages: int, number of pages to scrape
    
    Returns:
        DataFrame of Tamil Nadu articles for that date
    """
    all_articles = []
    
    # Convert date to URL format: YYYY/MM/DD
    date_parts = target_date.split('-')
    url_date = f"{date_parts[0]}/{date_parts[1]}/{date_parts[2]}"
    
    print(f"\n{'='*60}")
    print(f"📅 SCRAPING DATE: {target_date}")
    print(f"{'='*60}")
    
    for page_num in range(1, total_pages + 1):
        print(f"  📄 Page {page_num}/{total_pages}...", end=" ")
        
        # Construct URL
        if page_num == 1:
            url = f"{BASE_ARCHIVE_URL}{url_date}/"
        else:
            url = f"{BASE_ARCHIVE_URL}{url_date}/?page={page_num}"
        
        try:
            # Load page
            driver.get(url)
            
            # Wait for articles to load
            WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CLASS_NAME, "element"))
            )
            
            # Small delay to ensure full load
            time.sleep(1)
            
            # Get page source and parse
            html_content = driver.page_source
            articles = parse_articles_from_html(html_content)
            all_articles.extend(articles)
            
            print(f"✅ {len(articles)} articles")
            
            # Rate limiting between pages
            if page_num < total_pages:
                time.sleep(RATE_LIMIT_SECONDS)
                
        except Exception as e:
            print(f"❌ Failed - {str(e)[:50]}...")
            continue
    
    # Filter for Tamil Nadu articles
    tn_articles = [a for a in all_articles if is_tamil_nadu_article(a['category'])]
    
    # Add metadata
    for article in tn_articles:
        article['archive_date'] = target_date
        article['source'] = 'The Hindu - Archive'
        article['scraped_at'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        article['has_summary'] = False
        article['needs_full_scrape'] = True
    
    # Create DataFrame
    df = pd.DataFrame(tn_articles)
    
    print(f"\n✅ Date complete: {len(all_articles)} total → {len(tn_articles)} TN articles ({len(tn_articles)/len(all_articles)*100:.1f}%)")
    
    return df


print("✅ Core scraping function defined: scrape_archive_for_date()")

✅ Core scraping function defined: scrape_archive_for_date()


In [14]:
def initialize_selenium_driver():
    """
    Initialize Chrome driver with appropriate settings
    Returns: WebDriver instance
    """
    chrome_options = Options()
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')
    chrome_options.add_argument('--disable-blink-features=AutomationControlled')
    chrome_options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36')
    
    driver = webdriver.Chrome(
        service=Service(ChromeDriverManager().install()), 
        options=chrome_options
    )
    
    return driver


print("✅ Selenium initialization function defined")
print("\n⚠️ Driver will be initialized when scraping starts")

✅ Selenium initialization function defined

⚠️ Driver will be initialized when scraping starts


In [15]:
# ============================================
# MAIN MULTI-DATE SCRAPING LOOP (STABLE VERSION)
# ============================================
print("🚀 STARTING MULTI-DATE ARCHIVE COLLECTION")
print("="*60)
print(f"Date Range: {START_DATE} to {END_DATE}")
print(f"Total Dates: {len(date_list)}")
print(f"Estimated Time: ~{len(date_list) * 3} minutes")
print("="*60)

# Initialize storage for all results
all_dataframes = []
scraping_stats = []

overall_start = time.time()

# Scrape each date with its own browser instance
for i, date in enumerate(date_list, 1):
    print(f"\n{'#'*60}")
    print(f"DATE {i}/{len(date_list)}: {date}")
    print(f"{'#'*60}")
    
    date_start = time.time()
    driver = None
    
    try:
        # Initialize fresh browser for this date
        print("\n🌐 Launching browser for this date...")
        driver = initialize_selenium_driver()
        
        # Scrape this date
        df_date = scrape_archive_for_date(date, driver)
        
        date_elapsed = time.time() - date_start
        
        # Store results
        all_dataframes.append(df_date)
        scraping_stats.append({
            'date': date,
            'articles_scraped': len(df_date),
            'time_seconds': date_elapsed
        })
        
        print(f"⏱️ Time for this date: {date_elapsed:.1f}s")
        
    except Exception as e:
        print(f"❌ Error scraping {date}: {type(e).__name__}: {str(e)[:100]}")
        # Add empty dataframe to maintain alignment
        all_dataframes.append(pd.DataFrame(columns=['title', 'category', 'link', 'archive_date', 
                                                     'source', 'scraped_at', 'has_summary', 'needs_full_scrape']))
        scraping_stats.append({
            'date': date,
            'articles_scraped': 0,
            'time_seconds': 0
        })
    
    finally:
        # Always close browser for this date
        if driver:
            print("🔒 Closing browser for this date...")
            try:
                driver.quit()
            except:
                pass
    
    # Rate limiting between dates
    if i < len(date_list):
        print(f"⏳ Waiting {RATE_LIMIT_BETWEEN_DATES}s before next date...\n")
        time.sleep(RATE_LIMIT_BETWEEN_DATES)

overall_elapsed = time.time() - overall_start

# Combine all DataFrames
print("\n" + "="*60)
print("🔄 COMBINING RESULTS...")
print("="*60)

if all_dataframes:
    df_all_dates = pd.concat(all_dataframes, ignore_index=True)
else:
    df_all_dates = pd.DataFrame()

print(f"✅ Collection Complete!")
print(f"⏱️ Total Time: {overall_elapsed:.1f}s ({overall_elapsed/60:.1f} minutes)")
print(f"📊 Total Articles Collected: {len(df_all_dates)}")

# Display per-date stats
print("\n" + "="*60)
print("📊 PER-DATE STATISTICS")
print("="*60)
for stat in scraping_stats:
    print(f"{stat['date']}: {stat['articles_scraped']} articles in {stat['time_seconds']:.1f}s")

🚀 STARTING MULTI-DATE ARCHIVE COLLECTION
Date Range: 2025-06-05 to 2025-07-05
Total Dates: 31
Estimated Time: ~93 minutes

############################################################
DATE 1/31: 2025-06-05
############################################################

🌐 Launching browser for this date...

📅 SCRAPING DATE: 2025-06-05
  📄 Page 1/12... ✅ 40 articles
  📄 Page 2/12... ✅ 40 articles
  📄 Page 3/12... ✅ 40 articles
  📄 Page 4/12... ✅ 40 articles
  📄 Page 5/12... ✅ 40 articles
  📄 Page 6/12... ✅ 40 articles
  📄 Page 7/12... ✅ 40 articles
  📄 Page 8/12... ✅ 40 articles
  📄 Page 9/12... ✅ 40 articles
  📄 Page 10/12... ✅ 40 articles
  📄 Page 11/12... ✅ 40 articles
  📄 Page 12/12... ✅ 40 articles

✅ Date complete: 480 total → 68 TN articles (14.2%)
⏱️ Time for this date: 72.3s
🔒 Closing browser for this date...
⏳ Waiting 5s before next date...


############################################################
DATE 2/31: 2025-06-06
########################################################

In [16]:
# ============================================
# UPLOAD TO SUPABASE (Enhanced & Safe Version)
# ============================================

from datetime import datetime
import pandas as pd

def transform_archive_for_upload(df_archive):
    """
    Transform Archive DataFrame to match Supabase schema
    """
    records = []
    
    for _, row in df_archive.iterrows():
        def safe_str(value):
            if pd.isna(value) or value == '' or value == 'None':
                return None
            return str(value).strip()
        
        record = {
            'title': safe_str(row.get('title')),
            'link': safe_str(row.get('link')),
            'category': safe_str(row.get('category')),
            'source': 'The Hindu - Archive',
            'pub_date': None,
            'archive_date': safe_str(row.get('archive_date')),
            'description': None,
            'content_full': None,
            'guid': None,
            'image_url': None,
            'image_width': None,
            'image_height': None,
            'has_description': False,
            'has_image': False,
            'needs_full_scrape': True,
            'raw_json': None,
            'scraped_at': safe_str(row.get('scraped_at'))
        }
        records.append(record)
    
    return records


def upload_records_batch(records, batch_size=50):
    """
    Upload or update records in batches using UPSERT (no duplicates)
    """
    total = len(records)
    uploaded = 0
    updated = 0
    errors = 0

    for i in range(0, total, batch_size):
        batch = records[i:i+batch_size]
        batch_num = (i // batch_size) + 1
        total_batches = (total // batch_size) + 1

        print(f"📤 Uploading batch {batch_num}/{total_batches} ({len(batch)} records)...", end=" ")

        try:
            # ✅ Use UPSERT — avoids duplicate key errors
            result = supabase.table('news_cleaned').upsert(batch, on_conflict='link').execute()

            # Some Supabase clients don't directly show affected rows; you can infer:
            uploaded += len(batch)

            print(f"✅ Done")

        except Exception as e:
            error_msg = str(e)
            print(f"❌ Error: {error_msg[:120]}")
            errors += len(batch)

    return uploaded, updated, errors


# ============================================
# RUN UPLOAD PROCESS
# ============================================

print("🔄 Transforming data for upload...")
archive_records = transform_archive_for_upload(df_all_dates)
print(f"✅ Transformed {len(archive_records)} records")

print("\n🚀 Starting upload to Supabase...")
print("="*60)

uploaded, updated, errors = upload_records_batch(archive_records, batch_size=50)

print("="*60)
print(f"✅ Upload Complete!")
print(f"   Uploaded or Updated: {uploaded}")
print(f"   Errors: {errors}")

# Verify total count in database
try:
    result = supabase.table('news_cleaned').select("*", count='exact').execute()
    print(f"\n📊 Total articles in database: {result.count}")
except Exception as e:
    print(f"⚠️ Could not fetch count: {e}")


🔄 Transforming data for upload...
✅ Transformed 1690 records

🚀 Starting upload to Supabase...
📤 Uploading batch 1/34 (50 records)... ✅ Done
📤 Uploading batch 2/34 (50 records)... ✅ Done
📤 Uploading batch 3/34 (50 records)... ✅ Done
📤 Uploading batch 4/34 (50 records)... ❌ Error: {'message': 'ON CONFLICT DO UPDATE command cannot affect row a second time', 'code': '21000', 'hint': 'Ensure that no ro
📤 Uploading batch 5/34 (50 records)... ✅ Done
📤 Uploading batch 6/34 (50 records)... ✅ Done
📤 Uploading batch 7/34 (50 records)... ✅ Done
📤 Uploading batch 8/34 (50 records)... ✅ Done
📤 Uploading batch 9/34 (50 records)... ✅ Done
📤 Uploading batch 10/34 (50 records)... ✅ Done
📤 Uploading batch 11/34 (50 records)... ✅ Done
📤 Uploading batch 12/34 (50 records)... ✅ Done
📤 Uploading batch 13/34 (50 records)... ✅ Done
📤 Uploading batch 14/34 (50 records)... ✅ Done
📤 Uploading batch 15/34 (50 records)... ❌ Error: {'message': 'ON CONFLICT DO UPDATE command cannot affect row a second time', 'code':