In [24]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
import time
import json

print("📦 Libraries imported successfully!")

📦 Libraries imported successfully!


In [30]:
# Configuration
BASE_ARCHIVE_URL = "https://www.thehindu.com/archive/web/2025/09/27/"
TARGET_DATE = "2025-09-27"
TOTAL_PAGES = 12  # Based on your manual check

# Tamil Nadu categories to filter
TN_CATEGORIES = {
    'tamil nadu',
    'chennai', 'coimbatore', 'madurai',
    'tiruchirappalli', 'tiruchi', 'trichy',
    'salem', 'tirunelveli', 'erode', 'vellore',
    'thoothukudi', 'tuticorin', 'tiruppur',
    'dindigul', 'thanjavur', 'nagercoil',
    'kanyakumari', 'kanniyakumari', 'karur',
    'pudukkottai', 'cuddalore', 'hosur',
    'kanchipuram', 'avadi', 'kancheepuram'
}

# Scraping settings
RATE_LIMIT_SECONDS = 2
REQUEST_TIMEOUT = 10
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"

print(f"🎯 Target Date: {TARGET_DATE}")
print(f"📄 Pages to Scrape: {TOTAL_PAGES}")
print(f"🏷️  TN Categories Configured: {len(TN_CATEGORIES)}")

🎯 Target Date: 2025-09-27
📄 Pages to Scrape: 12
🏷️  TN Categories Configured: 26


In [35]:
# Install selenium and webdriver
!pip install selenium webdriver-manager

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

def fetch_archive_page_selenium(page_num=1):
    """
    Fetch archive page using Selenium (real browser)
    """
    if page_num == 1:
        url = BASE_ARCHIVE_URL
    else:
        url = f"{BASE_ARCHIVE_URL}?page={page_num}"
    
    # Setup Chrome options
    chrome_options = Options()
    chrome_options.add_argument('--headless')  # Run without opening browser window
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')
    chrome_options.add_argument('--disable-blink-features=AutomationControlled')
    chrome_options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36')
    
    try:
        # Initialize driver
        driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
        
        # Load page
        driver.get(url)
        
        # Wait for content to load
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CLASS_NAME, "element"))
        )
        
        # Get page source
        html_content = driver.page_source
        
        # Close browser
        driver.quit()
        
        return html_content
        
    except Exception as e:
        print(f"❌ Selenium error on page {page_num}: {e}")
        if 'driver' in locals():
            driver.quit()
        return None


# Test Selenium approach
print("🔍 Testing: Fetching Page 1 with Selenium (real browser)...\n")
html_content = fetch_archive_page_selenium(1)

if html_content:
    print(f"✅ Page fetched successfully!")
    print(f"📏 HTML Size: {len(html_content)} characters")
else:
    print("❌ Failed to fetch page")

🔍 Testing: Fetching Page 1 with Selenium (real browser)...

✅ Page fetched successfully!
📏 HTML Size: 201691 characters


In [36]:
def parse_articles_from_html(html_content):
    """
    Parse all articles from archive page HTML
    """
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Find all article elements
    article_elements = soup.find_all('div', class_='element')
    
    articles = []
    
    for element in article_elements:
        # Extract category label
        label_div = element.find('div', class_='label')
        category = label_div.get_text(strip=True) if label_div else ''
        
        # Extract title and link
        title_div = element.find('div', class_='title')
        if title_div:
            link_tag = title_div.find('a')
            if link_tag:
                title = link_tag.get_text(strip=True)
                link = link_tag.get('href', '')
                
                articles.append({
                    'category': category,
                    'title': title,
                    'link': link
                })
    
    return articles


# Test: Parse first page
print("🔍 Testing: Parsing articles from Page 1...\n")
articles_page1 = parse_articles_from_html(html_content)

print(f"✅ Found {len(articles_page1)} articles on Page 1")
print("\nFirst 3 articles:")
for i, article in enumerate(articles_page1[:3], 1):
    print(f"\n{i}. Category: {article['category']}")
    print(f"   Title: {article['title'][:80]}...")
    print(f"   Link: {article['link']}")

🔍 Testing: Parsing articles from Page 1...

✅ Found 40 articles on Page 1

First 3 articles:

1. Category: World
   Title: Moldova bans another pro-Russian party from September 28's vote...
   Link: https://www.thehindu.com/news/international/moldova-bans-another-pro-russian-party-from-september-28s-vote/article70103200.ece

2. Category: Kerala
   Title: Bison caught in residential area released into Aralam sanctuary...
   Link: https://www.thehindu.com/news/national/kerala/bison-caught-in-residential-area-released-into-aralam-sanctuary/article70102936.ece

3. Category: World
   Title: Russia says seized three villages in east Ukraine...
   Link: https://www.thehindu.com/news/international/russia-says-seized-three-villages-in-east-ukraine/article70103203.ece


In [37]:
def is_tamil_nadu_article(category):
    """
    Check if article belongs to Tamil Nadu
    """
    return category.lower() in TN_CATEGORIES


# Test: Filter Page 1 articles
tn_articles_page1 = [a for a in articles_page1 if is_tamil_nadu_article(a['category'])]

print("=" * 60)
print("📊 FILTERING RESULTS - PAGE 1")
print("=" * 60)
print(f"Total Articles: {len(articles_page1)}")
print(f"Tamil Nadu Articles: {len(tn_articles_page1)}")
print(f"Filter Rate: {len(tn_articles_page1)/len(articles_page1)*100:.1f}%")

print("\n" + "=" * 60)
print("🎯 TAMIL NADU ARTICLES FOUND:")
print("=" * 60)
for i, article in enumerate(tn_articles_page1, 1):
    print(f"\n{i}. [{article['category']}] {article['title']}")
    print(f"   {article['link']}")

📊 FILTERING RESULTS - PAGE 1
Total Articles: 40
Tamil Nadu Articles: 7
Filter Rate: 17.5%

🎯 TAMIL NADU ARTICLES FOUND:

1. [Tamil Nadu] Encroachments demolished to complete new bridge across Kamadalam river near Arani
   https://www.thehindu.com/news/national/tamil-nadu/encroachments-demolished-to-complete-new-bridge-across-kamadalam-river-near-arani/article70102100.ece

2. [Tamil Nadu] Engineering students exhibit futuristic skills at graVITas’25
   https://www.thehindu.com/news/national/tamil-nadu/engineering-students-exhibit-futuristic-skills-at-gravitas25/article70102115.ece

3. [Chennai] Pheasant-tailed jacana: raising more families
   https://www.thehindu.com/news/cities/chennai/pheasant-tailed-jacana-raising-more-families/article70103084.ece

4. [Coimbatore] Suicide pact: third sibling succumbs in Coimbatore hospital
   https://www.thehindu.com/news/cities/Coimbatore/suicide-pact-third-sibling-succumbs-in-coimbatore-hospital/article70103014.ece

5. [Chennai] Art challenge drive

In [39]:
def scrape_all_pages_selenium(total_pages):
    """
    Scrape all pages for the target date using Selenium
    """
    all_articles = []
    
    # Setup Chrome options ONCE (reuse for all pages)
    chrome_options = Options()
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')
    chrome_options.add_argument('--disable-blink-features=AutomationControlled')
    chrome_options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36')
    
    # Initialize driver ONCE
    print("🚀 Initializing Chrome browser...\n")
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
    
    try:
        for page_num in range(1, total_pages + 1):
            print(f"📄 Scraping Page {page_num}/{total_pages}...", end=" ")
            
            # Construct URL
            if page_num == 1:
                url = BASE_ARCHIVE_URL
            else:
                url = f"{BASE_ARCHIVE_URL}?page={page_num}"
            
            try:
                # Load page
                driver.get(url)
                
                # Wait for articles to load
                WebDriverWait(driver, 10).until(
                    EC.presence_of_element_located((By.CLASS_NAME, "element"))
                )
                
                # Small additional delay to ensure all content loaded
                time.sleep(1)
                
                # Get page source
                html_content = driver.page_source
                
                # Parse articles
                articles = parse_articles_from_html(html_content)
                all_articles.extend(articles)
                
                print(f"✅ Found {len(articles)} articles")
                
                # Rate limiting between pages
                if page_num < total_pages:
                    time.sleep(RATE_LIMIT_SECONDS)
                    
            except Exception as e:
                print(f"❌ Failed - {str(e)[:50]}... - Skipping")
                continue
        
    finally:
        # Always close browser when done
        print("\n🔒 Closing browser...")
        driver.quit()
    
    return all_articles


# Execute: Scrape all pages
print("🚀 Starting full scrape of all 12 pages with Selenium...")
print("=" * 60)

start_time = time.time()
all_articles = scrape_all_pages_selenium(TOTAL_PAGES)
elapsed_time = time.time() - start_time

print("=" * 60)
print(f"✅ Scraping Complete!")
print(f"⏱️  Time Taken: {elapsed_time:.1f} seconds ({elapsed_time/60:.1f} minutes)")
print(f"📊 Total Articles Collected: {len(all_articles)}")

🚀 Starting full scrape of all 12 pages with Selenium...
🚀 Initializing Chrome browser...

📄 Scraping Page 1/12... ✅ Found 40 articles
📄 Scraping Page 2/12... ✅ Found 40 articles
📄 Scraping Page 3/12... ✅ Found 40 articles
📄 Scraping Page 4/12... ✅ Found 40 articles
📄 Scraping Page 5/12... ✅ Found 40 articles
📄 Scraping Page 6/12... ✅ Found 40 articles
📄 Scraping Page 7/12... ✅ Found 40 articles
📄 Scraping Page 8/12... ✅ Found 40 articles
📄 Scraping Page 9/12... ✅ Found 40 articles
📄 Scraping Page 10/12... ✅ Found 40 articles
📄 Scraping Page 11/12... ✅ Found 40 articles
📄 Scraping Page 12/12... ✅ Found 21 articles

🔒 Closing browser...
✅ Scraping Complete!
⏱️  Time Taken: 56.3 seconds (0.9 minutes)
📊 Total Articles Collected: 461


In [40]:
# Filter for Tamil Nadu articles
tn_articles_all = [a for a in all_articles if is_tamil_nadu_article(a['category'])]

print("=" * 60)
print("📊 COMPLETE FILTERING RESULTS")
print("=" * 60)
print(f"Total Articles Scraped: {len(all_articles)}")
print(f"Tamil Nadu Articles: {len(tn_articles_all)}")
print(f"Filter Rate: {len(tn_articles_all)/len(all_articles)*100:.1f}%")
print(f"Articles Excluded: {len(all_articles) - len(tn_articles_all)}")

print("\n" + "=" * 60)
print("🏷️  CATEGORY BREAKDOWN (Tamil Nadu Only)")
print("=" * 60)

# Count by category
category_counts = {}
for article in tn_articles_all:
    cat = article['category']
    category_counts[cat] = category_counts.get(cat, 0) + 1

for cat, count in sorted(category_counts.items(), key=lambda x: x[1], reverse=True):
    print(f"{cat}: {count}")

📊 COMPLETE FILTERING RESULTS
Total Articles Scraped: 461
Tamil Nadu Articles: 61
Filter Rate: 13.2%
Articles Excluded: 400

🏷️  CATEGORY BREAKDOWN (Tamil Nadu Only)
Tamil Nadu: 19
Madurai: 15
Chennai: 14
Coimbatore: 13


In [41]:
# Add metadata to each article
for article in tn_articles_all:
    article['archive_date'] = TARGET_DATE
    article['source'] = 'The Hindu - Archive'
    article['scraped_at'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    article['has_summary'] = False  # Archive pages don't have summaries
    article['needs_full_scrape'] = True  # Will need to scrape article page

# Create DataFrame
df_archive = pd.DataFrame(tn_articles_all)

# Reorder columns
column_order = ['title', 'category', 'link', 'archive_date', 'has_summary', 
                'needs_full_scrape', 'source', 'scraped_at']
df_archive = df_archive[column_order]

print("=" * 60)
print("📊 DATAFRAME CREATED")
print("=" * 60)
print(f"Shape: {df_archive.shape}")
print(f"\nColumns: {list(df_archive.columns)}")

print("\n" + "=" * 60)
df_archive.info()

📊 DATAFRAME CREATED
Shape: (61, 8)

Columns: ['title', 'category', 'link', 'archive_date', 'has_summary', 'needs_full_scrape', 'source', 'scraped_at']

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61 entries, 0 to 60
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   title              61 non-null     object
 1   category           61 non-null     object
 2   link               61 non-null     object
 3   archive_date       61 non-null     object
 4   has_summary        61 non-null     bool  
 5   needs_full_scrape  61 non-null     bool  
 6   source             61 non-null     object
 7   scraped_at         61 non-null     object
dtypes: bool(2), object(6)
memory usage: 3.1+ KB


In [42]:
print("=" * 60)
print("🔍 DATA QUALITY CHECKS")
print("=" * 60)

# Check for duplicate links
duplicate_links = df_archive['link'].duplicated().sum()
print(f"\n✅ Duplicate Links: {duplicate_links}")

# Check for missing fields
print(f"✅ Missing Titles: {df_archive['title'].isna().sum()}")
print(f"✅ Missing Links: {df_archive['link'].isna().sum()}")
print(f"✅ Missing Categories: {df_archive['category'].isna().sum()}")

# Title length stats
df_archive['title_length'] = df_archive['title'].str.len()
print(f"\n📏 Title Length Stats:")
print(f"   Average: {df_archive['title_length'].mean():.0f} characters")
print(f"   Min: {df_archive['title_length'].min()}")
print(f"   Max: {df_archive['title_length'].max()}")

🔍 DATA QUALITY CHECKS

✅ Duplicate Links: 2
✅ Missing Titles: 0
✅ Missing Links: 0
✅ Missing Categories: 0

📏 Title Length Stats:
   Average: 67 characters
   Min: 20
   Max: 112


In [43]:
# Uncomment when ready to export

output_filename = rf"C:\Users\Yuvaraj\Desktop\Data-Science\Mini-project\tamil-news-drift-detection/data/raw/archive_sept27_tn_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
df_archive.to_csv(output_filename, index=False, encoding='utf-8')

print(f"✅ Data exported to: {output_filename}")
print(f"📊 Exported {len(df_archive)} Tamil Nadu articles")

# print("💡 Uncomment the code above when ready to export to CSV")

✅ Data exported to: C:\Users\Yuvaraj\Desktop\Data-Science\Mini-project\tamil-news-drift-detection/data/raw/archive_sept27_tn_20251027_132501.csv
📊 Exported 61 Tamil Nadu articles
