In [3]:
import feedparser
import pandas as pd
import requests
from datetime import datetime
from dateutil import parser as date_parser
import json
import warnings
warnings.filterwarnings('ignore')

print("📦 Libraries imported successfully!")
print(f"feedparser version: {feedparser.__version__}")
print(f"pandas version: {pd.__version__}")

📦 Libraries imported successfully!
feedparser version: 6.0.12
pandas version: 2.2.3


In [4]:
# RSS Feed URL
RSS_URL = "https://www.thehindu.com/news/national/tamil-nadu/feeder/default.rss"

print(f"🔍 Fetching RSS feed from:\n{RSS_URL}\n")

# Fetch the feed
response = requests.get(RSS_URL, timeout=10)

print(f"Status Code: {response.status_code}")
print(f"Response Size: {len(response.content)} bytes")
print(f"Content Type: {response.headers.get('Content-Type')}")

if response.status_code == 200:
    print("✅ RSS feed fetched successfully!")
else:
    print(f"❌ Failed to fetch feed. Status: {response.status_code}")

🔍 Fetching RSS feed from:
https://www.thehindu.com/news/national/tamil-nadu/feeder/default.rss

Status Code: 200
Response Size: 76142 bytes
Content Type: application/xml
✅ RSS feed fetched successfully!


In [5]:
# Parse the feed
feed = feedparser.parse(RSS_URL)

# Feed metadata
print("=" * 60)
print("📰 FEED METADATA")
print("=" * 60)
print(f"Feed Title: {feed.feed.get('title', 'N/A')}")
print(f"Feed Link: {feed.feed.get('link', 'N/A')}")
print(f"Feed Description: {feed.feed.get('description', 'N/A')[:100]}...")
print(f"Last Build Date: {feed.feed.get('lastBuildDate', 'N/A')}")
print(f"Language: {feed.feed.get('language', 'N/A')}")
print(f"\n📊 Total Entries: {len(feed.entries)}")
print("=" * 60)

📰 FEED METADATA
Feed Title: Tamil Nadu Latest News: Today’s Events & Political Developments | The Hindu
Feed Link: https://www.thehindu.com/news/national/tamil-nadu/
Feed Description: Stay informed about the latest national news and developments in Tamil Nadu. Get comprehensive cover...
Last Build Date: N/A
Language: en-US

📊 Total Entries: 100


In [6]:
# Look at the first entry in detail
if len(feed.entries) > 0:
    print("🔍 INSPECTING FIRST ENTRY STRUCTURE\n")
    first_entry = feed.entries[0]
    
    print("Available Keys:")
    for key in first_entry.keys():
        print(f"  - {key}")
    
    print("\n" + "=" * 60)
    print("FIRST ENTRY DETAILS:")
    print("=" * 60)
    
    print(f"\nTitle:\n{first_entry.get('title', 'N/A')}")
    print(f"\nLink:\n{first_entry.get('link', 'N/A')}")
    print(f"\nDescription:\n{first_entry.get('description', 'N/A')}")
    print(f"\nPublished:\n{first_entry.get('published', 'N/A')}")
    print(f"\nGUID:\n{first_entry.get('id', 'N/A')}")
    
    # Check for category
    if 'tags' in first_entry:
        print(f"\nCategories: {[tag.get('term') for tag in first_entry.tags]}")
    
    # Check for media content (images)
    if 'media_content' in first_entry:
        print(f"\nMedia Content:")
        for media in first_entry.media_content:
            print(f"  URL: {media.get('url', 'N/A')}")
            print(f"  Width: {media.get('width', 'N/A')}, Height: {media.get('height', 'N/A')}")
    
    print("\n" + "=" * 60)
else:
    print("❌ No entries found in feed!")

🔍 INSPECTING FIRST ENTRY STRUCTURE

Available Keys:
  - title
  - title_detail
  - summary
  - summary_detail
  - links
  - link
  - id
  - guidislink
  - tags
  - published
  - published_parsed
  - media_content

FIRST ENTRY DETAILS:

Title:
Orange alert issued in north Tamil Nadu districts as Cyclone Montha advances

Link:
https://www.thehindu.com/news/national/tamil-nadu/cyclone-montha-orange-alert-issued-in-north-tamil-nadu-districts-as-weather-system-advances/article70207325.ece

Description:
In its Nowcast till 1 p.m. on Monday (October 27), the RMC has predicted moderate rains to continue over Chennai and its neighbouring districts, and Villupuram and Ranipet

Published:
Mon, 27 Oct 2025 12:23:45 +0530

GUID:
article-70207325

Categories: ['Tamil Nadu']

Media Content:
  URL: https://th-i.thgim.com/public/incoming/z04cxf/article70207451.ece/alternates/LANDSCAPE_1200/2315_25_10_2025_16_55_13_2_CLOUDS2.JPG
  Width: 1200, Height: 675



In [7]:
def extract_entry_data(entry):
    """
    Extract relevant fields from a single RSS entry
    """
    
    # Basic fields
    data = {
        'title': entry.get('title', '').strip(),
        'link': entry.get('link', '').strip(),
        'description': entry.get('description', '').strip(),
        'guid': entry.get('id', '').strip(),
        'published': entry.get('published', ''),
    }
    
    # Extract category
    if 'tags' in entry and len(entry.tags) > 0:
        data['category'] = entry.tags[0].get('term', '')
    else:
        data['category'] = ''
    
    # Extract image URL
    if 'media_content' in entry and len(entry.media_content) > 0:
        data['image_url'] = entry.media_content[0].get('url', '')
        data['image_width'] = entry.media_content[0].get('width', '')
        data['image_height'] = entry.media_content[0].get('height', '')
    else:
        data['image_url'] = ''
        data['image_width'] = ''
        data['image_height'] = ''
    
    # Parse published date
    try:
        if data['published']:
            parsed_date = date_parser.parse(data['published'])
            data['pub_date'] = parsed_date.strftime('%Y-%m-%d %H:%M:%S')
            data['pub_date_readable'] = parsed_date.strftime('%d %b %Y, %I:%M %p')
        else:
            data['pub_date'] = ''
            data['pub_date_readable'] = ''
    except:
        data['pub_date'] = data['published']
        data['pub_date_readable'] = data['published']
    
    # Add flags
    data['has_description'] = len(data['description']) > 0
    data['has_image'] = len(data['image_url']) > 0
    
    # Add metadata
    data['source'] = 'The Hindu - Tamil Nadu'
    data['scraped_at'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    
    # Store raw entry as JSON (for debugging)
    data['raw_json'] = json.dumps(dict(entry), default=str)
    
    return data


# Extract all entries
print("🔄 Extracting data from all entries...\n")
articles_data = []

for entry in feed.entries:
    article = extract_entry_data(entry)
    articles_data.append(article)

print(f"✅ Extracted {len(articles_data)} articles!")

🔄 Extracting data from all entries...

✅ Extracted 100 articles!


In [8]:
# Create DataFrame
df = pd.DataFrame(articles_data)

# Reorder columns for better readability
column_order = [
    'title', 
    'description', 
    'link', 
    'pub_date',
    'pub_date_readable',
    'category', 
    'guid',
    'image_url',
    'has_description',
    'has_image',
    'source',
    'scraped_at',
    'image_width',
    'image_height',
    'raw_json'
]

df = df[column_order]

print("📊 DataFrame created successfully!")
print(f"Shape: {df.shape}")
print(f"Columns: {list(df.columns)}")

📊 DataFrame created successfully!
Shape: (100, 15)
Columns: ['title', 'description', 'link', 'pub_date', 'pub_date_readable', 'category', 'guid', 'image_url', 'has_description', 'has_image', 'source', 'scraped_at', 'image_width', 'image_height', 'raw_json']


In [9]:
print("=" * 60)
print("📊 DATASET OVERVIEW")
print("=" * 60)

print(f"\n📈 Total Articles: {len(df)}")
print(f"📅 Date Range: {df['pub_date_readable'].min()} to {df['pub_date_readable'].max()}")

print("\n" + "=" * 60)
print("📋 COLUMN INFO")
print("=" * 60)
df.info()

print("\n" + "=" * 60)
print("🔍 DATA QUALITY CHECKS")
print("=" * 60)

print(f"\nMissing Descriptions: {(~df['has_description']).sum()} ({(~df['has_description']).sum() / len(df) * 100:.1f}%)")
print(f"Missing Images: {(~df['has_image']).sum()} ({(~df['has_image']).sum() / len(df) * 100:.1f}%)")
print(f"Duplicate Links: {df['link'].duplicated().sum()}")

print("\n" + "=" * 60)
print("📂 CATEGORY DISTRIBUTION")
print("=" * 60)
print(df['category'].value_counts())

📊 DATASET OVERVIEW

📈 Total Articles: 100
📅 Date Range: 23 Oct 2025, 08:01 PM to 27 Oct 2025, 12:23 PM

📋 COLUMN INFO
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   title              100 non-null    object
 1   description        100 non-null    object
 2   link               100 non-null    object
 3   pub_date           100 non-null    object
 4   pub_date_readable  100 non-null    object
 5   category           100 non-null    object
 6   guid               100 non-null    object
 7   image_url          100 non-null    object
 8   has_description    100 non-null    bool  
 9   has_image          100 non-null    bool  
 10  source             100 non-null    object
 11  scraped_at         100 non-null    object
 12  image_width        100 non-null    object
 13  image_height       100 non-null    object
 14  raw_json           10

In [10]:
print("=" * 60)
print("📰 SAMPLE ARTICLES (First 5)")
print("=" * 60)

# Display first 5 articles (excluding raw_json for readability)
display_columns = ['title', 'description', 'pub_date_readable', 'category', 'has_description']
print(df[display_columns].head().to_string())

📰 SAMPLE ARTICLES (First 5)
                                                                                               title                                                                                                                                                                                                             description      pub_date_readable    category  has_description
0                       Orange alert issued in north Tamil Nadu districts as Cyclone Montha advances                                             In its Nowcast till 1 p.m. on Monday (October 27), the RMC has predicted moderate rains to continue over Chennai and its neighbouring districts, and Villupuram and Ranipet  27 Oct 2025, 12:23 PM  Tamil Nadu             True
1  Kalaignar International Convention Centre set for completion by February 2026: Minister E.V. Velu                                                                                                                                      

In [11]:
# Analyze description lengths
df['description_length'] = df['description'].str.len()

print("=" * 60)
print("📏 DESCRIPTION LENGTH STATISTICS")
print("=" * 60)

print(df['description_length'].describe())

print("\n📊 Length Distribution:")
print(f"Empty (0 chars): {(df['description_length'] == 0).sum()}")
print(f"Short (1-50 chars): {((df['description_length'] > 0) & (df['description_length'] <= 50)).sum()}")
print(f"Medium (51-200 chars): {((df['description_length'] > 50) & (df['description_length'] <= 200)).sum()}")
print(f"Long (200+ chars): {(df['description_length'] > 200).sum()}")

📏 DESCRIPTION LENGTH STATISTICS
count    100.000000
mean     121.360000
std      108.944478
min        0.000000
25%        0.000000
50%      128.500000
75%      180.000000
max      547.000000
Name: description_length, dtype: float64

📊 Length Distribution:
Empty (0 chars): 32
Short (1-50 chars): 0
Medium (51-200 chars): 51
Long (200+ chars): 17


In [12]:
print("=" * 60)
print("📊 RSS EXPLORATION SUMMARY REPORT")
print("=" * 60)

print(f"""
✅ Feed Successfully Parsed
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

📰 Source: The Hindu - Tamil Nadu
🔗 URL: {RSS_URL}
📅 Scraped At: {datetime.now().strftime('%d %b %Y, %I:%M %p')}

📊 STATISTICS
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
Total Articles: {len(df)}
Date Range: {df['pub_date_readable'].min()} → {df['pub_date_readable'].max()}

✅ Articles WITH Descriptions: {df['has_description'].sum()} ({df['has_description'].sum() / len(df) * 100:.1f}%)
⚠️ Articles WITHOUT Descriptions: {(~df['has_description']).sum()} ({(~df['has_description']).sum() / len(df) * 100:.1f}%)

🖼️ Articles WITH Images: {df['has_image'].sum()} ({df['has_image'].sum() / len(df) * 100:.1f}%)

🏷️ CATEGORIES
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
{df['category'].value_counts().to_string()}

📝 DESCRIPTION STATS
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
Average Length: {df['description_length'].mean():.0f} characters
Median Length: {df['description_length'].median():.0f} characters
Max Length: {df['description_length'].max():.0f} characters

💡 NEXT STEPS
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
1. For articles without descriptions, we need full-text scraping
2. Run this notebook multiple times to collect historical data
3. Export to CSV when satisfied
4. Proceed to archive scraping for older data
""")

print("=" * 60)

📊 RSS EXPLORATION SUMMARY REPORT

✅ Feed Successfully Parsed
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

📰 Source: The Hindu - Tamil Nadu
🔗 URL: https://www.thehindu.com/news/national/tamil-nadu/feeder/default.rss
📅 Scraped At: 27 Oct 2025, 12:50 PM

📊 STATISTICS
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
Total Articles: 100
Date Range: 23 Oct 2025, 08:01 PM → 27 Oct 2025, 12:23 PM

✅ Articles WITH Descriptions: 68 (68.0%)
⚠️ Articles WITHOUT Descriptions: 32 (32.0%)

🖼️ Articles WITH Images: 77 (77.0%)

🏷️ CATEGORIES
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
category
Tamil Nadu        83
Chennai            5
Coimbatore         4
Madurai            2
Andhra Pradesh     2
India              1
Health             1
Tiruchirapalli     1
Videos             1

📝 DESCRIPTION STATS
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
Average Length: 121 characters
Median Length: 128 characters
Max Length: 547 characters

💡 NEXT STEPS
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
1. For articles without descriptions, we need full-

In [17]:
# Uncomment when you're ready to export

output_filename = rf"C:\Users\Yuvaraj\Desktop\Data-Science\Mini-project\tamil-news-drift-detection\data\raw\hindu_rss_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"


# Export (excluding raw_json for smaller file size)
export_columns = [col for col in df.columns if col != 'raw_json']
df[export_columns].to_csv(output_filename, index=False, encoding='utf-8')

print(f"✅ Data exported to: {output_filename}")
print(f"📊 Exported {len(df)} articles")

✅ Data exported to: C:\Users\Yuvaraj\Desktop\Data-Science\Mini-project\tamil-news-drift-detection\data\raw\hindu_rss_20251027_125409.csv
📊 Exported 100 articles
