<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Data-Processing" data-toc-modified-id="Data-Processing-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Data Processing</a></span></li></ul></div>

In [1]:
from requests_html import HTMLSession
from bs4 import BeautifulSoup as bs
import json, re, os
from tqdm import tqdm
from loguru import logger
from datetime import datetime

In [2]:
# feed_config.py and import
RSS_FEED_URLS = [
    'https://timesofindia.indiatimes.com/rssfeedstopstories.cms?x=1',
    'https://feeds.feedburner.com/NDTV-LatestNews',
    'https://www.indiatoday.in/rss/1206578',
    'https://indianexpress.com/feed/',
    'https://www.thehindu.com/news/national/?service=rss',
    'https://www.news18.com/rss/india.xml',
    'https://www.business-standard.com/rss/latest.rss',
    'https://www.dnaindia.com/feeds/india.xml',
    'https://www.deccanchronicle.com/rss_feed/',
    'https://www.india.com/feed/',
    'https://www.oneindia.com/rss/news-india-fb.xml',
    'https://news.abplive.com/home/feed',
    'https://theprint.in/feed/',
    'https://www.indiatvnews.com/rssnews/topstory.xml',
    'https://www.tribuneindia.com/rss/feed?catId=42',
    'https://www.thehindubusinessline.com/?service=rss',
    'https://prod-qt-images.s3.amazonaws.com/production/freepressjournal/feed.xml',
    'https://www.siasat.com/feed/',
    'https://feeds.feedburner.com/ScrollinArticles.rss',
    'https://telanganatoday.com/feed'
]

In [3]:
len(RSS_FEED_URLS)

20

In [4]:
session = HTMLSession()
response = session.get(RSS_FEED_URLS[0])
response.status_code

200

In [5]:
soup = bs(response.content, features='xml')

In [6]:
items  = soup.find_all('item')

In [7]:
len(items)

33

In [8]:
items[0]

<item><title>Not only Joshimath, these places are also at risk of sinking</title><description/><link>https://timesofindia.indiatimes.com/india/joshimath-sinking-town-in-indian-himalayas-spotlights-risks-of-development/articleshow/97018583.cms</link><guid>https://timesofindia.indiatimes.com/india/joshimath-sinking-town-in-indian-himalayas-spotlights-risks-of-development/articleshow/97018583.cms</guid><pubDate>Mon, 16 Jan 2023 09:33:46 +0530</pubDate><dc:creator>Bloomberg</dc:creator><enclosure length="73592" type="image/jpeg" url="https://static.toiimg.com/photo/msid-97018595,imgsize-73592.cms"/></item>

In [9]:
title = items[0].find('title').text
title

'Not only Joshimath, these places are also at risk of sinking'

In [10]:
description = items[0].find('description').text
description

''

In [11]:
pub_date = items[0].find('pubDate').text
pub_date

'Mon, 16 Jan 2023 09:33:46 +0530'

In [12]:
# main.py
news_records = {}
for idx, url in enumerate(RSS_FEED_URLS):
    source_list = []
    try:
        session = HTMLSession()
        response = session.get(url)
        logger.info(f"Connected to: {url}")
    except Exception as e:
        logger.error(f"Unable to connect to: {url}")
    try:
        soup = bs(response.content, features='xml')
        items = soup.find_all('item')
        logger.info("All items fetched.")
    except Exception as e:
        logger.error("Unable to fetch items.")
    
    for item in items:
        temp = {}
        temp['title'] = item.find('title').text
        temp['description'] = item.find('description').text
        temp['pub_date'] = item.find('pubDate').text
        source_list.append(temp)
        news_records[str(idx)] = source_list

2023-01-16 21:46:27.595 | INFO     | __main__:<module>:7 - Connected to: https://timesofindia.indiatimes.com/rssfeedstopstories.cms?x=1
2023-01-16 21:46:27.625 | INFO     | __main__:<module>:13 - All items fetched.
2023-01-16 21:46:28.606 | INFO     | __main__:<module>:7 - Connected to: https://feeds.feedburner.com/NDTV-LatestNews
2023-01-16 21:46:28.613 | INFO     | __main__:<module>:13 - All items fetched.
2023-01-16 21:46:28.778 | INFO     | __main__:<module>:7 - Connected to: https://www.indiatoday.in/rss/1206578
2023-01-16 21:46:28.785 | INFO     | __main__:<module>:13 - All items fetched.
2023-01-16 21:46:29.001 | INFO     | __main__:<module>:7 - Connected to: https://indianexpress.com/feed/
2023-01-16 21:46:29.077 | INFO     | __main__:<module>:13 - All items fetched.
2023-01-16 21:46:29.423 | INFO     | __main__:<module>:7 - Connected to: https://www.thehindu.com/news/national/?service=rss
2023-01-16 21:46:29.449 | INFO     | __main__:<module>:13 - All items fetched.
2023-01-16

In [16]:
date = datetime.today().strftime("%Y%m%d")

filename = f"{date}_snapshot.json"
folder_name = "./raw_data"

if not os.path.exists(folder_name):
    os.mkdir(folder_name)

with open(f"{folder_name}/{filename}", 'w+') as f:
    json.dump(news_records, f)

## Data Processing

In [18]:
# Data processing (for trial)
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', 25)

In [20]:
data = pd.read_json(f"{folder_name}/{filename}", lines=True)
print(data.shape)
data.head(3)

(1, 19)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,17,18,19
0,"[{'title': 'Not only Joshimath, these places a...",[{'title': 'On Rahul Gandhi's Bharat Jodo Yatr...,[{'title': 'Don't run Punjab govt from Delhi: ...,[{'title': 'Gujarat CM approves 3 town plannin...,[{'title': '  Parliamentary debate orga...,[{'title': 'Lessor Says Air India to Order Aro...,[{'title': 'Increased provision on CET1 for lo...,[{'title': 'Bundelkhand Expressway to be conne...,[{'title': 'PM Modi holds roadshow as BJP nati...,[{'title': 'Viral Video: Biker’s Wheelie On Em...,[{'title': 'Have to win all 9 state polls: JP ...,[{'title': ''Don't Be A Remote Control In The ...,[{'title': 'Hong Kong judge defends judiciary ...,[{'title': 'BJP national executive: Strategy f...,"[{'title': 'In a first, Army clears over 30 wo...",[{'title': '  Has Twitter hastened Musk...,[{'title': 'Abu Dhabi govt raises Golden visa ...,[{'title': 'Frene Ginwala (1932-2023): The Par...,[{'title': 'KTR inaugurates Telangana Pavilion...


In [24]:
data = data.T
data

Unnamed: 0,0
0,"[{'title': 'Not only Joshimath, these places a..."
1,[{'title': 'On Rahul Gandhi's Bharat Jodo Yatr...
2,[{'title': 'Don't run Punjab govt from Delhi: ...
3,[{'title': 'Gujarat CM approves 3 town plannin...
4,[{'title': '  Parliamentary debate orga...
5,[{'title': 'Lessor Says Air India to Order Aro...
6,[{'title': 'Increased provision on CET1 for lo...
7,[{'title': 'Bundelkhand Expressway to be conne...
8,[{'title': 'PM Modi holds roadshow as BJP nati...
9,[{'title': 'Viral Video: Biker’s Wheelie On Em...
