In [19]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [20]:
# Define the RSS feeds
RSS_FEEDS = {
     'Copperbelt Energy': 'https://cecinvestor.com/search/kitwe/feed/rss2/',
     'ZNBC' : 'https://znbc.co.zm/news/search/kitwe/feed/rss2/',
     'News Invasion 24': 'https://newsinvasion24.com/search/kitwe/feed/rss2/',
     'Mwebantu': 'https://www.mwebantu.com/search/kitwe/feed/rss2/',
    'Lusaka Times': 'https://www.lusakatimes.com/search/kitwe/feed/rss2/',
     'Kitwe Online': 'https://kitweonline.com/search/kitwe/feed/rss2/',
     'Daily Revelation Zambia': 'https://dailyrevelationzambia.com/search/kitwe/feed/rss2/',
     'Zambia Monitor': 'https://www.zambiamonitor.com/search/kitwe/feed/rss2/',
     'Tech Africa News': 'https://www.techafricanews.com/search/kitwe/feed/rss2/',
     'Zambian Eye': 'https://zambianeye.com/search/kitwe/feed/rss2/',
     'DailyMail': 'https://www.daily-mail.co.zm/search/kitwe/feed/rss2/'
}

In [21]:
# Function to handle paginated feed entries
def get_feed_entries(feed_url, pages=10):
    all_entries = []
    for page in range(1, pages + 1):
        paged_url = f"{feed_url}?paged={page}"
        print(f"Fetching page {page} from {feed_url}")
        response = requests.get(paged_url)
        
        if response.status_code != 200:
            print(f"Failed to fetch page {page}: {response.status_code}")
            break
        
        soup = BeautifulSoup(response.content, 'xml')
        items = soup.find_all('item')

        if not items:
            break  # Exit if there are no more entries

        for item in items:
            entry = {
                'title': item.find('title').text if item.find('title') else 'N/A',
                'link': item.find('link').text if item.find('link') else 'N/A',
                'description': item.find('description').text if item.find('description') else 'N/A',
                'pubDate': item.find('pubDate').text if item.find('pubDate') else 'N/A',
                'category': ', '.join([cat.text for cat in item.find_all('category')]) if item.find_all('category') else 'N/A'
            }
            all_entries.append(entry)
    return all_entries

In [22]:
# Loop through each RSS feed and collect data
all_feed_data = []
for source_name, feed_url in RSS_FEEDS.items():
    entries = get_feed_entries(feed_url, pages=50)
    print(f'RSS Feed done: {source_name}')
    
    # Add source name to each entry
    for entry in entries:
        entry['source'] = source_name
    all_feed_data.extend(entries)

# Create a pandas DataFrame
df = pd.DataFrame(all_feed_data, columns=['source', 'category', 'title', 'link', 'description', 'pubDate'])

# Display the DataFrame
df.head()

# Optional: Save to CSV file
# df.to_csv('rss_feed_data_bs_jupyter.csv', index=False)

Fetching page 1 from https://cecinvestor.com/search/kitwe/feed/rss2/
Fetching page 2 from https://cecinvestor.com/search/kitwe/feed/rss2/
Fetching page 3 from https://cecinvestor.com/search/kitwe/feed/rss2/
Fetching page 4 from https://cecinvestor.com/search/kitwe/feed/rss2/
Fetching page 5 from https://cecinvestor.com/search/kitwe/feed/rss2/
Fetching page 6 from https://cecinvestor.com/search/kitwe/feed/rss2/
Fetching page 7 from https://cecinvestor.com/search/kitwe/feed/rss2/
Fetching page 8 from https://cecinvestor.com/search/kitwe/feed/rss2/
Fetching page 9 from https://cecinvestor.com/search/kitwe/feed/rss2/
Fetching page 10 from https://cecinvestor.com/search/kitwe/feed/rss2/
Fetching page 11 from https://cecinvestor.com/search/kitwe/feed/rss2/
Fetching page 12 from https://cecinvestor.com/search/kitwe/feed/rss2/
Fetching page 13 from https://cecinvestor.com/search/kitwe/feed/rss2/
Fetching page 14 from https://cecinvestor.com/search/kitwe/feed/rss2/
Fetching page 15 from https:/

Unnamed: 0,source,category,title,link,description,pubDate
0,Copperbelt Energy,"Careers, Current Careers",CEC Career Opportunity: Technician – Electrical,https://cecinvestor.com/cec-career-opportunity...,We currently have career opportunities in the ...,"Tue, 24 Sep 2024 09:10:46 +0000"
1,Copperbelt Energy,"Careers, Current Careers",CEC Career Opportunity: Parks & Gardens Assistant,https://cecinvestor.com/cec-career-opportunity...,We currently have career opportunities in the ...,"Tue, 24 Sep 2024 09:09:59 +0000"
2,Copperbelt Energy,"Careers, Current Careers",CEC Career Opportunity: Technician – Mechanical,https://cecinvestor.com/cec-career-opportunity...,We currently have career opportunities in the ...,"Tue, 24 Sep 2024 09:09:22 +0000"
3,Copperbelt Energy,"Careers, Current Careers",CEC Career Opportunity: CAPEX Engineer,https://cecinvestor.com/cec-career-opportunity...,We currently have career opportunities in the ...,"Mon, 23 Sep 2024 15:29:42 +0000"
4,Copperbelt Energy,"Careers, Current Careers",CEC Career Opportunity: Engineer – SCADA,https://cecinvestor.com/cec-career-opportunity...,We currently have career opportunities in the ...,"Mon, 23 Sep 2024 15:29:32 +0000"


In [23]:

#  Save to CSV file
df.to_csv('rss_feed_data_bs_jupyter.csv', index=False)

In [24]:
df.shape

(2223, 6)