In [1]:
import os

In [2]:
%pwd

'c:\\Users\\ambig\\jupiter_notebook\\Projects\\Kitwe-Local-News-Aggregator-Omdena-\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'c:\\Users\\ambig\\jupiter_notebook\\Projects\\Kitwe-Local-News-Aggregator-Omdena-'

In [5]:
from dataclasses import dataclass
from typing import List
from pathlib import Path
from typing import Dict

@dataclass
class DataIngestionConfig:
    root_dir:Path
    News_websites : Dict[str,str]
    output_path: Path

In [6]:
from src.newsaggregator.constants import *
from src.newsaggregator.utils.common import read_yaml , create_directories

[2024-10-28 17:35:47,721: INFO: __init__: Hi, welcome to the news aggregator!]


In [7]:
class ConfigurationManager:
    def __init__(self,
                 config_filepath = CONFIG_FILE_PATH,
                 params_filepath = PARAMS_FILE_PATH,
                 schema_filepath = SCHEMA_FILE_PATH
                 ):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)
        
        create_directories([self.config.artifacts_root])
        
    def get_data_ingestion_config(self) -> DataIngestionConfig:
        config = self.config.data_ingestion
        create_directories([config.root_dir])
        
        data_ingestion_config = DataIngestionConfig(
            root_dir=Path(config['root_dir']),  # Convert to Path
            News_websites=config['News_websites'],  # Keep it as is, since it's a dict
            output_path=Path(config['output_path'])  # Convert to Path
        )
        return data_ingestion_config
    
    

In [8]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
from src.newsaggregator import logger
from datetime import datetime

In [9]:

class DataIngestion:
    def __init__(self, config):
        self.config = config
        self.News_websites = config.News_websites  
        self.output_path = config.output_path

    def clean_text(self, text):
        """Clean HTML tags and unnecessary whitespace."""
        return ' '.join(BeautifulSoup(text, 'xml').stripped_strings) if text else 'N/A'

    def parse_pub_date(self, date_str):
        """Parse the publication date into a standardized format."""
        formats = ['%a, %d %b %Y %H:%M:%S %Z', '%Y-%m-%dT%H:%M:%SZ']  # Add more formats if needed
        for fmt in formats:
            try:
                return datetime.strptime(date_str, fmt)
            except ValueError:
                continue
        return 'N/A'

    def get_feed_entries(self, feed_url, pages=10):
        all_entries = []
        for page in range(1, pages + 1):
            paged_url = f"{feed_url}?paged={page}"
            logger.info(f"Fetching page {page} from {feed_url}")
            try:
                response = requests.get(paged_url, timeout=10)
                response.raise_for_status()
            except requests.exceptions.RequestException as e:
                logger.error(f"Failed to fetch page {page}: {e}")
                break

            soup = BeautifulSoup(response.content, 'html.parser')
            items = soup.find_all('item')

            if not items:
                logger.info("No more entries found.")
                break

            for item in items:
                entry = {
                    'title': item.find('title').text.strip() if item.find('title') else 'N/A',
                    'link': item.find('link').text.strip() if item.find('link') else 'N/A',
                    'description': BeautifulSoup(item.find('description').text, 'html.parser').text.strip() if item.find('description') else 'N/A',
                    'pubDate': self.parse_pub_date(item.find('pubDate').text) if item.find('pubDate') else 'N/A',
                    'category': ', '.join([cat.text.strip() for cat in item.find_all('category')]) if item.find_all('category') else 'N/A'
                }
                all_entries.append(entry)

        return all_entries

    def ingest_data(self, pages=10):
        all_feed_data = []
        for source_name, feed_url in self.News_websites.items():
            entries = self.get_feed_entries(feed_url, pages)
            logger.info(f'RSS Feed done: {source_name}')

            # Add source name to each entry
            for entry in entries:
                entry['source'] = source_name
            all_feed_data.extend(entries)

        # Create a pandas DataFrame
        df = pd.DataFrame(all_feed_data, columns=['source', 'category', 'title', 'link', 'description', 'pubDate'])

        # Remove rows with missing essential fields
        df.dropna(subset=['title', 'link', 'description'], inplace=True)
        
        # Save the DataFrame to CSV
        os.makedirs(self.output_path.parent, exist_ok=True)
        df.to_csv(self.output_path, index=False)
        logger.info(f"Data saved to {self.output_path}")

        return df


In [10]:
try:
    config = ConfigurationManager()
    data_ingestion_config = config.get_data_ingestion_config()
    data_ingestion = DataIngestion(config=data_ingestion_config)
    # Use data_ingestion_config.news_websites instead of News_websites
    for source_name, feed_url in data_ingestion_config.News_websites.items():
        data_ingestion.get_feed_entries(feed_url)
        data_ingestion.ingest_data()

except Exception as e:
    raise e
    

[2024-10-28 17:35:49,336: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-10-28 17:35:49,339: INFO: common: yaml file: params.yaml loaded successfully]
[2024-10-28 17:35:49,344: INFO: common: yaml file: schema.yaml loaded successfully]
[2024-10-28 17:35:49,346: INFO: common: created directory at: artifacts]
[2024-10-28 17:35:49,346: INFO: common: created directory at: artifacts/data_ingestion]
[2024-10-28 17:35:49,346: INFO: 1719643358: Fetching page 1 from https://cecinvestor.com/search/kitwe/feed/rss2/]
[2024-10-28 17:35:49,856: INFO: 1719643358: Fetching page 2 from https://cecinvestor.com/search/kitwe/feed/rss2/]


  k = self.parse_starttag(i)


[2024-10-28 17:35:50,202: INFO: 1719643358: Fetching page 3 from https://cecinvestor.com/search/kitwe/feed/rss2/]
[2024-10-28 17:35:50,538: INFO: 1719643358: Fetching page 4 from https://cecinvestor.com/search/kitwe/feed/rss2/]
[2024-10-28 17:35:51,048: INFO: 1719643358: Fetching page 5 from https://cecinvestor.com/search/kitwe/feed/rss2/]
[2024-10-28 17:35:51,647: INFO: 1719643358: Fetching page 6 from https://cecinvestor.com/search/kitwe/feed/rss2/]
[2024-10-28 17:35:52,010: INFO: 1719643358: Fetching page 7 from https://cecinvestor.com/search/kitwe/feed/rss2/]
[2024-10-28 17:35:52,342: INFO: 1719643358: Fetching page 8 from https://cecinvestor.com/search/kitwe/feed/rss2/]
[2024-10-28 17:35:52,790: INFO: 1719643358: Fetching page 9 from https://cecinvestor.com/search/kitwe/feed/rss2/]
[2024-10-28 17:35:53,101: INFO: 1719643358: Fetching page 10 from https://cecinvestor.com/search/kitwe/feed/rss2/]
[2024-10-28 17:35:53,505: INFO: 1719643358: Fetching page 1 from https://cecinvestor.co

  k = self.parse_starttag(i)


[2024-10-28 17:38:41,761: INFO: 1719643358: Fetching page 3 from https://znbc.co.zm/news/search/kitwe/feed/rss2/]
[2024-10-28 17:38:43,712: INFO: 1719643358: Fetching page 4 from https://znbc.co.zm/news/search/kitwe/feed/rss2/]
[2024-10-28 17:38:45,357: ERROR: 1719643358: Failed to fetch page 4: 404 Client Error: Not Found for url: https://znbc.co.zm/news/search/kitwe/feed/rss2/?paged=4]
[2024-10-28 17:38:45,357: INFO: 1719643358: Fetching page 1 from https://cecinvestor.com/search/kitwe/feed/rss2/]
[2024-10-28 17:38:46,039: INFO: 1719643358: Fetching page 2 from https://cecinvestor.com/search/kitwe/feed/rss2/]
[2024-10-28 17:38:46,774: INFO: 1719643358: Fetching page 3 from https://cecinvestor.com/search/kitwe/feed/rss2/]
[2024-10-28 17:38:47,423: INFO: 1719643358: Fetching page 4 from https://cecinvestor.com/search/kitwe/feed/rss2/]
[2024-10-28 17:38:47,910: INFO: 1719643358: Fetching page 5 from https://cecinvestor.com/search/kitwe/feed/rss2/]
[2024-10-28 17:38:48,430: INFO: 1719643

  k = self.parse_starttag(i)


[2024-10-28 17:40:26,393: INFO: 1719643358: Fetching page 3 from https://newsinvasion24.com/search/kitwe/feed/rss2/]
[2024-10-28 17:40:28,839: INFO: 1719643358: Fetching page 4 from https://newsinvasion24.com/search/kitwe/feed/rss2/]
[2024-10-28 17:40:31,439: INFO: 1719643358: Fetching page 5 from https://newsinvasion24.com/search/kitwe/feed/rss2/]
[2024-10-28 17:40:33,486: INFO: 1719643358: Fetching page 6 from https://newsinvasion24.com/search/kitwe/feed/rss2/]
[2024-10-28 17:40:36,322: ERROR: 1719643358: Failed to fetch page 6: 404 Client Error: Not Found for url: https://newsinvasion24.com/search/kitwe/feed/rss2/?paged=6]
[2024-10-28 17:40:36,331: INFO: 1719643358: Fetching page 1 from https://cecinvestor.com/search/kitwe/feed/rss2/]
[2024-10-28 17:40:36,872: INFO: 1719643358: Fetching page 2 from https://cecinvestor.com/search/kitwe/feed/rss2/]
[2024-10-28 17:40:37,111: INFO: 1719643358: Fetching page 3 from https://cecinvestor.com/search/kitwe/feed/rss2/]
[2024-10-28 17:40:37,372

  k = self.parse_starttag(i)


[2024-10-28 17:43:10,566: INFO: 1719643358: Fetching page 3 from https://www.mwebantu.com/search/kitwe/feed/rss2/]
[2024-10-28 17:43:11,757: INFO: 1719643358: Fetching page 4 from https://www.mwebantu.com/search/kitwe/feed/rss2/]
[2024-10-28 17:43:13,107: INFO: 1719643358: Fetching page 5 from https://www.mwebantu.com/search/kitwe/feed/rss2/]
[2024-10-28 17:43:14,726: INFO: 1719643358: Fetching page 6 from https://www.mwebantu.com/search/kitwe/feed/rss2/]
[2024-10-28 17:43:16,556: INFO: 1719643358: Fetching page 7 from https://www.mwebantu.com/search/kitwe/feed/rss2/]
[2024-10-28 17:43:18,012: INFO: 1719643358: Fetching page 8 from https://www.mwebantu.com/search/kitwe/feed/rss2/]
[2024-10-28 17:43:19,493: INFO: 1719643358: Fetching page 9 from https://www.mwebantu.com/search/kitwe/feed/rss2/]
[2024-10-28 17:43:20,673: INFO: 1719643358: Fetching page 10 from https://www.mwebantu.com/search/kitwe/feed/rss2/]
[2024-10-28 17:43:21,810: INFO: 1719643358: Fetching page 1 from https://cecinv

  k = self.parse_starttag(i)


[2024-10-28 17:45:52,241: INFO: 1719643358: Fetching page 3 from https://www.lusakatimes.com/search/kitwe/feed/rss2/]
[2024-10-28 17:45:53,757: INFO: 1719643358: Fetching page 4 from https://www.lusakatimes.com/search/kitwe/feed/rss2/]
[2024-10-28 17:45:57,393: INFO: 1719643358: Fetching page 5 from https://www.lusakatimes.com/search/kitwe/feed/rss2/]
[2024-10-28 17:46:00,307: INFO: 1719643358: Fetching page 6 from https://www.lusakatimes.com/search/kitwe/feed/rss2/]
[2024-10-28 17:46:04,358: INFO: 1719643358: Fetching page 7 from https://www.lusakatimes.com/search/kitwe/feed/rss2/]
[2024-10-28 17:46:06,878: INFO: 1719643358: Fetching page 8 from https://www.lusakatimes.com/search/kitwe/feed/rss2/]
[2024-10-28 17:46:09,959: INFO: 1719643358: Fetching page 9 from https://www.lusakatimes.com/search/kitwe/feed/rss2/]
[2024-10-28 17:46:13,291: INFO: 1719643358: Fetching page 10 from https://www.lusakatimes.com/search/kitwe/feed/rss2/]
[2024-10-28 17:46:16,579: INFO: 1719643358: Fetching pa

  k = self.parse_starttag(i)


[2024-10-28 17:48:44,709: INFO: 1719643358: Fetching page 3 from https://cecinvestor.com/search/kitwe/feed/rss2/]
[2024-10-28 17:48:44,954: INFO: 1719643358: Fetching page 4 from https://cecinvestor.com/search/kitwe/feed/rss2/]
[2024-10-28 17:48:45,194: INFO: 1719643358: Fetching page 5 from https://cecinvestor.com/search/kitwe/feed/rss2/]
[2024-10-28 17:48:45,397: INFO: 1719643358: Fetching page 6 from https://cecinvestor.com/search/kitwe/feed/rss2/]
[2024-10-28 17:48:45,592: INFO: 1719643358: Fetching page 7 from https://cecinvestor.com/search/kitwe/feed/rss2/]
[2024-10-28 17:48:45,789: INFO: 1719643358: Fetching page 8 from https://cecinvestor.com/search/kitwe/feed/rss2/]
[2024-10-28 17:48:46,008: INFO: 1719643358: Fetching page 9 from https://cecinvestor.com/search/kitwe/feed/rss2/]
[2024-10-28 17:48:46,244: INFO: 1719643358: Fetching page 10 from https://cecinvestor.com/search/kitwe/feed/rss2/]
[2024-10-28 17:48:46,424: INFO: 1719643358: RSS Feed done: Copperbelt_Energy]
[2024-10-

  k = self.parse_starttag(i)


[2024-10-28 17:51:22,731: INFO: 1719643358: Fetching page 3 from https://cecinvestor.com/search/kitwe/feed/rss2/]
[2024-10-28 17:51:22,992: INFO: 1719643358: Fetching page 4 from https://cecinvestor.com/search/kitwe/feed/rss2/]
[2024-10-28 17:51:23,245: INFO: 1719643358: Fetching page 5 from https://cecinvestor.com/search/kitwe/feed/rss2/]
[2024-10-28 17:51:23,492: INFO: 1719643358: Fetching page 6 from https://cecinvestor.com/search/kitwe/feed/rss2/]
[2024-10-28 17:51:23,784: INFO: 1719643358: Fetching page 7 from https://cecinvestor.com/search/kitwe/feed/rss2/]
[2024-10-28 17:51:24,035: INFO: 1719643358: Fetching page 8 from https://cecinvestor.com/search/kitwe/feed/rss2/]
[2024-10-28 17:51:24,324: INFO: 1719643358: Fetching page 9 from https://cecinvestor.com/search/kitwe/feed/rss2/]
[2024-10-28 17:51:24,565: INFO: 1719643358: Fetching page 10 from https://cecinvestor.com/search/kitwe/feed/rss2/]
[2024-10-28 17:51:24,828: INFO: 1719643358: RSS Feed done: Copperbelt_Energy]
[2024-10-

  k = self.parse_starttag(i)


[2024-10-28 17:53:49,221: INFO: 1719643358: Fetching page 3 from https://www.zambiamonitor.com/search/kitwe/feed/rss2/]
[2024-10-28 17:53:51,292: INFO: 1719643358: Fetching page 4 from https://www.zambiamonitor.com/search/kitwe/feed/rss2/]
[2024-10-28 17:53:53,434: INFO: 1719643358: Fetching page 5 from https://www.zambiamonitor.com/search/kitwe/feed/rss2/]
[2024-10-28 17:53:55,373: INFO: 1719643358: Fetching page 6 from https://www.zambiamonitor.com/search/kitwe/feed/rss2/]
[2024-10-28 17:53:57,265: INFO: 1719643358: Fetching page 7 from https://www.zambiamonitor.com/search/kitwe/feed/rss2/]
[2024-10-28 17:53:59,148: INFO: 1719643358: Fetching page 8 from https://www.zambiamonitor.com/search/kitwe/feed/rss2/]
[2024-10-28 17:54:00,812: INFO: 1719643358: Fetching page 9 from https://www.zambiamonitor.com/search/kitwe/feed/rss2/]
[2024-10-28 17:54:02,744: INFO: 1719643358: Fetching page 10 from https://www.zambiamonitor.com/search/kitwe/feed/rss2/]
[2024-10-28 17:54:04,804: INFO: 1719643

  k = self.parse_starttag(i)


[2024-10-28 17:56:39,300: ERROR: 1719643358: Failed to fetch page 2: 404 Client Error: Not Found for url: https://www.techafricanews.com/search/kitwe/feed/rss2/?paged=2]
[2024-10-28 17:56:39,306: INFO: 1719643358: Fetching page 1 from https://cecinvestor.com/search/kitwe/feed/rss2/]
[2024-10-28 17:56:39,576: INFO: 1719643358: Fetching page 2 from https://cecinvestor.com/search/kitwe/feed/rss2/]
[2024-10-28 17:56:41,210: INFO: 1719643358: Fetching page 3 from https://cecinvestor.com/search/kitwe/feed/rss2/]
[2024-10-28 17:56:42,566: INFO: 1719643358: Fetching page 4 from https://cecinvestor.com/search/kitwe/feed/rss2/]
[2024-10-28 17:56:43,245: INFO: 1719643358: Fetching page 5 from https://cecinvestor.com/search/kitwe/feed/rss2/]
[2024-10-28 17:56:43,645: INFO: 1719643358: Fetching page 6 from https://cecinvestor.com/search/kitwe/feed/rss2/]
[2024-10-28 17:56:43,896: INFO: 1719643358: Fetching page 7 from https://cecinvestor.com/search/kitwe/feed/rss2/]
[2024-10-28 17:56:44,153: INFO: 

  k = self.parse_starttag(i)


[2024-10-28 17:59:01,762: INFO: 1719643358: Fetching page 3 from https://zambianeye.com/search/kitwe/feed/rss2/]
[2024-10-28 17:59:05,245: INFO: 1719643358: Fetching page 4 from https://zambianeye.com/search/kitwe/feed/rss2/]
[2024-10-28 17:59:08,734: INFO: 1719643358: Fetching page 5 from https://zambianeye.com/search/kitwe/feed/rss2/]
[2024-10-28 17:59:12,316: INFO: 1719643358: Fetching page 6 from https://zambianeye.com/search/kitwe/feed/rss2/]
[2024-10-28 17:59:15,897: INFO: 1719643358: Fetching page 7 from https://zambianeye.com/search/kitwe/feed/rss2/]
[2024-10-28 17:59:19,252: INFO: 1719643358: Fetching page 8 from https://zambianeye.com/search/kitwe/feed/rss2/]
[2024-10-28 17:59:22,569: INFO: 1719643358: Fetching page 9 from https://zambianeye.com/search/kitwe/feed/rss2/]
[2024-10-28 17:59:25,987: INFO: 1719643358: Fetching page 10 from https://zambianeye.com/search/kitwe/feed/rss2/]
[2024-10-28 17:59:26,378: INFO: 1719643358: Fetching page 1 from https://cecinvestor.com/search

  k = self.parse_starttag(i)


[2024-10-28 18:01:48,623: INFO: 1719643358: Fetching page 3 from https://www.daily-mail.co.zm/search/kitwe/feed/rss2/]
[2024-10-28 18:01:50,314: INFO: 1719643358: Fetching page 4 from https://www.daily-mail.co.zm/search/kitwe/feed/rss2/]
[2024-10-28 18:01:52,111: INFO: 1719643358: Fetching page 5 from https://www.daily-mail.co.zm/search/kitwe/feed/rss2/]
[2024-10-28 18:01:53,846: INFO: 1719643358: Fetching page 6 from https://www.daily-mail.co.zm/search/kitwe/feed/rss2/]
[2024-10-28 18:01:55,544: INFO: 1719643358: Fetching page 7 from https://www.daily-mail.co.zm/search/kitwe/feed/rss2/]
[2024-10-28 18:01:57,179: INFO: 1719643358: Fetching page 8 from https://www.daily-mail.co.zm/search/kitwe/feed/rss2/]
[2024-10-28 18:01:58,844: INFO: 1719643358: Fetching page 9 from https://www.daily-mail.co.zm/search/kitwe/feed/rss2/]
[2024-10-28 18:02:00,561: INFO: 1719643358: Fetching page 10 from https://www.daily-mail.co.zm/search/kitwe/feed/rss2/]
[2024-10-28 18:02:02,283: INFO: 1719643358: Fet