In [1]:
import os

In [2]:
%pwd

'/home/tiva/PycharmProjects/HeadlineGenerator/notebook'

In [3]:
os.chdir("../")

In [4]:
%pwd

'/home/tiva/PycharmProjects/HeadlineGenerator'

In [5]:
from dataclasses import dataclass
from pathlib import Path

In [6]:
@dataclass(frozen=True)
class DataScraperConfig:
    root_dir: Path
    save_dir: Path
    source_url: str
    categories: list
    max_pages: int

In [7]:
from headlineGenerator.constants import *
from headlineGenerator.utils.common import read_yaml, create_directories

In [8]:
class ConfigurationManager:
    def __init__(self, config_filepath=CONFIG_FILEPATH, params_filepath=PARAMS_FILEPATH):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])

    def get_data_scraper_config(self) -> DataScraperConfig:
        config = self.config.data_scraper

        create_directories([config.root_dir])
        
        data_scraper_config = DataScraperConfig(
            root_dir=config.root_dir,
            save_dir=config.save_dir,
            source_url=config.source_url,
            categories=config.categories,
            max_pages=config.max_pages
        )

        return data_scraper_config

In [9]:
import os
import csv
import httpx
from selectolax.parser import HTMLParser
from headlineGenerator.entity import Content
from headlineGenerator.logging import logger
from headlineGenerator.utils.common import get_size
from dataclasses import asdict, dataclass, fields

In [10]:
class DataScraper:
    def __init__(self, config: DataScraperConfig):
        self.config = config


    def get_html(self, url, **kwargs):
        headers = {"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:121.0) Gecko/20100101 Firefox/121.0"}
        timeout = httpx.Timeout(connect=None, read=None, write=5, pool=5)
        # page keyword is used for getting news from a specific page
        if kwargs.get("page"):
            resp = httpx.get(url + f"page/{kwargs.get('page')}/", headers=headers, timeout=timeout, follow_redirects=True)
        else:
            resp = httpx.get(url, headers=headers, timeout=timeout, follow_redirects=True)

        html = HTMLParser(resp.text)
        # handle when page number is exceeded
        if html.css_first("title").text() == "Page not found | AIT LIVE":
            logger.info(f"No results found while requesting URL {resp.url}. Page Limit Exceeded")
            return False

        return html


    def extract_text(self, html, selector):
        try:
            return html.css_first(selector).text(strip=True)
        except AttributeError:
            return None
        except Exception as e:
            raise e


    def extract_attribute(self, html, selector, attribute):
        try:
            return html.css_first(selector).attributes[attribute]
        except AttributeError:
            return None
        except Exception as e:
            raise e


    def get_page_url(self, html: HTMLParser):
        blog_contents = html.css("div.blog-content div.block-inner div[data-pid]")

        for content in blog_contents:
            yield content.css_first("h4.entry-title a").attributes["href"]


    def get_page_content(self, html: HTMLParser, url: str):
        content = Content(
            headline=self.extract_text(html, "h1.s-title"),
            last_update=self.extract_attribute(html, "time.updated-date", "datetime"),
            writer=self.extract_text(html, "span.meta-el.meta-custom.meta-bold"),
            editor=self.extract_text(html, "div[itemprop=articleBody] p:nth-last-of-type(2) strong"),
            summary=self.extract_text(html, "div[itemprop=articleBody] p strong"),
            main_story=" ".join([node.text() for node in html.css("div[itemprop=articleBody] p:not(:has(strong))")]).strip(),
            page_url=url
        )

        return asdict(content)

    def export_to_csv(self, contents: list, path: Path):
        field_names = [field.name for field in fields(Content)]
        with open(path, "w") as file:
            writer = csv.DictWriter(file, field_names)
            writer.writeheader()
            writer.writerows(contents)

    def scrape_data(self):        
        for category in self.config.categories:
            contents = []
            data_count = 1
        
            logger.info(f"'\n>>> {category.upper()} CATEGORY'")
            
            for i in range(1, self.config.max_pages):
                logger.info(f"Scraping page {i}/{self.config.max_pages}")
                url = self.config.source_url + category + "/"
                html = self.get_html(url, page=i)
                if html is False:
                    break
                content_urls = self.get_page_url(html)

                for content_url in content_urls:
                    content_html = self.get_html(content_url)
                    logger.info(f"#{data_count} ==> {content_html.css_first('title').text()}")
                    content = self.get_page_content(content_html, content_url)
                    contents.append(content)
                    data_count += 1
                    logger.info("-" * 200)
                    
            save_path = os.path.join(self.config.save_dir, f"{category}_contents.csv")
            self.export_to_csv(contents, save_path)
            logger.info(f"{category.upper()} CONTENTS SAVED TO CSV SUCCESSFULLY.")

In [11]:
try:
    config = ConfigurationManager()
    data_scraper_config = config.get_data_scraper_config()
    data_scraper = DataScraper(config=data_scraper_config)
    data_scraper.scrape_data()
except Exception as e:
    raise e

[2024-03-13 09:24:38,076 || INFO || common || yaml file : config.yaml loaded successfully]
[2024-03-13 09:24:38,080 || INFO || common || yaml file : params.yaml loaded successfully]
[2024-03-13 09:24:38,082 || INFO || common || created directory at artifacts]
[2024-03-13 09:24:38,084 || INFO || common || created directory at artifacts/data_scraper]
[2024-03-13 09:24:38,085 || INFO || 1657421991 || '
>>> BUSINESS CATEGORY']
[2024-03-13 09:24:38,087 || INFO || 1657421991 || Scraping page 1/2]
[2024-03-13 09:26:49,717 || INFO || _client || HTTP Request: GET https://ait.live/category/business/page/1/ "HTTP/1.1 301 Moved Permanently"]
[2024-03-13 09:26:50,160 || INFO || _client || HTTP Request: GET https://ait.live/category/business/ "HTTP/1.1 200 OK"]
[2024-03-13 09:26:51,417 || INFO || _client || HTTP Request: GET https://ait.live/nigeria-qatar-sign-seven-agreements-as-president-tinubu-ends-state-visit/ "HTTP/1.1 200 OK"]
[2024-03-13 09:26:51,729 || INFO || 1657421991 || #1 ==> Nigeria, Q