<a target="_blank" href="https://colab.research.google.com/github/theyorubayesian/otelemuye/blob/master/notebooks/example.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>

In [1]:
# Install otelemuye from GitHub if this notebook is run on Google Colab
! [ -z $COLAB_GPU ] || pip install git+https://github.com/theyorubayesian/otelemuye.git

In [1]:
from itertools import chain
from pathlib import PurePosixPath
from typing import List
from typing import Optional
from urllib.parse import urlparse
from urllib.parse import urlunparse

from bs4 import BeautifulSoup
from scrapy.http.response import Response

from otelemuye import SitemapSpider
from otelemuye import Spider
from otelemuye import CrawlerProcess

## Websites that provide a sitemap

In [2]:
class JamhuriMediaSpider(SitemapSpider):
    name = "jamhuri_media_spider"
    sitemap_urls = [
        "https://www.jamhurimedia.co.tz/post-sitemap.xml",
        "https://www.jamhurimedia.co.tz/post-sitemap2.xml",
        "https://www.jamhurimedia.co.tz/post-sitemap3.xml",
        "https://www.jamhurimedia.co.tz/post-sitemap4.xml",
        "https://www.jamhurimedia.co.tz/post-sitemap5.xml",
        "https://www.jamhurimedia.co.tz/post-sitemap6.xml",
        "https://www.jamhurimedia.co.tz/post-sitemap7.xml",
        "https://www.jamhurimedia.co.tz/post-sitemap8.xml",
        "https://www.jamhurimedia.co.tz/post-sitemap9.xml",
        "https://www.jamhurimedia.co.tz/post-sitemap10.xml",
        "https://www.jamhurimedia.co.tz/post-sitemap11.xml"
    ]

    custom_settings = {
        "LOG_FILE": "../logs/jamhuri_media_crawl.log",
        "ROBOTSTXT_OBEY": False,
        "JOBDIR": "../crawls/jamhurimedia",
        "CONCURRENT_REQUESTS": 2,
        "OUTPUT_FILE": "../data/jamhurimedia_swahili.jsonl",
        "ITEM_PIPELINES": {
            "otelemuye.pipelines.JsonWriterPipeline": 300
        }
    }
    
    def _get_article_data(self, soup: BeautifulSoup):
        headline = soup.find("h1", attrs={"class": "post-title single-post-title entry-title"}).text
        category = None

        content_soup = soup.find("div", attrs={"class": "inner-post-entry entry-content"})
        content_elements = content_soup.find_all("p")
        content = self._clean_string(" ".join([elem.text for elem in content_elements]))
        
        return self.article_data(headline, content, category)

In [None]:
process = CrawlerProcess()
process.crawl(JamhuriMediaSpider)
process.start()

## Websites that do not provide a sitemap

In [None]:
class TukoSpider(Spider):
    name = "tuko_spider"
    content_tags = ["p", "strong"]
    start_urls = [
        "https://kiswahili.tuko.co.ke/watu/",
        "https://kiswahili.tuko.co.ke/kenya/",
        "https://kiswahili.tuko.co.ke/burudani/",
        "https://kiswahili.tuko.co.ke/siasa/",
        "https://kiswahili.tuko.co.ke/michezo/",
        "https://kiswahili.tuko.co.ke/biashara/",
        "https://kiswahili.tuko.co.ke/mahusiano/",
        "https://kiswahili.tuko.co.ke/elimu/",
        "https://kiswahili.tuko.co.ke/habari-za-ulimwengu/",
        "https://kiswahili.tuko.co.ke/familia/"
    ]
    custom_settings = {
        "LOG_FILE": "../logs/tuko_crawl.log",
        "ROBOTSTXT_OBEY": True,
        "JOBDIR": "../crawls/tuko",
        "CONCURRENT_REQUESTS": 2,
        "OUTPUT_FILE": "../data/tuko_swahili.jsonl",
        "ITEM_PIPELINES": {
            "otelemuye.pipelines.JsonWriterPipeline": 300
        }
    }
    
    def _get_article_data(self, soup: BeautifulSoup):
        headline = soup.find("h1", attrs={"class": "c-main-headline"}).text
        category = None

        content_soup = soup.find("div", attrs={"class": "post__content"})
        content_elements = chain(*[content_soup.find_all(tag) for tag in self.content_tags])
        content = self._clean_string(" ".join([elem.text for elem in content_elements]))
        
        return self.article_data(headline, content, category)

    def _find_next_page(self, soup: BeautifulSoup, response: Response) -> Optional[str]:
        first_article = soup.find("article", attrs={"class": "c-article-card-horizontal l-article-loadable-list"})
        
        if first_article:
            curr_url = urlparse(response.url)
            try:
                base, category, idx = PurePosixPath(curr_url.path).parts
                idx = int(idx) + 1 
            except ValueError:
                base, category = PurePosixPath(curr_url.path).parts
                idx = 2
            
            next_url = urlunparse(
                (
                    curr_url.scheme, 
                    curr_url.hostname, 
                    PurePosixPath(base, category, str(idx)).as_posix(), 
                    None, None, None
                ))
            return next_url

    def _get_article_urls(self, soup: BeautifulSoup) -> List[str]:
        all_urls = [urlparse(a.get("href")) for a in soup.find_all("a")]
        article_urls = [
            x.geturl() 
            for x in all_urls 
            if x.hostname == "kiswahili.tuko.co.ke"
            and len(x.path.split("-")) > 3
        ]
        return article_urls

In [None]:
process = CrawlerProcess()
process.crawl(TukoSpider)
process.start()