In [17]:
from bs4 import BeautifulSoup
import requests
import itertools
import time

In [18]:
def get_page_soup(url:str) -> BeautifulSoup:
    """
    Makes a request to a url and creates a beautiful soup oject from the response html

    input:
        :param url: input page url
    returns:
        - page_soup: beautiful soup oject from the response html
    """

    response = requests.get(url)
    page_html = response.text
    page_soup = BeautifulSoup(page_html, "html.parser")

    return page_soup

In [63]:
def _is_valid_url_sw(href: str) -> bool:
    href = href.replace("https://www.bbc.com", "")
     
    if href.startswith("/swahili/articles/"):
        return True
    elif href.startswith("/swahili/habari-") or href.startswith("/swahili/"):
        if not (
            href.startswith("/swahili/topics")
            or href.startswith("/swahili/michezo")
            or href.startswith("/swahili/bbc_swahili_radio")
            or href.startswith("/swahili/dira-tv")
            or href.startswith('/swahili/media')
            or href.startswith("/swahili/taasisi")
        ):
            if href[-1].isdigit():
                return True
    else:
        return False

In [70]:
def get_valid_urls(category_page: BeautifulSoup):
    """
    Gets all valid urls from a category page

    input:
        :param: url: category_page
    returns:
        - valid_urls: list of all valid article urls on a given category page
    """
    all_urls = category_page.findAll("a")
    valid_article_urls = []
    for url in all_urls:
        href: str = url.get("href")
        # from a look at BBC pidgin's urls, they always begin with the following strings. 
        # so we obtain valid article urls using these strings
        # if ((href.startswith("/swahili/habari-") or href.startswith("/swahili/")) \
        #     and href[-1].isdigit() and not (href.startswith("/swahili/topics") or href.startswith("/swahili/michezo"))) | href.startswith("/swahili/articles/"):
        if _is_valid_url_sw(href):
            story_url = "https://www.bbc.com" + href if href.startswith("/swahili") else href
            valid_article_urls.append(story_url)

    return list(set(valid_article_urls))

In [22]:
page_soup = get_page_soup("https://www.bbc.com/swahili")
article_urls = get_valid_urls(page_soup)

In [60]:
def get_topics(homepage: str, known_topic_urls):
    """
    Meant to be used with the homepage to recover all sub-topics available
    """
    page_soup = get_page_soup(homepage)
    article_urls = get_valid_urls(page_soup)
    topics = {}
 
    for url in article_urls:
        url_soup = get_page_soup(url)
        topic_elements = url_soup.find_all("li", attrs={"class": "bbc-ekyvwt e2o6ii40"}) or []
        for topic in topic_elements:
            topic_url = "https://www.bbc.com" + topic.find("a").get("href")
            if topic_url not in known_topic_urls:
                topic_name = "_".join(topic.text.split()).upper()
                topics[topic_name] = topic_url
    return topics

In [25]:
categories = {
    "michezo": 'https://www.bbc.com/swahili/topics/ckdxndddjkxt',
    "video": "https://www.bbc.com/swahili/media/video",
    "dunia": "https://www.bbc.com/swahili/michezo-63553971"
}

In [26]:
other_categories = get_topics("https://www.bbc.com/swahili/", list(categories.values()))
categories.update(other_categories)

In [34]:
page_soup = get_page_soup("https://www.bbc.com/swahili/articles/cmm68ln0veyo")

In [35]:
page_soup.find_all("ul", attrs={"class": "bbc-f8df6t e19602dz4"})

[]

In [40]:
def get_page_count(soup: BeautifulSoup) -> int:
    count = 1
    ul: BeautifulSoup = soup.find_all("ul", attrs={"class": "bbc-f8df6t e19602dz4"})
    
    if ul:
        count = int(ul[0].find_all("li")[-1].text)
    
    return count

In [71]:
def get_urls(
    category_url:str,
    time_delay:bool, 
    articles_per_category = -1
    ):
    """
    Obtains all the article urls from the category url it takes in

    input:
        :param categpry_url: category url
        :param category: category name
    returns:
        - category_urls: list of all valid article urls on all the category pages
    """
    page_soup = get_page_soup(category_url)
    category_urls = get_valid_urls(page_soup)
    # logging.info(f"{len(category_urls)} urls in page 1 gotten for {category}")

    if articles_per_category > 0 and len(category_urls) >= articles_per_category:
        return category_urls
    
    # get total number of pages for given category
    # article_count_span = page_soup.find_all(
    #     "a", attrs={"class": CONFIG["ARTICLE_COUNT_SPAN"]}
    #     )
    # # if there are multiple pages, get valid urls from each page
    # else just get the articles on the first page
    total_page_count = get_page_count(page_soup)
    print(total_page_count)
    # if article_count_span:
    #     total_page_count = int(article_count_span[0].text)
        # logging.info(f"{total_page_count} pages found for {category}")
    # if total_page_count > 1:
    for count in range(1, total_page_count):
        page_soup = get_page_soup(category_url + f"?page={count+1}")
        page_urls = get_valid_urls(page_soup)
        # logging.info(f"{len(page_urls)} urls in page {count+1} gotten for {category}")
        category_urls+=page_urls
        
        if articles_per_category > 0 and len(category_urls) >= articles_per_category:
            break
            
        # articles_per_category -= len(page_urls)

        if time_delay: 
            time.sleep(10)
        
    else:
        pass
        # logging.info(f"Only one page found for {category}. {len(category_urls)} urls gotten")

    return category_urls

In [65]:
page_soup = get_page_soup("https://www.bbc.com/swahili/topics/c890g74k92qt")

In [48]:
all_urls = page_soup.findAll("a")

In [50]:
len(all_urls)

48

In [73]:
category_urls = get_urls("https://www.bbc.com/swahili/topics/c890g74k92qt", False)

4


In [81]:
category_urls

['https://www.bbc.com/swahili/articles/c72ejng1xdjo',
 'https://www.bbc.com/swahili/articles/c806zg0x44go',
 'https://www.bbc.com/swahili/articles/crgzyl9w1kyo',
 'https://www.bbc.com/swahili/articles/c283y0xjd93o',
 'https://www.bbc.com/swahili/articles/c0dr7lr8jm2o',
 'https://www.bbc.com/swahili/articles/cqq2n1yr7l1o',
 'https://www.bbc.com/swahili/articles/ce70knk74yeo',
 'https://www.bbc.com/swahili/articles/c518m95rdj5o',
 'https://www.bbc.com/swahili/articles/cq5e7q3qlv5o',
 'https://www.bbc.com/swahili/articles/cw0wryr0wg1o',
 'https://www.bbc.com/swahili/articles/cq5ev1q8381o',
 'https://www.bbc.com/swahili/articles/c0vq0yq44r1o',
 'https://www.bbc.com/swahili/articles/c8v1nz7y491o',
 'https://www.bbc.com/swahili/articles/c14ryydz7g2o',
 'https://www.bbc.com/swahili/articles/cw4n2dej1k2o',
 'https://www.bbc.com/swahili/articles/cp3yle1yyvdo',
 'https://www.bbc.com/swahili/articles/cn0ypnq942yo',
 'https://www.bbc.com/swahili/articles/c9xp2r9e57eo',
 'https://www.bbc.com/swahil

In [45]:
def get_article_data(article_url:str):
    """
    Obtains paragraphs texts and headlines input url article

    input:
        :param article_url: category_page
    returns:
        - headline: headline of url article 
        - story_text: text of url article
        - article_url: input article url
    """
    page_soup = get_page_soup(article_url)

    for cls_name in ["bbc-13tuc7f e1p3vdyi0", "bbc-csfh25 e1p3vdyi0"]:
        headline = page_soup.find(
            "h1", attrs={"class": cls_name}
            )
        if headline:
            break
    # by inspection, if the headline is not in the class above, it should be in the one below
    # TODO: Investigate if this is still necessary
    if not headline:
        for cls_name in ["ewk8wmc0 bbc-1xtm48o eglt09e1", "ewk8wmc0 bbc-12j4rku eglt09e1"]:
            headline = page_soup.find(
                "strong", attrs={"class": cls_name}
                )
            if headline:
                break
    
    if headline:
        headline = headline.text.strip()
    
    story_text = " "
    story_div = page_soup.find_all(
        "div", attrs={"class": "bbc-19j92fr ebmt73l0"}
        )
    if story_div:
        all_paragraphs = [div.findAll("p", recursive=False) for div in story_div]
        all_paragraphs = list(itertools.chain(*all_paragraphs))
        story_text = [p.text.strip().replace("\r", "").replace("\n", "\\n") for p in all_paragraphs]
        # story_text = story_text.join(str(paragraph).strip().replace("\n", "\\n") for paragraph in all_paragraphs)
        # story_text = BeautifulSoup(story_text, "html.parser") #.get_text(separator=" ", strip=True)
        # story_text = " ".join(story_text.stripped_strings)
        story_text = " ".join(story_text)
    story_text = story_text if not story_text == " " else None

    return (headline, story_text, article_url)

In [46]:
get_article_data("https://www.bbc.com/tigrinya/news-60403111")

('ድሕሪ 75 ዓመት ዝተራኸቡ ክልተ ኣሕዋት',
 'መሓመድን ሓቢብን ኣብ 1947 [ኣቆጻጽራ ፈረንጂ] ህንዲ ኣብ ክልተ ምስተመቐለት ሓዲኦም ኣብ ህንዲ ተሪፎም፤ ሓዲኦም ድማ ናብ ፓኪስታን ከይዶም። እዞም ክልተ ኣሕዋት ንመጀመርታ ግዜ ድሕሪ ናይ 75 ዓመታት ጻኒሒት ተራኺቦም።',
 'https://www.bbc.com/tigrinya/news-60403111')

In [25]:
h, t, u = _

In [26]:
print(t)

Yara rukunin mutane ne masu rauni, kuma akwai wasu cuttuka da suka fi kama su a lokacin hunturu, wadanda ya kamata iyaye mata su kiyaye. Cututtuka sun hada da wadanda suka shafi hanyoyin shakar iska da cutar sanyi ta nimoniya da mura da tari. Haka kuma yaran da ke da cutar asma da masu cutar sikila yanayinsu na kara tsananta a wannan lokaci. Shafin intanet na wikipedia ya bayyana  hunturu da cewa, lokaci ne na iska da kura wanda ke kadawa daga Sahara ya bi ta yammacin Afrika, zuwa gabar tekun Guinea. Wannan yanayi yana kamawa ne daga karshen watan Nuwamba zuwa tsakiyar watan Maris. Sannan a lokaci ne ake sanyi a mafiya yawan wurare da ke yankin. \nHaka zalika a wannan lokaci ne kuma ake hazo. Kuma akan kwashe tsawon kwanaki ba a ga rana ba.


In [1]:
import pandas as pd

In [35]:
data = pd.read_csv("../data/igbo_corpus.tsv", sep="\t")

In [36]:
data.head()

Unnamed: 0,headline,text,category,url
0,ABSUTH: Ngalaba NLC azaghachila gọọmentị Abia ...,Otu jikọrọ ndị ọrụ n'Abia steeti azaghachila g...,Abia Steeti,https://www.bbc.com/igbo/articles/czkdkdpjyzvo
1,Igbo Independence: Ụfọdụ agha nnwereonwe ndị I...,"Site na mgbe mgbo, ndị Igbo bụ ndị a ma ama na...",Abia Steeti,https://www.bbc.com/igbo/articles/ce7y9l7r53eo
2,"Obi Onitsha agaghị izute Atiku n'obi gọọmentị,...",Njem achụmnta vootu nakwa ileta Gọvanọ Anambra...,Naijiria,https://www.bbc.com/igbo/articles/cy70z45r0g7o
3,"Ihe ọhụrụ Nnamdi Kanu, Mike Ozekhome na Ifeany...",O﻿nyeisi otu 'Indigenous People of Biafra' (Ip...,Naijiria,https://www.bbc.com/igbo/articles/cpd2jz0zm2lo
4,Ọgba mbọ: 'Akwụsịrị m ezigbo ọrụ m tinye isi k...,Ọtụtụ ndị mmadụ na-ebe na ọrụ adịghị n'obodo m...,Azụmahịa nke kọmpụta,https://www.bbc.com/igbo/afirika-53144473


In [37]:
data.shape

(116, 4)

In [38]:
data.isnull().sum()

headline    1
text        0
category    0
url         0
dtype: int64

In [39]:
data[data.headline.isnull()]

Unnamed: 0,headline,text,category,url
24,,Gọọmentị Abia steeti ekwuola na ihe ji ha iwep...,Ọmụmụ na mmụta,https://www.bbc.com/igbo/articles/czqpww4j1leo
