In [6]:
import requests
import yaml
from bs4 import BeautifulSoup
import itertools

In [2]:
def get_page_soup(url: str) -> BeautifulSoup:
    """
    Makes a request to a url and creates a beautiful soup oject from the response html

    input:
        :param url: input page url
    returns:
        - page_soup: beautiful soup oject from the response html
    """

    response = requests.get(url)
    page_html = response.text
    page_soup = BeautifulSoup(page_html, "html.parser")

    return page_soup

In [5]:
CONFIG = yaml.load(open("config.yml"), Loader=yaml.FullLoader)
ALL_CATEGORIES = CONFIG["CATEGORY_URLS"]

In [7]:
def get_article_data(article_url:str):
    """
    Obtains paragraphs texts and headlines input url article

    input:
        :param article_url: category_page
    returns:
        - headline: headline of url article 
        - story_text: text of url article
        - article_url: input article url
    """
    page_soup = get_page_soup(article_url)

    headline = page_soup.find(
        "h1", attrs={"class": CONFIG["HEADLINE_SPAN_CLASS_A"]}
        )
    # by inspection, if the headline is not in the class above, it should be in the one below
    # TODO: Investigate if this is still necessary
    if not headline:
        headline = page_soup.find(
            "strong", attrs={"class": CONFIG["HEADLINE_SPAN_CLASS_B"]}
            )
    
    if headline:
        headline = headline.text.strip()
    
    story_text = " "
    story_div = page_soup.find_all(
        "div", attrs={"class": CONFIG["STORY_DIV_CLASS"]}
        )
    if story_div:
        all_paragraphs = [div.findAll("p", recursive=False) for div in story_div]
        all_paragraphs = list(itertools.chain(*all_paragraphs))
        story_text = story_text.join(str(paragraph) for paragraph in all_paragraphs)
        story_text = BeautifulSoup(story_text, "html.parser").get_text()
    story_text = story_text if not story_text == " " else None

    return (headline, story_text, article_url)


In [3]:
soup = get_page_soup("https://www.bbc.com/amharic/news-61369467")

In [8]:
get_article_data("https://www.bbc.com/amharic/news-61369467")

(None,
 'የንግድ እና ቀጠናዊ ትሥሥር ሚኒስቴር በዓለም አቀፍ ደረጃ የነዳጅ ዋጋ መናሩን በምክንያትነት በመጥቀስ ከአራት ወራት በፊት ባወጣው የዋጋ ተመን ላይ ከዛሬ ሚያዝያ 30፣ 2014 ዓ.ም ጀምሮ ተግባራዊ የሚደረግ ማሻሻያ ማድረጉን አስታውቋል። ሚኒስቴሩ ባወጣው መግለጫ መንግሥት ባለፉት አራት ወራት የነዳጅ ዋጋን ለመደጎም ባወጣው ወጪ ከፍተኛ ጫና ላይ መውደቁን ያተተ ሲሆን "በነዳጅ ማረጋጊያ ፈንድ ላይ በወር በአማካይ ወደ ተጠቃሚው መተላለፍ የነበረበት 10 ቢሊዮን ብር ጉድለት እንዲመዘገብ" አድርጓል ሲል አክሏል። በዚህም ምክንያት ከዛሬ ጀምሮ በአዲስ አበባ የነዳጅ ምርቶች የችርቻሮ መሸጫ ዋጋ ላይ ለውጥ ተደርጓል።  ቤንዚን በሊትር 36 ብር ከ87 ሳንቲም፣ ነጭ ናፍጣ በሊትር 35 ብር ከ43 ሳንቲም፣ ኬሮሲን በሊትር 35 ብር ከ43 ሳንቲም፣ ቀላል ጥቁር ናፍጣ በሊትር 52 ብር ከ45 ሳንቲም፣ ከባድ ጥቁር ናፍጣ በሊትር 51 ብር ከ78 ሳንቲም እንዲሁም የአውሮፕላን ነዳጅ በሊትር 78 ብር ከ87 ሳንቲም ሆኖ እንደሚሸጥ መወሰኑንም ነው ሚኒስቴሩ የገለጸው። ዛሬ ተግባራዊ ከሚደረገው የተመን ማሻሻያ በፊት ኢትዮጵያ በተያዘው ዓመት ታህሳስ ወር ላይ የነዳጅ ምርቶች ዋጋ ላይ ጭማሪ አድርጋ ነበር። የንግድ እና ቀጠናዊ ትሥሥር ሚኒስቴር በነዚህ አራት ወራቶች ውስጥ የነዳጅ ዋጋ በዓለም አቀፍ ደረጃ "በከፍተኛ ሁኔታ" መጨመሩን ገልጿል።  በዚህም በታኅሣሥ ወር ቤንዚን በሜትሪክ ቶን 870 የአሜሪካን ዶላር ይሸጥ እንደነበር ጠቅሶ ከአራት ወራት በኋላ ዋጋው በ27 በመቶ በመጨመር 1028 የአሜሪካ ዶላር መግባቱን አስታውሷል።  በተመሳሳይ ናፍጣ በሜትሪክ ቶን 730 የአሜሪካን ዶላር ይሸጥ ከነበረበት በ55 በመቶ ወይም በ408 ዶላር ጭማሪ በማሳየት ወደ 1138 የአሜሪካ

In [7]:
soup.find("time", attrs={"class": "bbc-1bnmgo0 e4zesg50"}).get("datetime")

'2022-05-04'

In [8]:
from datetime import datetime

In [24]:
datetime.strptime('2021-02-01', '%Y-%m-%d') <= datetime.strptime('2021-01-31', '%Y-%m-%d')

False

In [9]:
datetime('2022-05-04'.split("-"))

TypeError: an integer is required (got type str)

In [18]:
soup.find_all("ul", attrs={"class": "bbc-f8df6t e19602dz4"})[0].find_all("li")[-1].text

'40'

In [13]:
def get_valid_urls(category_page:BeautifulSoup):
    """
    Gets all valid urls from a category page

    input:
        :param: url: category_page
    returns:
        - valid_urls: list of all valid article urls on a given category page
    """
    all_urls = category_page.findAll("a")
    valid_article_urls = []
    for url in all_urls:
        href: str = url.get("href")
        if (
            href.startswith("/amharic/news") or 
            href.startswith("/amharic/") # or href.startswith("/pidgin/sport")
            ) and href[-1].isdigit() and not href.startswith("/amharic/topics"):
        #                stub[-1].isdigit():
            story_url = "https://www.bbc.com" + href if href.startswith("/amharic") else href
            if "live" in story_url.split("/"):
                continue
            valid_article_urls.append(story_url)

    return list(set(valid_article_urls))

In [15]:
get_valid_urls(soup)

['https://www.bbc.com/amharic/news-61316845',
 'https://www.bbc.com/amharic/news-61369017',
 'https://www.bbc.com/amharic/news-61346541',
 'https://www.bbc.com/amharic/news-61361343',
 'https://www.bbc.com/amharic/news-61369467',
 'https://www.bbc.com/amharic/news-61343985',
 'https://www.bbc.com/amharic/news-61337371',
 'https://www.bbc.com/amharic/news-61318619',
 'https://www.bbc.com/amharic/news-61360852',
 'https://www.bbc.com/amharic/news-61306677']

In [12]:
all_urls = soup.findAll("a")

In [14]:
for url in all_urls:
    href = url.get("href")
    print(href)

/pidgin
#content
/pidgin
/pidgin/topics/c2dwqd1zr92t
/pidgin/topics/c404v061z85t
/pidgin/topics/c0823e52dd0t
/pidgin/media/video
/pidgin/topics/cjgn7gv77vrt
/pidgin/topics/cqywjyzk2vyt
/pidgin/popular/read
/pidgin
/pidgin/topics/c2dwqd1zr92t
/pidgin/topics/c404v061z85t
/pidgin/topics/c0823e52dd0t
/pidgin/media/video
/pidgin/topics/cjgn7gv77vrt
/pidgin/topics/cqywjyzk2vyt
/pidgin/popular/read
https://www.bbc.com/pidgin/media-61325497
https://www.bbc.com/pidgin/tori-61083547
https://www.bbc.com/pidgin/media-55222820
https://www.bbc.com/pidgin/tori-61260173
https://www.bbc.com/pidgin/media-61296844
https://www.bbc.com/pidgin/live/world-61317733
https://www.bbc.com/pidgin/tori-61247469
https://www.bbc.com/pidgin/tori-61313923
https://www.bbc.com/pidgin/tori-61296842
https://www.bbc.com/pidgin/sport-61292477
https://www.bbc.com/pidgin/tori-61296841
https://www.bbc.com/pidgin/tori-61296382
https://www.bbc.com/pidgin/tori-61287415
https://www.bbc.com/pidgin/tori-61291330
https://www.bbc.com/p

In [1]:
import pandas as pd

In [3]:
data = pd.read_csv("data/pidgin.tsv", sep="\t")

In [4]:
data.head()

Unnamed: 0,headline,text,category,url
0,Chinedu Bernard: Nollywood actress die for chu...,Actors Guild of Nigeria don confam di death of...,NIGERIA,https://www.bbc.com/pidgin/tori-61283209
1,Khafi Kareem: UK police give written warning t...,UK police don give written warning to former B...,NIGERIA,https://www.bbc.com/pidgin/tori-61283212
2,Yul Edochie new wife Judy Austin: 'Yul make mi...,Di family of Yul Edochie don chook mouth for d...,NIGERIA,https://www.bbc.com/pidgin/tori-61269129
3,Indonesia ban export of palm oil: Wetin e mean...,Indonesia don ban export of any kind of palm o...,NIGERIA,https://www.bbc.com/pidgin/tori-61260168
4,Dog sleeping with human video: Wetin Nigeria l...,Nigeria police force tok-tok pesin Muyiwa Adej...,NIGERIA,https://www.bbc.com/pidgin/tori-61287415


In [6]:
data.duplicated(subset="url").sum()

183

In [7]:
deduplicated_data = data.drop_duplicates(subset="url", keep="last")

In [8]:
deduplicated_data.shape

(841, 4)

In [9]:
data.shape

(1024, 4)

In [10]:
deduplicated_data.to_csv("pidgin_nigeria.tsv", sep="\t", index=False)