# Tigrinya web crawler (Tirawler)

## Collect page links

In [1]:
import urllib.request
import requests
import os
import lazynlp
from bs4 import BeautifulSoup
import re

def url_exists(url):
    request = requests.get(url)
    return request.status_code == 200

def download_url(url, path):
    if not os.path.exists(path):
        if not url_exists(url):
            print("Url doesn't exist:", url)
            return 'END'
        
        print("Downloading", url)
        code, page_content = lazynlp.download_page(url)
        with open(path, 'wb') as f:
            f.write(page_content)
    else:
        with open(path, 'r') as f:
            page_content = f.read()
    return page_content

def parse_topic_headlines(page_content, base_url):
    soup = BeautifulSoup(page_content)
    
    media_block = soup.find("div", {"class": "media-block-wrap"})
    article_elems = media_block.findAll("a", {"class":"img-wrap"})
    
    article_links = [base_url + e.attrs["href"] for e in article_elems]
    
    return article_links

In [None]:
links = mediablock.findAll("a", {"class":"img-wrap"})
firstlink = links[0]

### (1) VOA News Tigrigna https://tigrigna.voanews.com/

In [3]:
#Download directory
voa_downdir = "urls/voanews"
topic_urls_downdir = os.path.join(voa_downdir, 'topic-headlines')
if not os.path.exists(voa_downdir):
    os.makedirs(voa_downdir)

base_url = 'https://tigrigna.voanews.com'
topic_base_urls = {'africa1':'https://tigrigna.voanews.com/z/5444', 
              'africa2':'https://tigrigna.voanews.com/z/2916', 
              'world':'https://tigrigna.voanews.com/z/3329', 
              'politics': 'https://tigrigna.voanews.com/z/3316',
              'health': 'https://tigrigna.voanews.com/z/2918',
              'youth': 'https://tigrigna.voanews.com/z/2923',
              'america': 'https://tigrigna.voanews.com/z/2917',
              'culture': 'https://tigrigna.voanews.com/z/3325',
              'people': 'https://tigrigna.voanews.com/z/2920',
              'kenya': 'https://tigrigna.voanews.com/z/4270',
              'UN': 'https://tigrigna.voanews.com/z/4611',
              '2015': 'https://tigrigna.voanews.com/z/4457'
              }

topic_article_urls = {topic:[] for topic in topic_base_urls.keys()}

#Download and process all
    
for topic_key in topic_base_urls:
    topic_dir = os.path.join(topic_urls_downdir,topic_key)
    if not os.path.exists(topic_dir):
        os.makedirs(topic_dir)
    
    topic_base_url = topic_base_urls[topic_key]

    page_no = 0
    article_list_url = topic_base_url

    while(True):
        
        article_list_html_path = os.path.join(topic_dir, topic_key + "_" + str(page_no) + ".html")

        article_list_html_content = download_url(article_list_url, article_list_html_path)

        if article_list_html_content == 'END':
            break

        topic_article_urls[topic_key].extend(parse_topic_headlines(article_list_html_content, base_url))

        #get next page
        page_no += 1
        article_list_url = topic_base_url + '?p=' + str(page_no)


Url doesn't exist: https://tigrigna.voanews.com/z/5444?p=101
Url doesn't exist: https://tigrigna.voanews.com/z/2916?p=101
Url doesn't exist: https://tigrigna.voanews.com/z/3329?p=101
Url doesn't exist: https://tigrigna.voanews.com/z/3316?p=101
Url doesn't exist: https://tigrigna.voanews.com/z/2918?p=101
Url doesn't exist: https://tigrigna.voanews.com/z/2923?p=101
Url doesn't exist: https://tigrigna.voanews.com/z/2917?p=101
Url doesn't exist: https://tigrigna.voanews.com/z/3325?p=101
Url doesn't exist: https://tigrigna.voanews.com/z/2920?p=101
Url doesn't exist: https://tigrigna.voanews.com/z/4270?p=101
Url doesn't exist: https://tigrigna.voanews.com/z/4611?p=101
Url doesn't exist: https://tigrigna.voanews.com/z/4457?p=101


In [37]:
topic_article_urls['prefecture2016'] = []

### Download and parse articles

In [4]:
def parse_article(html_content, out_txt_path):
    soup = BeautifulSoup(html_content)
    text = ""
    
    try:
        title = soup.find("h1", {"class":"pg-title"})
        text += title.text
#     except:
#         print("No title")
        
    try:
        media_block = soup.find("div", {"class": "wsw"})
        text_elems = media_block.findChildren("p", {"class":""})
        text_elems.extend(media_block.findChildren("p", {"class":"xmsonormal"}))

        text += '\n'.join([elem.text for elem in text_elems if elem.text and not elem.text.isspace()]) 
#     except:
#         print("No text content")
    
    if text:
        with open(out_txt_path, 'w') as f:
            f.write(text)
        return text
    else:
        return 0


In [58]:
#Experiments
soup = BeautifulSoup(article_html_content)
media_block = soup.find("div", {"class": "wsw"})
text_elems = media_block.findChildren("p", {"class":""})
text_elems.extend(media_block.findChildren("p", {"class":"xmsonormal"}))
article_text = '\n'.join([elem.text for elem in text_elems if elem.text and not elem.text.isspace()])    

title = soup.find("h1", {"class":"pg-title"})
print(title.text)

print(article_text)


ዋሽንግተን: ሱሉሳዊ ልዝብ ሚንስትራት ኢትዮጵያ፤ሱዳንን ግብጺን ቀጺሉ ውዒሉ'ሎ

ፕረዚደንት ትራምፕ፡ ትማሊ ኣብዚ ኣብ ዋሽንግተን፡ ምስ ሚንስተራት ጉዳያት ወጻኢ ኢትዮጵያ፡ ግብጽን ሱዳንን ተራኺቡ- ብዛዕባ ምዕባለታት ዝቕጽል ዝሎ ዝርርብ ዓቢ ህዳሰ ግድብ ኢትዮጵያው`ን ተዛሪቡ።
ፕረዚደንት ትራምፕ ኣብቲ እዋን፡ ዩናይትድ ስቴትስ ኣብ ምትሕብባር ዝተመስረተ፡ ቀጻልነት ዘለዎን ንኹሎም ዝረብሕን ሓባራዊ ስምምዕ ክግበር ከምትድግፍ`ውን ደጊማ ተረጋግጽ ከም ዝበለ ካብ ዋይት ሃውስ ዝረኸብናዮ ሓበሬታ የመልክት።
ፕረዚደንት ትራምፕ ብምትሕሓዝ፡ ስለስቲኤን ሃገራት ነዚ ዕድል`ዚ ተጠቒመን፡ መጻኢ ወሎዶታተን ካብ`ዚ ኣገዳሲ ጸጋታት ማይ ተጠቀምቲ ንክኾኑ ብሓባር ንክሰርሓን ክዓብያን ክርእዩ ዩናይትድ ስቴትስ ድልየታ ምዃኑ ደጊማ ተረጋግጽ ኢሉ።
ትማሊ ዝጀመረ 4ይ ዙርያ ዝርርብ ግብጺ፡ ኢትዮጵያን ሱዳንን ሎሚ`ውን ቀጺሉ ውዒሉ`ሎ፡ ዝተበጽሐ ነገር እንተሎ ዛጊድ ዝተፈልጠ ነገር የለን።
ብኻልእ ወገን ኣብ`ዚ ሰዓት`ዚ ኣብ`ዚ ኣብ ዋሽንግተን ዲሲ ዝርከብ ኢምባሲ ኢትዮጵያ፡ ብሚንስትር ጉዳያት ወጻኢ ገዱ እንዳርጋቸው ዝምርሑ ልኡኻት ኢትዮጵያ ጋዜጣዊ መግለጺ ይህቡ`ለው። 


In [48]:
#Parse all articles
articlehtmldir = os.path.join(voa_downdir, 'topic-articles')
articletextdir = os.path.join(voa_downdir, 'topic-text')

article_text_paths = []

for topic in topic_article_urls.keys():
    print(topic)
    topic_articlehtmldir = os.path.join(articlehtmldir, topic)
    if not os.path.exists(topic_articlehtmldir):
        os.makedirs(topic_articlehtmldir)
        
    topic_articletextdir = os.path.join(articletextdir, topic)
    if not os.path.exists(topic_articletextdir):
        os.makedirs(topic_articletextdir)
    
    file_counter = 1
    for article_url in topic_article_urls[topic]:
        #print(article_url)
        article_html_path = os.path.join(topic_articlehtmldir, f"{file_counter:04d}" + '.html')
        article_text_path = os.path.join(topic_articletextdir, f"{file_counter:04d}" + '.txt')
        
        article_html_content = download_url(article_url, article_html_path)
        
        article_text = parse_article(article_html_content, article_text_path)
        
        file_counter += 1
        if article_text:
            article_text_paths.append(article_text_path)


africa1
africa2
world
politics
health
No title
No text content
No title
No text content
youth
No title
No text content
No title
No text content
No title
No text content
No title
No text content
No title
No text content
america
No text content
No text content
No text content
No text content
No text content
No text content
No text content
No text content
No title
No text content
No text content
No text content
No text content
No text content
No title
No text content
No text content
No text content
No text content
No title
No text content
No text content
No text content
No title
No text content
No title
No text content
No title
No text content
No title
No text content
culture
people
No text content
No title
No text content
No title
No text content
No title
No text content
No title
No text content
No text content
No text content
No title
No text content
No text content
kenya
UN
No title
No title
No title
No text content
No text content
No text content
No text content
No text content
No tex

### Make text corpus

In [70]:
sent_ending_punkset = ['።', '፨', '፠', '፧', '?']

def sentence_tokenize(doc):
    """
    Splits the document into sentences using end punctuation.

    :param doc: to split
    :return: the list of sentence strings from the document
    """
    tokens = []
    doc = ' '.join(doc.split())

    curr_sent = ""
    for c in doc:
        curr_sent += c
        if c in sent_ending_punkset:
            tokens.append(curr_sent)
            curr_sent = ""

    if curr_sent:
        tokens.append(curr_sent)

    return tokens

In [49]:
text_corpus_lines = []

for path in article_text_paths:
    with open(path, 'r') as f:
        #article_lines = [line for line in f.read().splitlines()]
        lines = [line for line in f.read().splitlines() if line]
        text_corpus_lines.extend(lines)
    #break
    
print("No. lines", len(text_corpus_lines))

In [69]:
text_corpus_sentences = []

for line in text_corpus_lines:
    text_corpus_sentences.extend(sentence_tokenize(line))

print("No. sentences", len(text_corpus_sentences))

No. sentences 49075


In [73]:
#Write to file
output_file = 'corpora/voanews_sentences.txt'
with open(output_file, 'w') as f:
    f.writelines("%s\n" % sent for sent in text_corpus_sentences)