In [1]:
from datetime import date
from bs4 import BeautifulSoup as soup
import requests
import tqdm.notebook as tq
import random

In [54]:
def scrape_cna():
    """
    Scrape Channel News Asia Topic Terrorism
    """   
    # Base URL
    cna_url="https://www.channelnewsasia.com/topic/terrorism"
    cna_base_url = 'https://www.channelnewsasia.com'

    # Get webpage
    html = requests.get(cna_url)

    # Initialise bs object
    bsobj = soup(html.content,'lxml')

    # Find all headers
    headers = bsobj.findAll("h6")

    # initialize the progress bar
    # select a random color
    colors = ['red', 'green', 'yellow', 'blue', 'magenta', 'cyan', 'white', 'steelblue']
    random_color = random.choice(colors)
    loop = tq.tqdm(enumerate(headers), total=len(headers), 
                        leave=True, colour=random_color, unit='article')

    articles = []

    for _, header in loop:
        item = {
            'headline': header.text
        }

        # Enter news article
        news_link = cna_base_url + header.a['href']
        news_html = requests.get(news_link)

        # Traverse news article for main body
        article = soup(news_html.content,'lxml')
        content = article.find('div', {'class': 'content'})
        if content:
            content_wrapper = content.find_all('div', {'class': 'content-wrapper'})

            article_text = []
            for content in content_wrapper:
                # Extract text
                main_text = content.find('div', {'class': 'text'})
                if main_text:
                    article_text.append(main_text.get_text())
        
            item['text'] = article_text
            articles.append(item)
        
        loop.set_postfix(Processing=header.text)

    return articles

def format_articles(articles):
    """
    Format scraped data
    """

    formatted_articles = []
    
    # Format article
    for article in articles:
        if 'text' not in article:
            continue
        
        article = {
            'title': article['headline'],
            'body': "\n".join(article['text']),
            'timestamp_published': article['timestamp_published']
        }

        formatted_articles.append(article)

    return formatted_articles

def scrape_apnews():
    """
    Scrape AP News Topic Terrorism
    """

    # Base URL
    cna_url="https://apnews.com/hub/terrorism"

    # Get webpage
    html = requests.get(cna_url)

    # Initialise bs object
    bsobj = soup(html.content,'lxml')

    # Find all article item in first section
    items = bsobj.findAll("h3", {'class':'PagePromo-title'})

    # initialize the progress bar
    # select a random color
    colors = ['red', 'green', 'yellow', 'blue', 'magenta', 'cyan', 'white', 'steelblue']
    random_color = random.choice(colors)
    loop = tq.tqdm(enumerate(items), total=len(items), 
                        leave=True, colour=random_color, unit='article')
    
    articles = []

    for _, header in loop:
        item = {
            'headline': header.text
        }

        # Enter news article
        news_link = header.a['href']
        news_html = requests.get(news_link)

        # Traverse news article for main body
        article = soup(news_html.content,'lxml')
        content = article.find('div', {'class': 'RichTextStoryBody'})
        if content:
            article_texts = content.find_all('p')
            article_text = []
            for text in article_texts:
                article_text.append(text.text)
            item['text'] = article_text

        timestamp_element = article.find('bsp-timestamp')
        if timestamp_element:
            item['timestamp_published'] = timestamp_element['data-timestamp']

        articles.append(item)
        
        # Update progress description
        loop.set_postfix(Processing=header.text)
    
    return articles

In [55]:
articles = scrape_apnews()
formatted_articles_apnews = format_articles(articles)

  0%|          | 0/39 [00:00<?, ?article/s]

In [58]:
print(formatted_articles_apnews[2].keys())

dict_keys(['title', 'body', 'timestamp_published'])


In [19]:
articles = scrape_cna()
formatted_articles_cna = format_articles(articles)


  0%|          | 0/15 [00:00<?, ?article/s]

In [12]:
print(formatted_articles_cna[12])



      Brussels gunman suspect was known to police but not on watchlist
  


BRUSSELS: A man suspected of shooting dead two Swedish football fans and wounding another in Brussels was a 45-year-old Tunisian who had an asylum application rejected in 2020 but continued to live in Belgium illegally, according to Belgian officials.
After an overnight manhunt, police fatally shot the suspect at a cafe in the Schaerbeek district of northern Brussels on Tuesday morning (Oct 17), a day after his deadly attack, which prosecutors are treating as an act of terrorism.
Authorities said initial indications were that the suspect, who they have not named, was working as a lone wolf, rather than as part of a broad network.
In a video claiming responsibility for the attack, he said he was a member of the Islamic State militant group and gave his name as Abdesalem Al Guilani. Belgian state broadcaster RTBF named him as Abdesalem Lassoued.
At the yellow-brick apartment block in Schaerbeek overlooking a sm