In [1]:
from datetime import date
from bs4 import BeautifulSoup as soup
import requests
import tqdm.notebook as tq
import random

In [31]:
def scrape_cna():
    """
    Scrape Channel News Asia Topic Terrorism
    """   
    # Base URL
    cna_url="https://www.channelnewsasia.com/topic/terrorism"
    cna_base_url = 'https://www.channelnewsasia.com'

    # Get webpage
    html = requests.get(cna_url)

    # Initialise bs object
    bsobj = soup(html.content,'lxml')

    # Find all headers
    headers = bsobj.findAll("h6")

    # initialize the progress bar
    # select a random color
    colors = ['red', 'green', 'yellow', 'blue', 'magenta', 'cyan', 'white', 'steelblue']
    random_color = random.choice(colors)
    loop = tq.tqdm(enumerate(headers), total=len(headers), 
                        leave=True, colour=random_color, unit='article')

    articles = []

    for _, header in loop:
        item = {
            'headline': header.text
        }

        # Enter news article
        news_link = cna_base_url + header.a['href']
        news_html = requests.get(news_link)

        # Traverse news article for main body
        article = soup(news_html.content,'lxml')
        content = article.find('div', {'class': 'content'})
        if content:
            content_wrapper = content.find_all('div', {'class': 'content-wrapper'})

            article_text = []
            for content in content_wrapper:
                # Extract text
                main_text = content.find('div', {'class': 'text'})
                if main_text:
                    article_text.append(main_text.get_text())
        
            item['text'] = article_text
            articles.append(item)
        
        loop.set_postfix(Processing=header.text)

    return articles

def format_articles(articles):
    """
    Format scraped data
    """

    formatted_articles = []

    # Format article
    for article in articles:
        if 'text' not in article:
            continue
        body = "\n".join(article['text'])
        whole = "\n".join([article['headline'], body])

        formatted_articles.append(whole)

    return formatted_articles

def scrape_apnews():
    """
    Scrape AP News Topic Terrorism
    """

    # Base URL
    cna_url="https://apnews.com/hub/terrorism"

    # Get webpage
    html = requests.get(cna_url)

    # Initialise bs object
    bsobj = soup(html.content,'lxml')

    # Find all article item in first section
    items = bsobj.findAll("h3", {'class':'PagePromo-title'})

    # initialize the progress bar
    # select a random color
    colors = ['red', 'green', 'yellow', 'blue', 'magenta', 'cyan', 'white', 'steelblue']
    random_color = random.choice(colors)
    loop = tq.tqdm(enumerate(items), total=len(items), 
                        leave=True, colour=random_color, unit='article')
    
    articles = []

    for _, header in loop:
        item = {
            'headline': header.text
        }

        # Enter news article
        news_link = header.a['href']
        news_html = requests.get(news_link)

        # Traverse news article for main body
        article = soup(news_html.content,'lxml')
        content = article.find('div', {'class': 'RichTextStoryBody'})
        if content:
            article_texts = content.find_all('p')
            article_text = []
            for text in article_texts:
                article_text.append(text.text)
            item['text'] = article_text

        articles.append(item)
        
        # Update progress description
        loop.set_postfix(Processing=header.text)
    
    return articles
        
    

In [29]:
articles = scrape_apnews()
formatted_articles_apnews = format_articles(articles)

  0%|          | 0/39 [00:00<?, ?article/s]

KeyError: 'text'

In [36]:
print(formatted_articles_apnews[2])


Judge denies bail to teen charged with terror-related offenses after stabbings at Sydney church

SYDNEY (AP) — A judge denied bail Thursday to a 15-year-old boy alleged to be in a network planning terrorist acts and who claimed to be a friend of another teen accused of stabbing a Sydney bishop last month.
The attack on the bishop triggered an investigation that led to the arrests of six teens, ages 14 to 17, who were charged last week with a range of offenses including conspiring to engage in or planning a terrorist act. All remain in custody.
The 15-year-old boy’s lawyer Ahmed Dib had applied for bail Wednesday in the Parramatta Children’s Court, arguing there were exceptional circumstances that required his client’s release.
But Magistrate James Viney ruled such circumstances did not exist. “There is an unacceptable risk to the protection of the community,” Viney said.
Viney said he found the boy’s alleged threats to stab Jewish or Assyrian people, a predominantly Christian ethnic g

In [26]:
# Base URL
cna_url="https://apnews.com/hub/terrorism"

# Get webpage
html = requests.get(cna_url)

# Initialise bs object
bsobj = soup(html.content,'lxml')

# Find all article item in first section
items = bsobj.findAll("h3", {'class':'PagePromo-title'})

news_link = items[0].a['href']
news_html = requests.get(news_link)
article = soup(news_html.content,'lxml')

content = article.find('div', {'class': 'RichTextStoryBody'})
article_texts = content.find_all('p')
article_text = []
for text in article_texts:
  article_text.append(text.text)

In [27]:
article_text

['TALLINN, Estonia (AP) — A Russian court on Monday opened the trial of a theater director and a playwright accused of advocating terrorism in a play, the latest step in an unrelenting crackdown on dissent in Russia that has reached new heights since Moscow sent troops into Ukraine. ',
 'Zhenya Berkovich, a prominent independent theater director, and playwright Svetlana Petriychuk have been jailed for over a year. Authorities claim their play “Finist, the Brave Falcon” justifies terrorism, which is a criminal offense in Russia punishable by up to seven years in prison. Berkovich and Petriychuk have both repeatedly rejected the accusations against them. ',
 'Berkovich told the court on Monday that she staged the play in order to prevent terrorism, and Petriychuk echoed her sentiment, saying that she wrote it in order to prevent events like those depicted in the play. ',
 'The women’s lawyers have pointed out at court hearings before the trial that the play was supported by the Russian C

In [19]:
articles = scrape_cna()
formatted_articles_cna = format_articles(articles)


  0%|          | 0/15 [00:00<?, ?article/s]

In [12]:
print(formatted_articles_cna[12])



      Brussels gunman suspect was known to police but not on watchlist
  


BRUSSELS: A man suspected of shooting dead two Swedish football fans and wounding another in Brussels was a 45-year-old Tunisian who had an asylum application rejected in 2020 but continued to live in Belgium illegally, according to Belgian officials.
After an overnight manhunt, police fatally shot the suspect at a cafe in the Schaerbeek district of northern Brussels on Tuesday morning (Oct 17), a day after his deadly attack, which prosecutors are treating as an act of terrorism.
Authorities said initial indications were that the suspect, who they have not named, was working as a lone wolf, rather than as part of a broad network.
In a video claiming responsibility for the attack, he said he was a member of the Islamic State militant group and gave his name as Abdesalem Al Guilani. Belgian state broadcaster RTBF named him as Abdesalem Lassoued.
At the yellow-brick apartment block in Schaerbeek overlooking a sm