In [34]:
from datetime import date
from bs4 import BeautifulSoup as soup
import requests
import tqdm.notebook as tq
import random
import calendar
from datetime import datetime

In [44]:
def scrape_cna(): 
    """
    Scrape Channel News Asia Topic Terrorism
    """   
    # Base URL
    cna_url="https://www.channelnewsasia.com/topic/terrorism"
    cna_base_url = 'https://www.channelnewsasia.com'

    # Get webpage
    html = requests.get(cna_url)

    # Initialise bs object
    bsobj = soup(html.content,'lxml')

    # Find all headers
    headers = bsobj.findAll("h6")

    # initialize the progress bar
    # select a random color
    colors = ['red', 'green', 'yellow', 'blue', 'magenta', 'cyan', 'white', 'steelblue']
    random_color = random.choice(colors)
    loop = tq.tqdm(enumerate(headers), total=len(headers), 
                        leave=True, colour=random_color, unit='article')

    articles = []

    for _, header in loop:
        item = {
            'headline': header.text
        }

        # Enter news article
        news_link = cna_base_url + header.a['href']
        news_html = requests.get(news_link)

        # Traverse news article for main body
        article = soup(news_html.content,'lxml')
        content = article.find('div', {'class': 'content'})
        if content:
            content_wrapper = content.find_all('div', {'class': 'content-wrapper'})

            article_text = []
            for content in content_wrapper:
                # Extract text
                main_text = content.find('div', {'class': 'text'})
                if main_text:
                    article_text.append(main_text.get_text())
        
            item['text'] = article_text
            articles.append(item)
        
        # Get timestamp
        timestamp_element = article.find('div', {'class': 'article-publish'})
        if timestamp_element:
            timestamp = timestamp_element.get_text(separator='|', strip=True).split('|')[0]
            timestamp_obj = datetime.strptime(timestamp, "%d %b %Y %I:%M%p")
            item['timestamp_published'] = str(calendar.timegm(timestamp_obj.timetuple()))
        
        loop.set_postfix(Processing=header.text)

    return articles

def format_articles(articles):
    """
    Format scraped data
    """

    formatted_articles = []
    
    # Format article
    for article in articles:
        if 'text' not in article:
            continue
        
        article = {
            'title': article['headline'],
            'body': "\n".join(article['text']),
            'timestamp_published': article['timestamp_published'] if 'timestamp_published' in article else 'null'
        }

        formatted_articles.append(article)

    return formatted_articles

def scrape_apnews():
    """
    Scrape AP News Topic Terrorism
    """

    # Base URL
    cna_url="https://apnews.com/hub/terrorism"

    # Get webpage
    html = requests.get(cna_url)

    # Initialise bs object
    bsobj = soup(html.content,'lxml')

    # Find all article item in first section
    items = bsobj.findAll("h3", {'class':'PagePromo-title'})

    # initialize the progress bar
    # select a random color
    colors = ['red', 'green', 'yellow', 'blue', 'magenta', 'cyan', 'white', 'steelblue']
    random_color = random.choice(colors)
    loop = tq.tqdm(enumerate(items), total=len(items), 
                        leave=True, colour=random_color, unit='article')
    
    articles = []

    for _, header in loop:
        item = {
            'headline': header.text
        }

        # Enter news article
        news_link = header.a['href']
        news_html = requests.get(news_link)

        # Traverse news article for main body
        article = soup(news_html.content,'lxml')
        content = article.find('div', {'class': 'RichTextStoryBody'})
        if content:
            article_texts = content.find_all('p')
            article_text = []
            for text in article_texts:
                article_text.append(text.text)
            item['text'] = article_text

        timestamp_element = article.find('bsp-timestamp')
        if timestamp_element:
            item['timestamp_published'] = timestamp_element['data-timestamp']

        articles.append(item)
        
        # Update progress description
        loop.set_postfix(Processing=header.text)
    
    return articles

In [14]:
articles = scrape_apnews()
formatted_articles_apnews = format_articles(articles)

  0%|          | 0/39 [00:00<?, ?article/s]

In [21]:
print(formatted_articles_apnews[2].keys())

dict_keys(['title', 'body', 'timestamp_published'])

Judge denies bail to teen charged with terror-related offenses after stabbings at Sydney church



In [43]:
articles = scrape_cna()
formatted_articles_cna = format_articles(articles)

  0%|          | 0/15 [00:00<?, ?article/s]

<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>


KeyboardInterrupt: 

In [36]:
print(formatted_articles_cna[12])



In [31]:
import os
import hashlib
from dotenv import load_dotenv
from pymongo import MongoClient

load_dotenv()

mongo_password = os.getenv("MONGO_PASSWORD")
mongo_uri = f"mongodb+srv://tristantanjh:{mongo_password}@cluster0.igmtl9j.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0"

# MongoDB connection setup
client = MongoClient(mongo_uri)
db = client.articles_db
articles_collection = db.articles

def generate_article_id(title, date_published):
    # Create a unique identifier using title and date
    unique_string = title + date_published
    return hashlib.md5(unique_string.encode()).hexdigest()

def store_article_in_mongo(article):
    # Check if the article already exists
    article_title_stripped = article['title'].strip()
    article_id = generate_article_id(article_title_stripped, article['timestamp_published'])
    existing_article = articles_collection.find_one({'title': article_title_stripped})
    if existing_article is not None:
        print(f"Article already exists: {article['title']}")
        return False

    # Insert new article into MongoDB
    articles_collection.insert_one({
        '_id': article_id,
        'title': article_title_stripped,
        'timestamp_published': article['timestamp_published']
    })
    print(f"Stored new article: {article['title']}")
    return True

Scraping CNA and storing articles...


  0%|          | 0/15 [00:00<?, ?article/s]

KeyError: 'timestamp_published'

In [45]:
import schedule
import time

# Define the job to be scheduled
def scheduled_job():
    print("Scraping AP News and storing articles...")
    articles_ap = scrape_apnews()
    formatted_articles_apnews = format_articles(articles_ap)
    for article in formatted_articles_apnews:
        store_article_in_mongo(article)
        
    print("Scraping CNA and storing articles...")
    articles_cna = scrape_cna()
    formatted_articles_cna = format_articles(articles_cna)
    for article in formatted_articles_cna:
        store_article_in_mongo(article)

# Schedule the job to run every minute
schedule.every().minute.do(scheduled_job)

# Infinite loop to keep the script running
while True:
    schedule.run_pending()
    time.sleep(1)  # Sleep for 1 second to avoid high CPU usage

Scraping AP News and storing articles...


  0%|          | 0/39 [00:00<?, ?article/s]

Article already exists: 
Russian theater director and playwright go on trial over a play authorities say justifies terrorism

Article already exists: 
Man kills 2 officers at police station in Malaysia in a suspected Jemaah Islamiyah attack

Article already exists: 
Judge denies bail to teen charged with terror-related offenses after stabbings at Sydney church

Article already exists: 
Saudi Arabia confirms a fitness influencer received an 11-year sentence over ‘terrorist offenses’

Article already exists: 
News anchor Poppy Harlow is leaving CNN after more than 15 years

Article already exists: 
Muslim groups claim double standards in police handling of two high-profile stabbings in Sydney

Article already exists: 
Dutch intelligence sees the wars in Gaza and Ukraine as triggers for terrorist threats

Article already exists: 
Teenager is charged with terrorism offenses in stabbings of bishop and priest at Sydney church

Article already exists: 
Russian forces kill 2 suspected of plott

  0%|          | 0/15 [00:00<?, ?article/s]

Stored new article: 

      Ulu Tiram attack: Suspect’s family held radical beliefs, remanded further, says police chief 
  

Stored new article: 

      Heightened security as man tries to snatch Penang police officer’s gun, days after fatal Ulu Tiram attack
  

Stored new article: 

      Johor police station attack: Suspect’s neighbours say no inkling of his family's suspected JI links
  

Stored new article: 

      Commentary: Why is Sydney church stabbing an act of terrorism, but not the Bondi attack?
  

Stored new article: 

      Commentary: Putin’s conspiracy theories make Russians less safe
  

Stored new article: 

      Russia mourns victims of deadly concert hall attack
  

Stored new article: 

      What is ISIS-K, the group that attacked a Moscow concert hall?
  

Stored new article: 

      Death and fear stoked hell in the Moscow concert attack, witnesses say 
  

Stored new article: 

      Putin vows to punish those behind Moscow concert massacre; Kremlin says 4 su

  0%|          | 0/39 [00:00<?, ?article/s]

KeyboardInterrupt: 