In [3]:
from datetime import date
from bs4 import BeautifulSoup as soup
import requests
import tqdm.notebook as tq
import random

In [1]:
def scrape_cna():
    """
    Scrape Channel News Asia Topic Terrorism
    """   
    # Base URL
    cna_url="https://www.channelnewsasia.com/topic/terrorism"
    cna_base_url = 'https://www.channelnewsasia.com'

    # Get webpage
    html = requests.get(cna_url)

    # Initialise bs object
    bsobj = soup(html.content,'lxml')

    # Find all headers
    headers = bsobj.findAll("h6")

    # initialize the progress bar
    # select a random color
    colors = ['red', 'green', 'yellow', 'blue', 'magenta', 'cyan', 'white', 'steelblue']
    random_color = random.choice(colors)
    loop = tq.tqdm(enumerate(headers), total=len(headers), 
                        leave=True, colour=random_color, unit='article')

    articles = []

    for _, header in loop:
        item = {
            'headline': header.text
        }

        # Enter news article
        news_link = cna_base_url + header.a['href']
        news_html = requests.get(news_link)

        # Traverse news article for main body
        article = soup(news_html.content,'lxml')
        content = article.find('div', {'class': 'content'})
        if content:
            content_wrapper = content.find_all('div', {'class': 'content-wrapper'})

            article_text = []
            for content in content_wrapper:
                # Extract text
                main_text = content.find('div', {'class': 'text'})
                if main_text:
                    article_text.append(main_text.get_text())
        
            item['text'] = article_text
            articles.append(item)
        
        loop.set_postfix(Processing=header.text)

    return articles

def format_articles(articles):
    """
    Format scraped data
    """

    formatted_articles = []

    # Format article
    for article in articles:
        if 'text' not in article:
            continue
        body = "\n".join(article['text'])
        whole = "\n".join([article['headline'], body])

        formatted_articles.append(whole)

    return formatted_articles

def scrape_apnews():
    """
    Scrape AP News Topic Terrorism
    """

    # Base URL
    cna_url="https://apnews.com/hub/terrorism"

    # Get webpage
    html = requests.get(cna_url)

    # Initialise bs object
    bsobj = soup(html.content,'lxml')

    # Find all article item in first section
    items = bsobj.findAll("h3", {'class':'PagePromo-title'})

    # initialize the progress bar
    # select a random color
    colors = ['red', 'green', 'yellow', 'blue', 'magenta', 'cyan', 'white', 'steelblue']
    random_color = random.choice(colors)
    loop = tq.tqdm(enumerate(items), total=len(items), 
                        leave=True, colour=random_color, unit='article')
    
    articles = []

    for _, header in loop:
        item = {
            'headline': header.text
        }

        # Enter news article
        news_link = header.a['href']
        news_html = requests.get(news_link)

        # Traverse news article for main body
        article = soup(news_html.content,'lxml')
        content = article.find('div', {'class': 'RichTextStoryBody'})
        if content:
            article_texts = content.find_all('p')
            article_text = []
            for text in article_texts:
                article_text.append(text.text)
            item['text'] = article_text

        articles.append(item)
        
        # Update progress description
        loop.set_postfix(Processing=header.text)
    
    return articles
        
    

In [4]:
articles = scrape_apnews()
formatted_articles_apnews = format_articles(articles)

  0%|          | 0/39 [00:00<?, ?article/s]

In [None]:
print(formatted_articles_apnews[2])

In [8]:
articles = scrape_cna()
formatted_articles_cna = format_articles(articles)


  0%|          | 0/15 [00:00<?, ?article/s]

In [None]:
print(formatted_articles_cna[12])

In [11]:
import os
import hashlib
from dotenv import load_dotenv
from pymongo import MongoClient

load_dotenv()

mongo_password = os.getenv("MONGO_PASSWORD")

# MongoDB connection setup
client = MongoClient('mongodb+srv://tristantanjh:{mongo_password}@cluster0.igmtl9j.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0')
db = client.articles_db
articles_collection = db.articles

def generate_article_id(title, date_published):
    # Create a unique identifier using title and date
    unique_string = title + date_published
    return hashlib.md5(unique_string.encode()).hexdigest()

def store_article_in_mongo(article):
    # Check if the article already exists
    article_id = generate_article_id(article['title'], article['timestamp_published'])
    existing_article = articles_collection.find_one({'title': article['title']})
    if existing_article is not None:
        print(f"Article already exists: {article['title']}")
        return False

    # Insert new article into MongoDB
    articles_collection.insert_one({
        '_id': article_id,
        'title': article['title'],
        'timestamp_published': article['timestamp_published']
    })
    print(f"Stored new article: {article['title']}")
    return True
