In [7]:
from datetime import date
from bs4 import BeautifulSoup as soup
import requests
import tqdm.notebook as tq
import random
import calendar
from datetime import datetime
from time import sleep


scraper methods for cna and ap news

In [8]:
def scrape_cna(): 
    """
    Scrape Channel News Asia Topic Terrorism
    """   
    # Base URL
    cna_url="https://www.channelnewsasia.com/topic/terrorism"
    cna_base_url = 'https://www.channelnewsasia.com'

    # Get webpage
    html = requests.get(cna_url)

    # Initialise bs object
    bsobj = soup(html.content,'lxml')

    # Find all headers
    headers = bsobj.findAll("h6")

    # initialize the progress bar
    # select a random color
    colors = ['red', 'green', 'yellow', 'blue', 'magenta', 'cyan', 'white', 'steelblue']
    random_color = random.choice(colors)
    loop = tq.tqdm(enumerate(headers), total=len(headers), 
                        leave=True, colour=random_color, unit='article')

    articles = []

    for _, header in loop:
        item = {
            'headline': header.text
        }

        # Enter news article
        news_link = cna_base_url + header.a['href']
        news_html = requests.get(news_link)

        # Traverse news article for main body
        article = soup(news_html.content,'lxml')
        content = article.find('div', {'class': 'content'})
        if content:
            content_wrapper = content.find_all('div', {'class': 'content-wrapper'})

            article_text = []
            for content in content_wrapper:
                # Extract text
                main_text = content.find('div', {'class': 'text'})
                if main_text:
                    article_text.append(main_text.get_text())
        
            item['text'] = article_text
            articles.append(item)
        
        # Get timestamp
        timestamp_element = article.find('div', {'class': 'article-publish'})
        if timestamp_element:
            timestamp = timestamp_element.get_text(separator='|', strip=True).split('|')[0]
            timestamp_obj = datetime.strptime(timestamp, "%d %b %Y %I:%M%p")
            item['timestamp_published'] = str(calendar.timegm(timestamp_obj.timetuple()))
        
        loop.set_postfix(Processing=header.text)
        sleep(2)

    return articles

def format_articles(articles):
    """
    Format scraped data
    """

    formatted_articles = []
    
    # Format article
    for article in articles:
        if 'text' not in article:
            continue
        
        article = {
            'title': article['headline'],
            'body': "\n".join(article['text']),
            'timestamp_published': article['timestamp_published'] if 'timestamp_published' in article else 'null'
        }

        formatted_articles.append(article)

    return formatted_articles

def scrape_apnews():
    """
    Scrape AP News Topic Terrorism
    """

    # Base URL
    cna_url="https://apnews.com/hub/terrorism"

    # Get webpage
    html = requests.get(cna_url)

    # Initialise bs object
    bsobj = soup(html.content,'lxml')

    # Find all article item in first section
    items = bsobj.findAll("h3", {'class':'PagePromo-title'})

    # initialize the progress bar
    # select a random color
    colors = ['red', 'green', 'yellow', 'blue', 'magenta', 'cyan', 'white', 'steelblue']
    random_color = random.choice(colors)
    loop = tq.tqdm(enumerate(items), total=len(items), 
                        leave=True, colour=random_color, unit='article')
    
    articles = []

    for _, header in loop:
        item = {
            'headline': header.text
        }

        # Enter news article
        news_link = header.a['href']
        news_html = requests.get(news_link)

        # Traverse news article for main body
        article = soup(news_html.content,'lxml')
        content = article.find('div', {'class': 'RichTextStoryBody'})
        if content:
            article_texts = content.find_all('p')
            article_text = []
            for text in article_texts:
                article_text.append(text.text)
            item['text'] = article_text

        timestamp_element = article.find('bsp-timestamp')
        if timestamp_element:
            item['timestamp_published'] = timestamp_element['data-timestamp']

        articles.append(item)
        
        # Update progress description
        loop.set_postfix(Processing=header.text)
        sleep(5)
    
    return articles

method call to scrape articles

In [9]:
articles = scrape_cna()
formatted_articles_cna = format_articles(articles)



  0%|          | 0/15 [00:00<?, ?article/s]

In [10]:
articles = scrape_apnews()
formatted_articles_apnews = format_articles(articles)

  0%|          | 0/39 [00:00<?, ?article/s]

ConnectionError: ('Connection aborted.', ConnectionResetError(10054, 'An existing connection was forcibly closed by the remote host', None, 10054, None))

In [5]:
print(formatted_articles_apnews[2].keys())
print(formatted_articles_cna[12])

dict_keys(['title', 'body', 'timestamp_published'])


storing scraped articles to mongo

In [None]:
import os
import hashlib
from dotenv import load_dotenv
from pymongo import MongoClient

load_dotenv()

mongo_password = os.getenv("MONGO_PASSWORD")
mongo_uri = f"mongodb+srv://tristantanjh:{mongo_password}@cluster0.igmtl9j.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0"

# MongoDB connection setup
client = MongoClient(mongo_uri)
db = client.articles_db
articles_collection = db.articles

def generate_article_id(title, date_published):
    # Create a unique identifier using title and date
    unique_string = title + date_published
    return hashlib.md5(unique_string.encode()).hexdigest()

def store_article_in_mongo(article):
    # Check if the article already exists
    article_title_stripped = article['title'].strip()
    article_id = generate_article_id(article_title_stripped, article['timestamp_published'])
    existing_article = articles_collection.find_one({'title': article_title_stripped})
    if existing_article is not None:
        print(f"Article already exists: {article['title']}")
        return False

    # Insert new article into MongoDB
    articles_collection.insert_one({
        '_id': article_id,
        'title': article_title_stripped,
        'timestamp_published': article['timestamp_published']
    })
    print(f"Stored new article: {article['title']}")
    return True

schedule scraper

In [None]:
import schedule
import time
from dotenv import load_dotenv
from langchain_community.graphs import Neo4jGraph
from langchain_experimental.graph_transformers import LLMGraphTransformer
from langchain_openai import ChatOpenAI
from langchain_text_splitters import TokenTextSplitter
from langchain_core.documents import Document

load_dotenv()

openai_api_key = os.getenv("OPENAI_API_KEY")
neo4j_uri = os.getenv("NEO4J_URI")
neo4j_username = os.getenv("NEO4J_USERNAME")
neo4j_password = os.getenv("NEO4J_PASSWORD")

os.environ["OPENAI_API_KEY"] = openai_api_key
os.environ["NEO4J_URI"] = neo4j_uri
os.environ["NEO4J_USERNAME"] = neo4j_username
os.environ["NEO4J_PASSWORD"] = neo4j_password
    
graph = Neo4jGraph()

llm = ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo")

llm_transformer = LLMGraphTransformer(llm=llm)

def process_and_store_text(clean_text, llm_transformer, graph):
    text_splitter = TokenTextSplitter(chunk_size=512, chunk_overlap=24)
    texts = text_splitter.split_text(clean_text)
    documents = [Document(page_content=text) for text in texts]
    graph_documents = llm_transformer.convert_to_graph_documents(documents)
    graph.add_graph_documents(graph_documents, baseEntityLabel=True, include_source=True)

# Define the job to be scheduled
def scheduled_job():
    unique_articles = []
    
    print("Scraping AP News and storing articles...")
    articles_ap = scrape_apnews()
    formatted_articles_apnews = format_articles(articles_ap)
    for article in formatted_articles_apnews:
        article_stored = store_article_in_mongo(article)
        if article_stored:
            unique_articles.append(article)
        
    print("Scraping CNA and storing articles...")
    articles_cna = scrape_cna()
    formatted_articles_cna = format_articles(articles_cna)
    for article in formatted_articles_cna:
        article_stored = store_article_in_mongo(article)
        if article_stored:
            unique_articles.append(article)
            
    for unique_article in unique_articles:
        process_and_store_text(unique_article['body'], llm_transformer, graph)

# Schedule the job to run every minute
schedule.every().monday.at("09:00").do(scheduled_job)

# Infinite loop to keep the script running
while True:
    schedule.run_pending()
    time.sleep(1)  # Sleep for 1 second to avoid high CPU usage