# Article Web Scraper

## Introduction
This notebook scrapes the latest articles from Ledger Insights and Blockworks websites and appends the data to a single CSV file named `scraped_articles.csv`. The notebook ensures that the CSV file is not overwritten but appended with new data.


## Import necessary libraries

In [8]:
# automating browser commands
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys

# web scraping tool
from bs4 import BeautifulSoup

# general libraries
import csv
import time
import os
import json

## Setup and CSV Handling

#### Load configuration from JSON file

In [9]:
# Set the working directory
os.chdir('C:/Users/sunil/OneDrive/Documents/PythonScripts/article_web_scraper_v2')

# Print the current working directory to verify
print(f"Current working directory: {os.getcwd()}")

# Path to config.json
file_path = 'config.json'

# Check if the file exists
file_exists = os.path.exists(file_path)
print(f"Does the file exist? {file_exists}")

# If the file exists, load the JSON file
if file_exists:
    with open(file_path, 'r') as config_file:
        config = json.load(config_file)
        print("Config loaded successfully:", config)
else:
    print(f"File '{file_path}' not found.")

Current working directory: C:\Users\sunil\OneDrive\Documents\PythonScripts\article_web_scraper_v2
Does the file exist? True
Config loaded successfully: {'webdriver_path': 'C:/Program Files/chromedriver-win64/chromedriver.exe', 'ledger_insights_url': 'https://www.ledgerinsights.com/tokenization/', 'blockworks_url': 'https://blockworks.co/search', 'keywords': ['digital assets', 'digital securities', 'tokenized', 'tokenization', 'bond', 'security', 'asset', 'token'], 'searchword': 'tokeniz', 'num_clicks_li': 20, 'num_clicks_bw': 20, 'csv_file_path': 'scraped_articles.csv'}


#### Extract configuration parameters

In [10]:
webdriver_path = config['webdriver_path']
ledger_insights_url = config['ledger_insights_url']
blockworks_url = config['blockworks_url']
keywords = config['keywords']
num_clicks_li = config['num_clicks_li']
num_clicks_bw = config['num_clicks_bw']
searchword = config['searchword']
csv_file_path = config['csv_file_path']

#### CSV checks

In [11]:
# Function to check if the CSV file exists and contains headers
def csv_file_exists_and_has_headers(file_path, headers):
    if not os.path.isfile(file_path):
        return False
    with open(file_path, 'r', encoding='utf-8') as file:
        first_line = file.readline().strip()
        if first_line != ','.join(headers):
            return False
    return True

# Function to append data to CSV
def append_to_csv(file_path, data, headers):
    file_has_headers = csv_file_exists_and_has_headers(file_path, headers)
    with open(file_path, mode='a', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        if not file_has_headers:
            writer.writerow(headers)
        writer.writerows(data)

#### Selenium setup

In [12]:
# Setup Selenium WebDriver
service = Service(webdriver_path)
options = Options()
# options.add_argument('--headless')

#### Interacting with cookie consent button

In [13]:
# Function to close cookie consent button
def close_cookie_consent():
    try:
        cookie_button = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.ID, 'wt-cli-settings-btn'))
        )
        cookie_button.click()
        time.sleep(2)  # Wait for the consent dialog to close
    except Exception as e:
        print(f"Failed to close cookie consent: {e}")

#### Define the helper function for 'robots.txt'

In [14]:
# Function to fetch and print robots.txt content
def fetch_and_print_robots_txt(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            print(f"robots.txt content for {url}:\n{response.text}")
            rp = RobotFileParser()
            rp.parse(response.text.split('\n'))
            return rp
        else:
            print(f"Failed to fetch robots.txt from {url}")
            return None
    except requests.exceptions.RequestException as e:
        print(f"Exception occurred while fetching robots.txt from {url}: {e}")
        return None

# Fetch and print robots.txt for Ledger Insights
ledger_insights_robots_url = "https://www.ledgerinsights.com/robots.txt"
ledger_rp = fetch_and_print_robots_txt(ledger_insights_robots_url)
    
# Fetch and print robots.txt for Blockworks
blockworks_robots_url = "https://blockworks.co/robots.txt"
blockworks_rp = fetch_and_print_robots_txt(blockworks_robots_url)

# Check specific URL access for Blockworks
blockworks_test_url = blockworks_url  # Use the base URL or a specific URL you want to test
if blockworks_rp:
    print(f"Testing URL access for {blockworks_test_url}")
    if blockworks_rp.can_fetch("*", blockworks_test_url):
        print(f"Allowed to scrape: {blockworks_test_url}")
    else:
        print(f"Not allowed to scrape: {blockworks_test_url}")
else:
    print("Failed to initialize RobotFileParser for Blockworks; proceeding with scraping based on assumed allowance.")

robots.txt content for https://www.ledgerinsights.com/robots.txt:

User-Agent: *
Disallow: /wp-admin/
Allow: /wp-admin/admin-ajax.php

User-agent: AhrefsBot
Crawl-Delay: 60

Failed to fetch robots.txt from https://blockworks.co/robots.txt
Failed to initialize RobotFileParser for Blockworks; proceeding with scraping based on assumed allowance.


#### Check to ensure we are allowed to scrape from sites
This is done by assessing and complying with the associated 'robots.txt' files embedded within a site

In [15]:
# Function to scrape Ledger Insights articles
def scrape_ledger_insights():
    urls_to_scrape = [
        ledger_insights_url,
        # Add more URLs as needed
    ]

    for url in urls_to_scrape:
        if ledger_rp.can_fetch("*", url):
            print(f"Allowed to scrape: {url}")
            response = requests.get(url)
            # Continue with your scraping logic (i.e. parsing the page with BeautifulSoup)
        else:
            print(f"Not allowed to scrape: {url}")

# Function to scrape Blockworks articles
def scrape_blockworks():
    urls_to_scrape = [
        blockworks_url,
        # Add more URLs as needed
    ]

    for url in urls_to_scrape:
        # Proceed if robots.txt is not fetched, assuming scraping is allowed
        if blockworks_rp is None or blockworks_rp.can_fetch("*", url):
            print(f"Allowed to scrape: {url}")
            response = requests.get(url)
            # Continue with scraping logic (i.e. parsing the page with BeautifulSoup)
        else:
            print(f"Not allowed to scrape: {url}")

# Run the scraping functions
scrape_ledger_insights()
scrape_blockworks()

Allowed to scrape: https://www.ledgerinsights.com/tokenization/
Allowed to scrape: https://blockworks.co/search


## Ledger Insights Scraper

#### Setup and initializtion

In [17]:
# Initialize WebDriver for Ledger Insights
driver = webdriver.Chrome(service=service, options=options)
driver.get(ledger_insights_url)

# Define the CSV file headers
headers = ['headline', 'link', 'date']

# Read existing headlines to avoid duplicates
existing_headlines = set()
if csv_file_exists_and_has_headers(csv_file_path, headers):
    with open(csv_file_path, 'r', encoding='utf-8') as read_file:
        csv_reader = csv.reader(read_file)
        next(csv_reader)  # Skip header row
        for row in csv_reader:
            if row:
                existing_headlines.add(row[0])
                
# Close cookie consent if it appears
close_cookie_consent()

#### Loading, parsing and scraping articles

In [18]:
# Load articles by clicking the 'Load More' button multiple times
for _ in range(num_clicks_li):
    try:
        load_more_button = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.XPATH, '//a[contains(text(), "Load more")]'))
        )
        load_more_button.click()
        time.sleep(3)  # Wait for the content to load
    except Exception as e:
        print(f"Exception occurred: {e}")
        break

# Parse the loaded page content with BeautifulSoup
soup = BeautifulSoup(driver.page_source, 'lxml')

# Use a set to store seen links to avoid duplication
seen_links = set()

# Loop through each link that might contain a headline
ledger_data = []
for link in soup.find_all('a', title=True):
    headline = link.get('title').strip()
    url = link.get('href')

    # Check if the link is already processed or the headline already exists in the CSV file
    if url in seen_links or headline in existing_headlines:
        continue

    # Check if any of the keywords are in the headline
    if any(keyword in headline.lower() for keyword in keywords):
        # Mark this link as seen
        seen_links.add(url)

        # Open the article in a new tab to fetch the date
        driver.execute_script("window.open(arguments[0], '_blank');", url)
        driver.switch_to.window(driver.window_handles[1])
        
        try:
            # Wait for the date element to be present
            date_element = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CLASS_NAME, 'updated'))
            )
            date_text = date_element.text
        except Exception as e:
            print(f"Error fetching date for article {headline}: {e}")
            date_text = "Unknown"
        
        # Close the new tab and switch back to the main tab
        driver.close()
        driver.switch_to.window(driver.window_handles[0])
        
        # Print the scraped data to the console
        print(f"Headline: {headline}")
        print(f"Link: {url}")
        print(f"Date: {date_text}")
        
        # Add the data to ledger_data
        ledger_data.append([headline, url, date_text])

# Close the WebDriver
driver.quit()

# Append Ledger Insights data to CSV
append_to_csv(csv_file_path, ledger_data, headers)

Headline: Glasstower plans to use tokenized money market funds for cross border payments
Link: https://www.ledgerinsights.com/glasstower-plans-to-use-tokenized-money-market-funds-for-cross-border-payments/
Date: May 16, 2024


## Blockworks Scraper

#### Setup and initializtion

In [19]:
# Initialize WebDriver for Blockworks
driver = webdriver.Chrome(service=service, options=options)
driver.get(blockworks_url)

# Read existing headlines to avoid duplicates
existing_headlines = set()
if csv_file_exists_and_has_headers(csv_file_path, headers):
    with open(csv_file_path, 'r', encoding='utf-8') as read_file:
        csv_reader = csv.reader(read_file)
        next(csv_reader)  # Skip header row
        for row in csv_reader:
            if row:
                existing_headlines.add(row[0])
                
# Close cookie consent if it appears
close_cookie_consent()

Failed to close cookie consent: Message: 
Stacktrace:
	GetHandleVerifier [0x00007FF7298B1522+60802]
	(No symbol) [0x00007FF72982AC22]
	(No symbol) [0x00007FF7296E7CE4]
	(No symbol) [0x00007FF729736D4D]
	(No symbol) [0x00007FF729736E1C]
	(No symbol) [0x00007FF72977CE37]
	(No symbol) [0x00007FF72975ABBF]
	(No symbol) [0x00007FF72977A224]
	(No symbol) [0x00007FF72975A923]
	(No symbol) [0x00007FF729728FEC]
	(No symbol) [0x00007FF729729C21]
	GetHandleVerifier [0x00007FF729BB41BD+3217949]
	GetHandleVerifier [0x00007FF729BF6157+3488183]
	GetHandleVerifier [0x00007FF729BEF0DF+3459391]
	GetHandleVerifier [0x00007FF72996B8E6+823622]
	(No symbol) [0x00007FF729835FBF]
	(No symbol) [0x00007FF729830EE4]
	(No symbol) [0x00007FF729831072]
	(No symbol) [0x00007FF7298218C4]
	BaseThreadInitThunk [0x00007FFC9F27257D+29]
	RtlUserThreadStart [0x00007FFCA0F0AA48+40]



#### Loading, parsing and scraping articles

In [20]:
# Find the search bar and input 'tokeniz'
try:
    search_bar = WebDriverWait(driver, 20).until(
        EC.presence_of_element_located((By.ID, 'blockworks-search'))  # Use correct ID for the search bar
    )
    search_bar.send_keys(searchword)
    search_bar.send_keys(Keys.RETURN)
    print("Search query submitted.")
except Exception as e:
    print(f"Error locating search bar: {e}")
    driver.quit()
    exit()

# Load More button click settings
for _ in range(num_clicks_bw):
    try:
        load_more_button = WebDriverWait(driver, 20).until(
            EC.element_to_be_clickable((By.XPATH, '//button[text()="Load More"]'))
        )
        driver.execute_script("arguments[0].scrollIntoView(true);", load_more_button)  # Scroll into view if needed
        load_more_button.click()
        time.sleep(3)  # Wait for the content to load
        print(f"'Load More' button clicked {_ + 1} times.")
    except Exception as e:
        print(f"Exception occurred while clicking 'Load More': {e}")
        break

# Parse the loaded page content with BeautifulSoup
soup = BeautifulSoup(driver.page_source, 'html.parser')

# Locate all article links on the search results page
article_links = [a['href'] for a in soup.find_all('a', class_='font-headline flex-grow text-base font-semibold leading-snug hover:text-primary')]

# Use a set to store seen links to avoid duplication
seen_links = set()

# Loop through each article link
blockworks_data = []
for relative_url in article_links:
    url = 'https://blockworks.co' + relative_url

    # Open the article
    driver.execute_script("window.open(arguments[0], '_blank');", url)
    driver.switch_to.window(driver.window_handles[1])
    
    try:
        # Extract the headline
        headline_tag = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CLASS_NAME, 'font-headline'))
        )
        headline = headline_tag.text.strip()

        # Extract the date
        date_tag = driver.find_element(By.TAG_NAME, 'time')
        date_text = date_tag.text.strip()
        
        # Print the scraped data to the console
        print(f"Headline: {headline}")
        print(f"Link: {url}")
        print(f"Date: {date_text}")

        # Add the data to blockworks_data
        if headline not in existing_headlines:
            blockworks_data.append([headline, url, date_text])
            existing_headlines.add(headline)
    except Exception as e:
        print(f"Error fetching details for article: {e}")
    
    # Close the new tab and switch back to the main tab
    driver.close()
    driver.switch_to.window(driver.window_handles[0])

# Close the WebDriver
driver.quit()

# Append Blockworks data to CSV
append_to_csv(csv_file_path, blockworks_data, headers)

Search query submitted.
'Load More' button clicked 1 times.
'Load More' button clicked 2 times.
Exception occurred while clicking 'Load More': Message: 
Stacktrace:
	GetHandleVerifier [0x00007FF7298B1522+60802]
	(No symbol) [0x00007FF72982AC22]
	(No symbol) [0x00007FF7296E7CE4]
	(No symbol) [0x00007FF729736D4D]
	(No symbol) [0x00007FF729736E1C]
	(No symbol) [0x00007FF72977CE37]
	(No symbol) [0x00007FF72975ABBF]
	(No symbol) [0x00007FF72977A224]
	(No symbol) [0x00007FF72975A923]
	(No symbol) [0x00007FF729728FEC]
	(No symbol) [0x00007FF729729C21]
	GetHandleVerifier [0x00007FF729BB41BD+3217949]
	GetHandleVerifier [0x00007FF729BF6157+3488183]
	GetHandleVerifier [0x00007FF729BEF0DF+3459391]
	GetHandleVerifier [0x00007FF72996B8E6+823622]
	(No symbol) [0x00007FF729835FBF]
	(No symbol) [0x00007FF729830EE4]
	(No symbol) [0x00007FF729831072]
	(No symbol) [0x00007FF7298218C4]
	BaseThreadInitThunk [0x00007FFC9F27257D+29]
	RtlUserThreadStart [0x00007FFCA0F0AA48+40]

Headline: Binance executive deni