# Article Web Scraper

## Introduction
This notebook scrapes the latest articles from Ledger Insights and Blockworks websites and appends the data to a single CSV file named `scraped_articles.csv`. The notebook ensures that the CSV file is not overwritten but appended with new data.


## Import necessary libraries

In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
import csv
import time
import os
import json

## Setup and CSV Handling

#### Load configuration from JSON file

In [2]:
with open('config.json', 'r') as config_file:
    config = json.load(config_file)

#### Extract configuration parameters

In [3]:
webdriver_path = config['webdriver_path']
ledger_insights_url = config['ledger_insights_url']
blockworks_url = config['blockworks_url']
keywords = config['keywords']
num_clicks_li = config['num_clicks_li']
num_clicks_bw = config['num_clicks_bw']
searchword = config['searchword']
csv_file_path = config['csv_file_path']

#### CSV checks

In [4]:
# Function to check if the CSV file exists and contains headers
def csv_file_exists_and_has_headers(file_path, headers):
    if not os.path.isfile(file_path):
        return False
    with open(file_path, 'r', encoding='utf-8') as file:
        first_line = file.readline().strip()
        if first_line != ','.join(headers):
            return False
    return True

# Function to append data to CSV
def append_to_csv(file_path, data, headers):
    file_has_headers = csv_file_exists_and_has_headers(file_path, headers)
    with open(file_path, mode='a', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        if not file_has_headers:
            writer.writerow(headers)
        writer.writerows(data)

#### Selenium setup

In [5]:
# Setup Selenium WebDriver
service = Service(webdriver_path)
options = Options()
# options.add_argument('--headless')

#### Interacting with cookie consent button

In [6]:
# Function to close cookie consent button
def close_cookie_consent():
    try:
        cookie_button = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.ID, 'wt-cli-settings-btn'))
        )
        cookie_button.click()
        time.sleep(2)  # Wait for the consent dialog to close
    except Exception as e:
        print(f"Failed to close cookie consent: {e}")

## Ledger Insights Scraper

#### Setup and initializtion

In [7]:
# Initialize WebDriver for Ledger Insights
driver = webdriver.Chrome(service=service, options=options)
driver.get(ledger_insights_url)

# Define the CSV file headers
headers = ['headline', 'link', 'date']

# Read existing headlines to avoid duplicates
existing_headlines = set()
if csv_file_exists_and_has_headers(csv_file_path, headers):
    with open(csv_file_path, 'r', encoding='utf-8') as read_file:
        csv_reader = csv.reader(read_file)
        next(csv_reader)  # Skip header row
        for row in csv_reader:
            if row:
                existing_headlines.add(row[0])
                
# Close cookie consent if it appears
close_cookie_consent()

#### Loading, parsing and scraping articles

In [8]:
# Load articles by clicking the 'Load More' button multiple times
for _ in range(num_clicks_li):
    try:
        load_more_button = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.XPATH, '//a[contains(text(), "Load more")]'))
        )
        load_more_button.click()
        time.sleep(3)  # Wait for the content to load
    except Exception as e:
        print(f"Exception occurred: {e}")
        break

# Parse the loaded page content with BeautifulSoup
soup = BeautifulSoup(driver.page_source, 'lxml')

# Use a set to store seen links to avoid duplication
seen_links = set()

# Loop through each link that might contain a headline
ledger_data = []
for link in soup.find_all('a', title=True):
    headline = link.get('title').strip()
    url = link.get('href')

    # Check if the link is already processed or the headline already exists in the CSV file
    if url in seen_links or headline in existing_headlines:
        continue

    # Check if any of the keywords are in the headline
    if any(keyword in headline.lower() for keyword in keywords):
        # Mark this link as seen
        seen_links.add(url)

        # Open the article in a new tab to fetch the date
        driver.execute_script("window.open(arguments[0], '_blank');", url)
        driver.switch_to.window(driver.window_handles[1])
        
        try:
            # Wait for the date element to be present
            date_element = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CLASS_NAME, 'updated'))
            )
            date_text = date_element.text
        except Exception as e:
            print(f"Error fetching date for article {headline}: {e}")
            date_text = "Unknown"
        
        # Close the new tab and switch back to the main tab
        driver.close()
        driver.switch_to.window(driver.window_handles[0])
        
        # Print the scraped data to the console
        print(f"Headline: {headline}")
        print(f"Link: {url}")
        print(f"Date: {date_text}")
        
        # Add the data to ledger_data
        ledger_data.append([headline, url, date_text])

# Close the WebDriver
driver.quit()

# Append Ledger Insights data to CSV
append_to_csv(csv_file_path, ledger_data, headers)

Headline: World Bank to issue Swiss digital bond settled in wholesale CBDC
Link: https://www.ledgerinsights.com/world-bank-to-issue-swiss-digital-bond-settled-in-wholesale-cbdc/
Date: 3 hours ago
Headline: Hana backed Busan Digital Asset Exchange incorporates. Focuses on RWA tokenization
Link: https://www.ledgerinsights.com/hana-backed-busan-digital-asset-exchange-incorporates-focuses-on-rwa-tokenization/
Date: 20 hours ago
Headline: Mastercard executes first tokenized deposit transactions with StanChart subsidiaries
Link: https://www.ledgerinsights.com/mastercard-executes-first-tokenized-deposit-transactions-with-stanchart-subsidiaries/
Date: May 15, 2024
Headline: StanChart confirms Euro transactions on the Partior interbank tokenized deposit network
Link: https://www.ledgerinsights.com/standard-chartered-partior-interbank-tokenized-deposit-network/
Date: May 15, 2024
Headline: Deutsche Bank joins Singapore’s Project Guardian tokenization initiative
Link: https://www.ledgerinsights.c

Headline: BlackRock tokenizes money market fund on Ethereum. Invests in Securitize
Link: https://www.ledgerinsights.com/blackrock-tokenization-money-market-fund-invests-in-securitize/
Date: March 21, 2024
Headline: BlackRock partners tokenization firm Securitize for first digital fund
Link: https://www.ledgerinsights.com/blackrock-digital-fund-partners-tokenization-firm-securitize/
Date: March 20, 2024
Headline: Israel finds transparency as key benefit of government bond tokenization
Link: https://www.ledgerinsights.com/israel-finds-transparency-as-key-benefit-of-government-bond-tokenization/
Date: March 19, 2024
Headline: Nomura backed fund tokenization protocol Libre launches with Brevan Howard funds
Link: https://www.ledgerinsights.com/fund-tokenization-protocol-libre-launches/
Date: March 19, 2024
Headline: Deutsche Börse unveils another digital assets initiative
Link: https://www.ledgerinsights.com/deutsche-borse-digital-assets-business-platform/
Date: March 18, 2024
Headline: Rus

Headline: Securitize partners with Japan’s DeCurret for tokenized deposit settlement
Link: https://www.ledgerinsights.com/securitize-japan-decurret-tokenized-deposit-settlement/
Date: January 30, 2024
Headline: Hong Kong listed broker GF Securities issues $100m tokenized securities on public blockchain
Link: https://www.ledgerinsights.com/gf-securities-tokenized-bond-public-blockchain/
Date: January 26, 2024
Headline: The legal hurdles obstructing U.S. digital bond issuance
Link: https://www.ledgerinsights.com/digital-bonds-us-legal-hurdles/
Date: January 25, 2024
Headline: Digital asset bank Sygnum raises $40m led by Azimut
Link: https://www.ledgerinsights.com/digital-asset-bank-sygnum-raises-40m-led-by-azimut/
Date: January 25, 2024
Headline: Solana public blockchain targets enterprises, institutions with token functionality
Link: https://www.ledgerinsights.com/solana-public-blockchain-targets-enterprises-institutions-with-token-functionality/
Date: January 24, 2024
Headline: Deka In

Headline: Basel, Zurich tokenized bonds settle with Swiss wholesale CBDC
Link: https://www.ledgerinsights.com/wholesale-cbdc-basel-zurich-tokenized-bonds/
Date: December 4, 2023
Headline: AXA IM buys digital green bond using SocGen FORGE’s Ethereum stablecoin
Link: https://www.ledgerinsights.com/stablecoin-socgen-forge-digital-bond/
Date: December 4, 2023
Headline: JP Morgan is live on multi bank tokenized deposit platform Partior
Link: https://www.ledgerinsights.com/jp-morgan-is-live-on-multi-bank-tokenized-deposit-platform-partior/
Date: December 1, 2023
Headline: Daiwa Securities to trial security tokens on public blockchain
Link: https://www.ledgerinsights.com/daiwa-security-tokens-public-blockchain/
Date: December 1, 2023
Headline: Hong Kong Monetary Authority: tokenized bonds  reduce funding costs
Link: https://www.ledgerinsights.com/tokenized-bonds-reduce-costs-hong-kong-monetary-authority/
Date: November 30, 2023
Headline: IMF doesn’t plan to build X-C platform for tokenized cr

Headline: BlackRock, Barclays join JP Morgan’s Tokenized Collateral Network
Link: https://www.ledgerinsights.com/jp-morgan-tokenized-collateral-network-tcn/
Date: October 11, 2023
Headline: Zimbabwe’s gold-backed digital token now used for payments
Link: https://www.ledgerinsights.com/zimbabwe-gold-digital-token-payments/
Date: October 11, 2023
Headline: SIX Digital exchange partners Invest Direct for tokenized private equity
Link: https://www.ledgerinsights.com/six-digital-exchange-sdx-tokenized-private-equity/
Date: October 6, 2023
Headline: Mitsui & Co Digital Assets enables direct investment in real estate fund using store card
Link: https://www.ledgerinsights.com/mitsui-co-digital-assets-real-estate-store-card/
Date: October 3, 2023
Headline: Why Swift may be the path to faster TradFi digital asset adoption
Link: https://www.ledgerinsights.com/swift-tradfi-digital-asset-adoption/
Date: October 3, 2023
Headline: Franklin Templeton CEO: tokenization is securitization on steroids
Lin

Headline: Canada’s Basel crypto proposal: extra risk weighting for tokenized securities
Link: https://www.ledgerinsights.com/canada-basel-crypto-tokenized-securities/
Date: July 26, 2023
Headline: Avalanche Foundation commits $50m to buy tokenized assets. Will regulators like that?
Link: https://www.ledgerinsights.com/avalanche-tokenization-vista/
Date: July 25, 2023
Headline: Financial Stability Board, CPMI to explore tokenization, including for payments
Link: https://www.ledgerinsights.com/tokenization-financial-stability-board-fsb-cpmi/
Date: July 17, 2023
Headline: BNY Mellon, HSBC, Siemens share lessons learned from digital securities issuances
Link: https://www.ledgerinsights.com/digital-securities-bny-mellon-hsbc-siemens/
Date: July 14, 2023
Headline: WisdomTree goes live with its Prime digital assets app
Link: https://www.ledgerinsights.com/wisdomtree-goes-live-with-its-prime-digital-assets-app/
Date: July 12, 2023
Headline: Italy’s Mediobanca tokenizes mutual fund units
Link: 

## Blockworks Scraper

#### Setup and initializtion

In [9]:
# Initialize WebDriver for Blockworks
driver = webdriver.Chrome(service=service, options=options)
driver.get(blockworks_url)

# Read existing headlines to avoid duplicates
existing_headlines = set()
if csv_file_exists_and_has_headers(csv_file_path, headers):
    with open(csv_file_path, 'r', encoding='utf-8') as read_file:
        csv_reader = csv.reader(read_file)
        next(csv_reader)  # Skip header row
        for row in csv_reader:
            if row:
                existing_headlines.add(row[0])
                
# Close cookie consent if it appears
close_cookie_consent()

Failed to close cookie consent: Message: 
Stacktrace:
	GetHandleVerifier [0x00007FF6917A1522+60802]
	(No symbol) [0x00007FF69171AC22]
	(No symbol) [0x00007FF6915D7CE4]
	(No symbol) [0x00007FF691626D4D]
	(No symbol) [0x00007FF691626E1C]
	(No symbol) [0x00007FF69166CE37]
	(No symbol) [0x00007FF69164ABBF]
	(No symbol) [0x00007FF69166A224]
	(No symbol) [0x00007FF69164A923]
	(No symbol) [0x00007FF691618FEC]
	(No symbol) [0x00007FF691619C21]
	GetHandleVerifier [0x00007FF691AA41BD+3217949]
	GetHandleVerifier [0x00007FF691AE6157+3488183]
	GetHandleVerifier [0x00007FF691ADF0DF+3459391]
	GetHandleVerifier [0x00007FF69185B8E6+823622]
	(No symbol) [0x00007FF691725FBF]
	(No symbol) [0x00007FF691720EE4]
	(No symbol) [0x00007FF691721072]
	(No symbol) [0x00007FF6917118C4]
	BaseThreadInitThunk [0x00007FFD8AD1257D+29]
	RtlUserThreadStart [0x00007FFD8B24AA48+40]



#### Loading, parsing and scraping articles

In [10]:
# Find the search bar and input 'tokeniz'
try:
    search_bar = WebDriverWait(driver, 20).until(
        EC.presence_of_element_located((By.ID, 'blockworks-search'))  # Use correct ID for the search bar
    )
    search_bar.send_keys(searchword)
    search_bar.send_keys(Keys.RETURN)
    print("Search query submitted.")
except Exception as e:
    print(f"Error locating search bar: {e}")
    driver.quit()
    exit()

# Load More button click settings
for _ in range(num_clicks_bw):
    try:
        load_more_button = WebDriverWait(driver, 20).until(
            EC.element_to_be_clickable((By.XPATH, '//button[text()="Load More"]'))
        )
        driver.execute_script("arguments[0].scrollIntoView(true);", load_more_button)  # Scroll into view if needed
        load_more_button.click()
        time.sleep(3)  # Wait for the content to load
        print(f"'Load More' button clicked {_ + 1} times.")
    except Exception as e:
        print(f"Exception occurred while clicking 'Load More': {e}")
        break

# Parse the loaded page content with BeautifulSoup
soup = BeautifulSoup(driver.page_source, 'html.parser')

# Locate all article links on the search results page
article_links = [a['href'] for a in soup.find_all('a', class_='font-headline flex-grow text-base font-semibold leading-snug hover:text-primary')]

# Use a set to store seen links to avoid duplication
seen_links = set()

# Loop through each article link
blockworks_data = []
for relative_url in article_links:
    url = 'https://blockworks.co' + relative_url

    # Open the article
    driver.execute_script("window.open(arguments[0], '_blank');", url)
    driver.switch_to.window(driver.window_handles[1])
    
    try:
        # Extract the headline
        headline_tag = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CLASS_NAME, 'font-headline'))
        )
        headline = headline_tag.text.strip()

        # Extract the date
        date_tag = driver.find_element(By.TAG_NAME, 'time')
        date_text = date_tag.text.strip()
        
        # Print the scraped data to the console
        print(f"Headline: {headline}")
        print(f"Link: {url}")
        print(f"Date: {date_text}")

        # Add the data to blockworks_data
        if headline not in existing_headlines:
            blockworks_data.append([headline, url, date_text])
            existing_headlines.add(headline)
    except Exception as e:
        print(f"Error fetching details for article: {e}")
    
    # Close the new tab and switch back to the main tab
    driver.close()
    driver.switch_to.window(driver.window_handles[0])

# Close the WebDriver
driver.quit()

# Append Blockworks data to CSV
append_to_csv(csv_file_path, blockworks_data, headers)

Search query submitted.
'Load More' button clicked 1 times.
'Load More' button clicked 2 times.
Exception occurred while clicking 'Load More': Message: 
Stacktrace:
	GetHandleVerifier [0x00007FF6917A1522+60802]
	(No symbol) [0x00007FF69171AC22]
	(No symbol) [0x00007FF6915D7CE4]
	(No symbol) [0x00007FF691626D4D]
	(No symbol) [0x00007FF691626E1C]
	(No symbol) [0x00007FF69166CE37]
	(No symbol) [0x00007FF69164ABBF]
	(No symbol) [0x00007FF69166A224]
	(No symbol) [0x00007FF69164A923]
	(No symbol) [0x00007FF691618FEC]
	(No symbol) [0x00007FF691619C21]
	GetHandleVerifier [0x00007FF691AA41BD+3217949]
	GetHandleVerifier [0x00007FF691AE6157+3488183]
	GetHandleVerifier [0x00007FF691ADF0DF+3459391]
	GetHandleVerifier [0x00007FF69185B8E6+823622]
	(No symbol) [0x00007FF691725FBF]
	(No symbol) [0x00007FF691720EE4]
	(No symbol) [0x00007FF691721072]
	(No symbol) [0x00007FF6917118C4]
	BaseThreadInitThunk [0x00007FFD8AD1257D+29]
	RtlUserThreadStart [0x00007FFD8B24AA48+40]

Headline: It’s time to overturn 