In [71]:
#Installing Library
%pip install selenium --quiet
%pip install pandas --quiet

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.1 -> 23.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.1 -> 23.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [72]:
#Library Import
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd 
import time
import json
import os
import datetime

In [73]:
def get_driver(url):
    #Return web driver
    colab_options = webdriver.ChromeOptions()
    colab_options.add_argument('--no-sandbox')
    colab_options.add_argument('--disable-dev-shm-usage')
    colab_options.add_argument('--headless')
    colab_options.add_argument('--start-maximized') 
    colab_options.add_argument('--start-fullscreen')
    colab_options.add_argument('--single-process')
    driver = webdriver.Chrome(options=colab_options)
    driver.get(url)
    return driver

In [74]:
def get_table_header(driver):
    # Return Table columns in list form
    header = driver.find_elements(By.TAG_NAME, value= 'th')
    header_list = [item.text for index, item in enumerate(header) if index < 10]
    return header_list

def get_table_rows(driver):
    # Get number of rows available
    tablerows = len(driver.find_elements(By.XPATH, value='//*[@id="scr-res-table"]/div[1]/table/tbody/tr'))
    return tablerows  

def parse_table_rows(rownum, driver, header_list):
    # Loop for each row to get the data and return column value in the form of dictionary
    row_dictionary = {}
    for index , item in enumerate(header_list):
        time.sleep(1/20)
        column_xpath = '//*[@id="scr-res-table"]/div[1]/table/tbody/tr[{}]/td[{}]'.format(rownum, index+1)
        row_dictionary[item] = driver.find_element(By.XPATH, value=column_xpath).text
    return row_dictionary

In [75]:
# Parsing Data 
def parse_multiple_pages(driver, total_crypto):
    # Loop through each row, perform button click to move to another page
    table_data = []
    page_num = 1
    is_scraping = True
    header_list = get_table_header(driver)

    while is_scraping:
        table_rows = get_table_rows(driver)
        print('Found {} rows on Page : {}'.format(table_rows, page_num))
        print('Parsing Page : {}'.format(page_num))
        table_data += [parse_table_rows(i, driver, header_list) for i in range (1, table_rows + 1)]
        total_count = len(table_data)
        print('Total rows scraped : {}'.format(total_count))
        if total_count >= total_crypto:
            print('Done Parsing...')
            is_scraping = False
        else:    
            print('== Moving to Next Page ==')
            element = WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.XPATH, '//*[@id="scr-res-table"]/div[2]/button[3]')))
            element.click() 
            page_num += 1
    return table_data

# Converting Large Number Data
def convert(attribute):
    #Convert attribute value from string format to float format based on the suffix (T, B, M)
    result = None
    
    if attribute.endswith('T'):
        result = float(attribute[:-1]) * 1e6
    elif attribute.endswith('B'):
        result = float(attribute[:-1]) * 1e3
    elif attribute.endswith('M'):
        result = float(attribute[:-1]) * 1
    
    return result

# Data Cleaning
def clean_data(data):
    cleaned_data = []
    for item in data:
        # Clean the 'Market Cap' and 'Volume' attribute
        market_cap = item["Market Cap"].replace(',', '')
        market_cap = convert(market_cap)

        volume = item["Total Volume All Currencies (24Hr)"].replace(',', '')
        volume = convert(volume)

        circ_supply = item["Circulating Supply"].replace(',', '')
        circ_supply = convert(circ_supply)

        # Clean the data item and add it to the cleaned_data list
        cleaned_item = {
            "Symbol": item["Symbol"].strip(),
            "Name": item["Name"].strip(),
            "Price (Intraday)": float(item["Price (Intraday)"].replace(',', '')),
            "Change": float(item["Change"].replace(',', '')),
            "% Change": float(item["% Change"].replace('%', '')),
            "Market Cap": market_cap,
            "Volume in Currency (Since 0:00 UTC)": (item["Volume in Currency (Since 0:00 UTC)"].replace(',', '')),
            "Volume in Currency (24Hr)": (item["Volume in Currency (24Hr)"].replace(',', '')),
            "Total Volume All Currencies (24Hr)": volume,
            "Circulating Supply": circ_supply
        }
        cleaned_data.append(cleaned_item)

    return cleaned_data

In [78]:
# Data Export
def scrape_data(url, total_crypto):
    # Scrape Yahoo Finance cryptocurrency data and separate it into different JSON files based on entities
    print('Creating driver')
    driver = get_driver(url)
    print("Web Title: ", driver.title)
    print("The web scraped at:", datetime.datetime.now())
    table_data = parse_multiple_pages(driver, total_crypto)
    driver.close()
    driver.quit()

    # Clean the scraped data
    cleaned_data = clean_data(table_data)

    # Separate the data into different dictionaries based on entities
    cryptocurrency_data = []
    price_data = []
    volume_data = []

    unique_symbols = set()  # Set to store unique symbols

    for item in cleaned_data:
        symbol = item["Symbol"]

        # Skip duplicate symbols
        if symbol in unique_symbols:
            continue

        unique_symbols.add(symbol)

        cryptocurrency_item = {
            "Symbol": symbol,
            "Name": item["Name"],
            "Market Cap (in M)": item["Market Cap"],
            "Circulating Supply (in M)": item["Circulating Supply"]
        }
        cryptocurrency_data.append(cryptocurrency_item)

        price_item = {
            "Symbol": symbol,
            "Price (Intraday)": item["Price (Intraday)"],
            "Change": item["Change"],
            "% Change": item["% Change"]
        }
        price_data.append(price_item)

        volume_item = {
            "Symbol": symbol,
            "Total Volume All Currencies in 24hr (in M)": item["Total Volume All Currencies (24Hr)"]
        }
        volume_data.append(volume_item)

    # Save the data to separate JSON files in the data folder
    with open('../data/cryptocurrency_data.json', "w") as file:
        json.dump(cryptocurrency_data, file, indent=4)

    with open('../data/price_data.json', "w") as file:
        json.dump(price_data, file, indent=4)

    with open('../data/volume_data.json', "w") as file:
        json.dump(volume_data, file, indent=4)

    print("Data scraped to JSON files successfully!")

In [79]:
YAHOO_FINANCE_URL = 'https://finance.yahoo.com/crypto/'
TOTAL_CRYPTO = 100
scrape_data(YAHOO_FINANCE_URL, TOTAL_CRYPTO)

Creating driver
Web Title:  Crypto Real Time Prices & Latest News - Yahoo Finance
The web scraped at: 2023-07-16 00:26:13.456055
Found 25 rows on Page : 1
Parsing Page : 1
Total rows scraped : 25
== Moving to Next Page ==
Found 25 rows on Page : 2
Parsing Page : 2
Total rows scraped : 50
== Moving to Next Page ==
Found 25 rows on Page : 3
Parsing Page : 3
Total rows scraped : 75
== Moving to Next Page ==
Found 25 rows on Page : 4
Parsing Page : 4
Total rows scraped : 100
Done Parsing...
Data scraped to JSON files successfully!
