In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import WebDriverException
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
from tqdm import tqdm

In [2]:
# Configures webdriver

options = Options()
# options.headless = True # Hides GUI

# Sets window size to native GUI size
# options.add_argument("--window-size=1920,1080") 
options.add_argument("start-maximized")

# Removes automation tags from the browser
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)

# Reference:
# https://scrapfly.io/blog/web-scraping-with-selenium-and-python/
# https://python.plainenglish.io/how-to-bypass-google-recaptcha-with-selenium-and-python-9470664fb575

In [3]:
# Initializes driver

service = Service('C:/Users/ZH128895/chromedriver.exe') # Your/Absolute/Path/To/chromedriver.exe 
driver = webdriver.Chrome(service=service, options = options) # Uses Chrome brower

# Changes User Agent at each request
# driver.execute_cdp_cmd('Network.setUserAgentOverride', {"userAgent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.53 Safari/537.36'})

driver.get('https://www.coingecko.com') # Website to scrap

In [4]:
# Finds the total pages

last_page_btn = driver.find_element(By.XPATH, '/html/body/div[4]/div[4]/div[7]/div[2]/nav/ul/li[8]/a')
total_page = int(last_page_btn.text)

In [5]:
# Performs scrapping in a loop

# Init two lists for storing gecko coin symbol and name 
gecko_symbol_lst = []
gecko_name_lst = []

count = 1

# Init progress bar
with tqdm(total = total_page) as pbar: 

    # Loops through all pages
    while True:
        # Prints the scrapping status
        # print('Scrapping page ' + str(i + 1) + ' / 135 ...', end = '\r')

        # Finds coin symbol and coin name by XPATH
        coin_symbols = driver.find_elements(By.XPATH, "//span[@class='tw-hidden d-lg-inline font-normal text-3xs ml-2']")
        coin_names = driver.find_elements(By.XPATH, "//a[@class='tw-hidden lg:tw-flex font-bold tw-items-center tw-justify-between']")

        # Appends the text info to its corresponding list
        for j in range(len(coin_symbols)):
            gecko_symbol_lst.append(coin_symbols[j].text)
            gecko_name_lst.append(coin_names[j].text)

        # Try to click next page
        try:
            next_page_btn = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.LINK_TEXT, 'Next ›')))

            next_page_btn.click()
            
            count += 1

        except WebDriverException:
            if count == total_page:
                print('Scrapping finished!', end = '\r')
            else:
                print('Scrapping incomplete! Something went wrong', end = '\r')
            break

        pbar.update(1)
        
# Asserts 
assert(len(gecko_symbol_lst) == len(gecko_name_lst))
        
#     # Stop at the last page
#     if i != (total_page-1):
#         # Finds the next page button, perform the lick
#         next_page_btn = driver.find_element(By.LINK_TEXT, 'Next ›')
        
#         action = ActionChains(driver)

#         action.click(on_element = next_page_btn)

#         action.perform()
#     else:
#         print('Scrapping finished!', end = '\r')

 99%|███████████████████████████████████████████████████████████████████████████████▍| 133/134 [22:03<00:09,  9.95s/it]

Scrapping finished!




In [6]:
# Creates dataframe, export as excel

gecko_coin_data = {
    'Coin(Gecko) Symbol' : gecko_symbol_lst,
    'Coin(Gecko) Name' : gecko_name_lst,
}

df_gecko_coin_data = pd.DataFrame(gecko_coin_data)

df_gecko_coin_data.to_excel('C:/Users/ZH128895/Work/CryptoCoinWebScrap/CoinGecko.xlsx', index = False)