In [1]:
from bs4 import BeautifulSoup
import concurrent.futures
import requests
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.common.exceptions import TimeoutException
import time
import re
import os
import threading

In [2]:
def main(letter):
    # Creating driver for each letter instance and using template url
    service = Service(executable_path="/usr/local/bin")
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')
    options.add_argument("--disable-javascript")
    #options.set_page_load_timeout(600) # NEW, double the default time
    driver = webdriver.Chrome(service=service, options=options)
    
    template_url = 'https://www.basketball-reference.com{}'
    letter_url = 'https://www.basketball-reference.com/players/{}/'.format(letter)
    
    # Scanning entire page to get list of players
    driver.get(letter_url)
    html_main = driver.page_source
    soup = BeautifulSoup(html_main, "lxml")
    
    # Create array of all players without header rows
    all_players = soup.find(id="players")
    all_player_rows = all_players.find_all("tr", class_=lambda x: x is None or 'thead' not in x.split())
    
    
    folder = "threadedPlayers/{}".format(letter)
    os.makedirs(folder, exist_ok=True)
    
    # writing code to store all "Letter" players in a folder to look at later
    for i in range(len(all_player_rows) - 1):
        player_link = [a['href'] for a in all_player_rows[i+1].find_all('a', href=True)][0]
        player_name = all_player_rows[i+1].find('a').text
        full_url = template_url.format(player_link)
        
        if os.path.exists(folder + "/{}.html".format(player_name)):
            continue
        
        try:
            print(f"Scraping for '{player_name}'")
            start_time = time.process_time()
            driver.get(full_url)
            driver.execute_script("window.scrollTo(1, 100000)")
            time.sleep(5) # NEW, before it was set to 2
            full_html = driver.page_source
    
            # Writing results to folder
            with open(folder + "/{}.html".format(player_name), "w+") as f:
                f.write(full_html)
            finish_time = (time.process_time() - start_time )
            print(f"Duration: for '{player_name}' is {finish_time}")
        except TimeoutException:
            print(f"Timeout occurred for '{player_name}'. Skipping to the next player.")
            time.sleep(30) # NEW, before it was 5
            continue
    
    driver.quit()
    print(f"Scraping for letter '{letter}' completed")

In [None]:
# FOR LOOP TO RUN FOR ALL LETTERS

# List of letters to process concurrently
letters = ['b', 'c']


"""# Create threads for each letter and start them
threads = []
for letter in letters:
    thread = threading.Thread(target=main, args=(letter, ))
    threads.append(thread)
    thread.start()
    print(f"Scraping for letter '{letter}' started...")

# Wait for all threads to finish
for thread in threads:
    thread.join()

print("Scraping completed!") """


# Using ThreadPoolExecutor to submit tasks for each letter
with concurrent.futures.ThreadPoolExecutor() as executor:
    # Create a list of futures for the scraping tasks
    futures = []

    for letter in letters:
        # Submit tasks for each letter to the executor
        future = executor.submit(main, letter)
        futures.append(future)

        print(f"Scraping for letter '{letter}' started...")

        # Wait for all tasks to complete
    concurrent.futures.wait(futures)
    print("Scraping completed!")

Scraping for letter 'b' started...
Scraping for letter 'c' started...
Scraping for 'Žarko Čabarkapa'
Scraping for 'Barney Cable'
Scraping for 'Bruno Caboclo'
Scraping for 'Devontae Cacok'
Scraping for 'Jason Caffey'
Scraping for 'Michael Cage'
Scraping for 'Jamal Cain'
Scraping for 'Gerry Calabrese'
Scraping for 'Nick Calathes'
Scraping for 'José Calderón'
Scraping for 'Adrian Caldwell'
Scraping for 'Jim Caldwell'
Scraping for 'Joe Caldwell'
Scraping for 'Kentavious Caldwell-Pope'
Scraping for 'Bill Calhoun'
Scraping for 'Corky Calhoun'
Scraping for 'Demetrius Calip'
Scraping for 'Tom Callahan'
Scraping for 'Rick Calloway'
Scraping for 'Ernie Calverley'
Scraping for 'Mack Calvin'
Scraping for 'Chris Babb'
Scraping for 'Luke Babbitt'
Scraping for 'Miloš Babić'
Scraping for 'Johnny Bach'
Scraping for 'Dwayne Bacon'
Scraping for 'Henry Bacon'
Scraping for 'Jim Baechtold'
Scraping for 'Dalibor Bagarić'
Scraping for 'John Bagley'
Scraping for 'Marvin Bagley III'
Scraping for 'Carl Bailey'
S

Scraping for 'Marquese Chriss'
Scraping for 'Spider Bennett'
Scraping for 'Fred Christ'
Scraping for 'Tony Bennett'
Scraping for 'Cal Christensen'
Scraping for 'Bob Christian'
Scraping for 'Winston Bennett'
Scraping for 'Doug Christie'
Scraping for 'David Benoit'
Scraping for 'Max Christie'
Scraping for 'Keith Benson'
Scraping for 'Dionte Christmas'
Scraping for 'Rakeem Christmas'
Scraping for 'Kent Benson'
Scraping for 'Semaj Christon'
Scraping for 'Ben Bentil'
Scraping for 'Josh Christopher'
Scraping for 'Gene Berce'
Scraping for 'Patrick Christopher'
Scraping for 'Gary Bergen'
Scraping for 'Steve Chubin'
Timeout occurred for 'Gary Bergen'. Skipping to the next player.
Scraping for 'Larry Bergh'
Scraping for 'Robert Churchwell'
Scraping for 'Ricky Berry'
Timeout occurred for 'Robert Churchwell'. Skipping to the next player.
Scraping for 'Archie Clark'
Scraping for 'Walter Berry'
Scraping for 'Carlos Clark'
Scraping for 'Dick Clark'
Scraping for 'Dairis Bertāns'
Scraping for 'Earl Cla