In [1]:
import pandas as pd
import numpy as np
import os
import time
import pickle
import re
from tqdm import tqdm
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.common.exceptions import ElementClickInterceptedException, ElementNotInteractableException 
from webdriver_manager.chrome import ChromeDriverManager

In [2]:
pd.set_option('display.max_columns', None)

In [3]:
import warnings
warnings.filterwarnings('ignore')

## Set Up Driver

In [4]:
service = Service(executable_path=ChromeDriverManager().install())
driver = webdriver.Chrome(service=service)



Current google-chrome version is 110.0.5481
Get LATEST chromedriver version for 110.0.5481 google-chrome
Driver [/Users/xulian/.wdm/drivers/chromedriver/mac64/110.0.5481.77/chromedriver] found in cache


In [5]:
proballers_url = 'https://www.proballers.com'

## Dictionary

In [6]:
leagues_dict = {
    'Eurocup': 192
}

## Functions

In [7]:
def create_dirs(league, seasons):
    for season in seasons:
        dirname = f'{league}_{season}'
        if dirname not in os.listdir('data'):
            os.mkdir(f'data/{dirname}')

In [8]:
def fetch(driver):
    """
    Use BeautifulSoup to get the text of a web page.
    """
    d = driver.page_source
    comm = re.compile("<!--|-->")
    soup = BeautifulSoup(comm.sub("", d), "html.parser")
    return soup

In [9]:
def get_team_urls(league, season, 
                  driver=driver,
                  proballers_url=proballers_url,
                  leagues_dict=leagues_dict):
    league_id = leagues_dict[league]
    season_url = (
        f'{proballers_url}/basketball/league/'
        f'{league_id}/{league.lower()}/'
        f'teams/{season}'
    )
    driver.get(season_url)
    
    # Get team urls
    soup = fetch(driver)
    team_urls = [a['href'] for a in soup.find_all('a')
                 if a.text == 'Team Roster']
    
    return team_urls

In [27]:
def get_team_stats(league, team_url,
                   driver=driver,
                   proballers_url=proballers_url,
                   leagues_dict=leagues_dict):
    driver.get(f'{proballers_url}{team_url}')
    
    time.sleep(3)
    driver.execute_script('window.scrollTo(0, 1600)') 
    
    # Click League
    league_id = leagues_dict[league]
    driver.find_element_by_id(f'league-{league_id}-list').click()
    
    time.sleep(3)
    
    # Change to Total
    driver.find_element_by_id(f'league-{league_id}')\
          .find_element_by_xpath("//select[@class='select-box']")\
          .find_element_by_xpath(f"//option[@value='accumulated-{league_id}']")\
          .click()

    # Get Data
    soup = fetch(driver)
    table = soup.find_all('table')[0]
    
    # Build DataFrame
    cols = [th.text for th in table.find_all('th')]
    data = [[td.text for td in tr.find_all('td')]
            for tr in table.find_all('tr')]
    data = [row for row in data
            if len(row) == len(cols)]
    df = pd.DataFrame(data, columns=cols)
    
    # Clean a bit
    for col in cols:
        if col in ['Player', 'W-L'] or '%' in col:
            df[col] = df[col].apply(lambda x: x.strip('\n').strip(' ').strip('\n'))
    
    return df

In [28]:
def download_season(league, season, 
                    driver=driver,
                    proballers_url=proballers_url,
                    leagues_dict=leagues_dict):
    data_dir = f'data/{league}_{season}'
    
    team_urls = get_team_urls(league, season)
    missing_teams_dict = {}
    
    for team_url in tqdm(team_urls):
        team = team_url.split('/')[-2]
        team_df = get_team_stats(league, team_url,
                                 driver=driver,
                                 proballers_url=proballers_url,
                                 leagues_dict=leagues_dict)
        team_df['Team'] = team
        team_df.to_csv(f'{data_dir}/{team}.csv', index=False)
        time.sleep(5)

## Scrape

In [29]:
league = 'Eurocup'
seasons = list(range(2017, 2022))
create_dirs(league, seasons)

### 2021-2022

In [30]:
season = 2021
download_season(league, season)

100%|███████████████████████████████████████████| 20/20 [06:10<00:00, 18.52s/it]


### 2020-2021

In [31]:
season = 2020
download_season(league, season)

100%|███████████████████████████████████████████| 24/24 [07:10<00:00, 17.96s/it]


### 2019-2020

In [32]:
season = 2019
download_season(league, season)

100%|███████████████████████████████████████████| 24/24 [06:48<00:00, 17.04s/it]


### 2018-2019

In [34]:
season = 2018
download_season(league, season)

100%|███████████████████████████████████████████| 24/24 [07:02<00:00, 17.61s/it]


### 2017-2018

In [35]:
season = 2017
download_season(league, season)

100%|█████████████████████████████████████████████| 8/8 [02:08<00:00, 16.08s/it]


In [36]:
driver.close()