In [1]:
import pandas as pd
import numpy as np
import os
import time
import pickle
import re
from tqdm import tqdm
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.common.exceptions import ElementClickInterceptedException, ElementNotInteractableException 
from webdriver_manager.chrome import ChromeDriverManager

In [2]:
pd.set_option('display.max_columns', None)

In [3]:
import warnings
warnings.filterwarnings('ignore')

## Set Up Driver

In [4]:
service = Service(executable_path=ChromeDriverManager().install())
driver = webdriver.Chrome(service=service)



Current google-chrome version is 112.0.5615
Get LATEST chromedriver version for 112.0.5615 google-chrome
There is no [mac64] chromedriver for browser 112.0.5615 in cache
Trying to download new driver from https://chromedriver.storage.googleapis.com/112.0.5615.49/chromedriver_mac64.zip
Driver has been saved in cache [/Users/xulian/.wdm/drivers/chromedriver/mac64/112.0.5615.49]


In [5]:
proballers_url = 'https://www.proballers.com'

## Dictionary

In [6]:
leagues_dict = {
    'China - CBA': 159,
    'China - CBA Play-Offs': 159
}

## Functions

In [7]:
def create_dirs(league, seasons):
    for season in seasons:
        dirname = f'{league}_{season}'
        if dirname not in os.listdir('data'):
            os.mkdir(f'data/{dirname}')

In [8]:
def fetch(driver):
    """
    Use BeautifulSoup to get the text of a web page.
    """
    d = driver.page_source
    comm = re.compile("<!--|-->")
    soup = BeautifulSoup(comm.sub("", d), "html.parser")
    return soup

In [9]:
def get_team_urls(league, season, 
                  driver=driver,
                  proballers_url=proballers_url,
                  leagues_dict=leagues_dict):
    league_id = leagues_dict[league]
    season_url = (
        f'{proballers_url}/basketball/league/'
        f'{league_id}/{league.lower()}/'
        f'teams/{season}'
    )
    driver.get(season_url)
    
    # Get team urls
    soup = fetch(driver)
    team_urls = [a['href'] for a in soup.find_all('a')
                 if a.text == 'Team Roster']
    
    return team_urls

In [10]:
def get_team_stats(league, team_url,
                   driver=driver,
                   proballers_url=proballers_url,
                   leagues_dict=leagues_dict):
    driver.get(f'{proballers_url}{team_url}')
    
    time.sleep(3)

    league_id = leagues_dict[league]

    # Locate & Get Data
    soup = fetch(driver)
    tables = soup.find_all('table')
    league_options = [a.text.strip('\n').strip(' ').strip('\n')
                      for a in soup.find_all('div', {'class': 'list-group'})[0]
                                   .find_all('a')]
    if len(tables) < len(league_options) * 7:
        return ValueError('Error in number of tables')
    for i in range(len(league_options)):
        if league == league_options[i]:
            league_order = i
    
            return pd.read_html(str(tables))[league_order*7+1]

In [11]:
def download_season(league, season, 
                    driver=driver,
                    proballers_url=proballers_url,
                    leagues_dict=leagues_dict):
    data_dir = f'data/{league}_{season}'
    
    team_urls = get_team_urls(league, season)
    missing_teams_dict = {}
    
    for team_url in tqdm(team_urls):
        team = team_url.split('/')[-2]
        team_df = get_team_stats(league, team_url,
                                 driver=driver,
                                 proballers_url=proballers_url,
                                 leagues_dict=leagues_dict)
        if team_df is not None:
            team_df['Team'] = team
            team_df.to_csv(f'{data_dir}/{team}.csv', index=False)
            time.sleep(5)

## Scrape

In [12]:
league = 'China - CBA'
seasons = list(range(2017, 2023))
create_dirs(league, seasons)

### 2022-2023

In [13]:
season = 2022
download_season(league, season)

100%|█████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [05:27<00:00, 16.38s/it]


### 2021-2022

In [13]:
season = 2021
download_season(league, season)

100%|███████████████████████████████████████████| 20/20 [06:21<00:00, 19.05s/it]


### 2020-2021

In [14]:
season = 2020
download_season(league, season)

100%|███████████████████████████████████████████| 19/19 [05:27<00:00, 17.26s/it]


### 2019-2020

In [15]:
season = 2019
download_season(league, season)

100%|███████████████████████████████████████████| 20/20 [05:32<00:00, 16.64s/it]


### 2018-2019

In [16]:
season = 2018
download_season(league, season)

100%|███████████████████████████████████████████| 20/20 [04:50<00:00, 14.51s/it]


### 2017-2018

In [17]:
season = 2017
download_season(league, season)

100%|███████████████████████████████████████████| 20/20 [05:12<00:00, 15.64s/it]


## Scrape

In [18]:
league = 'China - CBA Play-Offs'
seasons = list(range(2017, 2022))
create_dirs(league, seasons)

### 2021-2022

In [24]:
season = 2021
download_season(league, season)

100%|███████████████████████████████████████████| 20/20 [03:31<00:00, 10.59s/it]


### 2020-2021

In [25]:
season = 2020
download_season(league, season)

100%|███████████████████████████████████████████| 19/19 [04:04<00:00, 12.86s/it]


### 2019-2020

In [26]:
season = 2019
download_season(league, season)

100%|███████████████████████████████████████████| 20/20 [04:00<00:00, 12.02s/it]


### 2018-2019

In [27]:
season = 2018
download_season(league, season)

100%|███████████████████████████████████████████| 20/20 [04:02<00:00, 12.14s/it]


### 2017-2018

In [28]:
season = 2017
download_season(league, season)

100%|███████████████████████████████████████████| 20/20 [03:45<00:00, 11.30s/it]


In [14]:
driver.close()