In [1]:
import pandas as pd
import numpy as np
import os
import time
import pickle
import re
from tqdm import tqdm
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager

In [2]:
pd.set_option('display.max_columns', None)

In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
from IPython.core.pylabtools import figsize

In [5]:
sns.set()

## Login

In [6]:
ctg_url = 'https://cleaningtheglass.com'

In [7]:
ctg_login_url = 'https://cleaningtheglass.memberful.com/auth/sign_in'

In [8]:
def login(driver, cred, ctg_login_url=ctg_login_url):
    driver.get(ctg_login_url)

    # Get username & password
    with open(cred, 'r') as f:
        user_password = f.readlines()[0].replace('\n', '').split(', ')

    # Login
    userfield = driver.find_element_by_id('session_email')
    userfield.send_keys(user_password[0])
    driver.find_element_by_xpath("//button[@class='btn-main btn-expanded']").click()

    time.sleep(3)

    passwordfield = driver.find_element_by_id('session_password')
    passwordfield.send_keys(user_password[1])
    driver.find_element_by_xpath("//button[@data-session-form-target='passwordButton']").click()

In [9]:
service = Service(executable_path=ChromeDriverManager().install())
driver = webdriver.Chrome(service=service)



Current google-chrome version is 114.0.5735
Get LATEST chromedriver version for 114.0.5735 google-chrome
Driver [/Users/xulian/.wdm/drivers/chromedriver/mac64/114.0.5735.90/chromedriver] found in cache


In [10]:
cred = 'ctg_cred.txt'
login(driver, cred)

In [11]:
driver.get(f'{ctg_url}/stats/teams')
team_container = driver.find_element_by_id('team_container')
team_dict = {a.get_attribute('href').split('/')[-1]:
             a.find_element_by_tag_name('img').get_attribute('alt')
             for a in team_container.find_elements_by_tag_name('a')}
team_dict

{'1': 'Atlanta Hawks',
 '2': 'Boston Celtics',
 '3': 'Brooklyn Nets',
 '4': 'Charlotte Hornets',
 '5': 'Chicago Bulls',
 '6': 'Cleveland Cavaliers',
 '7': 'Dallas Mavericks',
 '8': 'Denver Nuggets',
 '9': 'Detroit Pistons',
 '10': 'Golden State Warriors',
 '11': 'Houston Rockets',
 '12': 'Indiana Pacers',
 '13': 'Los Angeles Clippers',
 '14': 'Los Angeles Lakers',
 '15': 'Memphis Grizzlies',
 '16': 'Miami Heat',
 '17': 'Milwaukee Bucks',
 '18': 'Minnesota Timberwolves',
 '19': 'New Orleans Pelicans',
 '20': 'New York Knicks',
 '21': 'Oklahoma City Thunder',
 '22': 'Orlando Magic',
 '23': 'Philadelphia 76ers',
 '24': 'Phoenix Suns',
 '25': 'Portland Trail Blazers',
 '26': 'Sacramento Kings',
 '27': 'San Antonio Spurs',
 '28': 'Toronto Raptors',
 '29': 'Utah Jazz',
 '30': 'Washington Wizards'}

## Basic Function

In [12]:
def fetch(driver):
    """
    Use BeautifulSoup to get the text of a web page.
    """
    d = driver.page_source
    comm = re.compile("<!--|-->")
    soup = BeautifulSoup(comm.sub("", d), "html.parser")
    return soup

## Download Lineup Data

In [13]:
def scrape_team_lineup_table(team_id, season, driver,
                             team_dict=team_dict):
    driver.get(f'{ctg_url}/stats/team/{team_id}/lineups?season={season}')
    time.sleep(3)
    soup = fetch(driver)
    table = soup.find_all('table', {'id': 'lineups_four_factors'})[0]
    # Build dataframe
    cols = ['PG', 'SG', 'SF', 'PF', 'C', 'Poss', 'Diff_Rank', 'Diff']
    for side in ['off', 'def']:
        for factor in ['Pts/Poss', 'eFG%', 'TOV%', 'OREB%', 'FTr']:
            cols = cols + [f'{side}_{factor}_rank', f'{side}_{factor}']
        if side == 'off':
            cols = cols + ['']
    data = [[td['data-full-name'] for td in row.find_all('td')[:5]] +
            [td.text for td in row.find_all('td')[5:]]
            for row in table.find_all('tbody')[1].find_all('tr')]
    df = pd.DataFrame(data, columns=cols).drop('', axis=1)
    df['season'] = season
    df['team'] = team_dict[team_id]
    return df

In [14]:
seasons = list(range(2017, 2023))

In [15]:
print('Start Scraping...')
team_dfs = {}
for team_id in team_dict.keys():
    team = team_dict[team_id]
    print(team)
    team_dfs[team] = pd.concat([scrape_team_lineup_table(team_id, season, driver)
                                for season in seasons])
print('Done.')

Start Scraping...
Atlanta Hawks
Boston Celtics
Brooklyn Nets
Charlotte Hornets
Chicago Bulls
Cleveland Cavaliers
Dallas Mavericks
Denver Nuggets
Detroit Pistons
Golden State Warriors
Houston Rockets
Indiana Pacers
Los Angeles Clippers
Los Angeles Lakers
Memphis Grizzlies
Miami Heat
Milwaukee Bucks
Minnesota Timberwolves
New Orleans Pelicans
New York Knicks
Oklahoma City Thunder
Orlando Magic
Philadelphia 76ers
Phoenix Suns
Portland Trail Blazers
Sacramento Kings
San Antonio Spurs
Toronto Raptors
Utah Jazz
Washington Wizards
Done.


In [18]:
driver.close()

In [16]:
df = pd.concat([team_dfs[team] for team in team_dict.values()])\
       .reset_index().drop('index', axis=1)
df.head()

Unnamed: 0,PG,SG,SF,PF,C,Poss,Diff_Rank,Diff,off_Pts/Poss_rank,off_Pts/Poss,off_eFG%_rank,off_eFG%,off_TOV%_rank,off_TOV%,off_OREB%_rank,off_OREB%,off_FTr_rank,off_FTr,def_Pts/Poss_rank,def_Pts/Poss,def_eFG%_rank,def_eFG%,def_TOV%_rank,def_TOV%,def_OREB%_rank,def_OREB%,def_FTr_rank,def_FTr,season,team
0,Dennis Schroder,Kent Bazemore,Taurean Prince,Ersan Ilyasova,Miles Plumlee,733,26,-6.7,18,99.9,41,51.5%,12,17.7%,59,25.9%,37,15.0,50,106.5,34,54.0%,57,15.0%,44,25.9%,73,14.2,2017,Atlanta Hawks
1,Dennis Schroder,Kent Bazemore,Taurean Prince,John Collins,Dewayne Dedmon,245,69,8.6,66,112.7,81,57.9%,15,17.1%,37,23.1%,41,15.7,60,104.0,56,51.6%,92,19.4%,15,30.6%,73,14.2,2017,Atlanta Hawks
2,Dennis Schroder,Kent Bazemore,Taurean Prince,Ersan Ilyasova,Dewayne Dedmon,193,25,-7.0,22,101.0,18,48.8%,78,12.4%,22,20.4%,73,22.1,42,108.0,38,53.3%,94,19.8%,18,29.7%,63,15.7,2017,Atlanta Hawks
3,Dennis Schroder,Kent Bazemore,Taurean Prince,Luke Babbitt,Dewayne Dedmon,142,66,7.9,62,112.0,91,59.7%,7,19.0%,44,23.8%,5,8.9,59,104.1,44,52.9%,49,14.4%,80,20.6%,38,18.9,2017,Atlanta Hawks
4,Isaiah Taylor,Taurean Prince,Damion Lee,John Collins,Dewayne Dedmon,140,18,-10.0,36,105.7,41,51.5%,62,13.6%,36,23.1%,4,8.3,16,115.8,11,58.3%,14,11.0%,96,15.6%,15,24.2,2017,Atlanta Hawks


In [17]:
df.to_csv('ctg_nba_lineup_stats.csv', index=False)