In [12]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup
import pandas as pd

In [13]:
schedule = pd.read_csv("Schedule.csv")
schedule

Unnamed: 0,G,Date,Web,Away,Opponent,W/L,Pacers,Opp,Notes,Game ID
0,1,20231025,https://www.nba.com/game/was-vs-ind-0022300064...,,WAS,W,143,120,,202310250IND
1,2,20231028,https://www.nba.com/game/ind-vs-cle-0022300091...,@,CLE,W,125,113,,202310280CLE
2,3,20231030,https://www.nba.com/game/chi-vs-ind-0022300102...,,CHI,L,105,112,,202310300IND
3,4,20231101,https://www.nba.com/game/ind-vs-bos-0022300118...,@,BOS,L,104,155,,202311010BOS
4,5,20231103,https://www.nba.com/game/cle-vs-ind-0022300001...,,CLE,W,121,116,In-Season Tournament,202311030IND
...,...,...,...,...,...,...,...,...,...,...
62,63,20240305,https://www.nba.com/game/ind-vs-dal-0022300892...,@,DAL,W,137,120,,202403050DAL
63,64,20240307,https://www.nba.com/game/min-vs-ind-0022300903...,,MIN,L,111,113,,202403070IND
64,65,20240310,https://www.nba.com/game/ind-vs-orl-0022300928...,@,ORL,W,111,97,,202403100ORL
65,66,20240312,https://www.nba.com/game/ind-vs-okc-0022300941...,@,OKC,W,121,111,,202403120OKC


In [14]:
schedule.rename(columns={'Web':'url'}, inplace =True)

In [15]:
schedule['url'] = schedule['url'].str.replace('#box-score$', '?type=hustle', regex=True)
schedule

Unnamed: 0,G,Date,url,Away,Opponent,W/L,Pacers,Opp,Notes,Game ID
0,1,20231025,https://www.nba.com/game/was-vs-ind-0022300064...,,WAS,W,143,120,,202310250IND
1,2,20231028,https://www.nba.com/game/ind-vs-cle-0022300091...,@,CLE,W,125,113,,202310280CLE
2,3,20231030,https://www.nba.com/game/chi-vs-ind-0022300102...,,CHI,L,105,112,,202310300IND
3,4,20231101,https://www.nba.com/game/ind-vs-bos-0022300118...,@,BOS,L,104,155,,202311010BOS
4,5,20231103,https://www.nba.com/game/cle-vs-ind-0022300001...,,CLE,W,121,116,In-Season Tournament,202311030IND
...,...,...,...,...,...,...,...,...,...,...
62,63,20240305,https://www.nba.com/game/ind-vs-dal-0022300892...,@,DAL,W,137,120,,202403050DAL
63,64,20240307,https://www.nba.com/game/min-vs-ind-0022300903...,,MIN,L,111,113,,202403070IND
64,65,20240310,https://www.nba.com/game/ind-vs-orl-0022300928...,@,ORL,W,111,97,,202403100ORL
65,66,20240312,https://www.nba.com/game/ind-vs-okc-0022300941...,@,OKC,W,121,111,,202403120OKC


In [16]:
# Function to scrape data from a given URL
def scrape_datahome(url):
    try:
        # chrome driver to scrape dynamic webpages
        driver = webdriver.Chrome()
        driver.get(url)

        # Wait for the table to be present on the page
        WebDriverWait(driver, 3).until(EC.presence_of_element_located((By.TAG_NAME, 'table')))

        # beautiful soup to parse it
        soup = BeautifulSoup(driver.page_source, 'html.parser')

        # Find all tables on the page
        tables = soup.find_all('table')

        # Select the second table (indexing starts from 0)
        t = tables[1]

        # Extract headers and data from the table
        headers = [i.text.strip() for i in t.find_all('th')]
        data = [[i.text.strip() for i in b.find_all('td')] for b in t.find_all('tr')]

        # Remove empty rows
        data = [row for row in data if len(row) > 1]

        # Create DataFrame
        df = pd.DataFrame(data, columns=headers)

        return df
    except TimeoutException:
        print("Page load timed out. Retrying...")
        return None
    finally:
        driver.quit()

# Initialize an empty list to store DataFrames
hustlehome_dfs = []

# Iterate through the schedule DataFrame and scrape data for rows where "Away" is NaN
for index, row in schedule.iterrows():
    if pd.isna(row['Away']):
        game_id = row['Game ID']
        url = row['url']
        print(f"Scraping data for Game ID: {game_id}")

        # Attempt to scrape data from the URL
        df = scrape_datahome(url)

        if df is not None:
            # Add Game ID to the DataFrame
            df['Game ID'] = game_id

            # Append the scraped DataFrame to the list
            hustlehome_dfs.append(df)
        else:
            print(f"Failed to scrape data for Game ID: {game_id}")

# Concatenate all scraped DataFrames into a single DataFrame
hustlehome1 = pd.concat(hustlehome_dfs, ignore_index=True)

# Display the combined DataFrame
print(hustlehome1)

Scraping data for Game ID: 202310250IND
Scraping data for Game ID: 202310300IND
Scraping data for Game ID: 202311030IND
Scraping data for Game ID: 202311040IND
Scraping data for Game ID: 202311060IND
Scraping data for Game ID: 202311080IND
Scraping data for Game ID: 202311090IND
Scraping data for Game ID: 202311190IND
Page load timed out. Retrying...
Failed to scrape data for Game ID: 202311190IND
Scraping data for Game ID: 202311220IND
Scraping data for Game ID: 202311240IND
Scraping data for Game ID: 202311270IND
Scraping data for Game ID: 202312040IND
Scraping data for Game ID: 202312180IND
Scraping data for Game ID: 202312200IND
Scraping data for Game ID: 202312230IND
Scraping data for Game ID: 202312300IND
Scraping data for Game ID: 202401030IND
Scraping data for Game ID: 202401050IND
Scraping data for Game ID: 202401060IND
Scraping data for Game ID: 202401080IND
Scraping data for Game ID: 202401100IND
Scraping data for Game ID: 202401230IND
Scraping data for Game ID: 202401250IND

In [17]:
# Function to scrape data from a given URL
def scrape_dataaway(url):
    try:
        # chrome driver to scrape dynamic webpages
        driver = webdriver.Chrome()
        driver.get(url)

        # Wait for the table to be present on the page
        WebDriverWait(driver, 3).until(EC.presence_of_element_located((By.TAG_NAME, 'table')))

        # beautiful soup to parse it
        soup = BeautifulSoup(driver.page_source, 'html.parser')

        # Find all tables on the page
        tables = soup.find_all('table')

        # Select the second table (indexing starts from 0)
        t = tables[0]

        # Extract headers and data from the table
        headers = [i.text.strip() for i in t.find_all('th')]
        data = [[i.text.strip() for i in b.find_all('td')] for b in t.find_all('tr')]

        # Remove empty rows
        data = [row for row in data if len(row) > 1]

        # Create DataFrame
        df = pd.DataFrame(data, columns=headers)

        return df
    except TimeoutException:
        print("Page load timed out. Retrying...")
        return None
    finally:
        driver.quit()

# Initialize an empty list to store DataFrames
hustleaway1_dfs = []

# Iterate through the schedule DataFrame and scrape data for rows where "Away" is NaN
for index, row in schedule.iterrows():
    if not pd.isna(row['Away']):
        game_id = row['Game ID']
        url = row['url']
        print(f"Scraping data for Game ID: {game_id}")

        # Attempt to scrape data from the URL
        df = scrape_dataaway(url)

        if df is not None:
            # Add Game ID to the DataFrame
            df['Game ID'] = game_id

            # Append the scraped DataFrame to the list
            hustleaway1_dfs.append(df)
        else:
            print(f"Failed to scrape data for Game ID: {game_id}")

# Concatenate all scraped DataFrames into a single DataFrame
hustleaway1 = pd.concat(hustleaway1_dfs, ignore_index=True)

# Display the combined DataFrame
print(hustleaway1)

Scraping data for Game ID: 202310280CLE
Scraping data for Game ID: 202311010BOS
Scraping data for Game ID: 202311120PHI
Scraping data for Game ID: 202311140PHI
Scraping data for Game ID: 202311210ATL
Scraping data for Game ID: 202311300MIA
Scraping data for Game ID: 202312020MIA
Scraping data for Game ID: 202312070MIL
Scraping data for Game ID: 202312110DET
Scraping data for Game ID: 202312130MIL
Scraping data for Game ID: 202312150WAS
Page load timed out. Retrying...
Failed to scrape data for Game ID: 202312150WAS
Scraping data for Game ID: 202312160MIN
Scraping data for Game ID: 202312210MEM
Scraping data for Game ID: 202312260HOU
Scraping data for Game ID: 202312280CHI
Scraping data for Game ID: 202401010MIL
Page load timed out. Retrying...
Failed to scrape data for Game ID: 202401010MIL
Scraping data for Game ID: 202401120ATL
Scraping data for Game ID: 202401140DEN
Scraping data for Game ID: 202401150UTA
Scraping data for Game ID: 202401180SAC
Scraping data for Game ID: 202401190PO

In [18]:
# Initialize empty lists to store the URLs
failedhome_urls = []

# Iterate through the schedule DataFrame to get the URLs for the failed games
for index, row in schedule.iterrows():
    if row['Game ID'] in ['202311190IND']:
        failedhome_urls.append(row['url'])

# Initialize an empty list to store DataFrames for the failed games
failedhome_dfs = []

# Scrape data for the failed games
for url in failedhome_urls:
    print(f"Scraping data for URL: {url}")

    # Attempt to scrape data from the URL
    df = scrape_datahome(url)

    if df is not None:
        # Append the scraped DataFrame to the list of failed ones
        failedhome_dfs.append(df)
    else:
        print(f"Failed to scrape data for URL: {url}")

# Concatenate the DataFrames for the failed games
failedhome_df = pd.concat(failedhome_dfs, ignore_index=True)

Scraping data for URL: https://www.nba.com/game/orl-vs-ind-0022300209/box-score?type=hustle


In [22]:
# Initialize empty lists to store the URLs
failedaway_urls = []

# Iterate through the schedule DataFrame to get the URLs for the failed games
for index, row in schedule.iterrows():
    if row['Game ID'] in ['202312150WAS','202401010MIL','202401190POR','202403010NOP']:
        failedaway_urls.append(row['url'])

# Initialize an empty list to store DataFrames for the failed games
failedaway_dfs = []

# Scrape data for the failed games
for url in failedaway_urls:
    print(f"Scraping data for URL: {url}")

    # Attempt to scrape data from the URL
    df = scrape_dataaway(url)

    if df is not None:
        # Append the scraped DataFrame to the list of failed ones
        failedaway_dfs.append(df)
    else:
        print(f"Failed to scrape data for URL: {url}")

# Concatenate the DataFrames for the failed games
failedaway_df = pd.concat(failedaway_dfs, ignore_index=True)

Scraping data for URL: https://www.nba.com/game/ind-vs-was-0022300328/box-score?type=hustle
Scraping data for URL: https://www.nba.com/game/ind-vs-mil-0022300454/box-score?type=hustle
Scraping data for URL: https://www.nba.com/game/ind-vs-por-0022300590/box-score?type=hustle
Scraping data for URL: https://www.nba.com/game/ind-vs-nop-0022300863/box-score?type=hustle


In [23]:
# Concatenate the DataFrames for the failed games
home = pd.concat([hustlehome1, failedhome_df], ignore_index=True)
home

Unnamed: 0,PLAYER,MIN,SCREEN\nAST,SCREEN\nAST PTS,deflections,OFF LOOSE BALLS\nRECOVERED,DEF LOOSE BALLS\nRECOVERED,LOOSE BALLS\nRECOVERED,CHARGES\nDRAWN,CONTESTED\n2PT SHOTS,CONTESTED\n3PT SHOTS,CONTESTED\nSHOTS,OFF\nBOX OUTS,DEF\nBOX OUTS,BOX OUTS,Game ID
0,Bennedict MathurinB. MathurinF,25:50,1,2,3,0,0,0,0,1,2,3,0,1,1,202310250IND
1,Obi ToppinO. ToppinF,19:14,2,5,1,0,1,1,0,3,0,3,0,1,1,202310250IND
2,Myles TurnerM. TurnerC,23:20,7,16,0,0,0,0,0,6,2,8,0,2,2,202310250IND
3,Bruce BrownB. BrownG,27:12,0,0,1,0,0,0,0,3,3,6,0,0,0,202310250IND
4,Tyrese HaliburtonT. HaliburtonG,26:54,0,0,2,0,0,0,0,2,1,3,0,0,0,202310250IND
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
402,Ben SheppardB. Sheppard,16:10,0,0,3,0,1,1,0,1,1,2,0,0,0,
403,Jarace WalkerJ. Walker,14:00,1,3,1,0,0,0,0,3,0,3,0,0,0,
404,Jordan NworaJ. Nwora,23:11,0,0,4,0,0,0,0,2,0,2,0,0,0,
405,Isaiah JacksonI. Jackson,12:00,1,2,2,0,1,1,0,1,2,3,0,0,0,


In [24]:
# Concatenate the DataFrames for the failed games
away = pd.concat([hustleaway1, failedaway_df], ignore_index=True)
away

Unnamed: 0,PLAYER,MIN,SCREEN\nAST,SCREEN\nAST PTS,deflections,OFF LOOSE BALLS\nRECOVERED,DEF LOOSE BALLS\nRECOVERED,LOOSE BALLS\nRECOVERED,CHARGES\nDRAWN,CONTESTED\n2PT SHOTS,CONTESTED\n3PT SHOTS,CONTESTED\nSHOTS,OFF\nBOX OUTS,DEF\nBOX OUTS,BOX OUTS,Game ID
0,Bennedict MathurinB. MathurinF,17:04,0,0,0,0,0,0,0,0,1,1,0,1,1,202310280CLE
1,Obi ToppinO. ToppinF,18:00,0,0,0,1,0,1,0,4,3,7,0,0,0,202310280CLE
2,Myles TurnerM. TurnerC,29:25,0,0,1,0,0,0,0,9,1,10,0,2,2,202310280CLE
3,Bruce BrownB. BrownG,32:11,1,2,4,0,0,0,0,1,1,2,0,0,0,202310280CLE
4,Tyrese HaliburtonT. HaliburtonG,32:01,0,0,1,0,0,0,0,3,2,5,0,0,0,202310280CLE
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
386,Isaiah JacksonI. Jackson,17:57,1,3,3,0,0,0,0,3,0,3,0,0,0,
387,James JohnsonJ. Johnson,7:18,0,0,0,0,0,0,0,1,0,1,0,0,0,
388,Kendall BrownK. Brown,6:37,0,0,2,1,0,1,0,0,0,0,0,0,0,
389,Oscar TshiebweO. Tshiebwe,4:42,0,0,0,1,0,1,0,1,0,1,0,1,1,


In [25]:
hustle = pd.concat([home, away], ignore_index=True)

In [26]:
hustle

Unnamed: 0,PLAYER,MIN,SCREEN\nAST,SCREEN\nAST PTS,deflections,OFF LOOSE BALLS\nRECOVERED,DEF LOOSE BALLS\nRECOVERED,LOOSE BALLS\nRECOVERED,CHARGES\nDRAWN,CONTESTED\n2PT SHOTS,CONTESTED\n3PT SHOTS,CONTESTED\nSHOTS,OFF\nBOX OUTS,DEF\nBOX OUTS,BOX OUTS,Game ID
0,Bennedict MathurinB. MathurinF,25:50,1,2,3,0,0,0,0,1,2,3,0,1,1,202310250IND
1,Obi ToppinO. ToppinF,19:14,2,5,1,0,1,1,0,3,0,3,0,1,1,202310250IND
2,Myles TurnerM. TurnerC,23:20,7,16,0,0,0,0,0,6,2,8,0,2,2,202310250IND
3,Bruce BrownB. BrownG,27:12,0,0,1,0,0,0,0,3,3,6,0,0,0,202310250IND
4,Tyrese HaliburtonT. HaliburtonG,26:54,0,0,2,0,0,0,0,2,1,3,0,0,0,202310250IND
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
793,Isaiah JacksonI. Jackson,17:57,1,3,3,0,0,0,0,3,0,3,0,0,0,
794,James JohnsonJ. Johnson,7:18,0,0,0,0,0,0,0,1,0,1,0,0,0,
795,Kendall BrownK. Brown,6:37,0,0,2,1,0,1,0,0,0,0,0,0,0,
796,Oscar TshiebweO. Tshiebwe,4:42,0,0,0,1,0,1,0,1,0,1,0,1,1,
