In [96]:
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd

In [97]:
schedule = pd.read_csv("Schedule.csv")
schedule['Web'] = schedule['Web'].str.replace('#box-score$', '?type=advanced', regex=True)
schedule

Unnamed: 0,G,Date,Web,Away,Opponent,W/L,Pacers,Opp,Notes,Game ID
0,1,20231025,https://www.nba.com/game/was-vs-ind-0022300064...,,WAS,W,143,120,,202310250IND
1,2,20231028,https://www.nba.com/game/ind-vs-cle-0022300091...,@,CLE,W,125,113,,202310280CLE
2,3,20231030,https://www.nba.com/game/chi-vs-ind-0022300102...,,CHI,L,105,112,,202310300IND
3,4,20231101,https://www.nba.com/game/ind-vs-bos-0022300118...,@,BOS,L,104,155,,202311010BOS
4,5,20231103,https://www.nba.com/game/cle-vs-ind-0022300001...,,CLE,W,121,116,In-Season Tournament,202311030IND
...,...,...,...,...,...,...,...,...,...,...
62,63,20240305,https://www.nba.com/game/ind-vs-dal-0022300892...,@,DAL,W,137,120,,202403050DAL
63,64,20240307,https://www.nba.com/game/min-vs-ind-0022300903...,,MIN,L,111,113,,202403070IND
64,65,20240310,https://www.nba.com/game/ind-vs-orl-0022300928...,@,ORL,W,111,97,,202403100ORL
65,66,20240312,https://www.nba.com/game/ind-vs-okc-0022300941...,@,OKC,W,121,111,,202403120OKC


In [98]:
schedule.rename(columns={'Web':'url'}, inplace =True)
schedule

Unnamed: 0,G,Date,url,Away,Opponent,W/L,Pacers,Opp,Notes,Game ID
0,1,20231025,https://www.nba.com/game/was-vs-ind-0022300064...,,WAS,W,143,120,,202310250IND
1,2,20231028,https://www.nba.com/game/ind-vs-cle-0022300091...,@,CLE,W,125,113,,202310280CLE
2,3,20231030,https://www.nba.com/game/chi-vs-ind-0022300102...,,CHI,L,105,112,,202310300IND
3,4,20231101,https://www.nba.com/game/ind-vs-bos-0022300118...,@,BOS,L,104,155,,202311010BOS
4,5,20231103,https://www.nba.com/game/cle-vs-ind-0022300001...,,CLE,W,121,116,In-Season Tournament,202311030IND
...,...,...,...,...,...,...,...,...,...,...
62,63,20240305,https://www.nba.com/game/ind-vs-dal-0022300892...,@,DAL,W,137,120,,202403050DAL
63,64,20240307,https://www.nba.com/game/min-vs-ind-0022300903...,,MIN,L,111,113,,202403070IND
64,65,20240310,https://www.nba.com/game/ind-vs-orl-0022300928...,@,ORL,W,111,97,,202403100ORL
65,66,20240312,https://www.nba.com/game/ind-vs-okc-0022300941...,@,OKC,W,121,111,,202403120OKC


In [99]:
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException

# Function to scrape data from a given URL
def scrape_data(url):
    try:
        # chrome driver to scrape dynamic webpages
        driver = webdriver.Chrome()
        driver.get(url)

        # Wait for the table to be present on the page
        WebDriverWait(driver, 3).until(EC.presence_of_element_located((By.TAG_NAME, 'table')))

        # beautiful soup to parse it
        soup = BeautifulSoup(driver.page_source, 'html.parser')

        # Find all tables on the page
        tables = soup.find_all('table')

        # Select the second table (indexing starts from 0)
        t = tables[1]

        # Extract headers and data from the table
        headers = [i.text.strip() for i in t.find_all('th')]
        data = [[i.text.strip() for i in b.find_all('td')] for b in t.find_all('tr')]

        # Remove empty rows
        data = [row for row in data if len(row) > 1]

        # Create DataFrame
        df = pd.DataFrame(data, columns=headers)

        return df
    except TimeoutException:
        print("Page load timed out. Retrying...")
        return None
    finally:
        driver.quit()

# Initialize an empty list to store DataFrames
advancedhome_dfs = []

# Iterate through the schedule DataFrame and scrape data for rows where "Away" is NaN
for index, row in schedule.iterrows():
    if pd.isna(row['Away']):
        game_id = row['Game ID']
        url = row['url']
        print(f"Scraping data for Game ID: {game_id}")

        # Attempt to scrape data from the URL
        df = scrape_data(url)

        if df is not None:
            # Add Game ID to the DataFrame
            df['Game ID'] = game_id

            # Append the scraped DataFrame to the list
            advancedhome_dfs.append(df)
        else:
            print(f"Failed to scrape data for Game ID: {game_id}")

# Concatenate all scraped DataFrames into a single DataFrame
advancedhome = pd.concat(advancedhome_dfs, ignore_index=True)

# Display the combined DataFrame
print(advancedhome)

Scraping data for Game ID: 202310250IND
Scraping data for Game ID: 202310300IND
Scraping data for Game ID: 202311030IND
Scraping data for Game ID: 202311040IND
Scraping data for Game ID: 202311060IND
Scraping data for Game ID: 202311080IND
Scraping data for Game ID: 202311090IND
Scraping data for Game ID: 202311190IND
Scraping data for Game ID: 202311220IND
Scraping data for Game ID: 202311240IND
Scraping data for Game ID: 202311270IND
Scraping data for Game ID: 202312040IND
Scraping data for Game ID: 202312180IND
Scraping data for Game ID: 202312200IND
Scraping data for Game ID: 202312230IND
Page load timed out. Retrying...
Failed to scrape data for Game ID: 202312230IND
Scraping data for Game ID: 202312300IND
Scraping data for Game ID: 202401030IND
Scraping data for Game ID: 202401050IND
Scraping data for Game ID: 202401060IND
Scraping data for Game ID: 202401080IND
Scraping data for Game ID: 202401100IND
Scraping data for Game ID: 202401230IND
Scraping data for Game ID: 202401250IND

In [100]:
# Initialize empty lists to store the URLs
failed_urls = []

# Iterate through the schedule DataFrame to get the URLs for the failed games
for index, row in schedule.iterrows():
    if row['Game ID'] in ['202312230IND']:
        failed_urls.append(row['url'])

# Initialize an empty list to store DataFrames for the failed games
failed_dfs = []

# Scrape data for the failed games
for url in failed_urls:
    print(f"Scraping data for URL: {url}")

    # Attempt to scrape data from the URL
    df = scrape_data(url)

    if df is not None:
        # Append the scraped DataFrame to the list of failed ones
        failed_dfs.append(df)
    else:
        print(f"Failed to scrape data for URL: {url}")

# Concatenate the DataFrames for the failed games
failed_combined_df = pd.concat(failed_dfs, ignore_index=True)

# Combine the failed DataFrames with the advancedhome DataFrame
combinedhome_df = pd.concat([advancedhome, failed_combined_df], ignore_index=True)

# Display the combined DataFrame
print(combinedhome_df)

Scraping data for URL: https://www.nba.com/game/orl-vs-ind-0022300391/box-score?type=advanced
                              PLAYER                     MIN OFFRTG DEFRTG  \
0     Bennedict MathurinB. MathurinF                   25:50  127.4  106.6   
1               Obi ToppinO. ToppinF                   19:14  137.0  121.7   
2             Myles TurnerM. TurnerC                   23:20  137.0  109.1   
3               Bruce BrownB. BrownG                   27:12  141.0  112.7   
4    Tyrese HaliburtonT. HaliburtonG                   26:54  125.0  115.6   
..                               ...                     ...    ...    ...   
513          James JohnsonJ. Johnson  DNP - Coach's Decision   None   None   
514             Jordan NworaJ. Nwora  DNP - Coach's Decision   None   None   
515          Ben SheppardB. Sheppard  DNP - Coach's Decision   None   None   
516           Jarace WalkerJ. Walker  DNP - Coach's Decision   None   None   
517                           TOTALS            

In [101]:
# Function to scrape data from a given URL
def scrape_dataaway(url):
    try:
        # chrome driver to scrape dynamic webpages
        driver = webdriver.Chrome()
        driver.get(url)

        # Wait for the table to be present on the page
        WebDriverWait(driver, 3).until(EC.presence_of_element_located((By.TAG_NAME, 'table')))

        # beautiful soup to parse it
        soup = BeautifulSoup(driver.page_source, 'html.parser')

        # Find all tables on the page
        tables = soup.find_all('table')

        # Select the second table (indexing starts from 0)
        t = tables[0]

        # Extract headers and data from the table
        headers = [i.text.strip() for i in t.find_all('th')]
        data = [[i.text.strip() for i in b.find_all('td')] for b in t.find_all('tr')]

        # Remove empty rows
        data = [row for row in data if len(row) > 1]

        # Create DataFrame
        df = pd.DataFrame(data, columns=headers)

        return df
    except TimeoutException:
        print("Page load timed out. Retrying...")
        return None
    finally:
        driver.quit()

# Initialize an empty list to store DataFrames
advancedaway_dfs = []

# Iterate through the schedule DataFrame and scrape data for rows where "Away" is NaN
for index, row in schedule.iterrows():
    if not pd.isna(row['Away']):
        game_id = row['Game ID']
        url = row['url']
        print(f"Scraping data for Game ID: {game_id}")

        # Attempt to scrape data from the URL
        df = scrape_dataaway(url)

        if df is not None:
            # Add Game ID to the DataFrame
            df['Game ID'] = game_id

            # Append the scraped DataFrame to the list
            advancedaway_dfs.append(df)
        else:
            print(f"Failed to scrape data for Game ID: {game_id}")

# Concatenate all scraped DataFrames into a single DataFrame
advancedaway = pd.concat(advancedaway_dfs, ignore_index=True)

# Display the combined DataFrame
print(advancedaway) 

Scraping data for Game ID: 202310280CLE
Scraping data for Game ID: 202311010BOS
Scraping data for Game ID: 202311120PHI
Scraping data for Game ID: 202311140PHI
Scraping data for Game ID: 202311210ATL
Scraping data for Game ID: 202311300MIA
Scraping data for Game ID: 202312020MIA
Scraping data for Game ID: 202312070MIL
Scraping data for Game ID: 202312110DET
Scraping data for Game ID: 202312130MIL
Scraping data for Game ID: 202312150WAS
Scraping data for Game ID: 202312160MIN
Scraping data for Game ID: 202312210MEM
Scraping data for Game ID: 202312260HOU
Scraping data for Game ID: 202312280CHI
Scraping data for Game ID: 202401010MIL
Scraping data for Game ID: 202401120ATL
Scraping data for Game ID: 202401140DEN
Scraping data for Game ID: 202401150UTA
Scraping data for Game ID: 202401180SAC
Scraping data for Game ID: 202401190POR
Scraping data for Game ID: 202401210PHO
Scraping data for Game ID: 202401300BOS
Scraping data for Game ID: 202402010NYK
Scraping data for Game ID: 202402040CHO


In [102]:
advanced = pd.concat([combinedhome_df, advancedaway], ignore_index=True)

In [104]:
advanced.to_csv("combined_advanced.csv", index = False)

In [105]:
advanced

Unnamed: 0,PLAYER,MIN,OFFRTG,DEFRTG,NETRTG,AST%,AST/TO,AST RATIO,OREB%,DREB%,REB%,TO RATIO,EFG%,TS%,USG%,PACE,PIE,Game ID
0,Bennedict MathurinB. MathurinF,25:50,127.4,106.6,20.9,20.0,2.5,20.8,0.0,13.8,7.4,8.3,46.7,53.7,27.5,114.26,11.7,202310250IND
1,Obi ToppinO. ToppinF,19:14,137.0,121.7,15.2,5.3,0.0,10.0,5.0,17.6,10.8,0.0,55.6,58.3,17.0,114.80,9.8,202310250IND
2,Myles TurnerM. TurnerC,23:20,137.0,109.1,27.9,4.2,0.5,8.3,0.0,29.6,18.2,16.7,62.5,61.9,19.3,112.11,8.5,202310250IND
3,Bruce BrownB. BrownG,27:12,141.0,112.7,28.3,3.8,1.0,7.1,4.2,6.7,5.6,7.1,100.0,101.0,18.8,109.41,12.7,202310250IND
4,Tyrese HaliburtonT. HaliburtonG,26:54,125.0,115.6,9.4,52.4,3.7,36.7,0.0,10.3,5.2,10.0,62.5,62.5,25.7,114.21,16.8,202310250IND
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,Jarace WalkerJ. Walker,6:51,157.1,121.4,35.7,0.0,0.0,0.0,25.0,11.1,15.4,0.0,75.0,75.0,13.3,98.10,8.8,202403120OKC
996,Kendall BrownK. Brown,DNP - Coach's Decision,,,,,,,,,,,,,,,,202403120OKC
997,Isaiah JacksonI. Jackson,DNP - Coach's Decision,,,,,,,,,,,,,,,,202403120OKC
998,James JohnsonJ. Johnson,DND - Injury/Illness,,,,,,,,,,,,,,,,202403120OKC
