In [20]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import Select
from webdriver_manager.chrome import ChromeDriverManager
from lxml import html
import time
import pandas as pd

In [2]:
options = webdriver.ChromeOptions()
# options.add_argument('--headless')
options.add_argument('--disable-blink-features=AutomationControlled')
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
options.add_argument('--disable-gpu')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')

service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=options)

# Hide webdriver property
driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")

In [3]:
stats_type='bat'
projection_type='fangraphsdc'

In [4]:
url = f"https://www.fangraphs.com/projections"
print(f"Loading: {url}")

Loading: https://www.fangraphs.com/projections


In [5]:
driver.get(url)

In [6]:
dc_link = driver.find_element(By.LINK_TEXT, "Depth Charts")
dc_link.click()

In [12]:
dropdown = driver.find_element(By.CLASS_NAME, "page-item-control")
select_elem = dropdown.find_elements(By.TAG_NAME, "select")[0]
select = Select(select_elem)
select.select_by_value('2000000000')

In [13]:
page_source = driver.page_source
tree = html.fromstring(page_source)

In [14]:
tables = tree.xpath('//table')
print(f"Found {len(tables)} tables in HTML")

Found 16 tables in HTML


In [15]:
best_table = None
max_rows = 0
for i, table in enumerate(tables):
    rows = table.xpath('.//tr')
    print(f"Table {i}: {len(rows)} rows")
    if len(rows) > max_rows:
        max_rows = len(rows)
        best_table = table
        best_table_idx = i

Table 0: 2 rows
Table 1: 1 rows
Table 2: 0 rows
Table 3: 31 rows
Table 4: 5 rows
Table 5: 5 rows
Table 6: 5 rows
Table 7: 5 rows
Table 8: 5 rows
Table 9: 5 rows
Table 10: 2 rows
Table 11: 2 rows
Table 12: 5 rows
Table 13: 5 rows
Table 14: 655 rows
Table 15: 655 rows


In [16]:
print(f"\nUsing table {best_table_idx} with {max_rows} rows")


Using table 14 with 655 rows


In [90]:
data = []
rows = best_table.xpath('.//tr')

header_row = rows[0]
headers = [th.text_content().strip() for th in header_row.xpath('.//th')]
headers.insert(0, 'FGId')
print(f"Headers ({len(headers)}): {headers[:10]}...")

Headers (23): ['FGId', '#', 'Name', 'Team', 'G', 'PA', 'HR', 'R', 'RBI', 'SB']...


In [110]:
def extract_player_data_from_row(row_element):
    """Extract all data from a table row using lxml"""
    data = {}
    
    # Get all td elements
    cells = row_element.xpath('.//td')
    
    for cell in cells:
        stat_name = cell.get('data-stat')
        if stat_name:
            data[stat_name] = cell.text_content().strip()
    
    # Extract player ID from the link
    try:
        player_link = row_element.xpath('.//td[@data-stat="Name"]//a')[0]
        href = player_link.get('href')
        if '/players/' in href:
            parts = href.split('/')
            for part in parts:
                # Check if it's a number (the player ID)
                if part.isdigit():
                    data['FGId'] = part
                    break
            else:
                data['FGId'] = None
    except:
        data['FGId'] = None
    
    return data
    

In [111]:
data = []
for row in rows[1:]:
        row_data = extract_player_data_from_row(row)
        print(row_data)
        data.append(row_data)

{'Name': 'Aaron Judge', 'Team': 'NYY', 'G': '156', 'PA': '672', 'HR': '45', 'R': '116', 'RBI': '110', 'SB': '9', 'BB%': '17.7%', 'K%': '24.5%', 'ISO': '.302', 'BABIP': '.325', 'AVG': '.285', 'OBP': '.417', 'SLG': '.587', 'wOBA': '.415', 'wRC+': '171', 'BsR': '-0.5', 'Off': '56.1', 'Def': '-7.3', 'WAR': '7.4', 'FGId': '15640'}
{'Name': 'Bobby Witt Jr.', 'Team': 'KCR', 'G': '157', 'PA': '679', 'HR': '28', 'R': '104', 'RBI': '92', 'SB': '34', 'BB%': '7.4%', 'K%': '16.7%', 'ISO': '.219', 'BABIP': '.316', 'AVG': '.291', 'OBP': '.349', 'SLG': '.510', 'wOBA': '.364', 'wRC+': '132', 'BsR': '3.2', 'Off': '29.0', 'Def': '14.0', 'WAR': '6.8', 'FGId': '25764'}
{'Name': 'Julio Rodríguez', 'Team': 'SEA', 'G': '159', 'PA': '686', 'HR': '33', 'R': '94', 'RBI': '101', 'SB': '25', 'BB%': '7.2%', 'K%': '21.7%', 'ISO': '.215', 'BABIP': '.309', 'AVG': '.274', 'OBP': '.334', 'SLG': '.489', 'wOBA': '.351', 'wRC+': '133', 'BsR': '2.3', 'Off': '29.0', 'Def': '8.4', 'WAR': '6.2', 'FGId': '23697'}
{'Name': 'Juan

In [112]:
df = pd.DataFrame(data, columns=headers)

In [114]:
df.drop('#', axis=1, inplace=True) 

In [115]:
df

Unnamed: 0,FGId,Name,Team,G,PA,HR,R,RBI,SB,BB%,...,BABIP,AVG,OBP,SLG,wOBA,wRC+,BsR,Off,Def,WAR
0,15640,Aaron Judge,NYY,156,672,45,116,110,9,17.7%,...,.325,.285,.417,.587,.415,171,-0.5,56.1,-7.3,7.4
1,25764,Bobby Witt Jr.,KCR,157,679,28,104,92,34,7.4%,...,.316,.291,.349,.510,.364,132,3.2,29.0,14.0,6.8
2,23697,Julio Rodríguez,SEA,159,686,33,94,101,25,7.2%,...,.309,.274,.334,.489,.351,133,2.3,29.0,8.4,6.2
3,20123,Juan Soto,NYM,159,686,38,110,102,22,18.9%,...,.285,.273,.413,.533,.403,165,-1.4,51.3,-14.9,6.1
4,26289,Gunnar Henderson,BAL,156,672,28,94,93,23,10.6%,...,.310,.274,.356,.487,.361,135,2.3,29.8,5.4,6.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
649,15429,Kris Bryant,COL,32,140,4,15,15,1,7.8%,...,.301,.239,.309,.388,.306,81,-0.3,-3.4,-3.0,-0.2
650,33098,Carson McCusker,MIN,21,91,3,9,9,1,6.1%,...,.305,.210,.264,.353,.270,70,-0.1,-3.4,-1.9,-0.2
651,13723,Jacob Stallings,,39,154,3,14,13,1,7.9%,...,.269,.203,.278,.307,.262,65,-0.8,-7.2,1.9,-0.2
652,,Munetaka Murakami,,122,525,12,55,50,9,7.5%,...,.276,.217,.284,.341,.276,75,-0.1,-15.9,-5.7,-0.4
