In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import pandas as pd
import time
import unicodedata

# Set up Selenium
options = Options()
options.add_argument("--headless=new")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
options.add_argument("user-agent=Mozilla/5.0")

driver = webdriver.Chrome(options=options)

# Go to the page
url = "https://www.baseball-reference.com/leagues/majors/2025-standard-batting.shtml"
driver.get(url)

# Wait longer to make sure JS finishes loading
time.sleep(5)

# Parse page
html = driver.page_source
soup = BeautifulSoup(html, "html.parser")

# ✅ Now the table is directly in the page, not hidden in a comment
table = soup.find('table', {'id': 'players_standard_batting'})

# Check if we found it
if table is None:
    print("❌ Table not found after JS load")
    driver.quit()
    raise ValueError("Table still not found after waiting.")

# Read table into pandas
df = pd.read_html(str(table))[0]

# Clean up repeated headers
df = df[df['Rk'] != 'Rk'].reset_index(drop=True)

print("✅ DataFrame preview:")
print(df.head())

driver.quit()


In [None]:
df2=df[['Player','Team']]

In [None]:
multi_team_flags = ['2TM', '3TM', '4TM', '5TM']
filtered_df = df2[~df2['Team'].isin(multi_team_flags)].copy()


# Step 2: Keep only the LAST team listed for each player (i.e., current team)
df2 = filtered_df.drop_duplicates(subset='Player', keep='last').reset_index(drop=True)

In [None]:
def clean_name_symbols(name):
    if not isinstance(name, str):
        return name
    return name.replace('*', '').replace('#', '')
df2['Player'] = df2['Player'].apply(clean_name_symbols)

In [None]:
#normalizing the names, getting rid of accents
def normalize_name(name):
    if not isinstance(name, str):
        return name
    return unicodedata.normalize('NFKD', name).encode('ascii', 'ignore').decode('utf-8')
df2['Player'] = df2['Player'].apply(normalize_name)


In [None]:
df2['Player'].unique()

In [None]:
df2.to_csv('../../general/stats/batter_team_2025.csv')