In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import datetime
import time
import random
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
import unicodedata
import re
from difflib import get_close_matches

In [3]:
#this scraper is to scrape muktiple days at a time
# from selenium import webdriver
# from selenium.webdriver.common.by import By
# from selenium.webdriver.chrome.options import Options
# from bs4 import BeautifulSoup
# import pandas as pd
# import datetime
# import time

# # 1) Chrome setup (headless + desktop viewport)
# options = Options()
# options.add_argument("--headless")
# options.add_argument("--disable-gpu")
# options.add_argument("--window-size=1920,1080")
# driver = webdriver.Chrome(options=options)

# # 2) Date range
# start_date = datetime.datetime.strptime("20250327", "%Y%m%d")
# end_date   = datetime.datetime.strptime("20250513", "%Y%m%d")

# # 3) Helper: parse an HTML <table> into a pandas DataFrame
# def parse_table(table):
#     thead = table.find('thead')
#     if thead:
#         headers = [th.get_text(strip=True) for th in thead.find_all('th')]
#     else:
#         first = table.find('tbody').find('tr')
#         headers = [f"Col{i+1}" for i,_ in enumerate(first.find_all('td'))]
#     rows = []
#     for tr in table.find('tbody').find_all('tr'):
#         cells = [td.get_text(strip=True) for td in tr.find_all('td')]
#         cells += [''] * (len(headers) - len(cells))
#         rows.append(cells)
#     return pd.DataFrame(rows, columns=headers)

# all_games = []
# current = start_date

# while current <= end_date:
#     date_str      = current.strftime("%Y%m%d")
#     scoreboard_url = f"https://www.espn.com/mlb/scoreboard/_/date/{date_str}"
#     print(f"\n🔄 Processing {date_str}")
    
#     driver.get(scoreboard_url)
#     time.sleep(5)  # wait for scoreboard to load

#     # 4) grab all boxscore links by href
#     anchors = driver.find_elements(
#         By.CSS_SELECTOR,
#         "a[href*='/mlb/boxscore/_/gameId/']"
#     )
#     hrefs = [a.get_attribute("href") for a in anchors]
#     print(f"  🎯 Found {len(hrefs)} games")

#     # 5) loop through each game link
#     for idx, href in enumerate(hrefs, start=1):
#         try:
#             print(f"    🔍 Game {idx}/{len(hrefs)} → {href}")
#             driver.get(href)
#             time.sleep(3)  # let boxscore page render

#             soup   = BeautifulSoup(driver.page_source, 'html.parser')
#             tables = soup.find_all('table', class_='Table')
#             print(f"       🧾 {len(tables)} tables found")

#             if len(tables) < 6:
#                 print("       ⚠️ Skipping — insufficient tables")
#                 continue

#             away_names  = parse_table(tables[2])
#             away_stats  = parse_table(tables[3])
#             home_names  = parse_table(tables[4])
#             home_stats  = parse_table(tables[5])

#             # align row counts
#             max_len = max(len(away_names), len(away_stats), len(home_names), len(home_stats))
#             for df in (away_names, away_stats, home_names, home_stats):
#                 df.reindex(range(max_len), fill_value='')

#             # prefix & merge
#             away_df = pd.concat([away_names, away_stats], axis=1)
#             away_df.columns = [f"away_{c}" for c in away_df.columns]
#             home_df = pd.concat([home_names, home_stats], axis=1)
#             home_df.columns = [f"home_{c}" for c in home_df.columns]
#             merged  = pd.concat([away_df, home_df], axis=1)

#             all_games.append(merged)
#             print("       ✅ Scraped")

#             # polite pause between games
#             time.sleep(2)
#         except Exception as e:
#             print(f"       ❌ Error scraping {href}: {e}")
#             time.sleep(2)
#             continue

#     # polite pause before next day's scoreboard
#     time.sleep(5)
#     current += datetime.timedelta(days=1)

# driver.quit()

# # 6) Combine and save
# if all_games:
#     final_df = pd.concat(all_games, ignore_index=True)
#     print("\n✅ DONE — total rows:", len(final_df))
#     final_df.to_csv("boxscores_20250327_to_20250513.csv", index=False)
# else:
#     print("⚠️ No data was scraped.")


In [10]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import pandas as pd
import datetime
import time

# 1) Chrome setup (headless + desktop viewport)
options = Options()
options.add_argument("--headless")
options.add_argument("--disable-gpu")
options.add_argument("--window-size=1920,1080")
driver = webdriver.Chrome(options=options)

# 2) Date range
start_date = datetime.datetime.strptime("20240701", "%Y%m%d")
end_date   = datetime.datetime.strptime("20240929", "%Y%m%d")

def parse_table(table):
    thead = table.find('thead')
    if thead:
        headers = [th.get_text(strip=True) for th in thead.find_all('th')]
    else:
        first = table.find('tbody').find('tr')
        headers = [f"Col{i+1}" for i,_ in enumerate(first.find_all('td'))]
    rows = []
    for tr in table.find('tbody').find_all('tr'):
        cells = [td.get_text(strip=True) for td in tr.find_all('td')]
        cells += [''] * (len(headers) - len(cells))
        rows.append(cells)
    return pd.DataFrame(rows, columns=headers)

all_games = []
current = start_date

while current <= end_date:
    date_str      = current.strftime("%Y%m%d")
    scoreboard_url = f"https://www.espn.com/mlb/scoreboard/_/date/{date_str}"
    print(f"\n🔄 Processing {date_str}")

    driver.get(scoreboard_url)
    # wait until at least one boxscore link appears for this date
    try:
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located(
                (By.CSS_SELECTOR, "a[href*='/mlb/boxscore/_/gameId/']")
            )
        )
    except:
        print("   ⚠️ Scoreboard did not load in time, skipping date")
        current += datetime.timedelta(days=1)
        continue

    anchors = driver.find_elements(
        By.CSS_SELECTOR,
        "a[href*='/mlb/boxscore/_/gameId/']"
    )
    raw_hrefs = [a.get_attribute("href") for a in anchors]
    # dedupe while preserving order
    hrefs = []
    for href in raw_hrefs:
        if href not in hrefs:
            hrefs.append(href)
    print(f"  🎯 Found {len(hrefs)} unique games")

    for idx, href in enumerate(hrefs, start=1):
        print(f"    🔍 Game {idx}/{len(hrefs)} → {href}")
        try:
            driver.get(href)
            # wait until at least one stats table is present
            WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, "table.Table"))
            )
            time.sleep(1)  # give JS a moment

            soup   = BeautifulSoup(driver.page_source, 'html.parser')
            tables = soup.find_all('table', class_='Table')
            print(f"       🧾 {len(tables)} tables found")

            if len(tables) < 6:
                print("       ⚠️ Skipping — insufficient tables")
                continue

            away_names  = parse_table(tables[2])
            away_stats  = parse_table(tables[3])
            home_names  = parse_table(tables[4])
            home_stats  = parse_table(tables[5])

            # ensure equal row counts
            max_len = max(map(len, (away_names, away_stats, home_names, home_stats)))
            for df in (away_names, away_stats, home_names, home_stats):
                df.reindex(range(max_len), fill_value='')

            # merge with prefixes
            away_df = pd.concat([away_names, away_stats], axis=1)
            away_df.columns = [f"away_{c}" for c in away_df.columns]
            home_df = pd.concat([home_names, home_stats], axis=1)
            home_df.columns = [f"home_{c}" for c in home_df.columns]
            merged = pd.concat([away_df, home_df], axis=1)

            all_games.append(merged)
            print("       ✅ Scraped")
            time.sleep(2)  # polite pause
        except Exception as e:
            print(f"       ❌ Error scraping {href}: {e}")
            time.sleep(2)
            continue

    current += datetime.timedelta(days=1)
    time.sleep(5)  # pause before next date

driver.quit()

# 6) Combine and save
if all_games:
    final_df = pd.concat(all_games, ignore_index=True)
    print(f"\n✅ DONE — total rows: {len(final_df)}")
    final_df.to_csv("2024_batting_stats.csv", index=False)
else:
    print("⚠️ No data was scraped.")



🔄 Processing 20240701
  🎯 Found 6 unique games
    🔍 Game 1/6 → https://www.espn.com/mlb/boxscore/_/gameId/401695581
       🧾 13 tables found
       ✅ Scraped
    🔍 Game 2/6 → https://www.espn.com/mlb/boxscore/_/gameId/401695585
       🧾 13 tables found
       ✅ Scraped
    🔍 Game 3/6 → https://www.espn.com/mlb/boxscore/_/gameId/401695580
       🧾 12 tables found
       ✅ Scraped
    🔍 Game 4/6 → https://www.espn.com/mlb/boxscore/_/gameId/401569738
       🧾 13 tables found
       ✅ Scraped
    🔍 Game 5/6 → https://www.espn.com/mlb/boxscore/_/gameId/401569740
       🧾 12 tables found
       ✅ Scraped
    🔍 Game 6/6 → https://www.espn.com/mlb/boxscore/_/gameId/401569739
       🧾 13 tables found
       ✅ Scraped

🔄 Processing 20240702
  🎯 Found 15 unique games
    🔍 Game 1/15 → https://www.espn.com/mlb/boxscore/_/gameId/401569744
       🧾 13 tables found
       ✅ Scraped
    🔍 Game 2/15 → https://www.espn.com/mlb/boxscore/_/gameId/401569745
       🧾 12 tables found
       ✅ Scraped
    🔍

In [4]:
#this scraper is to scrape multiple days at a time with game IDS

# from selenium import webdriver
# from selenium.webdriver.common.by import By
# from selenium.webdriver.chrome.options import Options
# from bs4 import BeautifulSoup
# import pandas as pd
# import datetime
# import time

# # 1) Chrome setup (headless + desktop viewport)
# options = Options()
# options.add_argument("--headless")
# options.add_argument("--disable-gpu")
# options.add_argument("--window-size=1920,1080")
# driver = webdriver.Chrome(options=options)

# # 2) Date range
# start_date = datetime.datetime.strptime("20240328", "%Y%m%d")
# end_date   = datetime.datetime.strptime("20240930", "%Y%m%d")

# # 3) Helper: parse an HTML <table> into a pandas DataFrame
# def parse_table(table):
#     thead = table.find('thead')
#     if thead:
#         headers = [th.get_text(strip=True) for th in thead.find_all('th')]
#     else:
#         first = table.find('tbody').find('tr')
#         headers = [f"Col{i+1}" for i,_ in enumerate(first.find_all('td'))]
#     rows = []
#     for tr in table.find('tbody').find_all('tr'):
#         cells = [td.get_text(strip=True) for td in tr.find_all('td')]
#         cells += [''] * (len(headers) - len(cells))
#         rows.append(cells)
#     return pd.DataFrame(rows, columns=headers)

# all_games = []
# current = start_date

# while current <= end_date:
#     date_str      = current.strftime("%Y%m%d")
#     scoreboard_url = f"https://www.espn.com/mlb/scoreboard/_/date/{date_str}"
#     print(f"\n Processing {date_str}")
    
#     driver.get(scoreboard_url)
#     time.sleep(5)  # wait for scoreboard to load

#     # 4) grab all boxscore links by href
#     anchors = driver.find_elements(
#         By.CSS_SELECTOR,
#         "a[href*='/mlb/boxscore/_/gameId/']"
#     )
#     hrefs = [a.get_attribute("href") for a in anchors]
#     print(f"  Found {len(hrefs)} games")

#     # 5) loop through each game link
#     for idx, href in enumerate(hrefs, start=1):
#         try:
#             # extract the numeric game ID from the URL
#             game_id = href.split("/gameId/")[1].split('?')[0]

#             print(f"    Game {idx}/{len(hrefs)} → ID {game_id}")
#             driver.get(href)
#             time.sleep(3)  # let boxscore page render

#             soup   = BeautifulSoup(driver.page_source, 'html.parser')
#             tables = soup.find_all('table', class_='Table')
#             print(f"        {len(tables)} tables found")

#             if len(tables) < 6:
#                 print("        Skipping — insufficient tables")
#                 continue

#             away_names  = parse_table(tables[2])
#             away_stats  = parse_table(tables[3])
#             home_names  = parse_table(tables[4])
#             home_stats  = parse_table(tables[5])

#             # align row counts
#             max_len = max(len(away_names), len(away_stats), len(home_names), len(home_stats))
#             for df in (away_names, away_stats, home_names, home_stats):
#                 df.reindex(range(max_len), fill_value='')

#             # prefix & merge
#             away_df = pd.concat([away_names, away_stats], axis=1)
#             away_df.columns = [f"away_{c}" for c in away_df.columns]
#             home_df = pd.concat([home_names, home_stats], axis=1)
#             home_df.columns = [f"home_{c}" for c in home_df.columns]
#             merged  = pd.concat([away_df, home_df], axis=1)

#             # insert gameId column at the front
#             merged.insert(0, "gameId", game_id)

#             all_games.append(merged)
#             print("        Scraped")

#             # polite pause between games
#             time.sleep(2)
#         except Exception as e:
#             print(f"        Error scraping {href}: {e}")
#             time.sleep(2)
#             continue

#     # polite pause before next day's scoreboard
#     time.sleep(5)
#     current += datetime.timedelta(days=1)

# driver.quit()

# # 6) Combine and save
# if all_games:
#     final_df = pd.concat(all_games, ignore_index=True)
#     print("\n DONE — total rows:", len(final_df))
#     final_df.to_csv("batter_stats_2024_2.csv", index=False)
# else:
#     print("No data was scraped.")


In [5]:
# #daily scraper
# from selenium import webdriver
# from selenium.webdriver.common.by import By
# from selenium.webdriver.chrome.options import Options
# from bs4 import BeautifulSoup
# import pandas as pd
# import datetime
# import time

# # 1) Configure headless Chrome with a desktop viewport
# options = Options()
# options.add_argument("--headless")
# options.add_argument("--disable-gpu")
# options.add_argument("--window-size=1920,1080")
# driver = webdriver.Chrome(options=options)

# # 2) Load yesterday's scoreboard
# yesterday   = datetime.datetime.now() - datetime.timedelta(days=1)
# date_str    = yesterday.strftime('%Y%m%d')
# scoreboard_url = f"https://www.espn.com/mlb/scoreboard/_/date/{date_str}"

# driver.get(scoreboard_url)
# time.sleep(5)  # give ESPN time to render

# # 3) Grab all the box-score URLs in one shot
# anchors = driver.find_elements(
#     By.CSS_SELECTOR,
#     "a[href*='/mlb/boxscore/_/gameId/']"
# )
# hrefs = [a.get_attribute("href") for a in anchors]
# print(f" Found {len(hrefs)} box-score links for {date_str}")

# # 4) Helper to turn a <table> into a DataFrame
# def parse_table(table):
#     # get header row(s)
#     thead = table.find('thead')
#     if thead:
#         headers = [th.get_text(strip=True) for th in thead.find_all('th')]
#     else:
#         # fallback: first row of tbody
#         first = table.find('tbody').find('tr')
#         headers = [f"Col{i+1}" for i,_ in enumerate(first.find_all('td'))]
#     # get all body rows
#     rows = []
#     for tr in table.find('tbody').find_all('tr'):
#         cells = [td.get_text(strip=True) for td in tr.find_all('td')]
#         # pad missing cells
#         cells += [''] * (len(headers) - len(cells))
#         rows.append(cells)
#     return pd.DataFrame(rows, columns=headers)

# # 5) Visit each box-score, scrape, and collect
# all_games = []
# for idx, href in enumerate(hrefs, start=1):
#     try:
#         print(f" Scraping game {idx}/{len(hrefs)} → {href}")
#         driver.get(href)
#         time.sleep(3)

#         soup   = BeautifulSoup(driver.page_source, 'html.parser')
#         tables = soup.find_all('table', class_='Table')
#         print(f"    Found {len(tables)} tables")

#         # ESPN box scores usually have at least 6 tables:
#         #  2 for away (names + stats) and 2 for home (names + stats)
#         if len(tables) < 6:
#             print("    Skipping — not enough tables")
#             continue

#         # parse them
#         away_names  = parse_table(tables[2])
#         away_stats  = parse_table(tables[3])
#         home_names  = parse_table(tables[4])
#         home_stats  = parse_table(tables[5])

#         # align lengths
#         max_len = max(len(away_names), len(away_stats), len(home_names), len(home_stats))
#         for df in (away_names, away_stats, home_names, home_stats):
#             df.reindex(range(max_len), fill_value='')

#         # prefix columns and merge
#         away_df = pd.concat([away_names, away_stats], axis=1)
#         away_df.columns = [f"away_{c}" for c in away_df.columns]

#         home_df = pd.concat([home_names, home_stats], axis=1)
#         home_df.columns = [f"home_{c}" for c in home_df.columns]

#         merged = pd.concat([away_df, home_df], axis=1)
#         all_games.append(merged)

#         print("    Success")
#     except Exception as e:
#         print(f"    Error scraping {href}: {e}")
#         continue

# # 6) Build final DataFrame
# if all_games:
#     final_df = pd.concat(all_games, ignore_index=True)
#     print("\n✅ FINAL PREVIEW:")
#     print(final_df.head())
#     # final_df.to_csv(f"boxscores_{date_str}.csv", index=False)
# else:
#     print(" No games scraped.")

# driver.quit()


In [6]:
final_df

NameError: name 'final_df' is not defined

In [8]:
#daily scraper with gameIDs
# daily scraper with gameId column
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import pandas as pd
import datetime
import time
import os


# 1) Configure headless Chrome with a desktop viewport
options = Options()
options.add_argument("--headless")
options.add_argument("--disable-gpu")
options.add_argument("--window-size=1920,1080")
driver = webdriver.Chrome(options=options)

# 2) Load yesterday's scoreboard
yesterday     = datetime.datetime.now() - datetime.timedelta(days=1)
date_str      = yesterday.strftime('%Y%m%d')
scoreboard_url = f"https://www.espn.com/mlb/scoreboard/_/date/{date_str}"

driver.get(scoreboard_url)
time.sleep(5)  # give ESPN time to render

# 3) Grab all the box-score URLs in one shot
anchors = driver.find_elements(
    By.CSS_SELECTOR,
    "a[href*='/mlb/boxscore/_/gameId/']"
)
hrefs = [a.get_attribute("href") for a in anchors]
print(f" Found {len(hrefs)} box-score links for {date_str}")

# 4) Helper to turn a <table> into a DataFrame
def parse_table(table):
    thead = table.find('thead')
    if thead:
        headers = [th.get_text(strip=True) for th in thead.find_all('th')]
    else:
        first = table.find('tbody').find('tr')
        headers = [f"Col{i+1}" for i,_ in enumerate(first.find_all('td'))]
    rows = []
    for tr in table.find('tbody').find_all('tr'):
        cells = [td.get_text(strip=True) for td in tr.find_all('td')]
        cells += [''] * (len(headers) - len(cells))
        rows.append(cells)
    return pd.DataFrame(rows, columns=headers)

# 5) Visit each box-score, scrape, and collect
all_games = []
for idx, href in enumerate(hrefs, start=1):
    try:
        # extract the numeric game ID from the URL
        game_id = href.split("/gameId/")[1].split('?')[0]
        print(f"🔍 Scraping game {idx}/{len(hrefs)} → ID {game_id}")

        driver.get(href)
        time.sleep(3)

        soup   = BeautifulSoup(driver.page_source, 'html.parser')
        tables = soup.find_all('table', class_='Table')
        print(f"   🧾 Found {len(tables)} tables")

        if len(tables) < 6:
            print("    Skipping — not enough tables")
            continue

        # parse them
        away_names  = parse_table(tables[2])
        away_stats  = parse_table(tables[3])
        home_names  = parse_table(tables[4])
        home_stats  = parse_table(tables[5])

        # align lengths
        max_len = max(len(away_names), len(away_stats), len(home_names), len(home_stats))
        for df in (away_names, away_stats, home_names, home_stats):
            df.reindex(range(max_len), fill_value='')

        # prefix columns and merge
        away_df = pd.concat([away_names, away_stats], axis=1)
        away_df.columns = [f"away_{c}" for c in away_df.columns]

        home_df = pd.concat([home_names, home_stats], axis=1)
        home_df.columns = [f"home_{c}" for c in home_df.columns]

        merged = pd.concat([away_df, home_df], axis=1)

        # insert gameId column at the front
        merged.insert(0, "gameId", game_id)

        all_games.append(merged)
        print("    Success")
    except Exception as e:
        print(f"    Error scraping {href}: {e}")
        continue

# 6) Build final DataFrame
if all_games:
    final_df = pd.concat(all_games, ignore_index=True)
    print("\n FINAL PREVIEW:")
    print(final_df.head())
    # final_df.to_csv(f"boxscores_{date_str}.csv", index=False)

    output_path = "../../general/stats/batter_stats_2025.csv"
    final_df.to_csv(
        output_path,
        mode='a',                                # append
        index=False,
        header=not os.path.exists(output_path)   # write header only if file is new
)

else:
    print(" No games scraped.")

driver.quit()


 Found 16 box-score links for 20250516
🔍 Scraping game 1/16 → ID 401695581
   🧾 Found 12 tables
    Success
🔍 Scraping game 2/16 → ID 401695570
   🧾 Found 13 tables
    Success
🔍 Scraping game 3/16 → ID 401695564
   🧾 Found 13 tables
    Success
🔍 Scraping game 4/16 → ID 401695567
   🧾 Found 13 tables
    Success
🔍 Scraping game 5/16 → ID 401695566
   🧾 Found 13 tables
    Success
🔍 Scraping game 6/16 → ID 401695569
   🧾 Found 13 tables
    Success
🔍 Scraping game 7/16 → ID 401695565
   🧾 Found 13 tables
    Success
🔍 Scraping game 8/16 → ID 401695563
   🧾 Found 13 tables
    Success
🔍 Scraping game 9/16 → ID 401695568
   🧾 Found 13 tables
    Success
🔍 Scraping game 10/16 → ID 401695573
   🧾 Found 13 tables
    Success
🔍 Scraping game 11/16 → ID 401695571
   🧾 Found 12 tables
    Success
🔍 Scraping game 12/16 → ID 401695572
   🧾 Found 13 tables
    Success
🔍 Scraping game 13/16 → ID 401695575
   🧾 Found 12 tables
    Success
🔍 Scraping game 14/16 → ID 401695577
   🧾 Found 13 tables
  

In [None]:
# final_df.to_csv('../../batter_record_1_hit/stats/batter_stats_2025.csv', mode='a', header=False, index=True)
# once final_df exists:
print(final_df.columns.tolist())


In [None]:
batter_team_2025 = pd.read_csv('../../general/stats/batters_team_2025.csv')

In [None]:
rbi_def_rank_2025=pd.read_csv('../def_scraper/batter_rbi_def_rank.csv')
hits_def_rank_2025=pd.read_csv('../def_scraper/batter_hits_def_rank.csv')
runs_def_rank_2025=pd.read_csv('../def_scraper/batter_runs_def_rank.csv')
# rbi_def_rank_2025=pd.read_csv('../def_scraper/batter_rbi_def_rank.csv')

In [None]:
final_df_2=final_df

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
final_df_2=final_df_2[['away_hitters','away_R','away_H','away_K','away_RBI','home_hitters','home_R','home_H','home_K','home_RBI']]
final_df_2

In [None]:
#normalizing the names, getting rid of accents
def normalize_name(name):
    if not isinstance(name, str):
        return name
    return unicodedata.normalize('NFKD', name).encode('ascii', 'ignore').decode('utf-8')

In [None]:
batter_team_2025['player'] = batter_team_2025['player'].apply(normalize_name)


In [None]:
multi_team_flags = ['2TM', '3TM', '4TM', '5TM']
filtered_df_2 = batter_team_2025[~batter_team_2025['team'].isin(multi_team_flags)].copy()


batter_team_2025 = filtered_df_2.drop_duplicates(subset='player', keep='last').reset_index(drop=True)

In [None]:
def clean_name_symbols(name):
    if not isinstance(name, str):
        return name
    return name.replace('*', '').replace('#', '')

In [None]:
batter_team_2025['player'] = batter_team_2025['player'].apply(clean_name_symbols)

In [None]:
batter_team_2025

In [None]:
#changing names to match other files
stats_2025=final_df_2

In [None]:
def clean_name(raw):
    if not isinstance(raw, str):
        return None

    # Normalize unicode to remove accents (e.g., Hernández → Hernandez)
    raw = unicodedata.normalize('NFKD', raw).encode('ascii', 'ignore').decode('utf-8')

    # Remove leading garbage (e.g., bT. Taylor → T. Taylor)
    raw = re.sub(r"^[^A-Z]*", "", raw)

    # Fix missing space after period (e.g., T.Taylor → T. Taylor)
    raw = re.sub(r"([A-Z])\.([A-Z])", r"\1. \2", raw)

    # Remove trailing position labels (e.g., PH-CF, 3B, DH)
    raw = re.sub(r"[A-Z]{1,3}(-[A-Z]{1,3})?$", "", raw.strip())

    # Remove anything else weird
    raw = re.sub(r"[^\w\s\.\'\-]", "", raw)

    # Match short name: first initial + last name (including compound last names)
    match = re.match(r"([A-Z]\.\s(?:[A-Z][a-z]+(?:\s|['’\-]))*[A-Z][a-z]+)", raw)
    return match.group(1).strip() if match else None


# Apply cleaner to get short names
stats_2025['short_name'] = stats_2025['away_hitters'].apply(clean_name)

# Match short names to full names
def match_full_name(short_name, full_names):
    if not isinstance(short_name, str):
        return None

    last_name = short_name.split()[-1]
    full_names = [name for name in full_names if isinstance(name, str)]

    possible_matches = [name for name in full_names if last_name in name]
    for name in possible_matches:
        if name[0] == short_name[0]:  # first initial match
            return name

    close = get_close_matches(short_name, full_names, n=1, cutoff=0.5)
    return close[0] if close else None

# Now apply matching
full_name_list = batter_team_2025['player'].tolist()
stats_2025['matched_name'] = stats_2025['short_name'].apply(lambda x: match_full_name(x, full_name_list))


In [None]:
stats_2025

In [None]:
stats_2025=stats_2025.rename(columns={"matched_name":"away_batters"})

In [None]:
merged_df_5=stats_2025.drop(columns=['away_hitters','short_name'])

In [None]:
merged_df_5

In [None]:
def clean_name(raw):
    if not isinstance(raw, str):
        return None

    # Normalize unicode to remove accents (e.g., Hernández → Hernandez)
    raw = unicodedata.normalize('NFKD', raw).encode('ascii', 'ignore').decode('utf-8')

    # Remove leading garbage (e.g., bT. Taylor → T. Taylor)
    raw = re.sub(r"^[^A-Z]*", "", raw)

    # Fix missing space after period (e.g., T.Taylor → T. Taylor)
    raw = re.sub(r"([A-Z])\.([A-Z])", r"\1. \2", raw)

    # Remove trailing position labels (e.g., PH-CF, 3B, DH)
    raw = re.sub(r"[A-Z]{1,3}(-[A-Z]{1,3})?$", "", raw.strip())

    # Remove anything else weird
    raw = re.sub(r"[^\w\s\.\'\-]", "", raw)

    # Match short name: first initial + last name (including compound last names)
    match = re.match(r"([A-Z]\.\s(?:[A-Z][a-z]+(?:\s|['’\-]))*[A-Z][a-z]+)", raw)
    return match.group(1).strip() if match else None


# Apply cleaner to get short names
merged_df_5['short_name'] = merged_df_5['home_hitters'].apply(clean_name)

# Match short names to full names
def match_full_name(short_name, full_names):
    if not isinstance(short_name, str):
        return None

    last_name = short_name.split()[-1]
    full_names = [name for name in full_names if isinstance(name, str)]

    possible_matches = [name for name in full_names if last_name in name]
    for name in possible_matches:
        if name[0] == short_name[0]:  # first initial match
            return name

    close = get_close_matches(short_name, full_names, n=1, cutoff=0.5)
    return close[0] if close else None

# Now apply matching
full_name_list = batter_team_2025['player'].tolist()
merged_df_5['matched_name'] = merged_df_5['short_name'].apply(lambda x: match_full_name(x, full_name_list))


In [None]:
merged_df_6=merged_df_5.rename(columns={"matched_name":"home_batters"})

In [None]:
merged_df_6=merged_df_6.drop(columns=['home_hitters','short_name'])

In [None]:
merged_df_6

In [None]:
# Rename columns
# merged_df_7.rename(columns={
#     'player_x':'away_hitters',
#     'player_y':'home_hitters',
#     'team_x':'away_team',
#     'team_y':'home_team'
# }, inplace=True)

In [None]:
away=merged_df_6[['away_batters','away_R','away_H','away_K','away_RBI']]
home=merged_df_6[['home_batters','home_R','home_H','home_K','home_RBI']]

In [None]:
# Rename columns
away.rename(columns={
    'away_batters':'player',
    'away_K':'strikeouts',
    'away_R':'runs',
    'away_H':'hits',
    'away_RBI':'RBI'
}, inplace=True)

# Rename columns
home.rename(columns={
    'home_batters':'player',
    'home_K':'strikeouts',
    'home_R':'runs',
    'home_H':'hits',
    'home_RBI':'RBI'
}, inplace=True)



In [None]:
stacked_df = pd.concat([away, home], ignore_index=True)


In [None]:
stacked_df

In [None]:
stacked_df.to_csv('mlb_results.csv')