# Data Scraping

## Imports

In [51]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
import numpy as np

## Request / Parsing

In [52]:
url = "https://en.wikipedia.org/wiki/List_of_members_of_the_National_Baseball_Hall_of_Fame"

In [53]:
r = requests.get(url, headers={"user-agent": "Baseball HoF Analysis (tghardy@byu.edu)"})

In [54]:
soup = BeautifulSoup(r.content)

Wikipedia articles tend to have a lot of tables on them and it doesn't always cooperate with bs4.find_all(). But, I know that the 3rd table on the page is what we're looking for (and this can be sorted out easily with trial and error).

In [55]:
tables = soup.find_all("table")

In [56]:
df = pd.read_html(str(tables[2]))[0]

  df = pd.read_html(str(tables[2]))[0]


## Wrangling

In [57]:
df

Unnamed: 0,Year,Name,Primary position,Primary team,Career,Induction method,Vote %,Ref.
0,1936,Ty Cobb,CF,Detroit Tigers,1905–1928,BBWAA,98.23%,[10]
1,1936,Walter Johnson,P,Washington Senators,1907–1927,BBWAA,83.63%,[11]
2,1936,Christy Mathewson,P,New York Giants,1900–1916,BBWAA,90.71%,[12]
3,1936,Babe Ruth,RF,New York Yankees,1914–1935,BBWAA,95.13%,[13]
4,1936,Honus Wagner,SS,Pittsburgh Pirates,1897–1917,BBWAA,95.13%,[14]
...,...,...,...,...,...,...,...,...
346,2025,CC Sabathia,P,New York Yankees,2001–2019,BBWAA,86.8%,[356]
347,2025,Ichiro Suzuki,RF,Seattle Mariners,2001–2019,BBWAA,99.75%,[357]
348,2025,Billy Wagner,P,Houston Astros,1995–2010,BBWAA,82.5%,[358]
349,2025,Dick Allen,1B,Philadelphia Phillies,1963–1977,VC (CLB),81.3%,[359]


In [58]:
df = df.rename(columns = {"Primary position": "position", "Induction method": "method", 'Vote\xa0%': "vote_percentage"})

In [59]:
df

Unnamed: 0,Year,Name,position,Primary team,Career,method,vote_percentage,Ref.
0,1936,Ty Cobb,CF,Detroit Tigers,1905–1928,BBWAA,98.23%,[10]
1,1936,Walter Johnson,P,Washington Senators,1907–1927,BBWAA,83.63%,[11]
2,1936,Christy Mathewson,P,New York Giants,1900–1916,BBWAA,90.71%,[12]
3,1936,Babe Ruth,RF,New York Yankees,1914–1935,BBWAA,95.13%,[13]
4,1936,Honus Wagner,SS,Pittsburgh Pirates,1897–1917,BBWAA,95.13%,[14]
...,...,...,...,...,...,...,...,...
346,2025,CC Sabathia,P,New York Yankees,2001–2019,BBWAA,86.8%,[356]
347,2025,Ichiro Suzuki,RF,Seattle Mariners,2001–2019,BBWAA,99.75%,[357]
348,2025,Billy Wagner,P,Houston Astros,1995–2010,BBWAA,82.5%,[358]
349,2025,Dick Allen,1B,Philadelphia Phillies,1963–1977,VC (CLB),81.3%,[359]


In [60]:
pattern = re.compile(r"(\d{4}).*\n*.*\n*.*\n*.*(\d{4})") # Scary pattern but it fixes really annoying data
df[["career_start", "career_end"]]= df["Career"].str.extract(pattern)

In [61]:
df=df.drop(["Ref.", "Career"], axis = 1)

In [62]:
df["vote_percentage"] = df["vote_percentage"].str.replace("%", "")

In [63]:
df["vote_percentage"] = df["vote_percentage"].apply(lambda x: np.nan if "—" in x else x ).astype(float)

In [64]:
df

Unnamed: 0,Year,Name,position,Primary team,method,vote_percentage,career_start,career_end
0,1936,Ty Cobb,CF,Detroit Tigers,BBWAA,98.23,1905,1928
1,1936,Walter Johnson,P,Washington Senators,BBWAA,83.63,1907,1927
2,1936,Christy Mathewson,P,New York Giants,BBWAA,90.71,1900,1916
3,1936,Babe Ruth,RF,New York Yankees,BBWAA,95.13,1914,1935
4,1936,Honus Wagner,SS,Pittsburgh Pirates,BBWAA,95.13,1897,1917
...,...,...,...,...,...,...,...,...
346,2025,CC Sabathia,P,New York Yankees,BBWAA,86.80,2001,2019
347,2025,Ichiro Suzuki,RF,Seattle Mariners,BBWAA,99.75,2001,2019
348,2025,Billy Wagner,P,Houston Astros,BBWAA,82.50,1995,2010
349,2025,Dick Allen,1B,Philadelphia Phillies,VC (CLB),81.30,1963,1977


In [67]:
df.to_csv("baseball_hof.csv", index=False)