# SF Giants batting, 1958-2023
> This notebook downloads historic batting tables from [Baseball Reference](https://www.baseball-reference.com/teams/SFG/2024-batting.shtml) and outputs them to CSV, JSON and Parquet formats for later analysis and visualization.

---

#### Import Python tools and Jupyter config

In [16]:
import pandas as pd
#import jupyter_black
from time import sleep
from random import randint
from tqdm.notebook import tqdm

In [17]:
#jupyter_black.load()
pd.options.display.max_columns = 100
pd.options.display.max_rows = 1000
pd.options.display.max_colwidth = None

---

## Fetch

#### List comprehension of historic batting stats pages

In [18]:
urls = [
    f"https://www.baseball-reference.com/teams/SFG/{year}-batting.shtml"
    for year in range(1958, 2024)
]

#### Loop through urls, fetch batting table, clear out summary tables, store in list of dataframes

In [20]:
src_dfs = []
summary_dfs = []

for url in tqdm(urls):
    year = url.split("/")[-1].replace("-batting.shtml", "")
    
    # Player stats
    src_df = (
        pd.read_html(url)[0]
        .query(f"~Rk.isna() and Rk != 'Rk'")
        .dropna(thresh=7)
        .assign(season=year)
    )
    src_df.columns = src_df.columns.str.lower().str.replace("+", "_plus")
    src_dfs.append(src_df)

    # Team stats
    summary_df = (
        pd.read_html(url)[0]
        .query(f"Rk.isna() and Rk != 'Rk'")
        .dropna(thresh=7)
        .assign(season=year)
    )
    summary_df.columns = summary_df.columns.str.lower().str.replace("+", "_plus")
    summary_dfs.append(summary_df)

    # Be kind to the server
    sleep(randint(3, 6))

  0%|          | 0/66 [00:00<?, ?it/s]

     rk  pos                name   age    g    pa    ab    r     h   2b  3b  \
40  NaN  NaN         Team Totals  26.7  154  6003  5318  727  1399  250  42   
41  NaN  NaN  Rank in 8 NL teams   NaN  NaN   NaN     2    1     3    1   5   
42  NaN  NaN  Non-Pitcher Totals  26.7  154  5546  4910  692  1325  238  41   
43  NaN  NaN      Pitcher Totals  27.2  154   457   408   35    74   12   1   

     hr  rbi  sb  cs   bb   so    ba   obp   slg   ops ops_plus    tb  gdp  \
40  170  682  64  29  531  817  .263  .331  .422  .753      100  2243  119   
41    3  NaN   2   4    4    5     5     2     2     2      NaN     2  NaN   
42  169  655  63  29  515  696  .270  .340  .438  .779      107  2152  108   
43    1   27   1   0   16  121  .181  .213  .223  .436       17    91   11   

   hbp   sh  sf  ibb season  
40  34   68  45   54   1958  
41   4  NaN   1  NaN   1958  
42  33   41  43   54   1958  
43   1   27   2    0   1958  
(4, 29)


#### Concatenate dataframes

In [5]:
player_df = pd.concat(src_dfs).drop_duplicates().reset_index(drop=True)
summary_df = pd.concat(summary_dfs).drop_duplicates().reset_index(drop=True)

---

## Player stats

#### Remove injury details listed parenthetically next to some players' names

In [6]:
player_df["name"] = player_df["name"].str.split("(", expand=True)[0].str.strip()

#### Determine batter type, clean special characters from names

In [7]:
def determine_and_clean_bats(name):
    # Determine batting stance
    if name.endswith("*"):
        bat = "Left"
    elif name.endswith("#"):
        bat = "Both"
    elif name.endswith("?"):
        bat = "Unknown"
    else:
        bat = "Right"

    if name[-1] in "*#?":
        name = name[:-1]

    return bat, name

#### Apply the function and separate the results into two columns

In [8]:
player_df["bats"], player_df["name_clean"] = zip(
    *player_df["name"].apply(determine_and_clean_bats)
)

#### Replace the original 'player' column with the cleaned names

In [9]:
player_df["name"] = player_df["name_clean"]
del player_df["name_clean"]

In [10]:
player_df[
    [
        "g",
        "pa",
        "ab",
        "r",
        "h",
        "2b",
        "3b",
        "hr",
        "rbi",
        "sb",
        "cs",
        "bb",
        "so",
        "tb",
        "gdp",
        "hbp",
        "sh",
        "sf",
        "ibb",
    ]
] = player_df[
    [
        "g",
        "pa",
        "ab",
        "r",
        "h",
        "2b",
        "3b",
        "hr",
        "rbi",
        "sb",
        "cs",
        "bb",
        "so",
        "tb",
        "gdp",
        "hbp",
        "sh",
        "sf",
        "ibb",
    ]
].astype(
    int
)

In [11]:
player_df[["ba", "obp", "slg", "ops", "ops_plus"]] = player_df[
    ["ba", "obp", "slg", "ops", "ops_plus"]
].astype(float)

---

## Team stats
> The main batting table has totals for the team, with ranks and aggregates by season

In [12]:
team_totals_df = summary_df.query('name == "Team Totals"').dropna(axis=1)

In [13]:
team_ranks_df = summary_df.query('name.str.contains("Rank")').dropna(axis=1)

---

## Exports

In [14]:
def save_dataframe(df, path_without_extension, formats):
    """
    Save a DataFrame in multiple formats.
    """
    for file_format in formats:
        if file_format == "csv":
            df.to_csv(f"{path_without_extension}.{file_format}", index=False)
        elif file_format == "json":
            df.to_json(
                f"{path_without_extension}.{file_format}", indent=4, orient="records"
            )
        elif file_format == "parquet":
            df.to_parquet(f"{path_without_extension}.{file_format}", index=False)
        else:
            print(f"Unsupported format: {file_format}")

In [15]:
formats = ["csv", "json", "parquet"]
save_dataframe(
    team_totals_df,
    "../data/batting/archive/giants_team_batting_statistics_1958_2023",
    formats,
)
save_dataframe(
    player_df,
    "../data/batting/archive/giants_player_batting_statistics_1958_2023",
    formats,
)
save_dataframe(
    team_ranks_df,
    "../data/batting/archive/giants_team_batting_rankings_1958_2023",
    formats,
)