# [Baseball Databank](http://www.seanlahman.com/baseball-archive/statistics/)

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
from pathlib import Path
from typing import Tuple, Callable

In [4]:
import plotly.io as pio
import plotly.express as px
import plotly.graph_objects as go

In [5]:
# show matplotlib plots in notebook
%matplotlib inline

In [6]:
# set matplotlib style/theme
plt.style.use("fivethirtyeight")

In [7]:
# plotly defaults
pio.renderers.default = "notebook"
pio.templates.default = "seaborn"

In [8]:
from tabulate import tabulate
from IPython.core.display import display, Markdown, HTML

def markdown(s: str):
    display(Markdown(s))
    
def html(s: str):
    display(HTML(s))

In [9]:
# path to data files
datapath = Path("../baseballdatabank/core")

___________________
## Pitching info

In [11]:
pitching = pd.read_csv(datapath/"pitching.csv")
pitching.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48399 entries, 0 to 48398
Data columns (total 30 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   playerID  48399 non-null  object 
 1   yearID    48399 non-null  int64  
 2   stint     48399 non-null  int64  
 3   teamID    48399 non-null  object 
 4   lgID      48267 non-null  object 
 5   W         48399 non-null  int64  
 6   L         48399 non-null  int64  
 7   G         48399 non-null  int64  
 8   GS        48399 non-null  int64  
 9   CG        48399 non-null  int64  
 10  SHO       48399 non-null  int64  
 11  SV        48399 non-null  int64  
 12  IPouts    48399 non-null  int64  
 13  H         48399 non-null  int64  
 14  ER        48399 non-null  int64  
 15  HR        48399 non-null  int64  
 16  BB        48399 non-null  int64  
 17  SO        48399 non-null  int64  
 18  BAOpp     43958 non-null  float64
 19  ERA       48305 non-null  float64
 20  IBB       33821 non-null  fl

## People info

In [13]:
people = pd.read_csv(datapath/"people.csv")
for col in "debut", "finalGame":
    people[col] = pd.to_datetime(people[col])

In [14]:
# slimmed people dataframe with player ID, full name, first and final games as datetimes
people_slim = pd.concat([people["playerID"], 
                         people[["nameFirst", "nameLast"]].fillna("").agg(" ".join, axis=1).rename("fullName"), 
                         people["debut"].apply(lambda dt: dt.year).fillna(0).astype(np.int), 
                         people["finalGame"].apply(lambda dt: dt.year).fillna(0).astype(np.int)], 
                        axis=1)
people_slim.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20093 entries, 0 to 20092
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   playerID   20093 non-null  object
 1   fullName   20093 non-null  object
 2   debut      20093 non-null  int64 
 3   finalGame  20093 non-null  int64 
dtypes: int64(2), object(2)
memory usage: 628.0+ KB


_____________________
### Convenience functions

In [25]:
# All-time <stat> leaders (using sum aggregation)
def leaders(df_stat: pd.DataFrame, df_people: pd.DataFrame, stat: str, count: int=10):
    _leaders = (df_stat.pivot_table(values=stat, columns="yearID", index="playerID", fill_value=0, aggfunc=np.sum)
                  .astype(np.int)
                  .sum(axis=1)
                  .sort_values(ascending=False)
                  .rename(stat)
                  .head(count))
    return pd.merge(df_people, _leaders, on="playerID").sort_values(stat, ascending=False)

In [26]:
# All-time <stat> leaders (using mean aggregation)
def leaders_mean(df_stat: pd.DataFrame, df_people: pd.DataFrame, stat: str, count: int=10):
    _leaders = (df_stat.pivot_table(values=stat, columns="yearID", index="playerID", aggfunc=np.mean)
                  .mean(axis=1)
                  .sort_values(ascending=False)
                  .rename(stat)
                  .head(count))
    return pd.merge(df_people, _leaders, on="playerID").sort_values(stat, ascending=False)

In [27]:
# Bar-plot leaders
def leaders_barplot_sns(df: pd.DataFrame, stat: str, figsize: Tuple[int, int]=(7,6), stat_label: str=None):
    if stat_label is None:
        stat_label = stat
    plt.figure(figsize=figsize)
    bp = sns.barplot(x="fullName", y=stat, data=df, palette="crest_r")
    bp.set(xlabel=None, ylabel=f"Career {stat_label}", title=f"All-time {stat_label} Leaders")
    bp.set_xticklabels(bp.get_xticklabels(), rotation=60)
    plt.tight_layout()
    return bp

**NOTE** the bizarre name difference betwen setting ```custom_data``` and using ```customdata``` in plotly.express

In [28]:
# plotly plot
def leaders_barplot_plotly(df: pd.DataFrame, stat:str, stat_label: str=None):
    if stat_label is None:
        stat_label = stat
    bp = px.bar(df, 
                x="fullName", 
                y=stat, 
                title=f"All-time {stat_label} Leaders",
                custom_data=["debut", "finalGame"])
    bp.update_traces(hovertemplate="<b>%{x}</b>: %{y} (%{customdata[0]}-%{customdata[1]})")
    bp.update_xaxes(title=None)
    bp.update_yaxes(title=f"Career {stat_label}")
    return bp

_____________________________________
# Players: All-time Pitching Leaders

## Strikeouts

In [16]:
## Shutouts

In [18]:
## Wins

In [19]:
## Games

In [20]:
## Wins - Losses

In [21]:
## Saves

In [22]:
## Shutouts

In [23]:
## WP + HBP

In [24]:
## ERA (with minimum N games)

______________________________________________