# [Baseball Databank](http://www.seanlahman.com/baseball-archive/statistics/)

In [1206]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [1207]:
from pathlib import Path
from typing import Tuple, Callable

In [1208]:
import plotly.io as pio
import plotly.express as px
import plotly.graph_objects as go

In [1209]:
# show matplotlib plots in notebook
%matplotlib inline

In [1210]:
# set matplotlib style/theme
plt.style.use("fivethirtyeight")

In [1211]:
# plotly defaults
pio.renderers.default = "notebook"
pio.templates.default = "seaborn"

In [1212]:
from tabulate import tabulate
from IPython.core.display import display, Markdown, HTML

def markdown(s: str):
    display(Markdown(s))
    
def html(s: str):
    display(HTML(s))

In [1213]:
# path to data files
datapath = Path("../baseballdatabank/core")

## People info

In [1215]:
people = pd.read_csv(datapath/"people.csv")
for col in "debut", "finalGame":
    people[col] = pd.to_datetime(people[col])
people.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20093 entries, 0 to 20092
Data columns (total 24 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   playerID      20093 non-null  object        
 1   birthYear     19979 non-null  float64       
 2   birthMonth    19811 non-null  float64       
 3   birthDay      19670 non-null  float64       
 4   birthCountry  20032 non-null  object        
 5   birthState    19561 non-null  object        
 6   birthCity     19919 non-null  object        
 7   deathYear     9887 non-null   float64       
 8   deathMonth    9886 non-null   float64       
 9   deathDay      9885 non-null   float64       
 10  deathCountry  9884 non-null   object        
 11  deathState    9836 non-null   object        
 12  deathCity     9878 non-null   object        
 13  nameFirst     20056 non-null  object        
 14  nameLast      20093 non-null  object        
 15  nameGiven     20056 non-null  object

In [1216]:
# slimmed people dataframe with player ID, full name, first and final games as datetimes
people_slim = pd.concat([people["playerID"], 
                         people[["nameFirst", "nameLast"]].fillna("").agg(" ".join, axis=1).rename("fullName"), 
                         people["debut"].apply(lambda dt: dt.year).fillna(0).astype(np.int), 
                         people["finalGame"].apply(lambda dt: dt.year).fillna(0).astype(np.int)], 
                        axis=1)

In [1217]:
people_slim

Unnamed: 0,playerID,fullName,debut,finalGame
0,aardsda01,David Aardsma,2004,2015
1,aaronha01,Hank Aaron,1954,1976
2,aaronto01,Tommie Aaron,1962,1971
3,aasedo01,Don Aase,1977,1990
4,abadan01,Andy Abad,2001,2006
...,...,...,...,...
20088,zupofr01,Frank Zupo,1957,1961
20089,zuvelpa01,Paul Zuvella,1982,1991
20090,zuverge01,George Zuverink,1951,1959
20091,zwilldu01,Dutch Zwilling,1910,1916


_____________________
### Convenience functions

In [1218]:
# All-time <stat> leaders
def leaders(df_stat: pd.DataFrame, df_people: pd.DataFrame, stat: str, count: int=10):
    _leaders = (df_stat.pivot_table(values=stat, columns="yearID", index="playerID", fill_value=0, aggfunc=np.sum)
                  .astype(np.int)
                  .sum(axis=1)
                  .sort_values(ascending=False)
                  .rename(stat)
                  .head(count))
    return pd.merge(df_people, _leaders, on="playerID").sort_values(stat, ascending=False)

In [1219]:
# Bar-plot leaders
def leaders_barplot_sns(df: pd.DataFrame, y: str, figsize: Tuple[int, int]=(7,6), stat_label: str=None):
    if stat_label is None:
        stat_label = y
    plt.figure(figsize=figsize)
    bp = sns.barplot(x="fullName", y=y, data=df, palette="crest_r")
    bp.set(xlabel=None, ylabel=f"Career {stat_label}", title=f"All-time {stat_label} Leaders")
    bp.set_xticklabels(bp.get_xticklabels(), rotation=60)
    plt.tight_layout()

**NOTE** the bizarre name difference betwen setting ```custom_data``` and using ```customdata``` in plotly.express

In [1220]:
# plotly plot
def leaders_barplot_plotly(df: pd.DataFrame, stat:str, y_title: str=None):
    bp = px.bar(df, 
                x="fullName", 
                y=stat, 
                title=f"All-time {y_title if y_title else stat} Leaders",
                custom_data=["debut", "finalGame"])
    bp.update_traces(hovertemplate="<b>%{x}</b>: %{y} (%{customdata[0]}-%{customdata[1]})")
    bp.update_xaxes(title=None)
    bp.update_yaxes(title=f"Career {y_title if y_title else stat}")
    return bp

In [None]:
## Most teams played-for

In [None]:
## How many players played for a single team?

In [None]:
## Pie of birth-country

In [None]:
## Average retirement age

In [None]:
## Oldest retirement age

In [None]:
## Batting average
- absolute highest (with N AB)
- combined-career (total H/total AB)
- most consistent (highest average across career)

______________________________________________