In [5]:
import pandas as pd
import scipy.stats as stats
import seaborn as sns
import matplotlib.pyplot as plt

# for regression
import statsmodels.formula.api as smf
import statsmodels.api as sm
from statistics import mean
import subprocess # from mdbtools

stats_mdb = "stats.mdb"
tables = subprocess.check_output(["mdb-tables", stats_mdb]).decode().split()

# exploratory table name inquiry
print(tables) # Amber is an online conversion tool default

['Amber']


## Problem Statement
### 3.b
Everyone loves a shiny new toy, and baseball GMs are no different.  But is there a material benefit to throwing all your eggs in a young-kid-basket?  The data is set up to test the hypothesis--are younger players better?  Or asked another way: What is a baseball player's prime age?  The response variable will be wOBA which stands for weighted on-base average.  This is an advanced statistic based on linear weights designed to measure a player's overall offensive contributions per plate appearance.  The weights are based on observed run values for individual events, league-wide, and vary year to year.  The data was taken from baseballsavant.mlb.com, exported to .csv, converted online to .mdb, and assuaged in the notebook.

In [66]:
stats_table = "Amber"

def convert_mdb2df(file, table) -> pd.DataFrame:
    d = subprocess.check_output(["mdb-export", file, table]).decode().split("\n")
    # first row has arbitrary letter designators, 2nd row has column names
    # online conversion added extraneous string characters that needed to be removed
    columns = d[1].replace('"', '').split(',')
    data = [i.replace('"', '').split(',') for i in d[2:]] # data starts in row 3
    df = pd.DataFrame(columns=columns, data=data)
    return df

df = convert_mdb2df(file=stats_mdb, table=stats_table).dropna(axis=0)

# df.columns


['A,B,C,D,E,F,G,H,I,J,K', '"last_name","first_name","player_id","year","player_age","slg_percent","on_base_percent","woba","exit_velocity_avg","barrel_batted_rate","hard_hit_percent"', '"Martinez","Victor","400121","2018","39","0.353","0.297","0.281","87.8","4.7","30.6"', '"Mauer","Joe","408045","2018","35","0.379","0.35","0.319","91.1","5","44.3"', '"Choo","Shin-Soo","425783","2018","35","0.434","0.376","0.355","89.3","11.1","40.5"', '"Molina","Yadier","425877","2018","35","0.436","0.314","0.323","88.2","5.5","33.8"', '"Encarnacion","Edwin","429665","2018","35","0.474","0.335","0.346","89.9","11.2","41.4"', '"Jones","Adam","430945","2018","32","0.419","0.313","0.315","88.3","4.9","33.1"', '"Kinsler","Ian","435079","2018","36","0.38","0.301","0.298","85.3","3.1","26.2"', '"Desmond","Ian","435622","2018","32","0.422","0.307","0.315","90.1","7","43.2"', '"Cruz Jr.","Nelson","443558","2018","37","0.509","0.342","0.361","93.9","13.8","51.7"', '"Peralta","David","444482","2018","30","0.516"

Unnamed: 0,last_name,first_name,player_id,year,player_age,slg_percent,on_base_percent,woba,exit_velocity_avg,barrel_batted_rate,hard_hit_percent
0,Martinez,Victor,400121,2018,39,0.353,0.297,0.281,87.8,4.7,30.6
1,Mauer,Joe,408045,2018,35,0.379,0.35,0.319,91.1,5.0,44.3
2,Choo,Shin-Soo,425783,2018,35,0.434,0.376,0.355,89.3,11.1,40.5
3,Molina,Yadier,425877,2018,35,0.436,0.314,0.323,88.2,5.5,33.8
4,Encarnacion,Edwin,429665,2018,35,0.474,0.335,0.346,89.9,11.2,41.4


All data decodes as string, need to convert number columns appropriately

In [68]:
# required to recast object types

df.player_age = df.player_age.astype(int)

df[["slg_percent", "on_base_percent", "woba", "exit_velocity_avg", "barrel_batted_rate", "hard_hit_percent"]] = df[["slg_percent", "on_base_percent", "woba", "exit_velocity_avg", "barrel_batted_rate", "hard_hit_percent"]].astype(float)
print(df.dtypes)

last_name              string
first_name             string
player_id              string
year                   string
player_age              int64
slg_percent           float64
on_base_percent       float64
woba                  float64
exit_velocity_avg     float64
barrel_batted_rate    float64
hard_hit_percent      float64
dtype: object


## Data Understanding & Figures
### 2.a

In [70]:
df.sort_values(by=['woba'], ascending=False).head()

Unnamed: 0,last_name,first_name,player_id,year,player_age,slg_percent,on_base_percent,woba,exit_velocity_avg,barrel_batted_rate,hard_hit_percent
626,Soto,Juan,665742,2020,21,0.695,0.49,0.478,92.1,18.3,51.6
256,Harper,Bryce,547180,2015,22,0.649,0.46,0.461,91.4,12.9,47.5
732,Judge,Aaron,592450,2022,30,0.686,0.422,0.458,95.9,26.5,61.8
559,Freeman,Freddie,518692,2020,30,0.64,0.462,0.456,92.4,14.7,54.2
110,Betts,Mookie,605141,2018,25,0.64,0.438,0.449,92.2,14.1,50.6


In [74]:
sns.pairplot(data=df, vars=["woba"])

TypeError: 'NoneType' object is not callable

TypeError: 'NoneType' object is not callable

<Figure size 250x250 with 1 Axes>