# STAT488 Independent Project - An analysis of baseball data

In [None]:
import pandas as pd
import numpy as np
import math as m
import statistics as stat
from scipy import stats as test

# Project 1: Age of Prime Year vs. Weight

In [195]:
# Is there a correlation between age, size/weight, and on field performance?
# Hypothesis: Players that weigh more tend to have a prime later than the average age to have your prime season.

# Developing sample set of players who have 250 PA or more in at lease 5 seasons dating back to 2015

# Starting with a table taken from https://baseballsavant.mlb.com/
playerData = pd.DataFrame(pd.read_csv("C:/Users/tmeli/OneDrive/Documents/CLASSES/Fall 2022/STAT488/playerAgePerformanceData.csv"))
playerData = playerData.dropna()
playerData.info()
playerData.describe()
playerData.sample(n = 10)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2489 entries, 0 to 3193
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   last_name    2489 non-null   object 
 1    first_name  2489 non-null   object 
 2   player_id    2489 non-null   float64
 3   year         2489 non-null   int64  
 4   player_age   2489 non-null   int64  
 5   xwoba        2489 non-null   float64
dtypes: float64(2), int64(2), object(2)
memory usage: 136.1+ KB


Unnamed: 0,last_name,first_name,player_id,year,player_age,xwoba
734,Martin,Russell,431145.0,2017,34,0.339
1150,Escobar,Eduardo,500871.0,2018,29,0.323
161,Ward,Taylor,621493.0,2022,28,0.361
2697,Dietrich,Derek,518618.0,2015,25,0.332
191,Garver,Mitch,641598.0,2022,31,0.319
1384,Dozier,Hunter,641531.0,2018,26,0.304
786,Upton,Justin,457708.0,2017,29,0.361
213,Schwindel,Frank,643524.0,2022,30,0.278
2510,Victorino,Shane,425664.0,2015,34,0.275
495,Carter,Chris,474892.0,2016,29,0.368


In [196]:
# Form hypothesis test using a sample of players who weigh X lbs

#Step 1: Find Primes for every player
#Do this by grouping by player, and gathering their age for their prime year by woba

# Finding the best season of every player by using their highest xwoba since 2015
findPrimeYear = playerData.groupby(['player_id'])['xwoba'].transform('max') == playerData['xwoba']
primeYears = playerData.loc[findPrimeYear].drop_duplicates(subset=['player_id']).set_index(['player_id'])

# Filter by players that have at least 5 seasons (ideally would use more seasons but data only dates back this far)
minYears = 5
filterSeasons = playerData.groupby(['player_id']).size()
filterSeasons = filterSeasons >= minYears
primeYears = primeYears.loc[filterSeasons]

# Preparing to merge with Lahman database
primeYears[' first_name'] = primeYears[' first_name'].str.replace(" ", "")
primeYears['full_name'] = primeYears[' first_name'].str.cat(primeYears['last_name'], sep=' ')
primeYears = primeYears.drop(" first_name",axis = 1).drop("last_name",axis = 1)
primeYears.info()
# print(primeYears.loc[547989,"full_name"] == " Jose Abreu") # SO MUCH TROUBLE SHOOTING JUST TO FIGURE OUT THERES A SPACE IN FRONT OF THE NAMES

<class 'pandas.core.frame.DataFrame'>
Float64Index: 214 entries, 488726.0 to 660162.0
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   year        214 non-null    int64  
 1   player_age  214 non-null    int64  
 2   xwoba       214 non-null    float64
 3   full_name   214 non-null    object 
dtypes: float64(1), int64(2), object(1)
memory usage: 8.4+ KB


In [197]:
# First import data from R
weightData = pd.DataFrame(pd.read_csv("C:/Users/tmeli/OneDrive/Documents/CLASSES/Fall 2022/STAT488/weightMerge.csv"))
weightData['full_name'] = weightData['nameFirst'].str.cat(weightData['nameLast'], sep=' ')
weightData = weightData.drop("nameFirst", axis = 1).drop("nameLast", axis = 1).drop("Unnamed: 0", axis = 1).drop("finalGame", axis = 1)
weightData.info()
print(weightData.loc[:,"full_name"])
print(weightData.loc[5,"full_name"] == "Jose Abreu")
print((weightData.loc[200,"full_name"]))

finalData = primeYears.merge(weightData, on = "full_name", how = "inner")
print(finalData["player_age"].mean())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2958 entries, 0 to 2957
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   weight     2958 non-null   int64 
 1   full_name  2958 non-null   object
dtypes: int64(1), object(1)
memory usage: 46.3+ KB
0       David Aardsma
1       Fernando Abad
2         Cory Abbott
3        Albert Abreu
4         Bryan Abreu
            ...      
2953       Barry Zito
2954      Ben Zobrist
2955      Tyler Zuber
2956      Mike Zunino
2957        Tony Zych
Name: full_name, Length: 2958, dtype: object
True
Eduard Bazardo
28.24271844660194


In [198]:
meanWeight = finalData['weight'].mean()
sdWeight = stat.stdev(finalData['weight'])
z_score = (finalData['weight'] - meanWeight)/(sdWeight)
print("Testing for players with weights higher than " + str(meanWeight+sdWeight))
highWeight = z_score > 1 #OVer 1 std
highWeightSample = finalData.loc[highWeight]
print(highWeightSample)
meanPrimeHighWeight = highWeightSample['player_age'].mean()
print(meanPrimeHighWeight)

Testing for players with weights higher than 232.59481346027235
     year  player_age  xwoba          full_name  weight
0    2022          35  0.373         Jose Abreu     235
3    2022          30  0.463        Aaron Judge     282
8    2016          36  0.367      Albert Pujols     235
9    2016          33  0.449     Miguel Cabrera     267
11   2016          33  0.389    Kendrys Morales     242
18   2016          28  0.349       Wilson Ramos     241
28   2016          25  0.315    Jonathan Villar     233
46   2017          31  0.347      Tyler Flowers     260
47   2017          31  0.331   Asdrubal Cabrera     235
71   2017          31  0.371     Mitch Moreland     245
72   2017          27  0.395      Anthony Rizzo     240
82   2017          25  0.330    Jonathan Schoop     247
97   2017          26  0.359        Yasiel Puig     240
98   2017          24  0.398       Rhys Hoskins     245
102  2018          28  0.356      Jesus Aguilar     277
108  2018          26  0.338    Gregory 

In [199]:
ageSplit = primeYears.groupby(['player_age']).size()
# print(ageSplit)
print(sum(ageSplit)) #Total samples
meanPrime = primeYears['player_age'].mean() #Average age of prime season
sdPrime = stat.pstdev(primeYears['player_age']) #Standard deviation of average age
print(meanPrime)
print(sdPrime)

214
28.345794392523363
3.2240417071880727


In [299]:
# Conclusion
z_star = (meanPrimeHighWeight - meanPrime)/(sdPrime/(m.sqrt(len(highWeightSample))))
print(z_star)
# The sample and the population have a mean that is within almost 1/100th of a decimal.
# The difference z score for the comparison to the population is
# Due to this I am deeming a p test unnessary as the two clearly have no correlation
# This gives us insight that weight should not be taken into consideration for how
# late in the career a player may have their prime season
# Possible sources of error: Should have used full, completed careers. Decided against it since performance statistics were better after 2015. Probably shouldn't have

0.01862582303365157


# Project 2: Pitch Usage vs. Strikeout Rate

In [300]:
min4pitches = pitchUsage["unique_pitches"] >= 4
min4pitches = pitchUsage.loc[min4pitches]
meanSample = min4pitches["p_k_percent"].mean()
sdSample = stat.stdev(min3pitches['p_k_percent'])
z_star = (meanSample - meanKpercent)/(sdKpercent/m.sqrt(len(min4pitches)))
print(meanSample)
print(sdSample)
print(z_star)
print("95% CI: " + str(meanSample - 1.96* sdSample/m.sqrt(len(min4pitches))) + ", " + str(meanSample + 1.96* sdSample/m.sqrt(len(min4pitches))))
print(min4pitches["woba"].mean())

min4pitches.info()
min4pitches.sample(10)

effectSize = (meanSample-meanKpercent)/sdKpercent
print(effectSize)

21.28831168831168
3.323401871576773
-2.2420351989654077
95% CI: 20.545987573851498, 22.03063580277186
0.3085064935064937
<class 'pandas.core.frame.DataFrame'>
Int64Index: 77 entries, 10 to 1108
Data columns (total 17 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   last_name           77 non-null     object 
 1    first_name         77 non-null     object 
 2   player_id           77 non-null     int64  
 3   year                77 non-null     int64  
 4   innings_pitched     77 non-null     float64
 5   p_k_percent         77 non-null     float64
 6   p_bb_percent        77 non-null     float64
 7   woba                77 non-null     float64
 8   barrel_batted_rate  77 non-null     float64
 9   ff_rate             77 non-null     float64
 10  sl_rate             52 non-null     float64
 11  ch_rate             74 non-null     float64
 12  curve_rate          71 non-null     float64
 13  si_rate             67 non-null  

p value of about 0.014
Conclusion: When a pitcher throws a larger amount of pitches at a high rate, they tend to strikeout less batters.
Inference: Pitchers who mix more pitches into their arsenal tend to be the type of pitchers who try to "fool" batters.
This results in a subset of pitchers who are more pitch to contact instead of trying to strike people out.

In [6]:
# Future Inqueries:

# Can we gather an understanding of what assessing play-by-play data is like through retrosheet.com data?
# To find out we will be attempting to assess possitional shift data.