In [14]:
# import statements
import numpy as np
import pandas as pd
import pybaseball as pyball
import matplotlib as plt
import math

pyball.cache.enable()

In [15]:
# pulling in statcast data
statcast_2021_raw = pyball.statcast('2021-04-01', '2021-10-04')

# pull in fences data from CSV
fences = pd.read_csv('/Users/wrowley/Desktop/python-projects/little-league/fence_heights_complete.csv')

This is a large query, it may take a moment to complete


100%|██████████| 187/187 [00:09<00:00, 19.38it/s]


In [16]:
# data cleaning
hit_balls = statcast_2021_raw[statcast_2021_raw['description'] == 'hit_into_play']
potential_hrs = hit_balls[hit_balls['hit_distance_sc'] > 300].copy()
potential_hrs['uID'] = range(1, len(potential_hrs) + 1)
potential_hrs['actual_hr'] = potential_hrs['events'].copy() == 'home_run'

# selecting just the columns we need
potential_hrs = potential_hrs[['uID',
                               'launch_speed', 
                               'launch_angle',
                               'events',
                               'plate_z',
                               'hit_distance_sc', 
                               'hc_x', 
                               'hc_y',
                               'home_team',
                               'actual_hr']].dropna()

# we also want a list of all stadiums
mlb_teams = fences['team_abbr'].unique()


In [17]:

def height_at_d(data, distance):
    """
    This function is to calculate the projected height at a desired distance away from the plate 
    of a ball, given statcast data. We will use this to calculate whether or not a ball is
    higher than the fence of a particular field

    CREDIT FOR ALL CALCULATIONS GOES TO danmorse314 ON GITHUB. THIS IS A RECREATION OF HIS METHODS
    FOR PROJECTING THE OUTCOMES OF HOMERUNS IN DIFFERENT STADIUMS

    Inputs: statcast data (one row)
            desired distance to get the height at
    Outputs: Height (in feet) at that distance
    """
    # if the hit distance is less than the wall, we just return 0 as we know its not a HR
    if distance > data['hit_distance_sc']:
        return 0
    # pre-calculations on our data
    g = -32
    launch_angle_radians = math.radians(data['launch_angle'])
    launch_speed_fts = (data['launch_speed'] * 5280) / 3600
    V_x = launch_speed_fts * math.cos(launch_angle_radians)
    V_y = launch_speed_fts * math.sin(launch_angle_radians)
    # time for projections using ~physics~
    total_time = -(V_y + math.sqrt((V_y ** 2) + (2 * g * data['plate_z']))) / g
    print("total time " + str(total_time))
    A_x = ((-2 * V_x) / total_time) + ((2 * data['hit_distance_sc']) / (total_time ** 2))
    time_at_wall = (-V_x + math.sqrt(V_x ** 2 + 2 * A_x * distance)) / A_x
    # sanity check adjustments
    check_time_at_wall = (-V_x + math.sqrt(V_x ** 2 + 2 * A_x * data['hit_distance_sc'])) / A_x
    print("Check " + str(check_time_at_wall))
    if (round(total_time,1) != round(check_time_at_wall,1)):
        print("sanity failed")
        time_at_wall = 2 * total_time - ((-V_x - math.sqrt(V_x ** 2 + 2 * A_x * distance)) / A_x)
    print("time at wall: " + str(time_at_wall))
    height_at_wall = ((V_y * time_at_wall) + (.5 * g * (time_at_wall ** 2)))
    return height_at_wall

# Testing
# SANITY CHECK - does projected distance roughly equal height at 0
# Sanity check checks out for the most part
# test_set_1 = potential_hrs[(potential_hrs['uID'] == 519) | 
#                            (potential_hrs['uID'] == 92) |
#                            (potential_hrs['uID'] == 877)].copy()
test_set_1 = potential_hrs.iloc[0:1000].copy()
test_set_1['calculated_height'] = test_set_1.apply(lambda row : height_at_d(row, row['hit_distance_sc'] - 2), axis=1) 


total time 4.049732706761632
Check 4.049732706761632
time at wall: 3.9924908814616784
total time 5.708977991638701
Check 5.7089779916387045
time at wall: 5.531617144293765
total time 2.9756275141390174
Check 2.975627514139017
time at wall: 2.9530040481331365
total time 3.528688480945749
Check 3.528688480945748
time at wall: 3.491801480991387
total time 5.05809772809869
Check 5.058097728098688
time at wall: 4.981369829925172
total time 5.743491675658198
Check 5.743491675658194
time at wall: 5.611187656628759
total time 5.407673701666711
Check 5.40767370166671
time at wall: 5.281661383844268
total time 3.831300336305209
Check 3.8313003363052096
time at wall: 3.7891769414327086
total time 3.886903993783357
Check 3.8869039937833567
time at wall: 3.8534073226805665
total time 3.1440161615510984
Check 3.144016161551099
time at wall: 3.1214467767320815
total time 5.454491353826221
Check 5.454491353826223
time at wall: 5.371188872989812
total time 6.2317812017067205
Check 6.231781201706725
tim

In [18]:
# TODO function get_closest_wall | inp = statcast data , stadium| opt = height, distance of wall
def get_closest_wall(data, team_name):
    # a quirk of our data - WSH is WAS in the fence data
    if team_name == 'WSH':
        team_name = 'WAS'
    # preparing data
    this_stadium_fences = fences[fences['team_abbr'] == team_name].copy()
    spray_angle = round(
        (math.atan(
            (data['hc_x'] - 125)/(199 - data['hc_y'])) * 180/ math.pi * .75
        )
    ,1)
    # calculating closest wall
    this_stadium_fences['delta'] = abs(this_stadium_fences['spray_angle_stadia'] - spray_angle) 
    closest_wall = this_stadium_fences[this_stadium_fences['delta'] == this_stadium_fences['delta'].min()]
    return (closest_wall['fence_height'].iloc[0], closest_wall['d_wall'].iloc[0])

# TEST 
# Sanity check with the NYY
test_set_2 = potential_hrs[potential_hrs['home_team'] == 'NYY'].copy()
test_set_2 = test_set_2.iloc[0:10]
test_set_2['closest_wall_height'] = test_set_2.apply(lambda row : get_closest_wall(row, row['home_team'])[1], axis=1) 

In [19]:
# TODO would it dong | inp = statcast data , stadium | opt = boolean
def would_it_dong(data, stadium): 
    fence = get_closest_wall(data, stadium)
    fence_height = fence[0]
    fence_distance = fence[1]
    ball_height_at_fence = height_at_d(data, fence_distance)
    return ball_height_at_fence > fence_height 

In [20]:
# Creating test set 1

# Sanity check - ensure that all calculations agree with the homeruns in their park
test_set = potential_hrs.copy()
test_set['calculated_hr'] = test_set.apply(lambda row : would_it_dong(row, row['home_team']), axis=1) 


total time 3.831300336305209
Check 3.8313003363052096
time at wall: 2.925801859569885
total time 5.454491353826221
Check 5.454491353826223
time at wall: 5.229672456880084
total time 3.8742236885245456
Check 3.8742236885245456
time at wall: 3.773328815769537
total time 3.4598613005816468
Check 3.4598613005816476
time at wall: 2.7922795288820117
total time 3.2792497074674465
Check 3.2792497074674456
time at wall: 3.059238414836996
total time 5.785102445229379
Check 5.785102445229372
time at wall: 5.361422372105011
total time 5.615549039997301
Check 5.615549039997301
time at wall: 4.775099313813712
total time 3.7049886304861834
Check 3.704988630486182
time at wall: 3.485922732841104
total time 4.646269808250393
Check 4.646269808250393
time at wall: 3.3095441932081386
total time 4.731114260990847
Check 4.731114260990847
time at wall: 4.226435061438037
total time 5.218392138708186
Check 5.218392138708191
time at wall: 5.138854118125523
total time 3.820795269549763
Check 3.820795269549762
ti

In [21]:
# Verifying test set

# We now check number of homeuns in their own stadium that are not calculated homeruns
print("Actual Homeruns: " + str(test_set[test_set['actual_hr'] == True].shape[0]))
print("Actual Homeruns that are calced HRs: " + str(test_set[(test_set['actual_hr'] == True) & (test_set['calculated_hr'] == True)].shape[0]))
print("Non Homeruns: " + str(test_set[test_set['actual_hr'] == False].shape[0]))
print("Non Homeruns that are calced HRs: " + str(test_set[(test_set['actual_hr'] == False) & (test_set['calculated_hr'] == True)].shape[0]))

# Verify that the inaccuracy isnt stadium specific
test_counts = test_set[test_set['actual_hr'] == True].groupby(['home_team', 'calculated_hr'])['home_team', 'calculated_hr'].size().reset_index()

error_rates = pd.DataFrame()

for team in mlb_teams:
    if team == 'WAS':
        team = 'WSH'
    df = test_counts[test_counts['home_team'] == team].copy()
    df['stadium'] = team
    df['rate'] = df.iloc[0, 2] / (df.iloc[1, 2] + df.iloc[0, 2])
    error_rates = pd.concat([error_rates, df[['stadium', 'rate']].head(1)])

        


Actual Homeruns: 5893
Actual Homeruns that are calced HRs: 5448
Non Homeruns: 20956
Non Homeruns that are calced HRs: 193


  test_counts = test_set[test_set['actual_hr'] == True].groupby(['home_team', 'calculated_hr'])['home_team', 'calculated_hr'].size().reset_index()


In [None]:
# ok, creating our big dataset
calculated_hrs = pd.DataFrame()

for team in mlb_teams:
    # make our calculations
    df = potential_hrs.copy()
    df['calculated_hr'] = df.apply(lambda row : would_it_dong(row, team), axis=1) 
    # clean data and return
    df['stadium'] = team
    calculated_hrs = pd.concat([calculated_hrs, df[['stadium', 'calculated_hr']]])


total time 3.831300336305209
Check 3.8313003363052096
time at wall: 3.1670637025354687
total time 5.454491353826221
Check 5.454491353826223
time at wall: 5.135096723861005
total time 3.4598613005816468
Check 3.4598613005816476
time at wall: 3.0480605356310524
total time 3.2792497074674465
Check 3.2792497074674456
time at wall: 3.130287601119733
total time 5.615549039997301
Check 5.615549039997301
time at wall: 5.183625668568537
total time 4.646269808250393
Check 4.646269808250393
time at wall: 4.1788440280832315
total time 4.81312858541485
Check 4.813128585414849
time at wall: 4.646690326474477
total time 5.355918785009894
Check 5.355918785009894
time at wall: 4.631755061215883
total time 3.461990037506604
Check 3.4619900375066033
time at wall: 3.2248291319034164
total time 4.110805930054008
Check 4.110805930054008
time at wall: 3.6204003230847137
total time 4.530556192405913
Check 4.530556192405914
time at wall: 3.924234977863208
total time 3.802362106814291
Check 3.80236210681429
tim

KeyboardInterrupt: 

In [13]:
# getting counts of num of hrs in each stadium
all_calc_hrs = calculated_hrs[calculated_hrs['calculated_hr'] == True].copy()
counts = all_calc_hrs.groupby(['stadium'])['calculated_hr'].count().reset_index() 


SyntaxError: invalid syntax (3204499282.py, line 4)