In [1]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
import warnings
warnings.filterwarnings("ignore")
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, f1_score
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import GradientBoostingRegressor, HistGradientBoostingRegressor, HistGradientBoostingClassifier
from xgboost.sklearn import XGBRegressor, XGBClassifier
from sklearn.preprocessing import StandardScaler
from scipy.stats import truncnorm, gamma, norm, cosine, invgamma, gennorm, ttest_ind
import scipy.special as sc
from tqdm import tqdm
import math
import torch
import pickle

In [2]:
#read in the raw data from just 2023
from google.colab import drive
drive.mount('/drive')

raw23 = pd.read_csv('/drive/My Drive/Colab Notebooks/sequencing_positioning_journal_paper/data/raw23.csv')
raw23 = raw23.loc[raw23['type'] == 'X']

Mounted at /drive


In [3]:
#read in the field dims
field_dims = pd.read_csv('/drive/My Drive/Colab Notebooks/sequencing_positioning_journal_paper/data/field_dims.csv')

In [4]:
#filter down to just regular season
#I named this df rays so that I could copy and paste code from the rays only testing notebook
rays = raw23.loc[raw23.game_date > '2023-03-29']

#remove HR and plays with runners on base since we don't test our positioning on those plays
rays = rays.loc[np.isnan(rays.on_3b) & (np.isnan(rays.on_2b)) & (np.isnan(rays.on_1b))]
rays = rays.loc[rays.events != 'home_run']

#now define the matchups
rays['matchup'] = rays.batter.astype(str) + '-' + rays.pitcher.astype(str) + '-' + rays.stand

In [5]:
#we only want plays with non-null measurements
rays = rays.dropna(subset = ['hc_x', 'hc_y', 'launch_speed', 'launch_angle', 'hit_distance_sc'])

#one hot encode handedness
pit_hand = pd.get_dummies(rays['p_throws'], drop_first = False, dtype = int)
bat_hand = pd.get_dummies(rays['stand'], drop_first = False, dtype = int)
rays['pit_handR'] = pit_hand['R']
rays['bat_handR'] = bat_hand['R']

#hit direction
rays['x'] = rays.hc_x.values - 125.42
rays['y'] = 198.27 - rays.hc_y.values
rays['hit_direction'] = np.arctan2(rays.x.values, rays.y.values) * 180/np.pi

#remove extreme angles.
rays = rays.loc[np.abs(rays.hit_direction) < 55]

#launch angle type
rays['la_type'] = 0 #gb
rays.loc[(rays.launch_angle > 10) & (rays.launch_angle <= 25), 'la_type'] = 1 #ld
rays.loc[(rays.launch_angle > 25) & (rays.launch_angle <= 50), 'la_type'] = 2 #fb
rays.loc[rays.launch_angle > 50, 'la_type'] = 3 #pu
rays['is_gb'] = 0
rays['is_ld'] = 0
rays['is_fb'] = 0
rays['is_pu'] = 0
rays.loc[rays.la_type == 0, 'is_gb'] = 1
rays.loc[rays.la_type == 1, 'is_ld'] = 1
rays.loc[rays.la_type == 2, 'is_fb'] = 1
rays.loc[rays.la_type == 3, 'is_pu'] = 1

In [6]:
#predict the landing spots and hang times
model_t = pickle.load(open('/drive/My Drive/Colab Notebooks/sequencing_positioning_journal_paper/helper_models/hangtime_model.pkl', 'rb'))
model_dist = pickle.load(open('/drive/My Drive/Colab Notebooks/sequencing_positioning_journal_paper/helper_models/landing_dist_model.pkl', 'rb'))

rays['hangtime'] = model_t.predict(rays[['launch_angle', 'launch_speed', 'hit_direction', 'bat_handR']].values)
rays['distance'] = model_dist.predict(rays[['launch_angle', 'launch_speed', 'hit_direction', 'bat_handR']].values)
#calculate landing x and y using distance and spray angle
rays['landing_x'] = rays.distance.values * np.sin(np.pi * rays.hit_direction.values / 180)
rays['landing_y'] = rays.distance.values * np.cos(np.pi * rays.hit_direction.values / 180)

#also predict xwoba given a hit for all batted balls. WE will use this to prioritize higher xwoba balls in our optimizer
xwoba_mod = pickle.load(open('/drive/My Drive/Colab Notebooks/sequencing_positioning_journal_paper/helper_models/xwoba_model.pkl', 'rb'))
#I used fangraphs 2023 woba weights
rays['xwoba'] = xwoba_mod.predict_proba(rays[['launch_angle', 'launch_speed', 'hit_direction', 'bat_handR']].values) @ np.array([0.883, 1.244, 1.569, 2.004])

In [7]:
#remove non-standard outfield positioning like we did before in the sloan paper
rays = rays.loc[rays.of_fielding_alignment == 'Standard']

## MLB Avg Positioning

In [8]:
df = rays.copy()
#initialize infielder depths and angles
df[['a3', 'a4','a5','a6', 'd3','d4','d5','d6', 'a7', 'a8', 'a9', 'd7', 'd8', 'd9']]= np.array([34, 12, -32,-13,111,148,119,147, -27, 0, 27, 297, 322, 294])

df.loc[(df.bat_handR == 0) & (df.if_fielding_alignment == 'Infield shade'), ['a3', 'a4','a5','a6', 'd3','d4','d5','d6', 'a7', 'a8', 'a9', 'd7', 'd8', 'd9']] = np.array([42, 20, -22.5, -5, 126, 147, 132, 152, -27,-1,27,297,323,294])
df.loc[(df.bat_handR == 0) & (df.if_fielding_alignment == 'Standard'), ['a3', 'a4','a5','a6', 'd3','d4','d5','d6', 'a7', 'a8', 'a9', 'd7', 'd8', 'd9']] = np.array([42, 20, -27, -7.5, 124, 148, 117, 152, -30, -5, 26, 293, 329, 302])

df.loc[(df.bat_handR == 1) & (df.if_fielding_alignment == 'Infield shade'), ['a3', 'a4','a5','a6', 'd3','d4','d5','d6', 'a7', 'a8', 'a9', 'd7', 'd8', 'd9']] = np.array([28, 5, -42.5, -20, 116, 154, 125, 145, -27, 1, 27, 303,323,291])
df.loc[(df.bat_handR == 1) & (df.if_fielding_alignment == 'Standard'), ['a3', 'a4','a5','a6', 'd3','d4','d5','d6', 'a7', 'a8', 'a9', 'd7', 'd8', 'd9']] = np.array([30, 7.5, -42.5, -20, 116, 153, 118, 148, -25,5,29,300,321,293])

#use angle and depth to calculate coordinates
df['x3'] = df.d3.values * np.sin(np.pi * df.a3.values / 180)
df['y3'] = df.d3.values * np.cos(np.pi * df.a3.values / 180)
df['x4'] = df.d4.values * np.sin(np.pi * df.a4.values / 180)
df['y4'] = df.d4.values * np.cos(np.pi * df.a4.values / 180)
df['x5'] = df.d5.values * np.sin(np.pi * df.a5.values / 180)
df['y5'] = df.d5.values * np.cos(np.pi * df.a5.values / 180)
df['x6'] = df.d6.values * np.sin(np.pi * df.a6.values / 180)
df['y6'] = df.d6.values * np.cos(np.pi * df.a6.values / 180)
df['x7'] = df.d7.values * np.sin(np.pi * df.a7.values / 180)
df['y7'] = df.d7.values * np.cos(np.pi * df.a7.values / 180)
df['x8'] = df.d8.values * np.sin(np.pi * df.a8.values / 180)
df['y8'] = df.d8.values * np.cos(np.pi * df.a8.values / 180)
df['x9'] = df.d9.values * np.sin(np.pi * df.a9.values / 180)
df['y9'] = df.d9.values * np.cos(np.pi * df.a9.values / 180)

#calculate distance to landing spot of ball
df['dist3'] = np.sqrt((df.x3.values - df.landing_x.values)**2 + (df.y3.values - df.landing_y.values)**2)
df['dist4'] = np.sqrt((df.x4.values - df.landing_x.values)**2 + (df.y4.values - df.landing_y.values)**2)
df['dist5'] = np.sqrt((df.x5.values - df.landing_x.values)**2 + (df.y5.values - df.landing_y.values)**2)
df['dist6'] = np.sqrt((df.x6.values - df.landing_x.values)**2 + (df.y6.values - df.landing_y.values)**2)
df['dist7'] = np.sqrt((df.x7.values - df.landing_x.values)**2 + (df.y7.values - df.landing_y.values)**2)
df['dist8'] = np.sqrt((df.x8.values - df.landing_x.values)**2 + (df.y8.values - df.landing_y.values)**2)
df['dist9'] = np.sqrt((df.x9.values - df.landing_x.values)**2 + (df.y9.values - df.landing_y.values)**2)

#get the closest fielder by landing spot
df['closest_fielder'] = np.argmin(df[['dist3', 'dist4', 'dist5', 'dist6', 'dist7', 'dist8', 'dist9']].values, axis = 1) + 3
df['closest_fielder_x'] = np.nan
df['closest_fielder_y'] = np.nan
df['closest_fielder_angle'] = np.nan
df['closest_fielder_depth'] = np.nan
df['closest_fielder_coords_diff'] = np.nan
for i in range(3,10):
    df.loc[df.closest_fielder == i, ['closest_fielder_x', 'closest_fielder_y', 'closest_fielder_angle', 'closest_fielder_depth', 'closest_fielder_coords_diff']]= df.loc[df.closest_fielder==i, ['x' + str(i), 'y' + str(i), 'a' + str(i), 'd' + str(i), 'dist' + str(i)]].values

#calculate angular distance
df['ad3'] = df.a3.values - df.hit_direction.values
df['ad4'] = df.a4.values - df.hit_direction.values
df['ad5'] = df.a5.values - df.hit_direction.values
df['ad6'] = df.a6.values - df.hit_direction.values

#get the closest infielder by angle, their angle, and their depth. We're going to ignore the catcher and pitcher because obviously their positions are fixed
df['closest_infielder'] = np.argmin(np.abs(df[['ad3', 'ad4', 'ad5', 'ad6']].values),axis=1)  + 3
#get the corresponding coordinates for that infielder, and depth, and angle.
df['closest_infielder_angle'] = np.nan
df['closest_infielder_depth'] = np.nan
df['closest_infielder_angle_diff'] = np.nan
for i in range(3,7):
    df.loc[df.closest_infielder == i, ['closest_infielder_angle', 'closest_infielder_depth', 'closest_infielder_angle_diff']] = df.loc[df.closest_infielder == i, ['a' + str(i), 'd' + str(i), 'ad' + str(i)]].values

df['outcome'] = -1
df.loc[df.events.isin(['field_out', 'force_out', 'field_error', 'grounded_into_double_play', 'sac_fly', 'fielders_choice',
                                               'fielders_choice_out', 'double_play', 'other_out', 'triple_play', 'sac_bunt',
                                               'sac_fly_double_play']), 'outcome'] = 0 # in play out
df.loc[df.events == 'single', 'outcome'] = 1 #single
df.loc[df.events == 'double', 'outcome'] = 2 # double
df.loc[df.events == 'triple', 'outcome'] = 3 #triple
df.loc[df.events == 'home_run', 'outcome'] = 4 #hr

df['is_out'] = 0
df.loc[df.outcome == 0, 'is_out'] = 1

In [9]:
#remove the balls that likely get out of Tropicana Field since we optimized for the Trop
trop_dims = field_dims.loc[field_dims.stadium == 'Tropicana Field']
trop_dims['angle'] = np.arctan2(trop_dims.x.values, trop_dims.y.values)*180/np.pi
df['wall_distance'] = [trop_dims.distance.values[np.argmin(np.abs(df.hit_direction.values[i] - trop_dims.angle.values))] for i in range(df.shape[0])]
df= df.loc[df.distance < df.wall_distance]

In [10]:
#calculate expected outs given our positioning recommendatiosn
df['abs_angle_diff'] = np.abs(df.closest_infielder_angle_diff.values)
df['ev_angle_inter'] = df.launch_speed.values * df.abs_angle_diff.values
df['bttf'] = df.closest_infielder_depth.values / (df.launch_speed.values * 1.4667)
df['time_diff'] = df.hangtime.values - df.closest_fielder_coords_diff.values/27
df['closest_fielder_backwards'] = 0
df.loc[df.closest_fielder_depth < df.distance, 'closest_fielder_backwards'] = 1
df['back_time_inter'] = df.time_diff.values*df.closest_fielder_backwards.values

gb_int =-2.7808253
gb_angle_diff = -0.1463456
gb_bttf = 5.86696302
gb_bttf2 = -1.31835977

fb_int = -2.16869748
fb_td_slope = 1.50402427

df['xout_prob'] = df.is_gb.values * sc.expit(gb_int + gb_angle_diff * df.abs_angle_diff.values + gb_bttf * df.bttf.values + gb_bttf2 * df.bttf.values**2) + (1-df.is_gb.values) * sc.expit(fb_int + fb_td_slope*df.time_diff.values)

In [11]:
df.shape

(61200, 169)

In [12]:
1-df.xout_prob.mean(), ((1-df.xout_prob.values) * df.xwoba.values).mean()

(np.float64(0.31961714643666495), np.float64(0.3206047660774199))

In [13]:
avg_df = df.copy()

## Sloan Positioning

In [14]:
#read in the sloan positioning recs, which aren't pitcher specific
pos = pd.read_csv('/drive/My Drive/Colab Notebooks/sequencing_positioning_journal_paper/pitcher_specific_positioning/recs_2023.csv')

In [15]:
#merge with pos to get positioning recs
df = rays.merge(pos, left_on = ['batter', 'bat_handR'], right_on = ['batter_id', 'bat_handR'])

In [16]:
#calculate angles
df['a3'] = np.arctan2(df.x3.values, df.y3.values)*180/np.pi
df['a4'] = np.arctan2(df.x4.values, df.y4.values)*180/np.pi
df['a5'] = np.arctan2(df.x5.values, df.y5.values)*180/np.pi
df['a6'] = np.arctan2(df.x6.values, df.y6.values)*180/np.pi
df['a7'] = np.arctan2(df.x7.values, df.y7.values)*180/np.pi
df['a8'] = np.arctan2(df.x8.values, df.y8.values)*180/np.pi
df['a9'] = np.arctan2(df.x9.values, df.y9.values)*180/np.pi

#calculate depths
df['d3'] = np.sqrt(df.x3.values**2 + df.y3.values**2)
df['d4'] = np.sqrt(df.x4.values**2 + df.y4.values**2)
df['d5'] = np.sqrt(df.x5.values**2 + df.y5.values**2)
df['d6'] = np.sqrt(df.x6.values**2 + df.y6.values**2)
df['d7'] = np.sqrt(df.x7.values**2 + df.y7.values**2)
df['d8'] = np.sqrt(df.x8.values**2 + df.y8.values**2)
df['d9'] = np.sqrt(df.x9.values**2 + df.y9.values**2)

#calculate angular distance
df['ad3'] = df.a3.values - df.hit_direction.values
df['ad4'] = df.a4.values - df.hit_direction.values
df['ad5'] = df.a5.values - df.hit_direction.values
df['ad6'] = df.a6.values - df.hit_direction.values

#get the closest infielder by angle, their angle, and their depth. We're going to ignore the catcher and pitcher because obviously their positions are fixed
df['closest_infielder'] = np.argmin(np.abs(df[['ad3', 'ad4', 'ad5', 'ad6']].values),axis=1)  + 3
#get the corresponding coordinates for that infielder, and depth, and angle.
df['closest_infielder_angle'] = np.nan
df['closest_infielder_depth'] = np.nan
df['closest_infielder_angle_diff'] = np.nan
for i in range(3,7):
    df.loc[df.closest_infielder == i, ['closest_infielder_angle', 'closest_infielder_depth', 'closest_infielder_angle_diff']] = df.loc[df.closest_infielder == i, ['a' + str(i), 'd' + str(i), 'ad' + str(i)]].values

#calculate distance to landing spot of ball
df['dist3'] = np.sqrt((df.x3.values - df.landing_x.values)**2 + (df.y3.values - df.landing_y.values)**2)
df['dist4'] = np.sqrt((df.x4.values - df.landing_x.values)**2 + (df.y4.values - df.landing_y.values)**2)
df['dist5'] = np.sqrt((df.x5.values - df.landing_x.values)**2 + (df.y5.values - df.landing_y.values)**2)
df['dist6'] = np.sqrt((df.x6.values - df.landing_x.values)**2 + (df.y6.values - df.landing_y.values)**2)
df['dist7'] = np.sqrt((df.x7.values - df.landing_x.values)**2 + (df.y7.values - df.landing_y.values)**2)
df['dist8'] = np.sqrt((df.x8.values - df.landing_x.values)**2 + (df.y8.values - df.landing_y.values)**2)
df['dist9'] = np.sqrt((df.x9.values - df.landing_x.values)**2 + (df.y9.values - df.landing_y.values)**2)

#get the closest fielder by landing spot
df['closest_fielder'] = np.argmin(df[['dist3', 'dist4', 'dist5', 'dist6', 'dist7', 'dist8', 'dist9']].values, axis = 1) + 3
df['closest_fielder_x'] = np.nan
df['closest_fielder_y'] = np.nan
df['closest_fielder_angle'] = np.nan
df['closest_fielder_depth'] = np.nan
df['closest_fielder_coords_diff'] = np.nan
for i in range(3,10):
    df.loc[df.closest_fielder == i, ['closest_fielder_x', 'closest_fielder_y', 'closest_fielder_angle', 'closest_fielder_depth', 'closest_fielder_coords_diff']]= df.loc[df.closest_fielder==i, ['x' + str(i), 'y' + str(i), 'a' + str(i), 'd' + str(i), 'dist' + str(i)]].values

df['outcome'] = -1
df.loc[df.events.isin(['field_out', 'force_out', 'field_error', 'grounded_into_double_play', 'sac_fly', 'fielders_choice',
                                               'fielders_choice_out', 'double_play', 'other_out', 'triple_play', 'sac_bunt',
                                               'sac_fly_double_play']), 'outcome'] = 0 # in play out
df.loc[df.events == 'single', 'outcome'] = 1 #single
df.loc[df.events == 'double', 'outcome'] = 2 # double
df.loc[df.events == 'triple', 'outcome'] = 3 #triple
df.loc[df.events == 'home_run', 'outcome'] = 4 #hr

df['is_out'] = 0
df.loc[df.outcome == 0, 'is_out'] = 1

In [17]:
#remove the balls that likely get out of Tropicana Field since we optimized for the Trop
trop_dims = field_dims.loc[field_dims.stadium == 'Tropicana Field']
trop_dims['angle'] = np.arctan2(trop_dims.x.values, trop_dims.y.values)*180/np.pi
df['wall_distance'] = [trop_dims.distance.values[np.argmin(np.abs(df.hit_direction.values[i] - trop_dims.angle.values))] for i in range(df.shape[0])]
df= df.loc[df.distance < df.wall_distance]

In [18]:
#calculate expected outs given our positioning recommendatiosn
df['abs_angle_diff'] = np.abs(df.closest_infielder_angle_diff.values)
df['ev_angle_inter'] = df.launch_speed.values * df.abs_angle_diff.values
df['bttf'] = df.closest_infielder_depth.values / (df.launch_speed.values * 1.4667)
df['time_diff'] = df.hangtime.values - df.closest_fielder_coords_diff.values/27
df['closest_fielder_backwards'] = 0
df.loc[df.closest_fielder_depth < df.distance, 'closest_fielder_backwards'] = 1
df['back_time_inter'] = df.time_diff.values*df.closest_fielder_backwards.values

gb_int =-2.7808253
gb_angle_diff = -0.1463456
gb_bttf = 5.86696302
gb_bttf2 = -1.31835977

fb_int = -2.16869748
fb_td_slope = 1.50402427

df['xout_prob'] = df.is_gb.values * sc.expit(gb_int + gb_angle_diff * df.abs_angle_diff.values + gb_bttf * df.bttf.values + gb_bttf2 * df.bttf.values**2) + (1-df.is_gb.values) * sc.expit(fb_int + fb_td_slope*df.time_diff.values)

In [19]:
df.shape

(61200, 170)

In [20]:
1-df.xout_prob.mean(), ((1-df.xout_prob.values) * df.xwoba.values).mean()

(np.float64(0.30530641353796717), np.float64(0.308395007798304))

In [21]:
old_df = df.copy()

## Pitcher Specific Positioning

In [22]:
teams = ['AZ', 'TEX', 'PHI', 'HOU', 'MIN', 'ATL', 'LAD', 'BAL', 'TB', 'MIL',
       'LAA', 'SF', 'PIT', 'DET', 'SEA', 'COL', 'TOR', 'CWS', 'NYM',
       'STL', 'KC', 'BOS', 'CLE', 'NYY', 'WSH', 'CIN', 'CHC', 'MIA', 'SD',
       'OAK']
df = pd.DataFrame()
for team in teams:
  d = rays.loc[(rays.home_team == team) | (rays.away_team == team)]
  d['rays_pitching'] = 0 #this column is called rays_pitching because I copy and pasted. It should really be called team_pitching
  d.loc[(d.home_team == team) & (d.inning_topbot == 'Top'), 'rays_pitching'] = 1
  d.loc[(d.away_team == team) & (d.inning_topbot == 'Bot'), 'rays_pitching'] = 1
  d = d.loc[d.rays_pitching == 1]

  ppos = pd.read_csv(f'/drive/My Drive/Colab Notebooks/sequencing_positioning_journal_paper/pitcher_specific_positioning/results/{team}_pitchers.csv')
  d = d.merge(ppos, on = ['batter', 'pitcher', 'bat_handR'])
  d.drop(columns = ['rays_pitching'], inplace = True)
  df = pd.concat((df, d), ignore_index = True)

In [23]:
#calculate angles
df['a3'] = np.arctan2(df.x3.values, df.y3.values)*180/np.pi
df['a4'] = np.arctan2(df.x4.values, df.y4.values)*180/np.pi
df['a5'] = np.arctan2(df.x5.values, df.y5.values)*180/np.pi
df['a6'] = np.arctan2(df.x6.values, df.y6.values)*180/np.pi
df['a7'] = np.arctan2(df.x7.values, df.y7.values)*180/np.pi
df['a8'] = np.arctan2(df.x8.values, df.y8.values)*180/np.pi
df['a9'] = np.arctan2(df.x9.values, df.y9.values)*180/np.pi

#calculate depths
df['d3'] = np.sqrt(df.x3.values**2 + df.y3.values**2)
df['d4'] = np.sqrt(df.x4.values**2 + df.y4.values**2)
df['d5'] = np.sqrt(df.x5.values**2 + df.y5.values**2)
df['d6'] = np.sqrt(df.x6.values**2 + df.y6.values**2)
df['d7'] = np.sqrt(df.x7.values**2 + df.y7.values**2)
df['d8'] = np.sqrt(df.x8.values**2 + df.y8.values**2)
df['d9'] = np.sqrt(df.x9.values**2 + df.y9.values**2)

#calculate angular distance
df['ad3'] = df.a3.values - df.hit_direction.values
df['ad4'] = df.a4.values - df.hit_direction.values
df['ad5'] = df.a5.values - df.hit_direction.values
df['ad6'] = df.a6.values - df.hit_direction.values

#get the closest infielder by angle, their angle, and their depth. We're going to ignore the catcher and pitcher because obviously their positions are fixed
df['closest_infielder'] = np.argmin(np.abs(df[['ad3', 'ad4', 'ad5', 'ad6']].values),axis=1)  + 3
#get the corresponding coordinates for that infielder, and depth, and angle.
df['closest_infielder_angle'] = np.nan
df['closest_infielder_depth'] = np.nan
df['closest_infielder_angle_diff'] = np.nan
for i in range(3,7):
    df.loc[df.closest_infielder == i, ['closest_infielder_angle', 'closest_infielder_depth', 'closest_infielder_angle_diff']] = df.loc[df.closest_infielder == i, ['a' + str(i), 'd' + str(i), 'ad' + str(i)]].values

#calculate distance to landing spot of ball
df['dist3'] = np.sqrt((df.x3.values - df.landing_x.values)**2 + (df.y3.values - df.landing_y.values)**2)
df['dist4'] = np.sqrt((df.x4.values - df.landing_x.values)**2 + (df.y4.values - df.landing_y.values)**2)
df['dist5'] = np.sqrt((df.x5.values - df.landing_x.values)**2 + (df.y5.values - df.landing_y.values)**2)
df['dist6'] = np.sqrt((df.x6.values - df.landing_x.values)**2 + (df.y6.values - df.landing_y.values)**2)
df['dist7'] = np.sqrt((df.x7.values - df.landing_x.values)**2 + (df.y7.values - df.landing_y.values)**2)
df['dist8'] = np.sqrt((df.x8.values - df.landing_x.values)**2 + (df.y8.values - df.landing_y.values)**2)
df['dist9'] = np.sqrt((df.x9.values - df.landing_x.values)**2 + (df.y9.values - df.landing_y.values)**2)

#get the closest fielder by landing spot
df['closest_fielder'] = np.argmin(df[['dist3', 'dist4', 'dist5', 'dist6', 'dist7', 'dist8', 'dist9']].values, axis = 1) + 3
df['closest_fielder_x'] = np.nan
df['closest_fielder_y'] = np.nan
df['closest_fielder_angle'] = np.nan
df['closest_fielder_depth'] = np.nan
df['closest_fielder_coords_diff'] = np.nan
for i in range(3,10):
    df.loc[df.closest_fielder == i, ['closest_fielder_x', 'closest_fielder_y', 'closest_fielder_angle', 'closest_fielder_depth', 'closest_fielder_coords_diff']]= df.loc[df.closest_fielder==i, ['x' + str(i), 'y' + str(i), 'a' + str(i), 'd' + str(i), 'dist' + str(i)]].values

df['outcome'] = -1
df.loc[df.events.isin(['field_out', 'force_out', 'field_error', 'grounded_into_double_play', 'sac_fly', 'fielders_choice',
                                               'fielders_choice_out', 'double_play', 'other_out', 'triple_play', 'sac_bunt',
                                               'sac_fly_double_play']), 'outcome'] = 0 # in play out
df.loc[df.events == 'single', 'outcome'] = 1 #single
df.loc[df.events == 'double', 'outcome'] = 2 # double
df.loc[df.events == 'triple', 'outcome'] = 3 #triple
df.loc[df.events == 'home_run', 'outcome'] = 4 #hr

df['is_out'] = 0
df.loc[df.outcome == 0, 'is_out'] = 1

In [24]:
#remove the balls that likely get out of Tropicana Field since we optimized for the Trop
trop_dims = field_dims.loc[field_dims.stadium == 'Tropicana Field']
trop_dims['angle'] = np.arctan2(trop_dims.x.values, trop_dims.y.values)*180/np.pi
df['wall_distance'] = [trop_dims.distance.values[np.argmin(np.abs(df.hit_direction.values[i] - trop_dims.angle.values))] for i in range(df.shape[0])]
df= df.loc[df.distance < df.wall_distance]

In [25]:
#calculate expected outs given our positioning recommendatiosn
df['abs_angle_diff'] = np.abs(df.closest_infielder_angle_diff.values)
df['ev_angle_inter'] = df.launch_speed.values * df.abs_angle_diff.values
df['bttf'] = df.closest_infielder_depth.values / (df.launch_speed.values * 1.4667)
df['time_diff'] = df.hangtime.values - df.closest_fielder_coords_diff.values/27
df['closest_fielder_backwards'] = 0
df.loc[df.closest_fielder_depth < df.distance, 'closest_fielder_backwards'] = 1
df['back_time_inter'] = df.time_diff.values*df.closest_fielder_backwards.values

gb_int =-2.7808253
gb_angle_diff = -0.1463456
gb_bttf = 5.86696302
gb_bttf2 = -1.31835977

fb_int = -2.16869748
fb_td_slope = 1.50402427

df['xout_prob'] = df.is_gb.values * sc.expit(gb_int + gb_angle_diff * df.abs_angle_diff.values + gb_bttf * df.bttf.values + gb_bttf2 * df.bttf.values**2) + (1-df.is_gb.values) * sc.expit(fb_int + fb_td_slope*df.time_diff.values)

In [26]:
df.shape

(61200, 170)

In [27]:
1-df.xout_prob.mean(), ((1-df.xout_prob.values) * df.xwoba.values).mean()

(np.float64(0.3042366733427807), np.float64(0.307637894722835))

In [28]:
pit_df = df.copy()

## Pitch Spec Positioning

In [29]:
df = pd.DataFrame()
for team in teams:
  d = rays.loc[(rays.home_team == team) | (rays.away_team == team)]
  d['rays_pitching'] = 0 #this column is called rays_pitching because I copy and pasted. It should really be called team_pitching
  d.loc[(d.home_team == team) & (d.inning_topbot == 'Top'), 'rays_pitching'] = 1
  d.loc[(d.away_team == team) & (d.inning_topbot == 'Bot'), 'rays_pitching'] = 1
  d = d.loc[d.rays_pitching == 1]

  ppos = pd.read_csv(f'/drive/My Drive/Colab Notebooks/sequencing_positioning_journal_paper/pitcher_specific_positioning/results/{team}_pitches.csv')
  d = d.merge(ppos, on = ['batter', 'pitcher', 'bat_handR', 'pitch_type'])
  d.drop(columns = ['rays_pitching'], inplace = True)
  df = pd.concat((df, d), ignore_index = True)

In [30]:
#calculate angles
df['a3'] = np.arctan2(df.x3.values, df.y3.values)*180/np.pi
df['a4'] = np.arctan2(df.x4.values, df.y4.values)*180/np.pi
df['a5'] = np.arctan2(df.x5.values, df.y5.values)*180/np.pi
df['a6'] = np.arctan2(df.x6.values, df.y6.values)*180/np.pi
df['a7'] = np.arctan2(df.x7.values, df.y7.values)*180/np.pi
df['a8'] = np.arctan2(df.x8.values, df.y8.values)*180/np.pi
df['a9'] = np.arctan2(df.x9.values, df.y9.values)*180/np.pi

#calculate depths
df['d3'] = np.sqrt(df.x3.values**2 + df.y3.values**2)
df['d4'] = np.sqrt(df.x4.values**2 + df.y4.values**2)
df['d5'] = np.sqrt(df.x5.values**2 + df.y5.values**2)
df['d6'] = np.sqrt(df.x6.values**2 + df.y6.values**2)
df['d7'] = np.sqrt(df.x7.values**2 + df.y7.values**2)
df['d8'] = np.sqrt(df.x8.values**2 + df.y8.values**2)
df['d9'] = np.sqrt(df.x9.values**2 + df.y9.values**2)

#calculate angular distance
df['ad3'] = df.a3.values - df.hit_direction.values
df['ad4'] = df.a4.values - df.hit_direction.values
df['ad5'] = df.a5.values - df.hit_direction.values
df['ad6'] = df.a6.values - df.hit_direction.values

#get the closest infielder by angle, their angle, and their depth. We're going to ignore the catcher and pitcher because obviously their positions are fixed
df['closest_infielder'] = np.argmin(np.abs(df[['ad3', 'ad4', 'ad5', 'ad6']].values),axis=1)  + 3
#get the corresponding coordinates for that infielder, and depth, and angle.
df['closest_infielder_angle'] = np.nan
df['closest_infielder_depth'] = np.nan
df['closest_infielder_angle_diff'] = np.nan
for i in range(3,7):
    df.loc[df.closest_infielder == i, ['closest_infielder_angle', 'closest_infielder_depth', 'closest_infielder_angle_diff']] = df.loc[df.closest_infielder == i, ['a' + str(i), 'd' + str(i), 'ad' + str(i)]].values

#calculate distance to landing spot of ball
df['dist3'] = np.sqrt((df.x3.values - df.landing_x.values)**2 + (df.y3.values - df.landing_y.values)**2)
df['dist4'] = np.sqrt((df.x4.values - df.landing_x.values)**2 + (df.y4.values - df.landing_y.values)**2)
df['dist5'] = np.sqrt((df.x5.values - df.landing_x.values)**2 + (df.y5.values - df.landing_y.values)**2)
df['dist6'] = np.sqrt((df.x6.values - df.landing_x.values)**2 + (df.y6.values - df.landing_y.values)**2)
df['dist7'] = np.sqrt((df.x7.values - df.landing_x.values)**2 + (df.y7.values - df.landing_y.values)**2)
df['dist8'] = np.sqrt((df.x8.values - df.landing_x.values)**2 + (df.y8.values - df.landing_y.values)**2)
df['dist9'] = np.sqrt((df.x9.values - df.landing_x.values)**2 + (df.y9.values - df.landing_y.values)**2)

#get the closest fielder by landing spot
df['closest_fielder'] = np.argmin(df[['dist3', 'dist4', 'dist5', 'dist6', 'dist7', 'dist8', 'dist9']].values, axis = 1) + 3
df['closest_fielder_x'] = np.nan
df['closest_fielder_y'] = np.nan
df['closest_fielder_angle'] = np.nan
df['closest_fielder_depth'] = np.nan
df['closest_fielder_coords_diff'] = np.nan
for i in range(3,10):
    df.loc[df.closest_fielder == i, ['closest_fielder_x', 'closest_fielder_y', 'closest_fielder_angle', 'closest_fielder_depth', 'closest_fielder_coords_diff']]= df.loc[df.closest_fielder==i, ['x' + str(i), 'y' + str(i), 'a' + str(i), 'd' + str(i), 'dist' + str(i)]].values

df['outcome'] = -1
df.loc[df.events.isin(['field_out', 'force_out', 'field_error', 'grounded_into_double_play', 'sac_fly', 'fielders_choice',
                                               'fielders_choice_out', 'double_play', 'other_out', 'triple_play', 'sac_bunt',
                                               'sac_fly_double_play']), 'outcome'] = 0 # in play out
df.loc[df.events == 'single', 'outcome'] = 1 #single
df.loc[df.events == 'double', 'outcome'] = 2 # double
df.loc[df.events == 'triple', 'outcome'] = 3 #triple
df.loc[df.events == 'home_run', 'outcome'] = 4 #hr

df['is_out'] = 0
df.loc[df.outcome == 0, 'is_out'] = 1

In [31]:
#remove the balls that likely get out of Tropicana Field since we optimized for the Trop
trop_dims = field_dims.loc[field_dims.stadium == 'Tropicana Field']
trop_dims['angle'] = np.arctan2(trop_dims.x.values, trop_dims.y.values)*180/np.pi
df['wall_distance'] = [trop_dims.distance.values[np.argmin(np.abs(df.hit_direction.values[i] - trop_dims.angle.values))] for i in range(df.shape[0])]
df= df.loc[df.distance < df.wall_distance]

In [32]:
#calculate expected outs given our positioning recommendatiosn
df['abs_angle_diff'] = np.abs(df.closest_infielder_angle_diff.values)
df['ev_angle_inter'] = df.launch_speed.values * df.abs_angle_diff.values
df['bttf'] = df.closest_infielder_depth.values / (df.launch_speed.values * 1.4667)
df['time_diff'] = df.hangtime.values - df.closest_fielder_coords_diff.values/27
df['closest_fielder_backwards'] = 0
df.loc[df.closest_fielder_depth < df.distance, 'closest_fielder_backwards'] = 1
df['back_time_inter'] = df.time_diff.values*df.closest_fielder_backwards.values

gb_int =-2.7808253
gb_angle_diff = -0.1463456
gb_bttf = 5.86696302
gb_bttf2 = -1.31835977

fb_int = -2.16869748
fb_td_slope = 1.50402427

df['xout_prob'] = df.is_gb.values * sc.expit(gb_int + gb_angle_diff * df.abs_angle_diff.values + gb_bttf * df.bttf.values + gb_bttf2 * df.bttf.values**2) + (1-df.is_gb.values) * sc.expit(fb_int + fb_td_slope*df.time_diff.values)

In [33]:
df.shape

(61200, 170)

In [34]:
1-df.xout_prob.mean(), ((1-df.xout_prob.values) * df.xwoba.values).mean()

(np.float64(0.30269306676728325), np.float64(0.30633843603239863))

## Direct Comps

In [35]:
avg_df.shape[0], old_df.shape[0], pit_df.shape[0], df.shape[0]

(61200, 61200, 61200, 61200)

#### xBA

In [36]:
1-avg_df.xout_prob.mean(), 1-old_df.xout_prob.mean(), 1-pit_df.xout_prob.mean(), 1-df.xout_prob.mean()

(np.float64(0.31961714643666495),
 np.float64(0.30530641353796717),
 np.float64(0.3042366733427807),
 np.float64(0.30269306676728325))

In [37]:
ttest_ind(1-old_df.xout_prob.values, 1-pit_df.xout_prob.values, alternative='greater')

TtestResult(statistic=np.float64(0.7117608921525606), pvalue=np.float64(0.23830710507712255), df=np.float64(122398.0))

In [38]:
ttest_ind(1-old_df.xout_prob.values, 1-df.xout_prob.values, alternative='greater')

TtestResult(statistic=np.float64(1.7418366316544562), pvalue=np.float64(0.04076977342285315), df=np.float64(122398.0))

In [39]:
ttest_ind(1-pit_df.xout_prob.values, 1-df.xout_prob.values, alternative='greater')

TtestResult(statistic=np.float64(1.0319526916334496), pvalue=np.float64(0.15104815958855267), df=np.float64(122398.0))

In [40]:
(1-avg_df.xout_prob.mean()) * 61200/30, (1-old_df.xout_prob.mean()) * 61200/30, (1-pit_df.xout_prob.mean()) * 61200/30, (1-df.xout_prob.mean()) * 61200/30

(np.float64(652.0189787307966),
 np.float64(622.8250836174531),
 np.float64(620.6428136192727),
 np.float64(617.4938562052578))

#### xwOBA

In [41]:
((1-avg_df.xout_prob.values) * avg_df.xwoba.values).mean(), ((1-old_df.xout_prob.values) * old_df.xwoba.values).mean(), ((1-pit_df.xout_prob.values) * pit_df.xwoba.values).mean(), ((1-df.xout_prob.values) * df.xwoba.values).mean()

(np.float64(0.3206047660774199),
 np.float64(0.308395007798304),
 np.float64(0.307637894722835),
 np.float64(0.30633843603239863))

In [42]:
ttest_ind((1-old_df.xout_prob.values) * old_df.xwoba.values, (1-pit_df.xout_prob.values) * pit_df.xwoba.values, alternative = 'greater')

TtestResult(statistic=np.float64(0.4761477295751951), pvalue=np.float64(0.3169849925557041), df=np.float64(122398.0))

In [43]:
ttest_ind((1-old_df.xout_prob.values) * old_df.xwoba.values, (1-df.xout_prob.values) * df.xwoba.values, alternative = 'greater')

TtestResult(statistic=np.float64(1.2944463273657514), pvalue=np.float64(0.09775686966631439), df=np.float64(122398.0))

In [44]:
ttest_ind((1-pit_df.xout_prob.values) * pit_df.xwoba.values, (1-df.xout_prob.values) * df.xwoba.values, alternative = 'greater')

TtestResult(statistic=np.float64(0.8188127516049278), pvalue=np.float64(0.20644742433596203), df=np.float64(122398.0))

In [45]:
#calculate wRAA
wraa = lambda woba, pa: ((woba - 0.318) / 1.204) * pa

In [46]:
wraa(0.3206, 61200/30)

4.405315614617925

In [47]:
wraa(0.3084, 61200/30)

-16.265780730897006

In [48]:
wraa(0.3076, 61200/30)

-17.621262458471797

In [49]:
wraa(0.3063, 61200/30)

-19.82392026578071