In [1]:
# Suppress Pandas Warnings
import warnings
warnings.simplefilter(action='ignore')

In [2]:
#%pip install pybaseball;
import os
import re
import csv
import math
#import tweepy # %pip install tweepy
import requests
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xml.etree.ElementTree as ET
import matplotlib.patches as patches
import seaborn as sns
from datetime import date, timedelta
import datetime as dt
#import statsapi # %pip install MLB-StatsAPI
#from pybaseball import statcast

# data research

In [3]:
df = pd.read_csv('5 year raw download data with only called pitches and call type', sep='\t', encoding='utf-8')
df['pitch_type'].value_counts()

FF    587488
SI    284039
SL    263410
CH    158516
CU    140128
FC    101956
KC     40095
FS     20794
ST     20067
SV      4850
FA      1513
KN       543
EP       427
CS       307
FO        45
SC        22
Name: pitch_type, dtype: int64

In [4]:
df['description'].value_counts()

ball             1087477
called_strike     536806
Name: description, dtype: int64

In [5]:
len(df['pitch_type'])

1624283

In [6]:
display(df["call"].value_counts())

correct      1500923
incorrect     123360
Name: call, dtype: int64

In [7]:
#mark call column correct call as 0 , incorrect as 1
df.loc[df["call"] == "incorrect", "call"] = 1
df.loc[df["call"] == "correct", "call"] = 0

In [8]:
print('5 year BCR: ',df['call'].mean())

5 year BCR:  0.07594735646435996


In [9]:
pt = df[['pitch_type','call']]
pt.groupby("pitch_type").mean()

Unnamed: 0_level_0,call
pitch_type,Unnamed: 1_level_1
CH,0.057672
CS,0.042345
CU,0.067717
EP,0.11007
FA,0.07733
FC,0.08029
FF,0.082587
FO,0.066667
FS,0.050976
KC,0.066317


In [10]:
#WORST at calling FF (0.082587) and SI (0.087315)
#BEST AT calling CH (0.057672) and FS (0.050976)
#IGNORING 5 outliers with least occurance

In [11]:
print("Avg BCR for 16 pitch types")
(0.057672+0.042345+0.067717+0.110070+0.077330+0.080290+0.082587+0.066667+0.050976+0.066317+0.095764+0.227273+0.087315+0.066903+ 0.064982+ 0.067629) /16        

Avg BCR for 16 pitch types


0.0819898125

In [12]:
print("Remove 5 outliers with less than 1000 pitches: KN,EP,CS,FO,SC")
print("Avg BCR for 11 pitch types")
(0.057672+0.067717+0.077330+0.080290+0.082587+0.050976+0.066317+0.087315+0.066903+ 0.064982+ 0.067629) /11

Remove 5 outliers with less than 1000 pitches: KN,EP,CS,FO,SC
Avg BCR for 11 pitch types


0.06997436363636363

In [13]:
pt.groupby("pitch_type").mean()/0.06997436363636363

Unnamed: 0_level_0,call
pitch_type,Unnamed: 1_level_1
CH,0.824193
CS,0.605154
CU,0.967735
EP,1.573008
FA,1.105116
FC,1.147414
FF,1.18025
FO,0.95273
FS,0.728499
KC,0.94774


In [14]:
print("After removing 5 pitch types with less than 1000 pitches, we get Avg BCR of 0.07 for the 11 pitch types. We see that FF are 18% more likely than average to be called incorrectly and SI is 24.8% more liekly to be called incorrectly.")

After removing 5 pitch types with less than 1000 pitches, we get Avg BCR of 0.07 for the 11 pitch types. We see that FF are 18% more likely than average to be called incorrectly and SI is 24.8% more liekly to be called incorrectly.


# Strike vs ball BCR

In [15]:
strike_bcr = df[["call","type"]].loc[df["type"] == "S"]

In [16]:
strike_bcr["call"].value_counts()

0    463997
1     72809
Name: call, dtype: int64

In [17]:
print("strike bad call ratio")
72809/len(strike_bcr)

strike bad call ratio


0.13563372987634267

In [18]:
ball_bcr = df[["call","type"]].loc[df["type"] == "B"]

In [19]:
ball_bcr["call"].value_counts()

0    1036926
1      50551
Name: call, dtype: int64

In [20]:
print("ball bad call ratio")
50551/len(ball_bcr)

ball bad call ratio


0.04648466128479039

# Pitch zone

In [21]:
df.columns

Index(['Unnamed: 0.1', 'Unnamed: 0', 'pitch_type', 'game_date',
       'release_speed', 'release_pos_x', 'release_pos_z', 'player_name',
       'batter', 'pitcher', 'events', 'description', 'spin_dir',
       'spin_rate_deprecated', 'break_angle_deprecated',
       'break_length_deprecated', 'zone', 'des', 'game_type', 'stand',
       'p_throws', 'home_team', 'away_team', 'type', 'hit_location', 'bb_type',
       'balls', 'strikes', 'game_year', 'pfx_x', 'pfx_z', 'plate_x', 'plate_z',
       'on_3b', 'on_2b', 'on_1b', 'outs_when_up', 'inning', 'inning_topbot',
       'hc_x', 'hc_y', 'tfs_deprecated', 'tfs_zulu_deprecated', 'fielder_2',
       'umpire', 'sv_id', 'vx0', 'vy0', 'vz0', 'ax', 'ay', 'az', 'sz_top',
       'sz_bot', 'hit_distance_sc', 'launch_speed', 'launch_angle',
       'effective_speed', 'release_spin_rate', 'release_extension', 'game_pk',
       'pitcher.1', 'fielder_2.1', 'fielder_3', 'fielder_4', 'fielder_5',
       'fielder_6', 'fielder_7', 'fielder_8', 'fielder_9', '

In [22]:
df['zone'].value_counts()

14.0    391404
13.0    265341
11.0    260498
12.0    194770
9.0      75555
8.0      66955
7.0      62657
6.0      62589
4.0      58596
5.0      54990
1.0      47305
3.0      42896
2.0      40727
Name: zone, dtype: int64

In [23]:
zonebcr = df[["call","zone"]]

In [24]:
zonebcr.groupby("zone").mean()

Unnamed: 0_level_0,call
zone,Unnamed: 1_level_1
1.0,0.203636
2.0,0.097626
3.0,0.208971
4.0,0.029866
5.0,0.000655
6.0,0.036987
7.0,0.114895
8.0,0.07753
9.0,0.152035
11.0,0.055528


In [25]:
print("Avg BCR for 13 strike zones")
(0.203636+0.097626+0.208971+0.029866+0.000655+0.036987+0.114895+0.077530+0.152035+0.055528+0.075381+0.078378+0.058418)/13

Avg BCR for 13 strike zones


0.09153123076923078

In [26]:
zonebcr.groupby("zone").mean()/0.09153123076923078

Unnamed: 0_level_0,call
zone,Unnamed: 1_level_1
1.0,2.22477
2.0,1.066583
3.0,2.283052
4.0,0.326288
5.0,0.007152
6.0,0.404095
7.0,1.255259
8.0,0.84703
9.0,1.661017
11.0,0.606659


In [27]:
print("Zone 1 and 3 are 2.22 times and 2.28 times more likely than average to be called incorrectly")
print("Zone 7 and 9 are 26% and 66% more likely than average to be called incorrectly")

Zone 1 and 3 are 2.22 times and 2.28 times more likely than average to be called incorrectly
Zone 7 and 9 are 26% and 66% more likely than average to be called incorrectly


# Sinkers vs pitch zone

In [28]:
sinker = df.query("pitch_type == 'SI'")

In [29]:
sinker['zone'].value_counts()

14.0    55841
13.0    53710
11.0    43250
12.0    29648
9.0     16277
8.0     15066
7.0     13515
6.0     13287
5.0     10537
4.0     10442
3.0      7944
1.0      7352
2.0      7170
Name: zone, dtype: int64

In [30]:
sibcr = sinker[["call","zone"]]

In [31]:
sibcr.groupby("zone").mean()

Unnamed: 0_level_0,call
zone,Unnamed: 1_level_1
1.0,0.207291
2.0,0.107671
3.0,0.203046
4.0,0.035625
5.0,0.000854
6.0,0.045533
7.0,0.112616
8.0,0.085159
9.0,0.152424
11.0,0.056694


# FF vs Pitch zone

In [32]:
ff = df.query("pitch_type == 'FF'")
ff['zone'].value_counts()

11.0    131258
12.0    103808
14.0     92131
13.0     65765
9.0      27653
6.0      24959
8.0      24065
7.0      23194
4.0      22627
5.0      19654
1.0      19382
3.0      18229
2.0      14763
Name: zone, dtype: int64

In [33]:
ffbcr = ff[["call","zone"]]
ffbcr.groupby("zone").mean()

Unnamed: 0_level_0,call
zone,Unnamed: 1_level_1
1.0,0.246363
2.0,0.1329
3.0,0.254978
4.0,0.031909
5.0,0.000509
6.0,0.040827
7.0,0.095025
8.0,0.062913
9.0,0.131306
11.0,0.045986


In [34]:
ffbcr = ff[["call","type"]]
ffbcr.groupby("type").mean()

Unnamed: 0_level_0,call
type,Unnamed: 1_level_1
B,0.053259
S,0.138195
