In [2]:
# import statements
import numpy as np
import pandas as pd
import pybaseball as pyball
import matplotlib.pyplot as plt
import statsmodels.api as sm
from statsmodels.genmod.generalized_linear_model import GLM
from statsmodels.genmod import families
import statsmodels.formula.api as smf
import math
from scipy import stats

# importing what we need for the logistic regression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

pyball.cache.enable()

In [3]:
# pulling in statcast data
statcast_before_raw = pyball.statcast('2021-04-04', '2021-05-30')
statcast_after_raw = pyball.statcast('2021-06-22', '2021-09-30')

This is a large query, it may take a moment to complete


100%|██████████| 57/57 [00:07<00:00,  7.89it/s]


This is a large query, it may take a moment to complete


100%|██████████| 101/101 [00:09<00:00, 11.12it/s]


In [4]:
## We need to combine the two, with a variable for post_crackdown
statcast_before_raw['post_crackdown'] = 0
statcast_after_raw['post_crackdown'] = 1

data = pd.concat([statcast_before_raw, statcast_after_raw], axis=0)

In [5]:
# lets get the info we need to regress fastball spin rates
data = data[['pitch_type',
            'release_spin_rate',
            'player_name',
            'post_crackdown']].dropna(how='any')

# forcing it out of the d type
data = data.astype({
    column: np.int32
    for column in data.drop(['pitch_type', 'player_name'], axis=1).columns
})

# getting only the fastballs
fb_data = data[data['pitch_type'] == "FF"]

In [6]:
# function to create a logit model
formula = "post_crackdown ~ release_spin_rate"

def get_logit_model(data, name):
    df = rm_outliers(data[data['player_name'] == name], "release_spin_rate")
    fit = smf.logit(formula, df).fit()
    return fit

In [12]:
# function to get average RPMs before and after the crackdown
def print_averages(data, name):

    player_data = data[data['player_name'] == name]
    before = player_data[player_data['post_crackdown'] == 0]
    after = player_data[player_data['post_crackdown'] == 1]
    print(before['release_spin_rate'].mean())
    print(after['release_spin_rate'].mean())

In [8]:
# function to remove outliers
def rm_outliers(df, col):
    q_low = df[col].quantile(0.01)
    q_hi  = df[col].quantile(0.99)
    return df[(df[col] < q_hi) & (df[col] > q_low)].dropna()

In [9]:
# print results of a model

def print_logit_model(data, name): 

    model = get_logit_model(fb_data, name)

    df = rm_outliers(data[data['player_name'] == name], "release_spin_rate")
    pred_data = pd.DataFrame({'release_spin_rate': np.linspace(df['release_spin_rate'].min(), df['release_spin_rate'].max(), 1000)})
    probs = model.predict(pred_data)

    # plotting
    plt.figure(figsize=(7, 7), facecolor="white")
    ax = plt.axes()
    ax.scatter(df['release_spin_rate'], df['post_crackdown'], color='b', alpha=0.05)
    ax.scatter(pred_data, probs, color="black", s=4)
    ax.set_ylabel('Pre (0) Vs. Post (1) Crackdown')
    ax.set_xlabel('FB Spin Rate (RPMs)')
    ax.set_title(name)
    plt.show()

In [None]:
# alrighty, iterating through every single pitcher and pulling p-values

pitcher_list = data['player_name'].unique()

fb_pvalues = []
fb_coef = []
fb_names = []
fb_rs = []

for pitcher in pitcher_list:

    player_fb = fb_data[fb_data['player_name'] == pitcher]

    if (np.shape(player_fb[player_fb['post_crackdown'] == 0])[0] > 100) & (np.shape(player_fb[player_fb['post_crackdown'] == 1])[0] > 100):    

        player_logreg = get_logit_model(player_fb, pitcher)

        # appending to our arrays 
        fb_pvalues.append(player_logreg.pvalues[1])
        fb_coef.append(player_logreg.params[1])
        fb_names.append(pitcher)
        fb_rs.append(player_logreg.prsquared)

# creating dfs from our final results
fb_results = pd.DataFrame({
    'p_value': fb_pvalues,
    'coef': fb_coef,
    'r_squared': fb_rs,
    'name': fb_names 
})

In [11]:
# get list of cheaters
cheaters = fb_results[(fb_results['p_value'] < 0.001) & (fb_results['coef'] < 0) & (fb_results['r_squared'] > 0.1)]
not_cheaters = fb_results[(fb_results['p_value'] > 0.001) | (fb_results['coef'] > 0) | (fb_results['r_squared'] > 0.1)]
cheaters.to_clipboard()