In [None]:
%%capture
%pip install pybaseball

# General Packages
import numpy as np
import pandas as pd
import pybaseball as pyb
import seaborn as sns

# Machine Learning
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans
from sklearn.svm import SVC
import warnings
warnings.filterwarnings('ignore')

In [None]:
pitches = pyb.statcast('2022-04-01', '2022-10-01')



This is a large query, it may take a moment to complete


100%|██████████| 184/184 [13:48<00:00,  4.50s/it]


##Machine Learning

###Decision Tree

In [None]:
bieber_stats = pyb.statcast_pitcher('2022-04-01', '2022-10-01', 669456)

print('Our Dataset has {0} Pitches'.format(len(bieber_stats)))

Gathering Player Data
Our Dataset has 2825 Pitches


In [None]:
def decision_tree(data,fastball_group):
    data = data.loc[:,['pitch_name',
                       'release_speed',
                       'release_spin_rate',
                       'vx0','vy0','vz0',
                       'ax','ay','az']]
    
    data = data.dropna()
    
    def pitch_filter(x):
        if x=='2-Seam Fastball' or x=='4-Seam Fastball' or x=='Sinker':
            return 'Fastball_group'
        return x
    
    if fastball_group == True:
        data['pitch_name'] = data['pitch_name'].apply(pitch_filter)
    
    X = data.loc[:,['release_speed','vx0','vy0','vz0','ax','ay','az','release_spin_rate']]
    y = data.loc[:,['pitch_name']]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=8)
    
    dt_model = tree.DecisionTreeClassifier(max_depth=10, min_samples_split=50)
    
    dt_model.fit(X_train, y_train)
    
    predictions = dt_model.predict(X_train)
    print('Training Score Accuracy {0}'.format(accuracy_score(predictions, y_train)))
    
    predictions = dt_model.predict(X_test)
    print('Test Score Accuracy {0}'.format(accuracy_score(predictions, y_test)))
    
    print(classification_report(predictions, y_test))

In [None]:
decision_tree(bieber_stats, fastball_group = False)

Training Score Accuracy 0.9619686800894854
Test Score Accuracy 0.9481216457960644
                 precision    recall  f1-score   support

4-Seam Fastball       0.99      1.00      0.99       194
       Changeup       1.00      0.75      0.86        12
         Cutter       0.85      0.86      0.86        86
  Knuckle Curve       0.99      0.99      0.99       111
         Slider       0.92      0.92      0.92       156

       accuracy                           0.95       559
      macro avg       0.95      0.90      0.92       559
   weighted avg       0.95      0.95      0.95       559



In [None]:
# Data = Bieber in 2022
# Treat Four-Seam, Two-Seam, and Sinker as a general fastball

decision_tree(bieber_stats, fastball_group = True)

Training Score Accuracy 0.9619686800894854
Test Score Accuracy 0.9481216457960644
                precision    recall  f1-score   support

      Changeup       1.00      0.75      0.86        12
        Cutter       0.85      0.86      0.86        86
Fastball_group       0.99      1.00      0.99       194
 Knuckle Curve       0.99      0.99      0.99       111
        Slider       0.92      0.92      0.92       156

      accuracy                           0.95       559
     macro avg       0.95      0.90      0.92       559
  weighted avg       0.95      0.95      0.95       559



###Random Foest


In [None]:
from sklearn.preprocessing import LabelEncoder
def random_forest(data,fastball_group):
    data = data.loc[:,['pitch_name',
                       'release_speed',
                       'release_spin_rate',
                       'vx0','vy0','vz0',
                       'ax','ay','az']]
    
    data = data.dropna()
    
    def pitch_filter(x):
        if x=='2-Seam Fastball' or x=='4-Seam Fastball' or x=='Sinker':
            return 'Fastball_group'
        return x
    
    if fastball_group == True:
        data['pitch_name'] = data['pitch_name'].apply(pitch_filter)
    
    X = data.loc[:,['release_speed','vx0','vy0','vz0','ax','ay','az','release_spin_rate']]
    y = data.loc[:,['pitch_name']]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=8)
    
    dt_model = RandomForestClassifier(max_depth=10, min_samples_split=50)
    
    dt_model.fit(X_train, y_train)
    
    predictions = dt_model.predict(X_train)
    print('Training Score Accuracy {0}'.format(accuracy_score(predictions, y_train)))
    
    predictions = dt_model.predict(X_test)
    print('Test Score Accuracy {0}'.format(accuracy_score(predictions, y_test)))
    
    print(classification_report(predictions, y_test))

In [None]:
random_forest(bieber_stats, fastball_group = False)

Training Score Accuracy 0.959079283887468
Test Score Accuracy 0.964243146603099
                 precision    recall  f1-score   support

4-Seam Fastball       1.00      1.00      1.00       304
       Changeup       1.00      0.95      0.97        19
         Cutter       0.91      0.89      0.90       144
  Knuckle Curve       1.00      0.99      1.00       158
         Slider       0.92      0.94      0.93       214

       accuracy                           0.96       839
      macro avg       0.97      0.95      0.96       839
   weighted avg       0.96      0.96      0.96       839



In [None]:
random_forest(bieber_stats, fastball_group = True)

Training Score Accuracy 0.9616368286445013
Test Score Accuracy 0.9630512514898689
                precision    recall  f1-score   support

      Changeup       0.94      0.94      0.94        18
        Cutter       0.91      0.89      0.90       144
Fastball_group       1.00      1.00      1.00       305
 Knuckle Curve       1.00      0.99      1.00       158
        Slider       0.92      0.94      0.93       214

      accuracy                           0.96       839
     macro avg       0.96      0.95      0.95       839
  weighted avg       0.96      0.96      0.96       839



###SVC

In [None]:
def svc(data,fastball_group):
    data = data.loc[:,['pitch_name',
                       'release_speed',
                       'release_spin_rate',
                       'vx0','vy0','vz0',
                       'ax','ay','az']]
    
    data = data.dropna()
    
    def pitch_filter(x):
        if x=='2-Seam Fastball' or x=='4-Seam Fastball' or x=='Sinker':
            return 'Fastball_group'
        return x
    
    if fastball_group == True:
        data['pitch_name'] = data['pitch_name'].apply(pitch_filter)
    
    X = data.loc[:,['release_speed','vx0','vy0','vz0','ax','ay','az','release_spin_rate']]
    y = data.loc[:,['pitch_name']]
    
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=8)
    
    dt_model = SVC(degree=3)
    
    dt_model.fit(X_train, y_train)
    
    predictions = dt_model.predict(X_train)
    print('Training Score Accuracy {0}'.format(accuracy_score(predictions, y_train)))
    
    predictions = dt_model.predict(X_test)
    print('Test Score Accuracy {0}'.format(accuracy_score(predictions, y_test)))
    
    print(classification_report(predictions, y_test))

In [None]:
svc(bieber_stats, fastball_group=False)

Training Score Accuracy 0.636241610738255
Test Score Accuracy 0.6225402504472272
                 precision    recall  f1-score   support

4-Seam Fastball       0.97      0.63      0.77       300
       Changeup       0.78      0.78      0.78         9
         Cutter       0.00      0.00      0.00         0
  Knuckle Curve       0.00      0.00      0.00         0
         Slider       0.97      0.60      0.74       250

       accuracy                           0.62       559
      macro avg       0.54      0.40      0.46       559
   weighted avg       0.97      0.62      0.76       559



In [None]:
svc(bieber_stats, fastball_group=True)

Training Score Accuracy 0.6371364653243848
Test Score Accuracy 0.6261180679785331
                precision    recall  f1-score   support

      Changeup       1.00      0.82      0.90        11
        Cutter       0.00      0.00      0.00         0
Fastball_group       0.97      0.64      0.77       298
 Knuckle Curve       0.00      0.00      0.00         0
        Slider       0.97      0.60      0.74       250

      accuracy                           0.63       559
     macro avg       0.59      0.41      0.48       559
  weighted avg       0.97      0.63      0.76       559

