# Modeling

In [115]:
# Import libraries
import re 
import ast 
import json 
import pickle 
from collections import Counter 
import datetime as dt
import pybaseball
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta

from pybaseball import pitching_stats_bref
from pybaseball import statcast
from pybaseball import statcast_pitcher
from pybaseball import playerid_lookup
from pybaseball import playerid_reverse_lookup

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC, LinearSVC
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

import warnings
warnings.filterwarnings("ignore")

- XGBoost
- Random Forest
- RNN (LSTM or GRU or both)

RNN will likely take too long depending on how the number of features.

In [116]:
data = pd.read_csv('../data/final/pitch_by_pitch_2023_ml.csv')

In [118]:
data.drop(columns=['game_type', 'inning_topbot', 'if_fielding_alignment', 'of_fielding_alignment', 'count', 'game_pitcher_id'], inplace=True)

In [119]:
data['prev_pitch_type'] = data['prev_pitch_type'].astype(str)

In [120]:
data['prev_pitch_type'] = data['prev_pitch_type'].replace('nan', 'Unknown')

In [121]:
# Drop all rows where prev_pitch_type is UN
data = data[data.prev_pitch_type != 'Unknown']

In [122]:
data['pitch_type'].value_counts()

pitch_type
Fastball         286240
Breaking Ball    150978
Off-Speed         55266
Specialty            66
Name: count, dtype: int64

In [123]:
data['prev_pitch_type'].value_counts()

prev_pitch_type
Fastball         283970
Breaking Ball    152355
Off-Speed         56159
Specialty            66
Name: count, dtype: int64

In [124]:
data['pitch_type'].isna().sum()

0

In [198]:
def preprocess_data(data):

    data.drop(columns=['game_type', 'inning_topbot', 'if_fielding_alignment', 'of_fielding_alignment', 'count', 'game_pitcher_id'], inplace=True)
    
    data['prev_pitch_type'] = data['prev_pitch_type'].replace('nan', 'Unknown')
    data = data[data.prev_pitch_type != 'Unknown']
    
    data['pitch_type'] = data['pitch_type'].astype(str)
    data['prev_pitch_type'] = data['prev_pitch_type'].astype(str)

    data = data[data.pitch_type != 'Specialty']
    data = data[data.prev_pitch_type != 'Specialty']
    
    return data

In [131]:
data2 = pd.read_csv('../data/final/five_2023_ml.csv')

In [132]:
data2.drop(columns=['game_type', 'inning_topbot', 'if_fielding_alignment', 'of_fielding_alignment', 'count', 'game_pitcher_id'], inplace=True)

In [149]:
data2['prev_pitch_type'] = data2['prev_pitch_type'].astype(str)

In [150]:
data2['pitch_type'] = data2['pitch_type'].astype(str)

In [134]:
data2['prev_pitch_type'] = data2['prev_pitch_type'].replace('nan', 'Unknown')

In [135]:
# Drop all rows where prev_pitch_type is UN
data2 = data2[data2.prev_pitch_type != 'Unknown']

In [200]:
# Drop Specialty from both pitch_type and prev_pitch_type
data2 = data2[data2.pitch_type != 'Specialty']
data2 = data2[data2.prev_pitch_type != 'Specialty']

In [172]:
data2['pitch_type'].value_counts()

pitch_type
PFastball        162370
Breaking Ball    150978
MFastball        123870
Off-Speed         55266
Specialty            66
Name: count, dtype: int64

In [173]:
data2['prev_pitch_type'].value_counts()

prev_pitch_type
PFastball        160349
Breaking Ball    152355
MFastball        123621
Off-Speed         56159
Specialty            66
Name: count, dtype: int64

## 1. XGBoost

In [184]:
def train_models(train_data, pitch_count_cutoff=1000):
    '''
    Function to train and test models for pitch prediction for individual pitchers
    
    train_data: a cleaned data frame
    pitch_count_cutoff: a minimum number of pitches thrown  
    
    returns a pickled file for each pitcher that contains the model and some metadata for that pitcher
    '''

    # build a dict with pitch_id as key and total pitch count as value
    pitcher_count_dict = dict(Counter(train_data['pitcher']))

    # drop pitchers that don't have enough pitches to build a reliable model
    pitcher_count_dict = {k:v for k, v in pitcher_count_dict.items() if v > pitch_count_cutoff}

    # list of pitchers
    pitcher_list = pitcher_count_dict.keys()
    print(f"Number of pitchers that make the cut: {len(pitcher_count_dict)}")

    # loop through the list of pitchers and train models
    accuracy_list = []
    naive_accuracy_list = []
    num_skipped = 0
    for i, pitcher in enumerate(pitcher_list):

        # Start timer
        start = dt.datetime.now()

        df_pitcher = train_data[train_data['pitcher'] == pitcher].copy()
        df_pitcher.drop('pitcher', axis=1, inplace=True)

        # Get unique pitch types for the pitcher
        pitch_types = list(set(df_pitcher['prev_pitch_type'].unique()) | set(df_pitcher['pitch_type'].unique()))
        pitch_map = {pitch_types[i]: i for i in range(len(pitch_types))}
        pitch_unmap = {v: k for k, v in pitch_map.items()}

        # Map pitch types to integers
        df_pitcher['pitch_type'] = df_pitcher['pitch_type'].map(pitch_map)
        df_pitcher['prev_pitch_type'] = df_pitcher['prev_pitch_type'].map(pitch_map)

        # Features and target
        X = df_pitcher.drop('pitch_type', axis=1)
        y = df_pitcher['pitch_type']

        # Train/test split
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4256)

        # Determine the classification type
        num_classes = len(pitch_types)
        if num_classes == 2:
            objective = 'binary:logistic'
        else:
            objective = 'multi:softmax'

        # XGBoost parameters
        xgb_params = {"max_depth": (2, 5, 20),
                  "learning_rate": (0.01, 0.1, 0.4)}

        # GridSearchCV with appropriate objective
        xgb_opt = GridSearchCV(
            XGBClassifier(objective=objective, num_class=num_classes if num_classes > 2 else None),
            param_grid=xgb_params,
            cv=5,
            scoring='accuracy',
            verbose=0,
            n_jobs=-1
        )

        # Train the model
        xgb_opt.fit(X_train, y_train)
        y_pred = xgb_opt.predict(X_test)

        # Compute accuracies
        accuracy = round(accuracy_score(y_test, y_pred) * 100, 1)
        accuracy_list.append(accuracy)

        # Compute naive accuracy
        pitch_type_counts = Counter(y_train)
        naive_accuracy = round(max(pitch_type_counts.values()) / sum(pitch_type_counts.values()) * 100, 1)
        naive_accuracy_list.append(naive_accuracy)

        # Print progress for every 10th pitcher
        if i % 10 == 0:
            print(f"\nPitcher ID: {pitcher}")
            print(f"Pitch Types: {pitch_map}")
            print(f"Objective: {objective}")
            print(f"Train Size: {X_train.shape[0]}, Test Size: {X_test.shape[0]}")
            print(f"Best Params: {xgb_opt.best_params_}")
            print(f"Naive Accuracy: {naive_accuracy}")
            print(f"XGBoost Accuracy: {accuracy}")
            print(f"Training Time: {dt.datetime.now() - start}")

        # Save the model and metadata
        model_out = {
            "pitcherID": pitcher,
            "pitch_map": pitch_map,
            "pitch_unmap": pitch_unmap,
            "model": xgb_opt,
            "model_accuracy": accuracy
        }
        fpath = f"../data/pitcher_models/{pitcher}.pkl"
        with open(fpath, 'wb') as fobj:
            pickle.dump(model_out, fobj)
           
    # return the accuracy lists so we can perform assessment 
    return accuracy_list, naive_accuracy_list

In [206]:
def train_xgb_models(train_data, pitch_count_cutoff=1000):
    '''
    Function to train and test models for pitch prediction for individual pitchers
    
    train_data: a cleaned data frame
    pitch_count_cutoff: a minimum number of pitches thrown  
    
    returns a pickled file for each pitcher that contains the model and some metadata for that pitcher
    '''

    # build a dict with pitch_id as key and total pitch count as value
    pitcher_count_dict = dict(Counter(train_data['pitcher']))

    # drop pitchers that don't have enough pitches to build a reliable model
    pitcher_count_dict = {k:v for k, v in pitcher_count_dict.items() if v > pitch_count_cutoff}

    # list of pitchers
    pitcher_list = pitcher_count_dict.keys()
    print(f"Number of pitchers that make the cut: {len(pitcher_count_dict)}")

    # loop through the list of pitchers and train models
    accuracy_list = []
    naive_accuracy_list = []
    num_skipped = 0
    for i, pitcher in enumerate(pitcher_list):

        # start a timer
        start = dt.datetime.now()

        df_pitcher = train_data[train_data['pitcher'] == pitcher]
        df_pitcher.drop('pitcher', axis=1, inplace=True)

        # get a unique list of the pitcher's pitches
        pitch_types = list(set(list(df_pitcher['prev_pitch_type'].unique()) + list(df_pitcher['pitch_type'].unique())))
        pitch_type_counts = Counter(df_pitcher['prev_pitch_type'])

        # build maps for pitches to ints and ints back to pitches
        pitch_map = {pitch_types[i]: i for i in range(len(pitch_types))}
        pitch_unmap = {v: k for k, v in pitch_map.items()}

        # map pitch types to ints
        df_pitcher['pitch_type'] = df_pitcher['pitch_type'].apply(lambda x: pitch_map[x])
        df_pitcher['prev_pitch_type'] = df_pitcher['prev_pitch_type'].apply(lambda x: pitch_map[x])

        # split the dataframe into a feature set and an outcome column
        X = df_pitcher.drop('pitch_type', axis=1)
        y = df_pitcher['pitch_type']

        # split the data into train/test sets
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        """
        if len(np.unique(y_train)) > len(np.unique(y_test)):
            unique_classes = len(y_train.unique())
        else:
            unique_classes = len(y_test.unique())
        
        if len(np.unique(y_train)) > len(np.unique(y_test)):
            # Get all unique classes from y_train
            all_classes = np.unique(y_train)

            # Align the test labels with the training classes
            y_test = pd.Categorical(y_test, categories=all_classes).codes
        else:
            # Get all unique classes from y_test
            all_classes = np.unique(y_test)

            # Align the training labels with the test classes
            y_train = pd.Categorical(y_train, categories=all_classes).codes
        """
        # ----------------------
        # train an XGBoost model
        # ----------------------

        # small set of hyperparameters to optimize over
        xgb_params = {"max_depth": (2, 5, 20),
                      "learning_rate": (0.01, 0.1, 0.4)}
        
        # perform the paramater grid search using 5-fold cross validation
        xgb_opt = GridSearchCV(XGBClassifier(objective='multi:softprob', num_class=len(pitch_type_counts)), 
                               param_grid=xgb_params, cv=5, scoring='accuracy', verbose=0, n_jobs=-1)

        # perform fit and make predictions
        xgb_opt.fit(X_train, y_train)
        y_pred = xgb_opt.predict(X_test)
        y_prob = xgb_opt.predict_proba(X_test)

        # compute accuracy and store in a list for analyzing results later
        accuracy = round(accuracy_score(y_test, y_pred) * 100, 1)
        accuracy_list.append(accuracy)

        # get and store the naive accuracy (accuracy from just predicting the most thrown pitch)
        naive_accuracy = round(max(pitch_type_counts.values()) / sum(pitch_type_counts.values()) * 100., 1)
        naive_accuracy_list.append(naive_accuracy)

        # print some input/results for every 10th pitcher
        if i % 10 == 0:
            print()
            print(f"Pitcher ID: {pitcher}")
            print(f"Pitcher's pitch map: {pitch_map}")
            print(f"Pitcher's pitch counter: {dict(pitch_type_counts)}")
            print(f"Number of data points in training: {X_train.shape[0]}")
            print(f"Number of data points in testing: {X_test.shape[0]}")
            print(f"Best params: {xgb_opt.best_params_}")
            print(f"Total training time: {dt.datetime.now()-start}")
            print(f"Naive accuracy: {naive_accuracy}")
            print(f"XGBooost accuracy: {accuracy}")


        # ----------------------------------------------------------
        # write out the pitchers model and metadata to a pickle file
        # ----------------------------------------------------------

        # things to store in the pitcher's model file:
        #  1) the map and unmap for pitches (used for data clean-up in the prediction process)
        #  2) trained model (used to make prediction)
        #  3) accuracy on the test data (to include with pitch predictions so user can see how confident the model is)
        model_out = {
            "pitcherID": pitcher,
            "pitch_map": pitch_map,
            "pitch_unmap": pitch_unmap,
            "model": xgb_opt,
            "model_accuracy": accuracy
        }

        # pickle up the pitcher's model file
        fpath = "../data/pitcher_models/" + str(pitcher) + ".pkl"
        with open(fpath, 'wb') as fobj:
            pickle.dump(model_out, fobj)
           
    # return the accuracy lists so we can perform assessment 
    return accuracy_list, naive_accuracy_list

---

### Four categories

In [None]:
accuracy_list, naive_accuracy_list = train_xgb_models(data, pitch_count_cutoff=2000)

---

### Five categories 2023 season

In [212]:
accuracy_list, naive_accuracy_list = train_xgb_models(data2, pitch_count_cutoff=2400)

Number of pitchers that make the cut: 65

Pitcher ID: 622491
Pitcher's pitch map: {'MFastball': 0, 'Breaking Ball': 1, 'Off-Speed': 2, 'PFastball': 3}
Pitcher's pitch counter: {'MFastball': 556, 'PFastball': 1408, 'Breaking Ball': 710, 'Off-Speed': 500}
Number of data points in training: 2539
Number of data points in testing: 635
Best params: {'learning_rate': 0.1, 'max_depth': 5}
Total training time: 0:00:01.896866
Naive accuracy: 44.4
XGBooost accuracy: 80.6

Pitcher ID: 605483
Pitcher's pitch map: {'Breaking Ball': 0, 'Off-Speed': 1, 'PFastball': 2}
Pitcher's pitch counter: {'Breaking Ball': 1042, 'PFastball': 1510, 'Off-Speed': 584}
Number of data points in training: 2508
Number of data points in testing: 628
Best params: {'learning_rate': 0.1, 'max_depth': 5}
Total training time: 0:00:00.992865
Naive accuracy: 48.2
XGBooost accuracy: 94.9

Pitcher ID: 669302
Pitcher's pitch map: {'Breaking Ball': 0, 'PFastball': 1, 'MFastball': 2}
Pitcher's pitch counter: {'PFastball': 1192, 'Brea

---

### Five categories 2024 season

In [195]:
data3 = pd.read_csv('../data/final/five_2024_ml.csv')

In [197]:
data3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500605 entries, 0 to 500604
Data columns (total 32 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   pitch_type                         500605 non-null  object 
 1   release_speed                      500605 non-null  float64
 2   batter                             500605 non-null  int64  
 3   pitcher                            500605 non-null  int64  
 4   game_type                          500605 non-null  object 
 5   balls                              500605 non-null  int64  
 6   strikes                            500605 non-null  int64  
 7   on_3b                              500605 non-null  int64  
 8   on_2b                              500605 non-null  int64  
 9   on_1b                              500605 non-null  int64  
 10  outs_when_up                       500605 non-null  int64  
 11  inning                             5006

In [None]:
preprocess_data(data3)

In [219]:
data3 = data3[data3.pitch_type != 'Specialty']
data3 = data3[data3.prev_pitch_type != 'Specialty']

In [None]:
def train_xgb_models_2024(train_data, pitch_count_cutoff=1000):
    '''
    Function to train and test models for pitch prediction for individual pitchers
    
    train_data: a cleaned data frame
    pitch_count_cutoff: a minimum number of pitches thrown  
    
    returns a pickled file for each pitcher that contains the model and some metadata for that pitcher
    '''

    # build a dict with pitch_id as key and total pitch count as value
    pitcher_count_dict = dict(Counter(train_data['pitcher']))

    # drop pitchers that don't have enough pitches to build a reliable model
    pitcher_count_dict = {k:v for k, v in pitcher_count_dict.items() if v > pitch_count_cutoff}

    # list of pitchers
    pitcher_list = pitcher_count_dict.keys()
    print(f"Number of pitchers that make the cut: {len(pitcher_count_dict)}")

    # loop through the list of pitchers and train models
    accuracy_list = []
    naive_accuracy_list = []
    num_skipped = 0
    for i, pitcher in enumerate(pitcher_list):

        # start a timer
        start = dt.datetime.now()

        df_pitcher = train_data[train_data['pitcher'] == pitcher]
        df_pitcher.drop('pitcher', axis=1, inplace=True)

        # get a unique list of the pitcher's pitches
        pitch_types = list(set(list(df_pitcher['prev_pitch_type'].unique()) + list(df_pitcher['pitch_type'].unique())))
        pitch_type_counts = Counter(df_pitcher['prev_pitch_type'])

        # build maps for pitches to ints and ints back to pitches
        pitch_map = {pitch_types[i]: i for i in range(len(pitch_types))}
        pitch_unmap = {v: k for k, v in pitch_map.items()}

        # map pitch types to ints
        df_pitcher['pitch_type'] = df_pitcher['pitch_type'].apply(lambda x: pitch_map[x])
        df_pitcher['prev_pitch_type'] = df_pitcher['prev_pitch_type'].apply(lambda x: pitch_map[x])

        # split the dataframe into a feature set and an outcome column
        X = df_pitcher.drop('pitch_type', axis=1)
        y = df_pitcher['pitch_type']

        # split the data into train/test sets
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        """
        if len(np.unique(y_train)) > len(np.unique(y_test)):
            unique_classes = len(y_train.unique())
        else:
            unique_classes = len(y_test.unique())
        
        if len(np.unique(y_train)) > len(np.unique(y_test)):
            # Get all unique classes from y_train
            all_classes = np.unique(y_train)

            # Align the test labels with the training classes
            y_test = pd.Categorical(y_test, categories=all_classes).codes
        else:
            # Get all unique classes from y_test
            all_classes = np.unique(y_test)

            # Align the training labels with the test classes
            y_train = pd.Categorical(y_train, categories=all_classes).codes
        """
        # ----------------------
        # train an XGBoost model
        # ----------------------

        # small set of hyperparameters to optimize over
        xgb_params = {"max_depth": (2, 5, 20),
                      "learning_rate": (0.01, 0.1, 0.4)}
        
        # perform the paramater grid search using 5-fold cross validation
        xgb_opt = GridSearchCV(XGBClassifier(objective='multi:softprob', num_class=len(pitch_type_counts)), 
                               param_grid=xgb_params, cv=5, scoring='accuracy', verbose=0, n_jobs=-1)

        # perform fit and make predictions
        xgb_opt.fit(X_train, y_train)
        y_pred = xgb_opt.predict(X_test)
        y_prob = xgb_opt.predict_proba(X_test)

        # compute accuracy and store in a list for analyzing results later
        accuracy = round(accuracy_score(y_test, y_pred) * 100, 1)
        accuracy_list.append(accuracy)

        # get and store the naive accuracy (accuracy from just predicting the most thrown pitch)
        naive_accuracy = round(max(pitch_type_counts.values()) / sum(pitch_type_counts.values()) * 100., 1)
        naive_accuracy_list.append(naive_accuracy)

        # print some input/results for every 10th pitcher
        if i % 10 == 0:
            print()
            print(f"Pitcher ID: {pitcher}")
            print(f"Pitcher's pitch map: {pitch_map}")
            print(f"Pitcher's pitch counter: {dict(pitch_type_counts)}")
            print(f"Number of data points in training: {X_train.shape[0]}")
            print(f"Number of data points in testing: {X_test.shape[0]}")
            print(f"Best params: {xgb_opt.best_params_}")
            print(f"Total training time: {dt.datetime.now()-start}")
            print(f"Naive accuracy: {naive_accuracy}")
            print(f"XGBooost accuracy: {accuracy}")


        # ----------------------------------------------------------
        # write out the pitchers model and metadata to a pickle file
        # ----------------------------------------------------------

        # things to store in the pitcher's model file:
        #  1) the map and unmap for pitches (used for data clean-up in the prediction process)
        #  2) trained model (used to make prediction)
        #  3) accuracy on the test data (to include with pitch predictions so user can see how confident the model is)
        model_out = {
            "pitcherID": pitcher,
            "pitch_map": pitch_map,
            "pitch_unmap": pitch_unmap,
            "model": xgb_opt,
            "model_accuracy": accuracy
        }

        # pickle up the pitcher's model file
        fpath = "../data/pitcher_models/2024/" + str(pitcher) + ".pkl"
        with open(fpath, 'wb') as fobj:
            pickle.dump(model_out, fobj)
           
    # return the accuracy lists so we can perform assessment 
    return accuracy_list, naive_accuracy_list

In [224]:
accuracy_list, naive_accuracy_list = train_xgb_models_2024(data3, pitch_count_cutoff=2400)

Number of pitchers that make the cut: 74


ValueError: 
All the 45 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
45 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/anaconda3/envs/learn-env/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/anaconda3/envs/learn-env/lib/python3.12/site-packages/xgboost/core.py", line 726, in inner_f
    return func(**kwargs)
           ^^^^^^^^^^^^^^
  File "/opt/anaconda3/envs/learn-env/lib/python3.12/site-packages/xgboost/sklearn.py", line 1491, in fit
    raise ValueError(
ValueError: Invalid classes inferred from unique values of `y`.  Expected: [0 1 2], got [0 2 3]


---

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from xgboost import XGBClassifier
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline

# Encode target (pitch_type)
target = 'pitch_type'
features = [col for col in data.columns if col not in [target, 'pitcher']]  # Exclude 'pitcher' for looping

label_encoder = LabelEncoder()
data[target] = label_encoder.fit_transform(data[target])
data['prev_pitch_type'] = label_encoder.fit_transform(data['prev_pitch_type'])

def train_xgboost_with_gridsearch(data, target, features, min_pitches=0):
    """
    Train XGBoost models with GridSearchCV for each pitcher above a certain number of pitches.
    
    Args:
    - data (DataFrame): The full dataset.
    - target (str): The target variable (e.g., 'pitch_type').
    - features (list): List of feature columns.
    - min_pitches (int): Minimum number of pitches required for a pitcher to be included.
    
    Returns:
    - pitcher_results (list): List of dictionaries with results for each pitcher.
    """
    # Filter pitchers based on the number of pitches
    pitcher_counts = data['pitcher'].value_counts()
    valid_pitchers = pitcher_counts[pitcher_counts >= min_pitches].index
    filtered_data = data[data['pitcher'].isin(valid_pitchers)]
    
    pitcher_results = []
    for pitcher_id in filtered_data['pitcher'].unique():
        print(f"\nProcessing pitcher: {pitcher_id}")
        
        # Filter data for the current pitcher
        pitcher_data = filtered_data[filtered_data['pitcher'] == pitcher_id]
        
        # Split data into train and test
        X = pitcher_data[features]
        y = pitcher_data[target]
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        unique_classes_in_test = np.unique(y_test)
        
        # Define the pipeline
        pipeline = Pipeline([
            ('scaler', StandardScaler()),  # Feature scaling
            ('xgb', XGBClassifier(eval_metric='mlogloss'))  # XGBoost model
        ])
        
        # Define the hyperparameter grid
        param_grid = {
            'xgb__n_estimators': [50, 100, 150],
            'xgb__max_depth': [3, 5, 7],
            'xgb__learning_rate': [0.01, 0.1, 0.2],
            'xgb__subsample': [0.8, 1.0],
            'xgb__colsample_bytree': [0.8, 1.0],
        }
        
        # GridSearchCV
        grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring='accuracy', verbose=1, n_jobs=-1)
        grid_search.fit(X_train, y_train)
        
        # Best parameters and performance
        best_params = grid_search.best_params_
        best_model = grid_search.best_estimator_
        print(f"Best Params for Pitcher {pitcher_id}: {best_params}")
        
        # Evaluate the best model on the test set
        y_pred = best_model.predict(X_test)
        target_names = [str(cls) for cls in label_encoder.classes_[unique_classes_in_test]]
        report = classification_report(y_test, y_pred, target_names=target_names, labels=unique_classes_in_test, zero_division=0)
        print(report)
        
        # Append results
        pitcher_results.append({
            'pitcher_id': pitcher_id,
            'num_pitches': len(pitcher_data),
            'best_params': best_params,
            'classification_report': report
        })
    
    return pitcher_results

In [None]:
import warnings
warnings.filterwarnings(action='ignore', category=UserWarning, module='xgboost')

In [None]:
results = train_xgboost_with_gridsearch(data, target, features, min_pitches=2000)