# First Baseline Model - Random Forest

#### ``Objectives``
1. Implement a Decision Forest for run value prediction
2. Turn to a Random Forest for the another baseline model

### Import Libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# decision tree
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# random forest

# misc
import os
import glob


#### Clear output and stored data:



In [None]:
os.system('clear') 

---
#### <span style="color:chocolate">  Step 1: Data ingestion </span>

I already created the training data in another file:
 <span style="color:gray">TrackMan data of 2024 spring season</span> function below according to the following guidelines:

 a) Read all the csv files in the directory and merge them into a single dataframe \
 b) Save the dataframe to a csv file

In [10]:
# dont need to run this again since already created the training data

def load_data(path: str, num_columns=60) -> pd.DataFrame:
    """
    Loads and merges CSV files from the specified directory, excluding files with 'player positioning' in their names.
    
    Parameters:
    path (str): The directory path containing the CSV files.

    Returns:
    pandas.DataFrame: The merged DataFrame containing data from the selected CSV files.
    """
    try:
        # Ensure the directory exists
        if not os.path.exists(path):
            raise FileNotFoundError(f"The directory '{path}' does not exist.")

        # Get all files in the directory that end with .csv, excluding those with 'player positioning' in the name
        all_files = [
            file for file in glob.glob(f"{path}/*.csv") if 'player positioning' not in file
        ]

        # Raise an exception if no valid files are found
        if not all_files:
            raise ValueError(f"No valid CSV files found in the directory '{path}'.")

        # Set the indices of the columns to keep
        columns_to_keep = list(range(num_columns))  # will set that in the function call but usually 60 will be fine

        # Read and merge the filtered files with the specified columns
        df_list = [pd.read_csv(filename, usecols=columns_to_keep) for filename in all_files]
        merged_df = pd.concat(df_list, ignore_index=True)

        # Save the merged DataFrame to a CSV
        output_path = "/Users/tommayer/Desktop/games_test.csv"
        merged_df.to_csv(output_path, index=False)

        return merged_df

    except FileNotFoundError as fnf_error:
        print(f"Error: {fnf_error}")
    except ValueError as val_error:
        print(f"Error: {val_error}")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")


#### NOTE: 
I don't know if it's smart to load the data and concatenate all rows every time.  I could make it more like appending rows to the dataframe.

In [None]:
# path = "/Users/tommayer/desktop/Tman/03/CSV"
# data = load_data(path)

In [6]:
data = pd.read_csv("/Users/tommayer/Desktop/MIDS/207/talkin-ball-with-Santerre/training_data.csv")

  data = pd.read_csv("/Users/tommayer/Desktop/MIDS/207/talkin-ball-with-Santerre/training_data.csv")


Data shape: (1068473, 167)


In [12]:
# peer at data and get a sense of the shape
data.head(5)
#print(f'Data shape: {data.shape}')

Unnamed: 0,PitchNo,Date,Time,PAofInning,PitchofPA,Pitcher,PitcherId,PitcherThrows,PitcherTeam,Batter,...,ThrowTrajectoryZc1,ThrowTrajectoryZc2,PitchReleaseConfidence,PitchLocationConfidence,PitchMovementConfidence,HitLaunchConfidence,HitLandingConfidence,CatcherThrowCatchConfidence,CatcherThrowReleaseConfidence,CatcherThrowLocationConfidence
0,1,2024-03-02,13:32:53.75,1.0,1.0,"Quinn, JT",1000234000.0,Right,OLE_REB,"Wilmes, Ben",...,,,High,High,High,,,,,
1,2,2024-03-02,13:33:10.55,1.0,2.0,"Quinn, JT",1000234000.0,Right,OLE_REB,"Wilmes, Ben",...,,,High,High,High,,,,,
2,3,2024-03-02,13:33:23.01,1.0,3.0,"Quinn, JT",1000234000.0,Right,OLE_REB,"Wilmes, Ben",...,,,High,High,High,Medium,Low,,,
3,4,2024-03-02,13:33:44.31,1.0,4.0,"Quinn, JT",1000234000.0,Right,OLE_REB,"Wilmes, Ben",...,,,High,High,High,High,Low,,,
4,5,2024-03-02,13:34:08.96,1.0,5.0,"Quinn, JT",1000234000.0,Right,OLE_REB,"Wilmes, Ben",...,,,High,High,High,,,,,


---
#### <span style="color:chocolate"> Step 2: Exploratory data analysis (EDA) </span>
- check for missing values
- check for duplicates
- check for outliers
- check for class imbalance


In [14]:
# keep only the first 60 columns
data = data.iloc[:, :60]
data.head(5)

Unnamed: 0,PitchNo,Date,Time,PAofInning,PitchofPA,Pitcher,PitcherId,PitcherThrows,PitcherTeam,Batter,...,PositionAt110X,PositionAt110Y,PositionAt110Z,Distance,LastTrackedDistance,Bearing,HangTime,pfxx,pfxz,x0
0,1,2024-03-02,13:32:53.75,1.0,1.0,"Quinn, JT",1000234000.0,Right,OLE_REB,"Wilmes, Ben",...,,,,,,,,3.27315,1.04732,-0.22805
1,2,2024-03-02,13:33:10.55,1.0,2.0,"Quinn, JT",1000234000.0,Right,OLE_REB,"Wilmes, Ben",...,,,,,,,,-1.92154,11.56257,-0.42614
2,3,2024-03-02,13:33:23.01,1.0,3.0,"Quinn, JT",1000234000.0,Right,OLE_REB,"Wilmes, Ben",...,,,,,72.1932,,,-0.12626,12.74284,-0.12171
3,4,2024-03-02,13:33:44.31,1.0,4.0,"Quinn, JT",1000234000.0,Right,OLE_REB,"Wilmes, Ben",...,,,,,75.48044,,,3.04536,1.83437,-0.0475
4,5,2024-03-02,13:34:08.96,1.0,5.0,"Quinn, JT",1000234000.0,Right,OLE_REB,"Wilmes, Ben",...,,,,,,,,1.86701,1.6478,-0.03339


Rows to be dropped if N/A: 
- our target variables
- name, date, location, team??


In [15]:
# drop rows without certain columns
required_columns = ['TaggedPitchType', 'AutoPitchType', 'PitchCall', 'KorBB', 'TaggedHitType',
                    'PlayResult', 'RunsScored', 'RelSpeed', 'RelHeight', 'VertRelAngle', 'HorzRelAngle',
                      'SpinRate', 'SpinAxis', 'Tilt', 'RelHeight', 'RelSide', 'Extension','InducedVertBreak', 
                      'HorzBreak', 'VertApprAngle', 'HorzApprAngle']
data = data.dropna(subset=required_columns)

In [17]:
# check how many rows were dropped
print(f'Number of rows dropped: {data.shape[0] - len(data)}')


Number of rows dropped: 0


Check data types:

In [24]:
print(data.dtypes)

PitchNo                  int64
Date                    object
Time                    object
PAofInning             float64
PitchofPA              float64
Pitcher                 object
PitcherId              float64
PitcherThrows           object
PitcherTeam             object
Batter                  object
BatterId               float64
BatterSide              object
BatterTeam              object
PitcherSet              object
Inning                   int64
Top/Bottom              object
Outs                     int64
Balls                    int64
Strikes                  int64
TaggedPitchType         object
AutoPitchType           object
PitchCall               object
KorBB                   object
TaggedHitType           object
PlayResult              object
OutsOnPlay               int64
RunsScored               int64
Notes                   object
RelSpeed               float64
VertRelAngle           float64
HorzRelAngle           float64
SpinRate               float64
SpinAxis

---
#### <span style="color:chocolate"> Step 3: Data Preprocessing </span>
- drop columns that are not useful?
- encode labels 
- split into training and testing data
- standardize data

Working with certain data types: \
a) numerical data (float, int)  \
    - scale data \
    - RelSpeed, SpinRate, InducedVertBreak, HorzBreak, ExitSpeed, etc \
    \
b) categorical data (object/string) \
    - encode data (one-hot encoding with sklearn LabelEncoder) \
    - TaggedPitchType, AutoPitchType, PitchCall, KorBB, TaggedHitType, PlayResult


In [26]:
def preprocess_data(data: pd.DataFrame) -> pd.DataFrame:
    """
    Preprocesses the data by identifying column types, encoding categorical data, and scaling numerical data.
    """
    # 1. Identify column types
    numerical_cols = data.select_dtypes(include=['int64', 'float64']).columns
    categorical_cols = data.select_dtypes(include=['object']).columns

    # 2. Handle categorical data
    # For simple categorical variables, use Label Encoding
    for col in ['PitchCall', 'TaggedPitchType', 'AutoPitchType', 'KorBB']:
        le = LabelEncoder()
        data[f'{col}_encoded'] = le.fit_transform(data[col])
        ## ISSUE: label encoding assumes an order to the categories (ordinal encoding)

    # For nominal variables with many categories, use One-Hot Encoding
    data = pd.get_dummies(data, columns=['Pitcher', 'TaggedHitType'])

    # 3. Optional: Scale numerical features if they're in very different ranges
    scaler = StandardScaler()
    numerical_features = ['RelSpeed', 'RelHeight', 'SpinRate', 'Extension']  # add your numerical columns
    data[numerical_features] = scaler.fit_transform(data[numerical_features])

    # 4. Split into features and target
    X = data.drop(['RunsScored'], axis=1)  # assuming RunsScored is your target
    y = data['RunsScored']

    # 5. Split into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

    # 6. Split training data into training and validation sets
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=0)

    print(f'X_train shape: {X_train.shape}')
    print(f'X_val shape: {X_val.shape}')
    print(f'X_test shape: {X_test.shape}')
    print(f'y_train shape: {y_train.shape}')
    print(f'y_val shape: {y_val.shape}')
    print(f'y_test shape: {y_test.shape}')

    return X_train, X_test, y_train, y_test

---
#### <span style="color:chocolate"> Step 4: Modeling </span>
- train a decision tree
- train a random forest
- train a gradient boosting machine (XGBoost)
- compare the three models