# First Baseline Model - Random Forest

#### ``Objectives``
1. Implement a Decision Forest for run value prediction
2. Turn to a Random Forest for the another baseline model

### Import Libraries

In [28]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# decision tree
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.ensemble import RandomForestRegressor


# random forest

# misc
import os
import glob


#### Clear output and stored data:



In [81]:
os.system('clear') 

[H[2J

0

---
#### <span style="color:chocolate">  Step 1: Data ingestion </span>

I already created the training data in another file:
 <span style="color:gray">TrackMan data of 2024 spring season</span> function below according to the following guidelines:

 a) Read all the csv files in the directory and merge them into a single dataframe \
 b) Save the dataframe to a csv file

In [52]:
# dont need to run this again since already created the training data

def load_data(path: str, num_columns=60) -> pd.DataFrame:
    """
    Loads and merges CSV files from the specified directory, excluding files with 'player positioning' in their names.
    
    Parameters:
    path (str): The directory path containing the CSV files.

    Returns:
    pandas.DataFrame: The merged DataFrame containing data from the selected CSV files.
    """
    try:
        # Ensure the directory exists
        if not os.path.exists(path):
            raise FileNotFoundError(f"The directory '{path}' does not exist.")

        # Get all files in the directory that end with .csv, excluding those with 'player positioning' in the name
        all_files = [
            file for file in glob.glob(f"{path}/*.csv") if 'player positioning' not in file
        ]

        # Raise an exception if no valid files are found
        if not all_files:
            raise ValueError(f"No valid CSV files found in the directory '{path}'.")

        # Set the indices of the columns to keep
        columns_to_keep = list(range(num_columns))  # will set that in the function call but usually 60 will be fine

        # Read and merge the filtered files with the specified columns
        df_list = [pd.read_csv(filename, usecols=columns_to_keep) for filename in all_files]
        merged_df = pd.concat(df_list, ignore_index=True)

        # Save the merged DataFrame to a CSV
        output_path = "/Users/tommayer/Desktop/games_test.csv"
        merged_df.to_csv(output_path, index=False)

        return merged_df

    except FileNotFoundError as fnf_error:
        print(f"Error: {fnf_error}")
    except ValueError as val_error:
        print(f"Error: {val_error}")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")


#### NOTE: 
I don't know if it's smart to load the data and concatenate all rows every time.  I could make it more like appending rows to the dataframe.

In [82]:
# drop rows without certain columns
required_columns = ['TaggedPitchType', 'AutoPitchType', 'PitchCall', 'KorBB', 'TaggedHitType',
                    'PlayResult', 'RunsScored', 'RelSpeed', 'RelHeight', 'VertRelAngle', 'HorzRelAngle',
                      'SpinRate', 'SpinAxis', 'Tilt', 'RelHeight', 'RelSide', 'Extension','InducedVertBreak', 
                      'HorzBreak', 'VertApprAngle', 'HorzApprAngle','PlateLocHeight','PlateLocSide','ExitSpeed','VertExitAngle','HorzExitAngle']

In [83]:
path = "/Users/tommayer/Desktop/training_data.csv"
#data = load_data(path)
data = pd.read_csv(path, usecols=required_columns)
## drastically reduces the number of rows and columns -> way less memory

In [84]:
# peer at data and get a sense of the shape
data.head(90)
#print(f'Data shape: {data.shape}')

Unnamed: 0,TaggedPitchType,AutoPitchType,PitchCall,KorBB,TaggedHitType,PlayResult,RunsScored,RelSpeed,VertRelAngle,HorzRelAngle,SpinRate,SpinAxis,Tilt,RelHeight,RelSide,Extension,InducedVertBreak,HorzBreak,VertApprAngle,HorzApprAngle
0,Slider,Slider,BallCalled,Undefined,Undefined,Undefined,0,86.34831,-5.087035,-0.556059,2514.190308,69.694698,8:15,6.89596,0.28503,5.20061,-1.26150,-6.71201,-12.231122,-1.751516
1,Fastball,Four-Seam,StrikeCalled,Undefined,Undefined,Undefined,0,94.49974,-3.133086,-0.492520,2095.787589,190.374426,12:15,6.87165,0.46429,5.83655,20.20828,3.49654,-5.365324,0.134391
2,Fastball,Four-Seam,FoulBallNotFieldable,Undefined,Undefined,Undefined,0,94.81021,-3.910073,-1.135525,1996.806823,178.803234,12:00,6.94572,0.21782,5.67326,22.06875,-0.43740,-5.815582,-1.213668
3,Slider,Slider,FoulBallNotFieldable,Undefined,Undefined,Undefined,0,86.30865,-1.385858,-0.791508,3480.483920,100.930240,9:15,6.96039,0.12225,5.43599,2.34610,-6.38485,-7.862841,-1.929632
4,Slider,Slider,BallCalled,Undefined,Undefined,Undefined,0,87.45870,-4.605749,-1.323250,1287.761851,79.042003,8:45,6.99938,0.15737,5.28786,0.27646,-4.37162,-11.264591,-2.102031
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85,Fastball,Four-Seam,InPlay,Undefined,LineDrive,Single,0,94.63629,-4.517301,-1.507178,2029.538262,197.456106,12:30,6.92356,0.56487,5.57393,20.69584,6.20142,-6.591597,-0.401267
86,Slider,Splitter,StrikeCalled,Undefined,Undefined,Undefined,0,83.52705,-0.505401,-0.216829,1488.775131,318.435956,4:30,6.89951,0.19015,5.18325,-0.98284,1.99748,-8.127062,0.137755
87,Fastball,Four-Seam,BallCalled,Undefined,Undefined,Undefined,0,94.57738,-4.566557,-0.182542,1896.476301,192.931207,12:30,6.94888,0.45423,5.48461,19.09977,4.17524,-6.898638,0.561292
88,Slider,Slider,BallCalled,Undefined,Undefined,Undefined,0,85.29116,-3.496083,-0.297355,2337.115344,125.167777,10:15,6.95349,0.18628,5.32034,2.91461,-2.70288,-9.927804,-0.778875


---
#### <span style="color:chocolate"> Step 2: Exploratory data analysis (EDA) </span>
- check for missing values
- check for duplicates
- check for outliers
- check for class imbalance


Rows to be dropped if N/A: 
- our target variables
- name, date, location, team??


In [85]:
# drop rows without certain columns
data = data.dropna(subset=required_columns)

#$# DO NOT get rid of ExitSpeed n/a's bc only on a batted ball

In [86]:
# check how many rows were dropped
print(f'Number of rows dropped: {data.shape[0] - len(data)}')


Number of rows dropped: 0


Check data types:

In [87]:
print(data.dtypes)
# all columns are numerical except for TaggedPitchType, AutoPitchType, PitchCall, KorBB, TaggedHitType

TaggedPitchType      object
AutoPitchType        object
PitchCall            object
KorBB                object
TaggedHitType        object
PlayResult           object
RunsScored            int64
RelSpeed            float64
VertRelAngle        float64
HorzRelAngle        float64
SpinRate            float64
SpinAxis            float64
Tilt                 object
RelHeight           float64
RelSide             float64
Extension           float64
InducedVertBreak    float64
HorzBreak           float64
VertApprAngle       float64
HorzApprAngle       float64
dtype: object


---
#### <span style="color:chocolate"> Step 3: Data Preprocessing </span>
- drop columns that are not useful?
- encode labels 
- split into training and testing data
- standardize data

Working with certain data types: \
a) numerical data (float, int)  \
    - scale data \
    - RelSpeed, SpinRate, InducedVertBreak, HorzBreak, ExitSpeed, etc \
    \
b) categorical data (object/string) \
    - encode data (one-hot encoding with sklearn LabelEncoder) \
    - TaggedPitchType, AutoPitchType, PitchCall, KorBB, TaggedHitType, PlayResult


In [88]:
def preprocess_data(data: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.Series, pd.Series, pd.Series]:
    """
    Preprocesses the data by identifying column types, encoding categorical data, and scaling numerical data.
    Returns train/test/validation splits of features and target.

    A series is a 1D array-like or list-like object that contains a single column of data (test and validation sets).

    Returns:
    X_train, X_test, X_val, y_train, y_test, y_val for use
    """
    
    # 1. Identify column types
    numerical_cols = data.select_dtypes(include=['int64', 'float64']).columns
    categorical_cols = data.select_dtypes(include=['object']).columns

    # 2. Handle categorical data
    # For simple categorical variables, use Label Encoding
    """ for col in ['TaggedPitchType', 'AutoPitchType', 'KorBB','TaggedHitType','PlayResult']:
        le = LabelEncoder()
        data[f'{col}_encoded'] = le.fit_transform(data[col])
        ## Note: label encoding assumes an order to the categories """
    #### dont think i need label encoding since the data doesnt really have an order

    # For nominal variables with many categories, use One-Hot Encoding ( no rank order assumption for sliders or curveballs etc)
    data = pd.get_dummies(data, columns=['TaggedPitchType', 'AutoPitchType', 'PitchCall', 'KorBB','TaggedHitType','PlayResult','Tilt'])

    # 3. Scale numerical features
    scaler = StandardScaler()
    numerical_features = [
        'RelSpeed', 'RelHeight', 'SpinRate', 'SpinAxis', 'Extension', 'VertRelAngle', 'HorzRelAngle', 'RelSide',
        'InducedVertBreak', 'HorzBreak', 'VertApprAngle', 'HorzApprAngle']

    data[numerical_features] = scaler.fit_transform(data[numerical_features])

    # 4. Split into features and target
    X = data.drop(['RunsScored'], axis=1)
    y = data['RunsScored']

    # 6. Split into train/test/validation sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=0)

    print(f'X_train shape: {X_train.shape}')
    print(f'X_val shape: {X_val.shape}')
    print(f'X_test shape: {X_test.shape}')
    print(f'y_train shape: {y_train.shape}')
    print(f'y_val shape: {y_val.shape}')
    print(f'y_test shape: {y_test.shape}')

    return X_train, X_test, X_val, y_train, y_test, y_val

In [89]:
X_train, X_test, X_val, y_train, y_test, y_val = preprocess_data(data)

X_train shape: (673708, 119)
X_val shape: (168428, 119)
X_test shape: (210534, 119)
y_train shape: (673708,)
y_val shape: (168428,)
y_test shape: (210534,)


---
#### <span style="color:chocolate"> Step 4: Modeling </span>
- train a decision tree
- train a random forest
- train a gradient boosting machine (XGBoost)
- compare the three models

#### Decision Tree:
- enseble method - combines multiple decision trees to make a prediction
- uses bootstrap aggregating (bagging) - trains each tree on a different bootstrap sample of the data
- uses random subspace method - trains each tree on a different random subset of the features
- reduces variance and avoids overfitting
- can handle both numerical and categorical data
- easy to understand and interpret
- prone to overfitting if not tuned properly


In [90]:
# Import the correct model
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Create and train the model
model_rf = RandomForestRegressor(
    n_estimators=100,      # tree length i can change
    max_depth=8,        # longer depth is better but more prone to overfitting says Santerre
    min_samples_split=2,   # Minimum samples required to split
    random_state=1,       # have been doing this the whole time
    n_jobs=-1             # Use all CPU cores in my mac
)
# fit the model 
model_rf.fit(X_train, y_train)

# Make predictions
predictions_rf = model_rf.predict(X_val)

# Evaluate the model (using regression metrics instead of accuracy)
mse = mean_squared_error(y_val, predictions_rf)
rmse = mean_squared_error(y_val, predictions_rf, squared=False)  # Root Mean Squared Error
r2 = r2_score(y_val, predictions_rf)

print(f'Random Forest Performance:')
print(f'Mean Squared Error: {mse:.4f}')
print(f'Root Mean Squared Error: {rmse:.4f}')
print(f'R² Score: {r2:.4f}')

# Feature importance (optional)
feature_importance = pd.DataFrame({
    'feature': X_train.columns,
    'importance': model_rf.feature_importances_
})
feature_importance = feature_importance.sort_values('importance', ascending=False)
print("\nTop 10 Most Important Features:")
print(feature_importance.head(10))

Random Forest Performance:
Mean Squared Error: 0.0347
Root Mean Squared Error: 0.1862
R² Score: 0.4719

Top 10 Most Important Features:
                    feature  importance
63       PlayResult_HomeRun    0.645169
64           PlayResult_Out    0.138225
58  TaggedHitType_Undefined    0.105433
60        PlayResult_Double    0.017518
51    TaggedHitType_FlyBall    0.013981
65     PlayResult_Sacrifice    0.013486
68        PlayResult_Triple    0.006691
1              VertRelAngle    0.005273
5                 RelHeight    0.004510
0                  RelSpeed    0.004456




Hyper-Parameter Tuning:
- n_estimators
- max_depth
- min_samples_split
- min_samples_leaf
- max_features


In [92]:
"""from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(
    RandomForestRegressor(random_state=42),
    param_grid,
    cv=5,
    scoring='neg_mean_squared_error',
    n_jobs=-1
)

grid_search.fit(X_train, y_train)
print(f"Best parameters: {grid_search.best_params_}")"""

'from sklearn.model_selection import GridSearchCV\n\nparam_grid = {\n    \'n_estimators\': [50, 100, 200],\n    \'max_depth\': [None, 10, 20, 30],\n    \'min_samples_split\': [2, 5, 10],\n    \'min_samples_leaf\': [1, 2, 4]\n}\n\ngrid_search = GridSearchCV(\n    RandomForestRegressor(random_state=42),\n    param_grid,\n    cv=5,\n    scoring=\'neg_mean_squared_error\',\n    n_jobs=-1\n)\n\ngrid_search.fit(X_train, y_train)\nprint(f"Best parameters: {grid_search.best_params_}")'

Cross-Validation:
- use cross-validation to evaluate the model's performance on the training data
- use the validation set to tune the hyperparameters
- use the test set to evaluate the final model's performance

In [None]:
from sklearn.model_selection import cross_val_score

cv_scores = cross_val_score(
    model_rf, 
    X_train, 
    y_train, 
    cv=5, 
    scoring='neg_mean_squared_error'
)
rmse_scores = np.sqrt(-cv_scores)
print(f"Cross-validation RMSE: {rmse_scores.mean():.4f} (+/- {rmse_scores.std() * 2:.4f})")