# LSTM

Experimenting with LSTM.

In [None]:

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.utils import to_categorical

In [2]:
import pandas as pd
import numpy as np

In [3]:
file_path = "../data/final/pitch_by_pitch_2023_lstm.csv"
data = pd.read_csv(file_path)

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 503292 entries, 0 to 503291
Data columns (total 31 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   pitch_type                         503260 non-null  object 
 1   release_speed                      503292 non-null  float64
 2   batter                             503292 non-null  int64  
 3   pitcher                            503292 non-null  int64  
 4   zone                               503292 non-null  float64
 5   game_type                          503292 non-null  object 
 6   type                               503292 non-null  object 
 7   balls                              503292 non-null  int64  
 8   strikes                            503292 non-null  int64  
 9   on_3b                              503292 non-null  int64  
 10  on_2b                              503292 non-null  int64  
 11  on_1b                              5032

In [5]:
data.drop(columns=['game_type', 'type', 'inning_topbot', 'if_fielding_alignment', 'of_fielding_alignment', 'count'], inplace=True)

In [6]:
data.to_csv("../data/final/pitch_by_pitch_2023_lstm.csv", index=False)

In [None]:
# Preprocess the dataset
# Assuming 'pitch_type' is the target variable and others are features
target = 'pitch_type'
features = [col for col in data.columns if col != target]

# Encode the target variable
label_encoder = LabelEncoder()
data[target] = label_encoder.fit_transform(data[target])

# Normalize numerical features
scaler = StandardScaler()
data[features] = scaler.fit_transform(data[features])

# Prepare sequences for LSTM
sequence_length = 10
X, y = [], []
for i in range(len(data) - sequence_length):
    X.append(data[features].iloc[i:i + sequence_length].values)
    y.append(data[target].iloc[i + sequence_length])

X = np.array(X)
y = np.array(y)

# Convert target variable to categorical
y = to_categorical(y)

# Split the dataset
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Build the LSTM model
model = Sequential()
model.add(LSTM(64, input_shape=(X_train.shape[1], X_train.shape[2]), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(32))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=20, batch_size=32)

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy:.2f}")

# Save the model (optional)
model.save("pitch_type_prediction_lstm.h5")

The model with these configurations and the full pitch-by-pitch 2023 dataset takes too long to run locally on my laptop CPU.

In [None]:
from google.colab import files
uploaded = files.upload()  # Prompts file upload