DS340 Final Project: Beating the Books Authors: Easwer Raman and Victor Verma


In [1]:
!pip install pro-football-reference-web-scraper
!pip install -U scikit-learn


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.2[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.11 -m pip install --upgrade pip[0m
Collecting scikit-learn
  Downloading scikit_learn-1.4.2-cp311-cp311-macosx_10_9_x86_64.whl.metadata (11 kB)
Downloading scikit_learn-1.4.2-cp311-cp311-macosx_10_9_x86_64.whl (11.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.6/11.6 MB[0m [31m32.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.4.0
    Uninstalling scikit-learn-1.4.0:
      Successfully uninstalled scikit-learn-1.4.0
Successfully installed scikit-learn-1.4.2

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.2[0m[39;49m -> [0m[32;49m24.0[0m
[1

In [3]:
# Imports
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout, SimpleRNN
import numpy as np
import pandas as pd
from pro_football_reference_web_scraper import player_game_log as p
from sklearn.preprocessing import MinMaxScaler
import time

In [6]:
passing_def_df = pd.read_csv("passing_def_wk14.csv")
rushing_def_df = pd.read_csv("rushing_def_wk14.csv")

In [33]:
def apply_defense(position, data, team_playing):
    # only includes defensive stats from weeks 1 - 14 in the 2023 season.
    if position == "WR" or position == "TE":
        final = data
        final["Snap %"] = float(data["Snap %"].iloc[0]) * 100
        return final
    if position == "QB":
        team_stats = passing_def_df[passing_def_df["Tm"] == str(team_playing)]
        final = data
        final["Cmp"] = float(data["Cmp"].iloc[0]) * float(
            team_stats["Avg diff in Cmp"].iloc[0]
        )
        final["Att"] = float(data["Att"].iloc[0]) * float(
            team_stats["Avg diff in Att"].iloc[0]
        )
        final["Pass Yds"] = float(data["Pass Yds"].iloc[0]) * float(
            team_stats["Avg diff in pass yards"].iloc[0]
        )
        final["Pass TDs"] = float(data["Pass TDs"].iloc[0]) * float(
            team_stats["Avg diff in TD%"].iloc[0]
        )
        final["Int"] = float(data["Int"].iloc[0]) * float(
            team_stats["Avg diff in Int%"].iloc[0]
        )
        final["Rating"] = float(data["Rating"].iloc[0]) * float(
            team_stats["Avg diff in Rate"].iloc[0]
        )
        return final
    if position == "RB":
        team_stats = rushing_def_df[rushing_def_df["Tm"] == str(team_playing)]
        final = data
        final["Att"] = float(data["Att"].iloc[0]) * float(
            team_stats["Avg diff in Att"].iloc[0]
        )
        final["Rush Yds"] = float(data["Rush Yds"].iloc[0]) * float(
            team_stats["Avg diff in Yds"].iloc[0]
        )
        final["Rush TDs"] = float(data["Rush TDs"].iloc[0]) * float(
            team_stats["Avg diff in TD"].iloc[0]
        )
        final["Snap %"] = float(data["Snap %"].iloc[0]) * 100
        return final
    else:
        return

In [12]:
# supports QB, RB, WR, TE
def get_stats(player, position, season):
    start = time.perf_counter()
    stats = p.get_player_game_log(player, position, season)
    end = time.perf_counter()
    drop_columns = [
        "date",
        "game_location",
        "team",
        "opp",
        "result",
        "team_pts",
        "opp_pts",
    ]
    for column in drop_columns:
        stats = stats.drop(column, axis=1)
    stats.to_csv("game_log.csv", index=False)
    print(f"gathered stats in {end - start} seconds")
    return stats

In [13]:
def prepare_data(stats, end_training_week, time_steps):
    data_rows = []
    for index, row in stats.iterrows():
        if row["week"] <= end_training_week:
            data_rows.append(row)
    data_df = pd.concat(data_rows, axis=1).transpose().drop("week", axis=1)
    time_series_data = data_df.to_numpy()

    scaler = MinMaxScaler()
    time_series_data_scaled = scaler.fit_transform(
        time_series_data
    )  # scale data between 0 and 1
    if len(time_series_data_scaled) < 6:
        raise Exception("Player must have played at least 6 games")

    output_dimension = len(time_series_data_scaled[0])

    data = []
    target = []
    for i in range(len(time_series_data_scaled) - time_steps):
        data.append(time_series_data_scaled[i : i + time_steps])
        target.append(time_series_data_scaled[i + time_steps])

    data = np.array(data)
    target = np.array(target)

    train_size = int(0.8 * len(data))
    X_train, y_train = data[:train_size], target[:train_size]
    X_test, y_test = data[train_size:], target[train_size:]

    return X_train, y_train, X_test, y_test, output_dimension, scaler

In [14]:
def train_rnn(
    X_train,
    y_train,
    X_test,
    y_test,
    time_steps,
    output_dimension,
    position,
    display_model_stats,
):
    epoch_map = {"QB": 20, "RB": 8, "WR": 8, "TE": 8}
    verbose_map = {"Y": 2, "N": 0}
    rnn = Sequential()
    rnn.add(
        SimpleRNN(128, activation="relu", input_shape=(time_steps, output_dimension))
    )
    rnn.add(Dense(128, activation="relu"))
    rnn.add(Dropout(0.2))
    rnn.add(Dense(128, activation="relu"))
    rnn.add(Dropout(0.2))
    rnn.add(Dense(units=output_dimension))
    rnn.compile(
        loss="mean_squared_error", optimizer="adam", metrics=["mean_absolute_error"]
    )
    if display_model_stats == "Y":
        rnn.summary()
    history = rnn.fit(
        X_train,
        y_train,
        validation_data=(X_test, y_test),
        epochs=epoch_map[position],
        batch_size=4,
        verbose=verbose_map[display_model_stats],
        shuffle=True,
    )
    return rnn


def train_lstm(
    X_train,
    y_train,
    X_test,
    y_test,
    time_steps,
    output_dimension,
    position,
    display_model_stats,
):
    epoch_map = {"QB": 40, "RB": 20, "WR": 20, "TE": 20}
    verbose_map = {"Y": 2, "N": 0}
    lstm = Sequential()
    if position == "QB":
        lstm.add(
            LSTM(
                units=64,
                return_sequences=True,
                input_shape=(time_steps, output_dimension),
            )
        )
        lstm.add(Dropout(0.2))
        lstm.add(
            LSTM(
                units=128,
                return_sequences=True,
                input_shape=(time_steps, output_dimension),
            )
        )
        lstm.add(Dropout(0.2))
        lstm.add(LSTM(units=256, input_shape=(time_steps, output_dimension)))
        lstm.add(Dropout(0.2))
    elif position == "RB":
        lstm.add(
            LSTM(
                units=64,
                return_sequences=False,
                input_shape=(time_steps, output_dimension),
            )
        )
        lstm.add(Dropout(0.2))
    elif position == "WR" or position == "TE":
        lstm.add(
            LSTM(
                units=64,
                return_sequences=False,
                input_shape=(time_steps, output_dimension),
            )
        )
        lstm.add(Dropout(0.2))
    lstm.add(Dense(units=output_dimension))
    lstm.compile(
        loss="mean_squared_error", optimizer="adam", metrics=["mean_absolute_error"]
    )
    if display_model_stats == "Y":
        lstm.summary()
    history = lstm.fit(
        X_train,
        y_train,
        validation_data=(X_test, y_test),
        epochs=epoch_map[position],
        batch_size=4,
        verbose=verbose_map[display_model_stats],
        shuffle=True,
    )
    return lstm

In [15]:
def predict(model, X_train, time_steps, output_dimension, scaler, num_predictions):
    last_sequence = X_train[-1]
    last_sequence = last_sequence.reshape((1, time_steps, output_dimension))

    future_predictions = []
    for _ in range(num_predictions):
        next_prediction = model.predict(last_sequence)
        future_predictions.append(next_prediction)
        last_sequence = np.concatenate(
            (last_sequence[:, 1:, :], next_prediction.reshape(1, 1, output_dimension)),
            axis=1,
        )

    scaled_predictions = np.array(future_predictions)
    scaled_predictions = scaled_predictions.squeeze(axis=1)
    scaled_predictions = scaler.inverse_transform(
        scaled_predictions
    )  # Inverse transform the predictions to the original scale for model

    decimal_predictions = [
        ["%f" % value for value in week] for week in scaled_predictions
    ]

    return decimal_predictions

In [31]:
def get_predictions(
    player,
    position,
    season,
    end_training_week,
    time_steps,
    num_predictions,
    display_model_stats,
    team_playing,
):
    if end_training_week < 1 or end_training_week > 17:
        raise Exception(
            "end_training_week (current week - 1) must be between 1 and 17 (inclusive)"
        )
    if position not in ["QB", "RB", "WR", "TE"]:
        raise Exception("position must be set to QB, RB, WR, or TE")
    if display_model_stats not in ["Y", "N"]:
        raise Exception("display_model_stats must be set to Y or N")

    stats = get_stats(player, position, season)
    X_train, y_train, X_test, y_test, output_dimension, scaler = prepare_data(
        stats, end_training_week, time_steps
    )

    rnn = train_rnn(
        X_train,
        y_train,
        X_test,
        y_test,
        time_steps,
        output_dimension,
        position,
        display_model_stats,
    )
    rnn_predictions = predict(
        rnn, X_train, time_steps, output_dimension, scaler, num_predictions
    )
    lstm = train_lstm(
        X_train,
        y_train,
        X_test,
        y_test,
        time_steps,
        output_dimension,
        position,
        display_model_stats,
    )
    lstm_predictions = predict(
        lstm, X_train, time_steps, output_dimension, scaler, num_predictions
    )

    positional_features = {
        "QB": [
            "Cmp",
            "Att",
            "Pass Yds",
            "Pass TDs",
            "Int",
            "Rating",
            "Sacks",
            "Rush Att",
            "Rush Yds",
            "Rush TDs",
        ],
        "RB": ["Att", "Rush Yds", "Rush TDs", "Targets", "Receiving Yds", "Snap %"],
        "TE": ["Targets", "Receptions", "Receiving Yds", "Receiving TDs", "Snap %"],
        "WR": ["Targets", "Receptions", "Receiving Yds", "Receiving TDs", "Snap %"],
    }

    rnn_predictions_df = pd.DataFrame(
        rnn_predictions, columns=positional_features[position]
    )
    lstm_predictions_df = pd.DataFrame(
        lstm_predictions, columns=positional_features[position]
    )

    rnn_predictions_df = apply_defense(position, rnn_predictions_df, team_playing)
    lstm_predictions_df = apply_defense(position, lstm_predictions_df, team_playing)

    predictions = {}
    predictions["rnn predictions"] = rnn_predictions_df
    predictions["lstm predictions"] = lstm_predictions_df
    print(
        f"\nRNN Predictions\n{rnn_predictions_df}\n\nLSTM Predictions\n{lstm_predictions_df}"
    )

In [34]:
get_predictions(
    player="Patrick Mahomes",
    position="QB",
    season=2023,
    end_training_week=12,
    time_steps=4,
    num_predictions=1,
    display_model_stats="N",
    team_playing="Buffalo Bills",
)

gathered stats in 0.674805907998234 seconds

RNN Predictions
         Cmp        Att    Pass Yds  Pass TDs       Int     Rating     Sacks  \
0  22.934318  29.569528  217.550321  1.675439 -0.004743  105.00259  1.637902   

   Rush Att   Rush Yds   Rush TDs  
0  3.166482  11.990406  -0.092260  

LSTM Predictions
         Cmp        Att    Pass Yds  Pass TDs       Int    Rating     Sacks  \
0  18.606157  29.497588  154.071049   1.06615  0.505799  85.70144  2.028028   

   Rush Att   Rush Yds   Rush TDs  
0  5.608614  21.781343  -0.007585  


In [29]:
def predictions():
    playerInput = str(input("What is the player's name: "))
    positionInput = str(
        input("What position does the player play (QB, RB, TE, or WR): ")
    )
    seasonInput = int(input("What season do you want to use (current: 2023): "))
    end_tw = int(input("What week is it: ")) - 1
    team_playing = str(input("What team are they playing: "))
    num_predictions = int(input("How many games should the models predict: "))
    display_model_stats = str(input("Display model accuracy stats (Y or N): "))
    extra_player = "Y"

    while extra_player == "Y":
        get_predictions(
            player=playerInput,
            position=positionInput,
            season=seasonInput,
            end_training_week=end_tw,
            time_steps=4,
            num_predictions=num_predictions,
            display_model_stats=display_model_stats,
            team_playing=team_playing,
        )
        extra_player = str(input("Do you have more players to check for (Y or N)? "))
        if extra_player != "Y":
            break
        playerInput = str(input("What is the player's name: "))
        positionInput = str(
            input("What position does the player play (QB, RB, TE, or WR): ")
        )
        team_playing = str(input("What team are they playing: "))


predictions()

gathered stats in 0.588822646997869 seconds


  final["Cmp"] = float(data["Cmp"].iloc[0]) * float(team_stats["Avg diff in Cmp"])
  final["Att"] = float(data["Att"].iloc[0]) * float(team_stats["Avg diff in Att"])
  final["Pass Yds"] = float(data["Pass Yds"].iloc[0]) * float(
  final["Pass TDs"] = float(data["Pass TDs"].iloc[0]) * float(
  final["Int"] = float(data["Int"].iloc[0]) * float(
  final["Rating"] = float(data["Rating"].iloc[0]) * float(



RNN Predictions
         Cmp        Att    Pass Yds  Pass TDs     Int     Rating     Sacks  \
0  23.116378  36.471318  193.464701   1.90394  0.9716  76.749122  1.165916   

   Rush Att   Rush Yds   Rush TDs  
0  3.883745  19.643223  -0.007577  

LSTM Predictions
         Cmp        Att    Pass Yds  Pass TDs       Int     Rating     Sacks  \
0  22.341483  34.567874  187.519697  1.690316  0.814888  85.681169  1.771690   

   Rush Att   Rush Yds  Rush TDs  
0  5.346336  28.402117  0.013983  
