In [3]:
import pandas as pd

In [2]:
df = pd.read_csv('filtered3_df_full.csv')

In [3]:
df.columns.values

array(['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.1.1', 'Season',
       'Game_ID', 'PLAYER_NAME', 'Team', 'GAME_DATE', 'MATCHUP', 'WL',
       'MIN', 'FGM', 'FGA', 'FG_PCT', 'FG3M', 'FG3A', 'FG3_PCT', 'FTM',
       'FTA', 'FT_PCT', 'OREB', 'DREB', 'REB', 'AST', 'STL', 'BLK', 'TOV',
       'PF', 'PTS', 'PLUS_MINUS', 'VIDEO_AVAILABLE', 'DD', 'TD',
       'total_fantasy_points', 'salary', 'POS'], dtype=object)

In [4]:
pip install torch

Note: you may need to restart the kernel to use updated packages.


In [5]:
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split

In [6]:
df = df[['PLAYER_NAME', 'POS', 'Team', 'GAME_DATE', 'MIN', 'FGM', 'FGA', 'FG_PCT', 'FG3M', 'FG3A', 'FG3_PCT',
       'FTM', 'FTA', 'FT_PCT', 'OREB', 'DREB', 'REB', 'AST', 'STL', 'BLK',
       'TOV', 'PF', 'PTS', 'PLUS_MINUS',
       'total_fantasy_points']]

# encode player positions
le = LabelEncoder()
df["POS"] = le.fit_transform(df["POS"])
df["Team"]= le.fit_transform(df["Team"])

# scale numerical features
numerical_features = ['MIN', 'FGM', 'FGA', 'FG_PCT', 'FG3M', 'FG3A', 'FG3_PCT',
                      'FTM', 'FTA', 'FT_PCT', 'OREB', 'DREB', 'REB', 'AST', 'STL', 'BLK',
                      'TOV', 'PF', 'PTS', 'PLUS_MINUS']
scaler = StandardScaler()
df[numerical_features] = scaler.fit_transform(df[numerical_features])

# sequences of games for each player
sequence_length = 50  # Set the number of past games to consider for each player
input_features = len(numerical_features) + 1  # Number of input features + position encoding
X = []
y = []

for player_name in df['PLAYER_NAME'].unique():
    player_data = df[df['PLAYER_NAME'] == player_name].reset_index(drop=True)
    if len(player_data) >= sequence_length:  # Check if the player has at least 50 games
        for i in range(len(player_data) - sequence_length):
            X.append(player_data.loc[i:i + sequence_length - 1, numerical_features + ['POS']].values)
            y.append(player_data.loc[i + sequence_length, 'total_fantasy_points'])

X = np.array(X)
y = np.array(y)


# train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [7]:
# LSTM model 
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(LSTMModel, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).requires_grad_()
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).requires_grad_()
        out, (hn, cn) = self.lstm(x, (h0.detach(), c0.detach()))
        out = self.fc(out[:, -1, :])
        return out

# parameters
hidden_size = 64
num_layers = 2
output_size = 1

# model, loss function, optimizer
model = LSTMModel(input_features, hidden_size, num_layers, output_size)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)


In [8]:
def regression_accuracy(y_true, y_pred, threshold=0.3):
    assert y_true.shape == y_pred.shape, "y_true and y_pred must have the same shape"
    within_threshold = torch.abs(y_true - y_pred) <= (threshold * y_true)
    accuracy = torch.mean(within_threshold.type(torch.float32))
    return accuracy.item()

In [9]:
num_epochs = 100
batch_size = 8
train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32).view(-1, 1)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32).view(-1, 1)

for epoch in range(num_epochs):
    epoch_loss = 0
    n_batches = 0
    for i in range(0, len(train_tensor), batch_size):
        batch_X = train_tensor[i:i + batch_size]
        batch_y = y_train_tensor[i:i + batch_size]

        # forward
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        epoch_loss += loss.item()
        n_batches += 1

        # backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # average loss for epoch
    avg_epoch_loss = epoch_loss / n_batches
    print(f'Epoch [{epoch + 1}/{num_epochs}], Average Loss: {avg_epoch_loss:.4f}')

    model.eval()
    with torch.no_grad():
        y_pred = model(X_test_tensor)
        test_loss = criterion(y_pred, y_test_tensor)
        test_accuracy = regression_accuracy(y_test_tensor, y_pred)
        print(f'Epoch [{epoch + 1}/{num_epochs}], Average Test Loss: {test_loss.item():.4f}, Test Accuracy: {test_accuracy:.4f}')
    # train
    model.train()


Epoch [1/100], Average Loss: 268.9942
Epoch [1/100], Average Test Loss: 151.7928, Test Accuracy: 0.4635
Epoch [2/100], Average Loss: 127.9306
Epoch [2/100], Average Test Loss: 129.2812, Test Accuracy: 0.4556
Epoch [3/100], Average Loss: 122.2461
Epoch [3/100], Average Test Loss: 127.5970, Test Accuracy: 0.4556
Epoch [4/100], Average Loss: 87.5710
Epoch [4/100], Average Test Loss: 72.1480, Test Accuracy: 0.6765
Epoch [5/100], Average Loss: 61.4448
Epoch [5/100], Average Test Loss: 59.1960, Test Accuracy: 0.7160
Epoch [6/100], Average Loss: 52.1403
Epoch [6/100], Average Test Loss: 52.3064, Test Accuracy: 0.7337
Epoch [7/100], Average Loss: 46.0020
Epoch [7/100], Average Test Loss: 48.3951, Test Accuracy: 0.7495
Epoch [8/100], Average Loss: 41.9867
Epoch [8/100], Average Test Loss: 45.6569, Test Accuracy: 0.7535
Epoch [9/100], Average Loss: 38.5259
Epoch [9/100], Average Test Loss: 43.6278, Test Accuracy: 0.7535
Epoch [10/100], Average Loss: 36.0169
Epoch [10/100], Average Test Loss: 42.

Epoch [81/100], Average Loss: 1.6088
Epoch [81/100], Average Test Loss: 46.6647, Test Accuracy: 0.7633
Epoch [82/100], Average Loss: 1.4984
Epoch [82/100], Average Test Loss: 45.5971, Test Accuracy: 0.7633
Epoch [83/100], Average Loss: 1.5790
Epoch [83/100], Average Test Loss: 46.8157, Test Accuracy: 0.7475
Epoch [84/100], Average Loss: 1.5177
Epoch [84/100], Average Test Loss: 47.2797, Test Accuracy: 0.7633
Epoch [85/100], Average Loss: 1.4249
Epoch [85/100], Average Test Loss: 44.7262, Test Accuracy: 0.7712
Epoch [86/100], Average Loss: 1.5344
Epoch [86/100], Average Test Loss: 45.6113, Test Accuracy: 0.7613
Epoch [87/100], Average Loss: 1.4599
Epoch [87/100], Average Test Loss: 45.3953, Test Accuracy: 0.7692
Epoch [88/100], Average Loss: 1.4691
Epoch [88/100], Average Test Loss: 48.1244, Test Accuracy: 0.7357
Epoch [89/100], Average Loss: 1.4736
Epoch [89/100], Average Test Loss: 47.9917, Test Accuracy: 0.7357
Epoch [90/100], Average Loss: 1.6073
Epoch [90/100], Average Test Loss: 4

In [17]:
from sklearn.metrics import mean_absolute_error, r2_score

y_pred_np = y_pred.numpy().flatten()
y_test_np = y_test_tensor.numpy().flatten()

mae = mean_absolute_error(y_test_np, y_pred_np)
r2 = r2_score(y_test_np, y_pred_np)

print(f'Mean Absolute Error: {mae:.4f}')
print(f'R^2 Score: {r2:.4f}')

Mean Absolute Error: 4.7932
R^2 Score: 0.6477


## Now that we have trained the LSTM model, let us construct a function that predicts the total fantasy points that each player will generate for their next future game. 

In [18]:
sequence_length = 50

player_names_50 = []
for player_name in df['PLAYER_NAME'].unique():
    player_data = df[df['PLAYER_NAME'] == player_name].reset_index(drop=True)
    if len(player_data) >= sequence_length:
        player_names_50.append(player_name)

player_names_50 

['Alex Len',
 'Bam Adebayo',
 'Bismack Biyombo',
 'Brook Lopez',
 'Bruno Fernando',
 'Cameron Payne',
 'Chris Paul',
 'Clint Capela',
 'Damian Jones',
 'Damian Lillard',
 'Daniel Gafford',
 'Darius Bazley',
 'Darius Garland',
 "De'Aaron Fox",
 'DeAndre Jordan',
 'Deandre Ayton',
 'Dejounte Murray',
 'Devin Booker',
 "Devonte' Graham",
 'Drew Eubanks',
 'Dwight Powell',
 'Frank Kaminsky',
 'Isaiah Stewart',
 'Ivica Zubac',
 'Ja Morant',
 'Jakob Poeltl',
 'Jalen McDaniels',
 'James Wiseman',
 'Jarrett Allen',
 'Jaxson Hayes',
 'Jordan McLaughlin',
 'Josh Hart',
 'Justin Holiday',
 'Karl-Anthony Towns',
 'Kawhi Leonard',
 'Kendrick Nunn',
 'Kevin Durant',
 'Kevon Looney',
 'Kyrie Irving',
 'LaMelo Ball',
 'Landry Shamet',
 'Mason Plumlee',
 'Matisse Thybulle',
 'Mikal Bridges',
 'Mike Conley',
 'Mike Muscala',
 'Moses Brown',
 'Nerlens Noel',
 'Pat Connaughton',
 'Patrick Beverley',
 'Paul George',
 'Payton Pritchard',
 'Reggie Jackson',
 'Ricky Rubio',
 'Robin Lopez',
 'Rudy Gobert',
 'R

In [12]:
seq_lengths = []
for player_name in df['PLAYER_NAME'].unique():
    player_data = df[df['PLAYER_NAME'] == player_name]
    seq_lengths.append({'PLAYER_NAME': player_name, 'SEQ_LENGTH': len(player_data)})
seq_lengths_df = pd.DataFrame(seq_lengths)
seq_lengths_df

Unnamed: 0,PLAYER_NAME,SEQ_LENGTH
0,Aaron Gordon,12
1,Alec Burks,35
2,Aleksej Pokusevski,31
3,Alex Len,63
4,Amir Coffey,25
...,...,...
255,Trent Forrest,28
256,Troy Brown Jr.,16
257,Ty Jerome,9
258,Udonis Haslem,1


In [7]:
# DO NOT RUN UNLESS NECESSARY
# desired cutoff date
df = pd.read_csv('filtered3_df_full.csv')
cutoff_date = '2021-01-01'

seq_lengths_cut = []

for player_name in df['PLAYER_NAME'].unique():
    # Filter by player_name and game date before the cutoff_date
    player_data = df[(df['PLAYER_NAME'] == player_name) & (df['GAME_DATE'] <= cutoff_date)]

    seq_lengths_cut.append({'PLAYER_NAME': player_name, 'SEQ_LENGTH': len(player_data)})

seq_lengths_cut_df = pd.DataFrame(seq_lengths_cut)
seq_lengths_cut_df


Unnamed: 0,PLAYER_NAME,SEQ_LENGTH
0,Aaron Gordon,5
1,Alec Burks,3
2,Aleksej Pokusevski,4
3,Alex Len,2
4,Amir Coffey,5
...,...,...
255,Trent Forrest,0
256,Troy Brown Jr.,0
257,Ty Jerome,0
258,Udonis Haslem,0


In [19]:
player_name = 'Alex Len'
player_points_sequence = df.loc[df['PLAYER_NAME'] == player_name, 'total_fantasy_points'].tolist()
player_points_sequence

[14.75,
 13.5,
 13.75,
 22.0,
 33.25,
 22.75,
 11.0,
 8.0,
 10.25,
 24.0,
 5.0,
 20.0,
 11.75,
 5.75,
 20.75,
 11.0,
 20.5,
 5.75,
 5.5,
 8.5,
 21.75,
 17.0,
 20.75,
 24.25,
 7.25,
 4.5,
 8.0,
 10.25,
 9.0,
 28.25,
 5.75,
 15.75,
 28.0,
 15.25,
 26.5,
 15.0,
 10.5,
 13.0,
 7.25,
 28.75,
 15.5,
 28.0,
 17.75,
 8.0,
 16.75,
 18.25,
 11.75,
 16.75,
 23.25,
 10.0,
 9.75,
 14.0,
 7.0,
 18.75,
 19.5,
 15.25,
 10.75,
 10.5,
 29.0,
 27.0,
 17.75,
 19.75,
 15.75]

## example of one player's predicted total fantasy points for their next game

In [20]:
# drop na values
seq_lengths_df = seq_lengths_df[(seq_lengths_df != 0).all(1)].dropna()

# most recent sequence of games
player_name = 'Alex Len'
player_data = df[df['PLAYER_NAME'] == player_name].reset_index(drop=True)

# player max sequence length from seq_lengths_df
sequence_length = seq_lengths_df.loc[seq_lengths_df['PLAYER_NAME'] == player_name, 'SEQ_LENGTH'].item()
last_sequence = player_data.iloc[-sequence_length:, :]

# numerical features scaled
last_sequence[numerical_features] = scaler.transform(last_sequence[numerical_features])

# convert to tensor and predict future total fantasy points for next game
with torch.no_grad():
    input_tensor = torch.tensor(last_sequence[numerical_features + ['POS']].values.reshape(1, -1, input_features),
                                dtype=torch.float32)
    pred = model(input_tensor).item()
    print(f'Predicted total fantasy points for {player_name} in the next game: {pred:.2f}')


Predicted total fantasy points for Alex Len in the next game: 6.68


## function that predicts all players total fantasy points for their next game

In [21]:
def predict_next_game_scores(df, seq_lengths_df, numerical_features, input_features, scaler, model, unique_players):
    player_names = []
    predicted_scores = []
    
    for player_name in unique_players:
        player_data = df[df['PLAYER_NAME'] == player_name].reset_index(drop=True)
        
        if not player_data.empty:
            sequence_length = seq_lengths_df.loc[seq_lengths_df['PLAYER_NAME'] == player_name, 'SEQ_LENGTH'].item()
            last_sequence = player_data.iloc[-sequence_length:, :]
            last_sequence[numerical_features] = scaler.transform(last_sequence[numerical_features])
            
            
            with torch.no_grad():
                input_tensor = torch.tensor(last_sequence[numerical_features + ['POS']].values.reshape(1, -1, input_features),
                                            dtype=torch.float32)
                pred = model(input_tensor).item()
                
            player_names.append(player_name)
            predicted_scores.append(pred)
        else:
            player_names.append(player_name)
            predicted_scores.append(None)

    predictions_df = pd.DataFrame({'PLAYER_NAME': player_names, 'PRED_SCORE': predicted_scores})
    return predictions_df


unique_players = df['PLAYER_NAME'].unique()


player_predictions_df = predict_next_game_scores(df, seq_lengths_df, numerical_features, input_features, scaler, model, unique_players)


print(player_predictions_df)


            PLAYER_NAME  PRED_SCORE
0          Aaron Gordon    4.076872
1            Alec Burks    9.093205
2    Aleksej Pokusevski    7.573839
3              Alex Len    6.675406
4           Amir Coffey    4.800491
..                  ...         ...
255       Trent Forrest    7.911955
256      Troy Brown Jr.    5.695612
257           Ty Jerome    4.046981
258       Udonis Haslem    3.057429
259         Will Barton    5.190966

[260 rows x 2 columns]


In [None]:
# do not run unless necessary
cutoff_date = '2021-01-01'

def predict_next_game_scores(df, seq_lengths_cut_df, numerical_features, input_features, scaler, model, unique_players, cutoff_date):
    player_names = []
    predicted_scores = []
    
    for player_name in unique_players:
        # Filter player_data by player_name and game date before the cutoff_date
        player_data = df[(df['PLAYER_NAME'] == player_name) & (df['GAME_DATE'] <= cutoff_date)].reset_index(drop=True)
        
        if not player_data.empty:
            sequence_length = seq_lengths_cut_df.loc[seq_lengths_cut_df['PLAYER_NAME'] == player_name, 'SEQ_LENGTH'].item()
            last_sequence = player_data.iloc[-sequence_length:, :]
            last_sequence[numerical_features] = scaler.transform(last_sequence[numerical_features])
            
            with torch.no_grad():
                input_tensor = torch.tensor(last_sequence[numerical_features + ['POS']].values.reshape(1, -1, input_features),
                                            dtype=torch.float32)
                pred = model(input_tensor).item()
                
            player_names.append(player_name)
            predicted_scores.append(pred)
        else:
            player_names.append(player_name)
            predicted_scores.append(None)

    predictions_df = pd.DataFrame({'PLAYER_NAME': player_names, 'PRED_SCORE': predicted_scores})
    return predictions_df

unique_players = df['PLAYER_NAME'].unique()


player_predictions_df = predict_next_game_scores(df, seq_lengths_cut_df, numerical_features, input_features, scaler, model, unique_players, cutoff_date)

print(player_predictions_df)

In [22]:
filtered3_prediction = player_predictions_df
print(filtered3_prediction)
filtered3_prediction.to_csv('filtered3_prediction.csv')

            PLAYER_NAME  PRED_SCORE
0          Aaron Gordon    4.076872
1            Alec Burks    9.093205
2    Aleksej Pokusevski    7.573839
3              Alex Len    6.675406
4           Amir Coffey    4.800491
..                  ...         ...
255       Trent Forrest    7.911955
256      Troy Brown Jr.    5.695612
257           Ty Jerome    4.046981
258       Udonis Haslem    3.057429
259         Will Barton    5.190966

[260 rows x 2 columns]
