In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('filtered1_df.csv')

In [3]:
df.columns.values

array(['Unnamed: 0', 'Unnamed: 0.1', 'Season', 'Game_ID', 'PLAYER_NAME',
       'POS', 'Team', 'GAME_DATE', 'MATCHUP', 'WL', 'MIN', 'FGM', 'FGA',
       'FG_PCT', 'FG3M', 'FG3A', 'FG3_PCT', 'FTM', 'FTA', 'FT_PCT',
       'OREB', 'DREB', 'REB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS',
       'PLUS_MINUS', 'VIDEO_AVAILABLE', 'DD', 'TD',
       'total_fantasy_points'], dtype=object)

In [4]:
pip install torch

Note: you may need to restart the kernel to use updated packages.


In [5]:
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split

In [6]:
df = df[['PLAYER_NAME', 'POS', 'Team', 'GAME_DATE', 'MIN', 'FGM', 'FGA', 'FG_PCT', 'FG3M', 'FG3A', 'FG3_PCT',
       'FTM', 'FTA', 'FT_PCT', 'OREB', 'DREB', 'REB', 'AST', 'STL', 'BLK',
       'TOV', 'PF', 'PTS', 'PLUS_MINUS',
       'total_fantasy_points']]

# encode player positions
le = LabelEncoder()
df["POS"] = le.fit_transform(df["POS"])
df["Team"]= le.fit_transform(df["Team"])

# scale numerical features
numerical_features = ['MIN', 'FGM', 'FGA', 'FG_PCT', 'FG3M', 'FG3A', 'FG3_PCT',
                      'FTM', 'FTA', 'FT_PCT', 'OREB', 'DREB', 'REB', 'AST', 'STL', 'BLK',
                      'TOV', 'PF', 'PTS', 'PLUS_MINUS']
scaler = StandardScaler()
df[numerical_features] = scaler.fit_transform(df[numerical_features])

# sequences of games for each player
sequence_length = 50  # Set the number of past games to consider for each player
input_features = len(numerical_features) + 1  # Number of input features + position encoding
X = []
y = []

for player_name in df['PLAYER_NAME'].unique():
    player_data = df[df['PLAYER_NAME'] == player_name].reset_index(drop=True)
    if len(player_data) >= sequence_length:  # Check if the player has at least 50 games
        for i in range(len(player_data) - sequence_length):
            X.append(player_data.loc[i:i + sequence_length - 1, numerical_features + ['POS']].values)
            y.append(player_data.loc[i + sequence_length, 'total_fantasy_points'])

X = np.array(X)
y = np.array(y)


# train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [7]:
# LSTM model 
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(LSTMModel, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).requires_grad_()
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).requires_grad_()
        out, (hn, cn) = self.lstm(x, (h0.detach(), c0.detach()))
        out = self.fc(out[:, -1, :])
        return out

# parameters
hidden_size = 64
num_layers = 2
output_size = 1

# model, loss function, optimizer
model = LSTMModel(input_features, hidden_size, num_layers, output_size)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)


In [11]:
def regression_accuracy(y_true, y_pred, threshold=0.3):
    assert y_true.shape == y_pred.shape, "y_true and y_pred must have the same shape"
    within_threshold = torch.abs(y_true - y_pred) <= (threshold * y_true)
    accuracy = torch.mean(within_threshold.type(torch.float32))
    return accuracy.item()

In [12]:
num_epochs = 100
batch_size = 8
train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32).view(-1, 1)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32).view(-1, 1)

for epoch in range(num_epochs):
    epoch_loss = 0
    n_batches = 0
    for i in range(0, len(train_tensor), batch_size):
        batch_X = train_tensor[i:i + batch_size]
        batch_y = y_train_tensor[i:i + batch_size]

        # forward
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        epoch_loss += loss.item()
        n_batches += 1

        # backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # average loss for epoch
    avg_epoch_loss = epoch_loss / n_batches
    print(f'Epoch [{epoch + 1}/{num_epochs}], Average Loss: {avg_epoch_loss:.4f}')

    model.eval()
    with torch.no_grad():
        y_pred = model(X_test_tensor)
        test_loss = criterion(y_pred, y_test_tensor)
        test_accuracy = regression_accuracy(y_test_tensor, y_pred)
        print(f'Epoch [{epoch + 1}/{num_epochs}], Average Test Loss: {test_loss.item():.4f}, Test Accuracy: {test_accuracy:.4f}')
    # train
    model.train()


Epoch [1/100], Average Loss: 36.3246
Epoch [1/100], Average Test Loss: 34.8222, Test Accuracy: 0.7196
Epoch [2/100], Average Loss: 35.2600
Epoch [2/100], Average Test Loss: 34.1994, Test Accuracy: 0.7244
Epoch [3/100], Average Loss: 34.5694
Epoch [3/100], Average Test Loss: 34.1610, Test Accuracy: 0.7234
Epoch [4/100], Average Loss: 33.7447
Epoch [4/100], Average Test Loss: 34.1679, Test Accuracy: 0.7224
Epoch [5/100], Average Loss: 33.0013
Epoch [5/100], Average Test Loss: 34.0324, Test Accuracy: 0.7234
Epoch [6/100], Average Loss: 32.0541
Epoch [6/100], Average Test Loss: 33.4868, Test Accuracy: 0.7244
Epoch [7/100], Average Loss: 30.8025
Epoch [7/100], Average Test Loss: 32.7092, Test Accuracy: 0.7349
Epoch [8/100], Average Loss: 29.3517
Epoch [8/100], Average Test Loss: 31.3153, Test Accuracy: 0.7474
Epoch [9/100], Average Loss: 28.6437
Epoch [9/100], Average Test Loss: 30.8867, Test Accuracy: 0.7589
Epoch [10/100], Average Loss: 26.9617
Epoch [10/100], Average Test Loss: 30.5350, 

Epoch [81/100], Average Loss: 2.2749
Epoch [81/100], Average Test Loss: 46.1694, Test Accuracy: 0.6932
Epoch [82/100], Average Loss: 2.2139
Epoch [82/100], Average Test Loss: 45.5356, Test Accuracy: 0.7052
Epoch [83/100], Average Loss: 2.2469
Epoch [83/100], Average Test Loss: 45.2809, Test Accuracy: 0.6994
Epoch [84/100], Average Loss: 2.3656
Epoch [84/100], Average Test Loss: 45.6118, Test Accuracy: 0.6965
Epoch [85/100], Average Loss: 2.1279
Epoch [85/100], Average Test Loss: 45.5921, Test Accuracy: 0.6918
Epoch [86/100], Average Loss: 2.1585
Epoch [86/100], Average Test Loss: 45.8845, Test Accuracy: 0.6975
Epoch [87/100], Average Loss: 2.0840
Epoch [87/100], Average Test Loss: 44.5511, Test Accuracy: 0.7105
Epoch [88/100], Average Loss: 1.8454
Epoch [88/100], Average Test Loss: 45.5362, Test Accuracy: 0.6942
Epoch [89/100], Average Loss: 2.2897
Epoch [89/100], Average Test Loss: 45.7290, Test Accuracy: 0.6989
Epoch [90/100], Average Loss: 2.0740
Epoch [90/100], Average Test Loss: 4

In [13]:
from sklearn.metrics import mean_absolute_error, r2_score

y_pred_np = y_pred.numpy().flatten()
y_test_np = y_test_tensor.numpy().flatten()

mae = mean_absolute_error(y_test_np, y_pred_np)
r2 = r2_score(y_test_np, y_pred_np)

print(f'Mean Absolute Error: {mae:.4f}')
print(f'R^2 Score: {r2:.4f}')

Mean Absolute Error: 4.6281
R^2 Score: 0.4725


## Now that we have trained the LSTM model, let us construct a function that predicts the total fantasy points that each player will generate for their next future game. 

In [14]:
sequence_length = 50

player_names_50 = []
for player_name in df['PLAYER_NAME'].unique():
    player_data = df[df['PLAYER_NAME'] == player_name].reset_index(drop=True)
    if len(player_data) >= sequence_length:
        player_names_50.append(player_name)

player_names_50 

['AJ Griffin',
 'Aaron Gordon',
 'Aaron Holiday',
 'Aaron Nesmith',
 'Aaron Wiggins',
 'Al Horford',
 'Alec Burks',
 'Alex Caruso',
 'Andre Drummond',
 'Andrew Nembhard',
 'Anfernee Simons',
 'Anthony Davis',
 'Anthony Edwards',
 'Anthony Gill',
 'Anthony Lamb',
 'Austin Reaves',
 'Austin Rivers',
 'Ayo Dosunmu',
 'Bam Adebayo',
 'Bennedict Mathurin',
 'Bismack Biyombo',
 'Bobby Portis',
 'Bol Bol',
 'Bones Hyland',
 'Bradley Beal',
 'Brandon Clarke',
 'Brook Lopez',
 'Bruce Brown',
 'Bruno Fernando',
 'Buddy Hield',
 'CJ McCollum',
 'Caleb Martin',
 'Cam Reddish',
 'Cam Thomas',
 'Cameron Johnson',
 'Caris LeVert',
 'Cedi Osman',
 'Chimezie Metu',
 'Chris Boucher',
 'Chris Paul',
 'Christian Braun',
 'Christian Koloko',
 'Christian Wood',
 'Clint Capela',
 'Coby White',
 'Cole Anthony',
 'Corey Kispert',
 'Cory Joseph',
 "D'Angelo Russell",
 'Daishen Nix',
 'Damian Jones',
 'Damian Lillard',
 'Damion Lee',
 'Daniel Gafford',
 'Danuel House Jr.',
 'Darius Bazley',
 'Darius Garland',
 '

In [15]:
seq_lengths = []
for player_name in df['PLAYER_NAME'].unique():
    player_data = df[df['PLAYER_NAME'] == player_name]
    seq_lengths.append({'PLAYER_NAME': player_name, 'SEQ_LENGTH': len(player_data)})
seq_lengths_df = pd.DataFrame(seq_lengths)
seq_lengths_df

Unnamed: 0,PLAYER_NAME,SEQ_LENGTH
0,A.J. Green,34
1,A.J. Lawson,39
2,AJ Griffin,71
3,Aaron Gordon,67
4,Aaron Holiday,61
...,...,...
497,Skylar Mays,3
498,Stanley Umude,1
499,Sterling Brown,4
500,Xavier Moon,3


In [16]:
player_name = 'Aaron Gordon'
player_points_sequence = df.loc[df['PLAYER_NAME'] == player_name, 'total_fantasy_points'].tolist()
player_points_sequence

[24.5,
 24.75,
 21.5,
 29.5,
 21.75,
 36.0,
 38.25,
 11.25,
 20.25,
 18.25,
 28.75,
 21.25,
 32.5,
 34.25,
 21.0,
 37.75,
 28.75,
 19.0,
 31.25,
 40.5,
 22.75,
 17.75,
 24.75,
 24.0,
 10.25,
 29.25,
 26.5,
 33.5,
 16.25,
 25.0,
 13.75,
 31.25,
 31.5,
 35.25,
 39.5,
 36.5,
 27.25,
 25.0,
 17.0,
 24.25,
 21.5,
 28.75,
 18.0,
 22.75,
 24.0,
 21.5,
 27.0,
 39.75,
 44.75,
 23.0,
 25.75,
 15.75,
 32.5,
 27.5,
 23.25,
 43.0,
 19.0,
 16.25,
 29.75,
 28.0,
 34.0,
 39.0,
 22.25,
 23.0,
 27.5,
 15.5,
 23.75]

## example of one player's predicted total fantasy points for their next game

In [31]:
# drop na values
seq_lengths_df = seq_lengths_df[(seq_lengths_df != 0).all(1)].dropna()

# most recent sequence of games
player_name = 'AJ Griffin'
player_data = df[df['PLAYER_NAME'] == player_name].reset_index(drop=True)

# player max sequence length from seq_lengths_df
sequence_length = seq_lengths_df.loc[seq_lengths_df['PLAYER_NAME'] == player_name, 'SEQ_LENGTH'].item()
last_sequence = player_data.iloc[-sequence_length:, :]

# numerical features scaled
last_sequence[numerical_features] = scaler.transform(last_sequence[numerical_features])

# convert to tensor and predict future total fantasy points for next game
with torch.no_grad():
    input_tensor = torch.tensor(last_sequence[numerical_features + ['POS']].values.reshape(1, -1, input_features),
                                dtype=torch.float32)
    pred = model(input_tensor).item()
    print(f'Predicted total fantasy points for {player_name} in the next game: {pred:.2f}')


Predicted total fantasy points for AJ Griffin in the next game: 6.57


## function that predicts all players total fantasy points for their next game

In [41]:
def predict_next_game_scores(df, seq_lengths_df, numerical_features, input_features, scaler, model, unique_players):
    player_names = []
    predicted_scores = []
    
    for player_name in unique_players:
        player_data = df[df['PLAYER_NAME'] == player_name].reset_index(drop=True)
        
        if not player_data.empty:
            sequence_length = seq_lengths_df.loc[seq_lengths_df['PLAYER_NAME'] == player_name, 'SEQ_LENGTH'].item()
            last_sequence = player_data.iloc[-sequence_length:, :]
            last_sequence[numerical_features] = scaler.transform(last_sequence[numerical_features])
            
            
            with torch.no_grad():
                input_tensor = torch.tensor(last_sequence[numerical_features + ['POS']].values.reshape(1, -1, input_features),
                                            dtype=torch.float32)
                pred = model(input_tensor).item()
                
            player_names.append(player_name)
            predicted_scores.append(pred)
        else:
            player_names.append(player_name)
            predicted_scores.append(None)

    predictions_df = pd.DataFrame({'PLAYER_NAME': player_names, 'PRED_SCORE': predicted_scores})
    return predictions_df


unique_players = df['PLAYER_NAME'].unique()


player_predictions_df = predict_next_game_scores(df, seq_lengths_df, numerical_features, input_features, scaler, model, unique_players)


print(player_predictions_df)


        PLAYER_NAME  PRED_SCORE
0        A.J. Green    4.583989
1       A.J. Lawson    5.719961
2        AJ Griffin    6.571924
3      Aaron Gordon    4.256675
4     Aaron Holiday   10.490354
..              ...         ...
497     Skylar Mays    6.152487
498   Stanley Umude    4.287522
499  Sterling Brown    4.801435
500     Xavier Moon    5.593311
501    Xavier Sneed    2.977818

[502 rows x 2 columns]


In [42]:
filtered1_prediction = player_predictions_df
print(filtered1_prediction)
filtered1_prediction.to_csv('filtered1_prediction.csv')

        PLAYER_NAME  PRED_SCORE
0        A.J. Green    4.583989
1       A.J. Lawson    5.719961
2        AJ Griffin    6.571924
3      Aaron Gordon    4.256675
4     Aaron Holiday   10.490354
..              ...         ...
497     Skylar Mays    6.152487
498   Stanley Umude    4.287522
499  Sterling Brown    4.801435
500     Xavier Moon    5.593311
501    Xavier Sneed    2.977818

[502 rows x 2 columns]
