In [14]:
import pandas as pd

In [15]:
df = pd.read_csv('filtered3_df_full.csv')

In [16]:
df.columns.values

array(['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.1.1', 'Season',
       'Game_ID', 'PLAYER_NAME', 'POS', 'Team', 'GAME_DATE', 'MATCHUP',
       'WL', 'MIN', 'FGM', 'FGA', 'FG_PCT', 'FG3M', 'FG3A', 'FG3_PCT',
       'FTM', 'FTA', 'FT_PCT', 'OREB', 'DREB', 'REB', 'AST', 'STL', 'BLK',
       'TOV', 'PF', 'PTS', 'PLUS_MINUS', 'VIDEO_AVAILABLE', 'DD', 'TD',
       'total_fantasy_points', 'salary'], dtype=object)

In [17]:
pip install torch

Note: you may need to restart the kernel to use updated packages.


In [6]:
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split

In [52]:
df = df[['PLAYER_NAME', 'POS', 'Team', 'GAME_DATE', 'MIN', 'FGM', 'FGA', 'FG_PCT', 'FG3M', 'FG3A', 'FG3_PCT',
       'FTM', 'FTA', 'FT_PCT', 'OREB', 'DREB', 'REB', 'AST', 'STL', 'BLK',
       'TOV', 'PF', 'PTS', 'PLUS_MINUS',
       'total_fantasy_points']]

# encode player positions
le = LabelEncoder()
df["POS"] = le.fit_transform(df["POS"])
df["Team"]= le.fit_transform(df["Team"])

# scale numerical features
numerical_features = ['MIN', 'FGM', 'FGA', 'FG_PCT', 'FG3M', 'FG3A', 'FG3_PCT',
                      'FTM', 'FTA', 'FT_PCT', 'OREB', 'DREB', 'REB', 'AST', 'STL', 'BLK',
                      'TOV', 'PF', 'PTS', 'PLUS_MINUS']
scaler = StandardScaler()
df[numerical_features] = scaler.fit_transform(df[numerical_features])

# sequences of games for each player
sequence_length = 50  # Set the number of past games to consider for each player
input_features = len(numerical_features) + 1  # Number of input features + position encoding
X = []
y = []

for player_name in df['PLAYER_NAME'].unique():
    player_data = df[df['PLAYER_NAME'] == player_name].reset_index(drop=True)
    if len(player_data) >= sequence_length:  # Check if the player has at least 50 games
        for i in range(len(player_data) - sequence_length):
            X.append(player_data.loc[i:i + sequence_length - 1, numerical_features + ['POS']].values)
            y.append(player_data.loc[i + sequence_length, 'total_fantasy_points'])

X = np.array(X)
y = np.array(y)


# train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [53]:
# LSTM model 
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(LSTMModel, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).requires_grad_()
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).requires_grad_()
        out, (hn, cn) = self.lstm(x, (h0.detach(), c0.detach()))
        out = self.fc(out[:, -1, :])
        return out

# parameters
hidden_size = 64
num_layers = 2
output_size = 1

# model, loss function, optimizer
model = LSTMModel(input_features, hidden_size, num_layers, output_size)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)


In [72]:
# train LSTM
num_epochs = 100
batch_size = 8

train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32).view(-1, 1)

for epoch in range(num_epochs):
    epoch_loss = 0
    n_batches = 0
    for i in range(0, len(train_tensor), batch_size):
        batch_X = train_tensor[i:i + batch_size]
        batch_y = y_train_tensor[i:i + batch_size]

        # forward
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        epoch_loss += loss.item()
        n_batches += 1

        # backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # average loss for epoch
    avg_epoch_loss = epoch_loss / n_batches
    print(f'Epoch [{epoch + 1}/{num_epochs}], Average Loss: {avg_epoch_loss:.4f}')


# evaluate on test set
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32).view(-1, 1)

model.eval()
with torch.no_grad():
    y_pred = model(X_test_tensor)
    test_loss = criterion(y_pred, y_test_tensor)
    print(f'Test Loss: {test_loss.item():.4f}')


Epoch [1/100], Average Loss: 8.7138
Epoch [2/100], Average Loss: 5.3341
Epoch [3/100], Average Loss: 3.9257
Epoch [4/100], Average Loss: 3.2849
Epoch [5/100], Average Loss: 3.5004
Epoch [6/100], Average Loss: 3.7753
Epoch [7/100], Average Loss: 3.6330
Epoch [8/100], Average Loss: 4.1911
Epoch [9/100], Average Loss: 2.4873
Epoch [10/100], Average Loss: 2.1905
Epoch [11/100], Average Loss: 2.7406
Epoch [12/100], Average Loss: 2.4761
Epoch [13/100], Average Loss: 3.0387
Epoch [14/100], Average Loss: 2.5272
Epoch [15/100], Average Loss: 2.3000
Epoch [16/100], Average Loss: 2.2918
Epoch [17/100], Average Loss: 3.2168
Epoch [18/100], Average Loss: 2.9406
Epoch [19/100], Average Loss: 1.9547
Epoch [20/100], Average Loss: 1.6281
Epoch [21/100], Average Loss: 1.7522
Epoch [22/100], Average Loss: 2.5008
Epoch [23/100], Average Loss: 3.3955
Epoch [24/100], Average Loss: 2.1136
Epoch [25/100], Average Loss: 1.6738
Epoch [26/100], Average Loss: 1.7188
Epoch [27/100], Average Loss: 2.1064
Epoch [28/

In [73]:
from sklearn.metrics import mean_absolute_error, r2_score

y_pred_np = y_pred.numpy().flatten()
y_test_np = y_test_tensor.numpy().flatten()

mae = mean_absolute_error(y_test_np, y_pred_np)
r2 = r2_score(y_test_np, y_pred_np)

print(f'Mean Absolute Error: {mae:.4f}')
print(f'R^2 Score: {r2:.4f}')

Mean Absolute Error: 4.3687
R^2 Score: 0.5566


In [74]:
def regression_accuracy(y_true, y_pred, threshold=0.3):
    assert y_true.shape == y_pred.shape, "y_true and y_pred must have the same shape"
    within_threshold = torch.abs(y_true - y_pred) <= (threshold * y_true)
    accuracy = torch.mean(within_threshold.type(torch.float32))
    return accuracy.item()

In [75]:
num_epochs = 100
for epoch in range(num_epochs):
    epoch_loss = 0
    n_batches = 0
    for i in range(0, len(train_tensor), batch_size):
        batch_X = train_tensor[i:i + batch_size]
        batch_y = y_train_tensor[i:i + batch_size]

        # forward
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        epoch_loss += loss.item()
        n_batches += 1

        # backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # average loss for epoch
    avg_epoch_loss = epoch_loss / n_batches
    print(f'Epoch [{epoch + 1}/{num_epochs}], Average Loss: {avg_epoch_loss:.4f}')

    model.eval()
    with torch.no_grad():
        y_pred = model(X_test_tensor)
        test_loss = criterion(y_pred, y_test_tensor)
        test_accuracy = regression_accuracy(y_test_tensor, y_pred)
        print(f'Epoch [{epoch + 1}/{num_epochs}], Average Test Loss: {test_loss.item():.4f}, Test Accuracy: {test_accuracy:.4f}')
    # train
    model.train()


Epoch [1/100], Average Loss: 0.6475
Epoch [1/100], Average Test Loss: 38.4667, Test Accuracy: 0.7365
Epoch [2/100], Average Loss: 0.7308
Epoch [2/100], Average Test Loss: 38.5079, Test Accuracy: 0.7212
Epoch [3/100], Average Loss: 0.6735
Epoch [3/100], Average Test Loss: 38.5203, Test Accuracy: 0.7289
Epoch [4/100], Average Loss: 0.5580
Epoch [4/100], Average Test Loss: 38.6110, Test Accuracy: 0.7303
Epoch [5/100], Average Loss: 0.5431
Epoch [5/100], Average Test Loss: 40.1865, Test Accuracy: 0.7268
Epoch [6/100], Average Loss: 0.6739
Epoch [6/100], Average Test Loss: 39.0241, Test Accuracy: 0.7358
Epoch [7/100], Average Loss: 0.7306
Epoch [7/100], Average Test Loss: 38.3214, Test Accuracy: 0.7289
Epoch [8/100], Average Loss: 0.6509
Epoch [8/100], Average Test Loss: 38.6052, Test Accuracy: 0.7303
Epoch [9/100], Average Loss: 0.5665
Epoch [9/100], Average Test Loss: 38.6745, Test Accuracy: 0.7303
Epoch [10/100], Average Loss: 0.7695
Epoch [10/100], Average Test Loss: 39.1109, Test Accur

Epoch [81/100], Average Loss: 0.2105
Epoch [81/100], Average Test Loss: 37.9279, Test Accuracy: 0.7358
Epoch [82/100], Average Loss: 0.2437
Epoch [82/100], Average Test Loss: 37.5111, Test Accuracy: 0.7345
Epoch [83/100], Average Loss: 0.4936
Epoch [83/100], Average Test Loss: 38.3585, Test Accuracy: 0.7261
Epoch [84/100], Average Loss: 0.6044
Epoch [84/100], Average Test Loss: 37.8643, Test Accuracy: 0.7345
Epoch [85/100], Average Loss: 0.4276
Epoch [85/100], Average Test Loss: 38.1203, Test Accuracy: 0.7324
Epoch [86/100], Average Loss: 0.3103
Epoch [86/100], Average Test Loss: 37.3107, Test Accuracy: 0.7386
Epoch [87/100], Average Loss: 0.2945
Epoch [87/100], Average Test Loss: 37.8706, Test Accuracy: 0.7289
Epoch [88/100], Average Loss: 0.3436
Epoch [88/100], Average Test Loss: 37.7269, Test Accuracy: 0.7358
Epoch [89/100], Average Loss: 0.4102
Epoch [89/100], Average Test Loss: 36.8823, Test Accuracy: 0.7352
Epoch [90/100], Average Loss: 0.4642
Epoch [90/100], Average Test Loss: 3

## Now that we have trained the LSTM model, let us construct a function that predicts the total fantasy points that each player will generate for their next future game. 

In [90]:
sequence_length = 50

player_names_50 = []
for player_name in df['PLAYER_NAME'].unique():
    player_data = df[df['PLAYER_NAME'] == player_name].reset_index(drop=True)
    if len(player_data) >= sequence_length:
        player_names_50.append(player_name)

player_names_50 

['Aaron Gordon',
 'Aaron Holiday',
 'Alex Caruso',
 'Alex Len',
 'Andre Iguodala',
 'Andrew Wiggins',
 'Anfernee Simons',
 'Anthony Edwards',
 'Bam Adebayo',
 'Ben Simmons',
 'Bismack Biyombo',
 'Bobby Portis',
 'Bradley Beal',
 'Brandon Clarke',
 'Brandon Ingram',
 'Brook Lopez',
 'Bruce Brown',
 'Bruno Fernando',
 'Bryn Forbes',
 'Buddy Hield',
 'Caleb Martin',
 'Cam Reddish',
 'Cameron Johnson',
 'Cameron Payne',
 'Cedi Osman',
 'Chris Boucher',
 'Chris Paul',
 'Clint Capela',
 'Coby White',
 'Cody Martin',
 'Collin Sexton',
 'Cory Joseph',
 "D'Angelo Russell",
 'Damian Jones',
 'Damian Lillard',
 'Damion Lee',
 'Daniel Gafford',
 'Daniel Theis',
 'Danny Green',
 'Darius Bazley',
 'Darius Garland',
 "De'Aaron Fox",
 "De'Anthony Melton",
 'DeAndre Jordan',
 'DeMar DeRozan',
 'Dean Wade',
 'Deandre Ayton',
 'Dejounte Murray',
 'Delon Wright',
 'Deni Avdija',
 'Derrick Jones Jr.',
 'Derrick Rose',
 'Desmond Bane',
 'Devin Booker',
 'Devin Vassell',
 "Devonte' Graham",
 'Dillon Brooks',

In [95]:
seq_lengths = []
for player_name in df['PLAYER_NAME'].unique():
    player_data = df[df['PLAYER_NAME'] == player_name]
    seq_lengths.append({'PLAYER_NAME': player_name, 'SEQ_LENGTH': len(player_data)})
seq_lengths_df = pd.DataFrame(seq_lengths)
seq_lengths_df

Unnamed: 0,PLAYER_NAME,SEQ_LENGTH
0,Aaron Gordon,50
1,Aaron Holiday,66
2,Aaron Nesmith,46
3,Al Horford,28
4,Alec Burks,49
...,...,...
333,Taj Gibson,45
334,Trent Forrest,30
335,Ty Jerome,33
336,Udonis Haslem,1


In [93]:
player_name = 'Aaron Gordon'
player_points_sequence = df.loc[df['PLAYER_NAME'] == player_name, 'total_fantasy_points'].tolist()
player_points_sequence

[26.75,
 13.0,
 21.25,
 13.75,
 28.75,
 29.0,
 20.0,
 11.25,
 22.5,
 33.25,
 38.25,
 37.75,
 49.75,
 27.5,
 8.0,
 22.25,
 17.0,
 36.25,
 26.5,
 12.0,
 20.5,
 16.0,
 30.25,
 37.0,
 14.5,
 17.0,
 25.75,
 26.25,
 22.5,
 34.0,
 27.75,
 15.75,
 21.5,
 27.25,
 14.5,
 26.5,
 32.75,
 28.5,
 17.75,
 19.75,
 21.75,
 20.75,
 10.75,
 24.25,
 15.25,
 22.75,
 13.0,
 23.0,
 24.5,
 21.0]

## example of one player's predicted total fantasy points for their next game

In [99]:
# most recent sequence of games
player_name = 'Dwight Powell'  
player_data = df[df['PLAYER_NAME'] == player_name].reset_index(drop=True)

# player max sequence length from seq_lengths_df
sequence_length = seq_lengths_df.loc[seq_lengths_df['PLAYER_NAME'] == player_name, 'SEQ_LENGTH'].item()
last_sequence = player_data.iloc[-sequence_length:, :]

# position and team encoded
last_sequence['POS'] = le.transform(last_sequence['POS'])
last_sequence['Team'] = le.transform(last_sequence['Team'])

# numerical features scaled
last_sequence[numerical_features] = scaler.transform(last_sequence[numerical_features])

# convert to tensor and predict future total fantasy points for next game
with torch.no_grad():
    input_tensor = torch.tensor(last_sequence[numerical_features + ['POS']].values.reshape(1, -1, input_features),
                                dtype=torch.float32)
    pred = model(input_tensor).item()
    print(f'Predicted total fantasy points for {player_name} in the next game: {pred:.2f}')


Predicted total fantasy points for Dwight Powell in the next game: 19.17


## function that predicts all players total fantasy points for their next game

In [102]:
def predict_next_game_scores(df, model, seq_lengths_df, le, scaler, numerical_features, input_features, sequence_length):
    prediction = []
    for player_name in df['PLAYER_NAME'].unique():
        player_data = df[df['PLAYER_NAME'] == player_name].reset_index(drop=True)
        sequence_length = seq_lengths_df.loc[seq_lengths_df['PLAYER_NAME'] == player_name, 'SEQ_LENGTH'].item()
        if sequence_length >= sequence_length:
            last_sequence = player_data.iloc[-sequence_length:, :]
            last_sequence['POS'] = le.transform(last_sequence['POS'])
            last_sequence['Team'] = le.transform(last_sequence['Team'])
            last_sequence[numerical_features] = scaler.transform(last_sequence[numerical_features])
            with torch.no_grad():
                input_tensor = torch.tensor(last_sequence[numerical_features + ['POS']].values.reshape(1, -1, input_features),
                                            dtype=torch.float32)
                pred = model(input_tensor).item()
                prediction.append({'PLAYER_NAME': player_name, 'PRED_SCORE': pred})
    return pd.DataFrame(prediction)


In [106]:
filtered3_prediction = predict_next_game_scores(df, model, seq_lengths_df, le, scaler, numerical_features, input_features, sequence_length)
print(filtered3_prediction)
filtered3_prediction.to_csv('filtered3_prediction.csv')

        PLAYER_NAME  PRED_SCORE
0      Aaron Gordon   23.547430
1     Aaron Holiday   10.056620
2     Aaron Nesmith   11.241659
3        Al Horford   37.758564
4        Alec Burks   17.684090
..              ...         ...
333      Taj Gibson   16.436916
334   Trent Forrest    6.989388
335       Ty Jerome   17.515474
336   Udonis Haslem    1.829797
337  Wenyen Gabriel   25.223473

[338 rows x 2 columns]
