In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sqlite3
import torch

In [2]:
conn = sqlite3.connect('fantasy_football_data.db')

In [3]:
# query to get average stats per game over last 2 years
# I tested both 3 years and 2 years of stats and 2 years performed better so that's why I switched to 2 years
player_stats_pg = pd.read_sql('''
    WITH sum_stats AS (
        SELECT 
            Player, Year, Tm, FantPos, Age, 
            SUM(G) OVER (PARTITION BY Player ORDER BY Year ROWS BETWEEN 2 PRECEDING AND 1 PRECEDING) AS "G",
            SUM(GS) OVER (PARTITION BY Player ORDER BY Year ROWS BETWEEN 2 PRECEDING AND 1 PRECEDING) AS "GS",
            SUM(Cmp) OVER (PARTITION BY Player ORDER BY Year ROWS BETWEEN 2 PRECEDING AND 1 PRECEDING) AS "Cmp",
            SUM(Pass_Att) OVER (PARTITION BY Player ORDER BY Year ROWS BETWEEN 2 PRECEDING AND 1 PRECEDING) AS "Pass_Att",
            SUM(Pass_Yds) OVER (PARTITION BY Player ORDER BY Year ROWS BETWEEN 2 PRECEDING AND 1 PRECEDING) AS "Pass_Yds",
            SUM(Pass_TD) OVER (PARTITION BY Player ORDER BY Year ROWS BETWEEN 2 PRECEDING AND 1 PRECEDING) AS "Pass_TD",
            SUM(Int) OVER (PARTITION BY Player ORDER BY Year ROWS BETWEEN 2 PRECEDING AND 1 PRECEDING) AS "Int",
            SUM(Rush_Att) OVER (PARTITION BY Player ORDER BY Year ROWS BETWEEN 2 PRECEDING AND 1 PRECEDING) AS "Rush_Att",
            SUM(Rush_Yds) OVER (PARTITION BY Player ORDER BY Year ROWS BETWEEN 2 PRECEDING AND 1 PRECEDING) AS "Rush_Yds",
            SUM("Y/A") OVER (PARTITION BY Player ORDER BY Year ROWS BETWEEN 2 PRECEDING AND 1 PRECEDING) AS "Y/A",
            SUM(Rush_TD) OVER (PARTITION BY Player ORDER BY Year ROWS BETWEEN 2 PRECEDING AND 1 PRECEDING) AS "Rush_TD",
            SUM(Tgt) OVER (PARTITION BY Player ORDER BY Year ROWS BETWEEN 2 PRECEDING AND 1 PRECEDING) AS "Tgt",
            SUM(Rec) OVER (PARTITION BY Player ORDER BY Year ROWS BETWEEN 2 PRECEDING AND 1 PRECEDING) AS "Rec",
            SUM(Rec_Yds) OVER (PARTITION BY Player ORDER BY Year ROWS BETWEEN 2 PRECEDING AND 1 PRECEDING) AS "Rec_Yds",
            SUM("Y/R") OVER (PARTITION BY Player ORDER BY Year ROWS BETWEEN 2 PRECEDING AND 1 PRECEDING) AS "Y/R",
            SUM(Rec_TD) OVER (PARTITION BY Player ORDER BY Year ROWS BETWEEN 2 PRECEDING AND 1 PRECEDING) AS "Rec_TD",
            SUM(Fmb) OVER (PARTITION BY Player ORDER BY Year ROWS BETWEEN 2 PRECEDING AND 1 PRECEDING) AS "Fmb",
            SUM(FL) OVER (PARTITION BY Player ORDER BY Year ROWS BETWEEN 2 PRECEDING AND 1 PRECEDING) AS "FL",
            SUM(Tot_TD) OVER (PARTITION BY Player ORDER BY Year ROWS BETWEEN 2 PRECEDING AND 1 PRECEDING) AS "Tot_TD",
            PPR/G,
            PPR,
            LAG(Tm, 1) OVER (PARTITION BY Player ORDER BY Year) AS Tm_Last_Yr
        FROM
            player_stats)
        
    SELECT
        Player, Year, Tm, FantPos, Age, G, GS, GS/G, Cmp/G, Pass_Att/G, Pass_Yds/G, Pass_TD/G, Int/G, Rush_Att/G, Rush_Yds/G, "Y/A",
        Rush_TD/G, Tgt/G, Rec/G, Rec_Yds/G, "Y/R", Rec_TD/G, Fmb/G, FL/G, Tot_TD/G, "PPR/G", PPR, Tm_Last_Yr
    FROM
        sum_stats
        '''
, conn)
player_stats_pg['Changed_Teams'] = np.where(player_stats_pg['Tm'] != player_stats_pg['Tm_Last_Yr'], 1, 0)
player_stats_pg['PPR/G'].fillna(0, inplace=True)
player_stats_pg['GS/G'] = player_stats_pg['GS']/player_stats_pg['G']
player_stats_pg.query('Year > 2016', inplace=True)

rookies = player_stats_pg[player_stats_pg['G'].isnull()] # saving rookies for later
player_stats_pg.dropna(subset=['G'], inplace=True)

player_stats_pg_2024 = player_stats_pg[player_stats_pg['Year']==2024] # saving 2024 stats for final predictions

player_stats_pg.head()

Unnamed: 0,Player,Year,Tm,FantPos,Age,G,GS,GS/G,Cmp/G,Pass_Att/G,...,Rec_Yds/G,Y/R,Rec_TD/G,Fmb/G,FL/G,Tot_TD/G,PPR/G,PPR,Tm_Last_Yr,Changed_Teams
1,A.J. Brown,2020,TEN,WR,23.0,16.0,11.0,0.6875,0.0,0.0,...,65.6875,20.21,0.5,0.0625,0.0,0.5625,17.678571,247.5,TEN,0
2,A.J. Brown,2021,TEN,WR,24.0,30.0,23.0,0.766667,0.0,0.0,...,70.866667,35.57,0.633333,0.1,0.033333,0.7,13.915385,180.9,TEN,0
3,A.J. Brown,2022,PHI,WR,25.0,27.0,25.0,0.925926,0.0,0.074074,...,72.0,29.15,0.592593,0.074074,0.037037,0.62963,17.623529,299.6,TEN,1
4,A.J. Brown,2023,PHI,WR,26.0,30.0,29.0,0.966667,0.0,0.066667,...,78.833333,30.79,0.533333,0.066667,0.066667,0.533333,17.035294,289.6,PHI,0
5,A.J. Brown,2024,PHI,WR,27.0,34.0,33.0,0.970588,0.0,0.0,...,86.823529,30.74,0.529412,0.117647,0.117647,0.529412,0.0,,PHI,0


In [4]:
# splitting by position
# even though before this didn't seem to make much of a difference, so I'm going to still try it
qb_stats_pg = player_stats_pg[player_stats_pg['FantPos']=='QB'].dropna().reset_index(drop=True) # dropna drops 2024 data
wr_stats_pg = player_stats_pg[player_stats_pg['FantPos']=='WR'].dropna().reset_index(drop=True)
rb_stats_pg = player_stats_pg[player_stats_pg['FantPos']=='RB'].dropna().reset_index(drop=True)
te_stats_pg = player_stats_pg[player_stats_pg['FantPos']=='TE'].dropna().reset_index(drop=True)

In [5]:
qb_stats_pg.head()

Unnamed: 0,Player,Year,Tm,FantPos,Age,G,GS,GS/G,Cmp/G,Pass_Att/G,...,Rec_Yds/G,Y/R,Rec_TD/G,Fmb/G,FL/G,Tot_TD/G,PPR/G,PPR,Tm_Last_Yr,Changed_Teams
0,A.J. McCarron,2017,CIN,QB,27.0,8.0,3.0,0.375,9.875,14.875,...,0.0,0.0,0.0,0.125,0.125,0.0,0.866667,2.6,CIN,0
1,A.J. McCarron,2018,OAK,QB,28.0,4.0,0.0,0.0,1.75,3.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.05,0.1,CIN,1
2,A.J. McCarron,2019,HOU,QB,29.0,5.0,0.0,0.0,1.6,3.4,...,0.0,0.0,0.0,0.2,0.0,0.0,8.45,16.9,OAK,1
3,A.J. McCarron,2020,HOU,QB,30.0,4.0,1.0,0.25,5.5,10.0,...,0.0,0.0,0.0,0.25,0.0,0.25,0.4,0.8,HOU,0
4,A.J. McCarron,2023,CIN,QB,33.0,4.0,1.0,0.25,5.5,9.5,...,0.0,0.0,0.0,0.0,0.0,0.25,0.4,0.8,HOU,1


In [6]:
corr_mat = qb_stats_pg.dropna().drop(['Player', 'Year', 'Tm', 'FantPos', 'Tm_Last_Yr'], axis=1).corr()
corr_mat['PPR/G'].sort_values(ascending=False)
# much better correlation scores than when the stats weren't per game

PPR/G            1.000000
PPR              0.878407
Pass_TD/G        0.618939
GS               0.618009
Pass_Yds/G       0.595282
G                0.576521
Cmp/G            0.561965
GS/G             0.557919
Pass_Att/G       0.544045
Rush_Att/G       0.451839
Rush_Yds/G       0.380412
Rush_TD/G        0.363639
Tot_TD/G         0.357026
Fmb/G            0.317349
Y/A              0.247419
FL/G             0.218502
Int/G            0.210047
Y/R              0.068179
Rec_TD/G        -0.020003
Age             -0.031431
Tgt/G           -0.039316
Rec/G           -0.044036
Rec_Yds/G       -0.046480
Changed_Teams   -0.331969
Name: PPR/G, dtype: float64

In [7]:
from sklearn.preprocessing import StandardScaler

In [8]:
# qb features
qb_features_pg = qb_stats_pg[['Year', 'GS/G', 'Cmp/G', 'Pass_Att/G', 'Pass_Yds/G', 'Pass_TD/G', 'Int/G', 'Rush_Att/G', 'Rush_Yds/G', 'Rush_TD/G', 'Changed_Teams']]
qb_info_pg = qb_stats_pg[['Player', 'Year', 'Tm', 'FantPos', 'Age']]
qb_PPR_pg = qb_stats_pg[['Year','PPR/G']]

# scaling the data
scaler_qb = StandardScaler()
qb_features_pg = pd.DataFrame(scaler_qb.fit_transform(qb_features_pg), columns=qb_features_pg.columns)
scalerPPR_qb = StandardScaler() # need this to unscale the data later
qb_PPR_pg = pd.DataFrame(scalerPPR_qb.fit_transform(qb_PPR_pg['PPR/G'].values.reshape(-1,1)), columns=['PPR/G'])
qb_features_pg['Year'] = qb_stats_pg['Year']
qb_features_pg['Changed_Teams'] = qb_stats_pg['Changed_Teams']
qb_PPR_pg['Year'] = qb_stats_pg['Year']

In [9]:
qb_features_pg.head()

Unnamed: 0,Year,GS/G,Cmp/G,Pass_Att/G,Pass_Yds/G,Pass_TD/G,Int/G,Rush_Att/G,Rush_Yds/G,Rush_TD/G,Changed_Teams
0,2017,-0.811949,-0.654155,-0.739144,-0.669438,-0.387358,-0.937957,-0.490122,-0.56697,-0.789042,0
1,2018,-1.814809,-1.674574,-1.676053,-1.650139,-1.488015,-1.627074,-1.446103,-0.885309,-0.789042,1
2,2019,-1.814809,-1.693412,-1.68429,-1.668612,-1.488015,-1.627074,-1.118338,-0.918169,-0.789042,1
3,2020,-1.146236,-1.203611,-1.140676,-1.196463,-1.488015,-0.937957,-0.353553,-0.125403,1.016209,0
4,2023,-1.146236,-1.203611,-1.181859,-1.163863,-1.488015,-0.937957,-0.763259,-0.084327,1.016209,1


In [10]:
from sklearn.model_selection import KFold, cross_val_score, cross_validate
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression

In [11]:
# 12 fold cross validation
kf = KFold(n_splits=12, random_state=42, shuffle=True)

In [12]:
# linear regression only qbs
lr = LinearRegression()
results = cross_validate(lr, qb_features_pg.drop('Year', axis=1), qb_PPR_pg['PPR/G'], cv=kf, scoring='neg_mean_squared_error', return_train_score=True)

print(f"Train MSE: {-results['train_score']}")
print(f"Test MSE: {-results['test_score']}")
print(f"Mean Train MSE: {np.mean(-results['train_score'])}")
print(f"Mean Test MSE: {np.mean(-results['test_score'])}")

Train MSE: [0.50186344 0.47996184 0.49810038 0.50988136 0.49522335 0.51136986
 0.48427076 0.49363875 0.50163865 0.49064234 0.50382126 0.49282461]
Test MSE: [0.46242629 0.71360307 0.50915104 0.38367375 0.55229533 0.35857879
 0.6658281  0.56705995 0.47003911 0.59273539 0.43978869 0.58063219]
Mean Train MSE: 0.4969363832713755
Mean Test MSE: 0.5246509762142048


In [13]:
from sklearn.ensemble import GradientBoostingRegressor

In [14]:
# gradient boosted regression only qbs
gbr = GradientBoostingRegressor(n_estimators=6000,
                                learning_rate=0.001,
                                max_depth=4,
                                max_features='sqrt',
                                min_samples_leaf=15,
                                min_samples_split=10,
                                loss='huber',
                                random_state=42)
results = cross_validate(gbr, qb_features_pg.drop('Year', axis=1), qb_PPR_pg['PPR/G'], cv=kf, scoring='neg_mean_squared_error', return_train_score=True)

print(f"Train MSE: {-results['train_score']}")
print(f"Test MSE: {-results['test_score']}")
print(f"Mean Train MSE: {np.mean(-results['train_score'])}")
print(f"Mean Test MSE: {np.mean(-results['test_score'])}")

Train MSE: [0.28796646 0.26439254 0.27715836 0.29321368 0.28771454 0.29047384
 0.28014601 0.27969279 0.28117211 0.2832954  0.27833937 0.26927007]
Test MSE: [0.43357522 0.76408332 0.68691851 0.39607378 0.54880681 0.35883399
 0.65397365 0.67472089 0.65762243 0.61445652 0.52339951 0.70443705]
Mean Train MSE: 0.28106959617304134
Mean Test MSE: 0.5847418076614639


In [15]:
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [16]:
# same neural network as before
class SimpleNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        return out
    
def train(model, criterion, optmizer, num_epochs, X_train, y_train):
    model.train()
    for epoch in range(num_epochs):
        # Forward pass
        outputs = model(X_train)
        loss = criterion(outputs, y_train)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    return

def evaluate(model, X_train, y_train, X_test, y_test):
    model.eval()  
    with torch.no_grad():  # No need to compute gradients for evaluation
        train_predict = model(X_train)
        test_predict = model(X_test)

        train_mse = mean_squared_error(y_train.numpy(), train_predict.numpy())
        test_mse = mean_squared_error(y_test.numpy(), test_predict.numpy())
    return train_mse, test_mse

In [17]:
# setting up data and parameters
X_tensor = torch.tensor(qb_features_pg.drop('Year', axis=1).values, dtype=torch.float32)
y_tensor = torch.tensor(qb_PPR_pg.drop('Year', axis=1).values, dtype=torch.float32)
learning_rate = 0.001
num_epochs = 1000
input_size = X_tensor.shape[1]
hidden_size = 64
output_size = 1
criterion = nn.MSELoss()
model = SimpleNN(input_size, hidden_size, output_size)
optimizer = optim.Adam(model.parameters(), lr=0.001)

# cross validation
fold_results = []
for fold, (train_idx, test_idx) in enumerate(kf.split(X_tensor)):
    X_train = X_tensor[train_idx]
    y_train = y_tensor[train_idx]
    X_test = X_tensor[test_idx]
    y_test = y_tensor[test_idx]
    
    # initialize the model, loss function, and optimizer
    model = SimpleNN(input_size, hidden_size, output_size)
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    
    # train the model
    train(model, criterion, optimizer, num_epochs, X_train, y_train)
    
    # validate the model
    val_loss = evaluate(model, X_train, y_train, X_test, y_test)
    print(f'Fold {fold + 1}')
    print(f'Train MSE: {val_loss[0]}')
    print(f'Test MSE: {val_loss[1]}\n')
    fold_results.append(val_loss)

fold_results = np.array(fold_results)
print(f'Mean Train Loss: {np.mean(fold_results[:,0])}')
print(f'Mean Test Loss: {np.mean(fold_results[:,1])}')

Fold 1
Train MSE: 0.2838749885559082
Test MSE: 0.47391828894615173

Fold 2
Train MSE: 0.335738867521286
Test MSE: 0.5960904359817505

Fold 3
Train MSE: 0.32540568709373474
Test MSE: 0.7102637887001038

Fold 4
Train MSE: 0.32125604152679443
Test MSE: 0.5290007591247559

Fold 5
Train MSE: 0.30266889929771423
Test MSE: 0.547470211982727

Fold 6
Train MSE: 0.3155335485935211
Test MSE: 0.4852880537509918

Fold 7
Train MSE: 0.3119930326938629
Test MSE: 0.7652067542076111

Fold 8
Train MSE: 0.3045644164085388
Test MSE: 0.8119980096817017

Fold 9
Train MSE: 0.2957719564437866
Test MSE: 0.7250003814697266

Fold 10
Train MSE: 0.32048916816711426
Test MSE: 0.6670612096786499

Fold 11
Train MSE: 0.2889745235443115
Test MSE: 0.5215308666229248

Fold 12
Train MSE: 0.2580851912498474
Test MSE: 0.738976776599884

Mean Train Loss: 0.3053630590438843
Mean Test Loss: 0.6309837698936462


In [18]:
# 2023 data
qb_features_pg_2023 = qb_features_pg[qb_features_pg['Year']==2023].drop('Year', axis=1)
qb_PPR_pg_2023 = qb_PPR_pg[qb_PPR_pg['Year']==2023].drop('Year', axis=1)
qb_info_pg_2023 = qb_info_pg[qb_info_pg['Year']==2023]

In [19]:
# train and test data
X_train = torch.tensor(qb_features_pg[qb_features_pg['Year'] < 2023].drop('Year', axis=1).values, dtype=torch.float32)
y_train = torch.tensor(qb_PPR_pg[qb_PPR_pg['Year'] < 2023].drop('Year', axis=1).values, dtype=torch.float32)
X_test = torch.tensor(qb_features_pg_2023.values, dtype=torch.float32)
y_test = torch.tensor(qb_PPR_pg_2023.values, dtype=torch.float32)

# parameters
input_size = X_train.shape[1]
hidden_size = 64
output_size = 1
model = SimpleNN(input_size, hidden_size, output_size)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
num_epochs = 1000

model.train()
for epoch in range(num_epochs):
    # forward pass
    outputs = model(X_train)
    loss = criterion(outputs, y_train)
    
    # backward pass and optimization
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    if (epoch+1) % 100 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

model.eval()  
with torch.no_grad():  # don't need to compute gradients for evaluation
    train_predict = model(X_train)
    test_predict = model(X_test)

    train_mse = mean_squared_error(y_train.numpy(), train_predict.numpy())
    test_mse = mean_squared_error(y_test.numpy(), test_predict.numpy())

print(f'Train Loss: {train_mse}')
print(f'Test Loss: {test_mse}')

Epoch [100/1000], Loss: 0.4784
Epoch [200/1000], Loss: 0.4397
Epoch [300/1000], Loss: 0.4185
Epoch [400/1000], Loss: 0.4029
Epoch [500/1000], Loss: 0.3880
Epoch [600/1000], Loss: 0.3744
Epoch [700/1000], Loss: 0.3599
Epoch [800/1000], Loss: 0.3456
Epoch [900/1000], Loss: 0.3313
Epoch [1000/1000], Loss: 0.3178
Train Loss: 0.31770047545433044
Test Loss: 0.6619703769683838


In [20]:
# saving predictions
result = pd.DataFrame({'Actual PPR/G': qb_PPR_pg_2023.values.flatten(), 'Predicted PPR/G': test_predict.flatten()})
result = pd.concat([qb_info_pg_2023.reset_index(drop=True), result.reset_index(drop=True)], axis=1)
result['Actual PPR/G'] = qb_stats_pg['PPR/G'][qb_stats_pg['Year']==2023].reset_index(drop=True)
result['Predicted PPR/G'] = scalerPPR_qb.inverse_transform(result['Predicted PPR/G'].values.reshape(-1,1))
result['Actual PPR'] = qb_stats_pg['PPR'][qb_stats_pg['Year']==2023].reset_index(drop=True)
result['Predicted PPR'] = result['Predicted PPR/G']*qb_stats_pg['GS/G'][qb_stats_pg['Year']==2023].reset_index(drop=True)*16
result['Predicted Rank'] = result['Predicted PPR/G'].rank(ascending=False, method='min').astype(int)
result['Actual Rank'] = result['Actual PPR/G'].rank(ascending=False, method='min').astype(int)
result.sort_values('Predicted Rank', inplace=True)

result.head()

Unnamed: 0,Player,Year,Tm,FantPos,Age,Actual PPR/G,Predicted PPR/G,Actual PPR,Predicted PPR,Predicted Rank,Actual Rank
24,Jalen Hurts,2023,PHI,QB,25.0,20.988235,24.172295,356.8,386.756714,1,2
60,Sam Howell,2023,WAS,QB,23.0,15.147059,21.489937,257.5,343.838989,2,21
17,Deshaun Watson,2023,CLE,QB,28.0,14.466667,20.99724,86.8,335.955841,3,23
55,Patrick Mahomes,2023,KAN,QB,28.0,17.5125,20.981453,280.2,335.703247,4,12
33,Josh Allen,2023,BUF,QB,27.0,23.094118,20.301723,392.6,324.827576,5,1


The models trained on data split by position were all overfit and performed quite bad on the test data. In addition the cross validation was inconsistent. I think this may be due to the fact that splitting up by position results in less train and test data. I'm going predicting PPR/G without splitting up by position.

In [21]:
corr_mat = player_stats_pg.drop(['Player', 'Year', 'Tm', 'FantPos', 'Tm_Last_Yr'], axis=1).corr()
corr_mat['PPR/G'].sort_values(ascending=False)

PPR/G            1.000000
PPR              0.924478
GS/G             0.506338
GS               0.486816
Tot_TD/G         0.472692
Rec/G            0.375526
Rec_Yds/G        0.366811
Tgt/G            0.358360
Pass_TD/G        0.355057
Pass_Yds/G       0.342726
Fmb/G            0.342721
Cmp/G            0.333658
Pass_Att/G       0.329776
Rec_TD/G         0.323269
FL/G             0.304385
Rush_TD/G        0.301223
Rush_Yds/G       0.296987
Rush_Att/G       0.285529
Int/G            0.263266
Y/A              0.232757
G                0.229862
Age              0.056670
Y/R              0.034939
Changed_Teams   -0.193056
Name: PPR/G, dtype: float64

In [22]:
# player features
player_features_pg = player_stats_pg.drop(['Player', 'Tm', 'FantPos', 'Age', 'PPR', 'PPR/G', 'Tm_Last_Yr'], axis=1)
player_info_pg = player_stats_pg[['Player', 'Year', 'Tm', 'FantPos', 'Age']]
player_PPR_pg = player_stats_pg[['Year', 'PPR/G']][player_stats_pg['Year'] < 2024].reset_index(drop=True)

# scaling the data
scaler = StandardScaler()
player_features_pg = pd.DataFrame(scaler.fit_transform(player_features_pg), columns=player_features_pg.columns)
scalerPPR = StandardScaler() # using a different scaler so I can unscale the data later
player_PPR_pg = pd.DataFrame(scalerPPR.fit_transform(player_PPR_pg['PPR/G'].values.reshape(-1,1)), columns=['PPR/G'])
player_features_pg['Year'] = player_stats_pg['Year'].reset_index(drop=True)
player_features_pg['Changed_Teams'] = player_stats_pg['Changed_Teams'].reset_index(drop=True)
player_PPR_pg['Year'] = player_stats_pg['Year'][player_stats_pg['Year'] < 2024].reset_index(drop=True)

# saving 2024 data
player_features_pg_2024 = player_features_pg[player_features_pg['Year'] == 2024].reset_index(drop=True).drop('Year', axis=1)
player_features_pg = player_features_pg[player_features_pg['Year'] < 2024].reset_index(drop=True)
player_info_pg_2024 = player_info_pg[player_info_pg['Year'] == 2024].reset_index(drop=True)
player_info_pg = player_info_pg[player_info_pg['Year'] < 2024].reset_index(drop=True)

In [23]:
# linear regression all players
lr = LinearRegression()
results = cross_validate(lr, player_features_pg.drop('Year', axis=1), player_PPR_pg['PPR/G'], cv=kf, scoring='neg_mean_squared_error', return_train_score=True)

print(f"Train MSE: {-results['train_score']}")
print(f"Test MSE: {-results['test_score']}")
print(f"Mean Train MSE: {np.mean(-results['train_score'])}")
print(f"Mean Test MSE: {np.mean(-results['test_score'])}")

Train MSE: [0.37238625 0.3687807  0.36888109 0.35848415 0.36333834 0.36590893
 0.36375157 0.36628208 0.3663944  0.36180454 0.36902771 0.36758561]
Test MSE: [0.30228664 0.34556049 0.34286202 0.45771646 0.4041706  0.37650123
 0.39857489 0.3695095  0.37136084 0.42655277 0.33986181 0.36105637]
Mean Train MSE: 0.3660521139397966
Mean Test MSE: 0.374667801262327


In [None]:
# gradient boosted regression all players
gbr = GradientBoostingRegressor(n_estimators=6000,
                                learning_rate=0.001,
                                max_depth=4,
                                max_features='sqrt',
                                min_samples_leaf=15,
                                min_samples_split=10,
                                loss='huber',
                                random_state=42)
results = cross_validate(gbr, player_features_pg.drop('Year', axis=1), player_PPR_pg['PPR/G'], cv=kf, scoring='neg_mean_squared_error', return_train_score=True)

print(f"Train MSE: {-results['train_score']}")
print(f"Test MSE: {-results['test_score']}")
print(f"Mean Train MSE: {np.mean(-results['train_score'])}")
print(f"Mean Test MSE: {np.mean(-results['test_score'])}")

In [None]:
# setting up data and paramters
X_tensor = torch.tensor(player_features_pg.drop('Year', axis=1).values, dtype=torch.float32)
y_tensor = torch.tensor(player_PPR_pg.drop('Year', axis=1).values, dtype=torch.float32)
learning_rate = 0.001
num_epochs = 1000
input_size = X_tensor.shape[1]
hidden_size = int(X_train.shape[1]*2/3)
output_size = 1
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# cross validation
fold_results = []
for fold, (train_idx, test_idx) in enumerate(kf.split(X_tensor)):
    X_train = X_tensor[train_idx]
    y_train = y_tensor[train_idx]
    X_test = X_tensor[test_idx]
    y_test = y_tensor[test_idx]
    
    # initialize the model, loss function, and optimizer
    model = SimpleNN(input_size, hidden_size, output_size)
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    
    # train the model
    train(model, criterion, optimizer, num_epochs, X_train, y_train)
    
    # validate the model
    val_loss = evaluate(model, X_train, y_train, X_test, y_test)
    print(f'Fold {fold + 1}')
    print(f'Train MSE: {val_loss[0]}')
    print(f'Test MSE: {val_loss[1]}\n')
    fold_results.append(val_loss)
    
fold_results = np.array(fold_results)
print(f'Mean Train Loss: {np.mean(fold_results[:,0])}')
print(f'Mean Test Loss: {np.mean(fold_results[:,1])}')

In [None]:
# 2023 data
player_features_pg_2023 = player_features_pg[player_features_pg['Year']==2023].drop('Year', axis=1)
player_PPR_pg_2023 = player_PPR_pg[player_PPR_pg['Year']==2023].drop('Year', axis=1)
player_info_pg_2023 = player_info_pg[player_info_pg['Year']==2023]

In [None]:
# train and test data
X_train = torch.tensor(player_features_pg[player_features_pg['Year'] < 2023].drop('Year', axis=1).values, dtype=torch.float32)
y_train = torch.tensor(player_PPR_pg[player_PPR_pg['Year'] < 2023].drop('Year', axis=1).values, dtype=torch.float32)
X_test = torch.tensor(player_features_pg_2023.values, dtype=torch.float32)
y_test = torch.tensor(player_PPR_pg_2023.values, dtype=torch.float32)

# parameters
input_size = X_train.shape[1]
hidden_size = int(X_train.shape[1]*2/3)
output_size = 1
model = SimpleNN(input_size, hidden_size, output_size)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
num_epochs = 1000

# training loop
model.train()
for epoch in range(num_epochs):
    # forward pass
    outputs = model(X_train)
    loss = criterion(outputs, y_train)
    
    # backward pass and optimization
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    if (epoch+1) % 100 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

# evaluating the model
model.eval()  
with torch.no_grad():
    train_predict = model(X_train)
    test_predict = model(X_test)

    train_mse = mean_squared_error(y_train.numpy(), train_predict.numpy())
    test_mse = mean_squared_error(y_test.numpy(), test_predict.numpy())

print(f'Train Loss: {train_mse}')
print(f'Test Loss: {test_mse}')

In [None]:
# saving predictions
result = pd.DataFrame({'Actual PPR/G': player_PPR_pg_2023.values.flatten(), 'Predicted PPR/G': test_predict.flatten()})
result = pd.concat([player_info_pg_2023.reset_index(drop=True), result.reset_index(drop=True)], axis=1)
result['Actual PPR/G'] = player_stats_pg['PPR/G'][player_stats_pg['Year']==2023].reset_index(drop=True)
result['Predicted PPR/G'] = scalerPPR.inverse_transform(result['Predicted PPR/G'].values.reshape(-1,1))
result['Actual PPR'] = player_stats_pg['PPR'][player_stats_pg['Year']==2023].reset_index(drop=True)
result['Predicted PPR'] = result['Predicted PPR/G']*16
# for qbs I'm multiplying PPR/G * G/GS because qbs generally only play when they start
result.loc[result['FantPos'] == 'QB', 'Predicted PPR'] = (result['Predicted PPR/G'] * 
    player_stats_pg.loc[player_stats_pg['Year'] == 2023, 'GS/G'].reset_index(drop=True) * 16)
result['Predicted Rank'] = result['Predicted PPR'].rank(ascending=False, method='min').astype(int)
result['Actual Rank'] = result['Actual PPR'].rank(ascending=False, method='min').astype(int)
result.sort_values('Predicted Rank', inplace=True)

pred_2023 = result.copy()
pred_2023

In [None]:
plt.scatter(result['Actual Rank'], result['Actual PPR/G'])
plt.scatter(result['Actual Rank'], result['Predicted PPR/G'])

# Rookies

In [None]:
rookies.head()

In [None]:
rookies.to_sql('rookies', conn, if_exists='replace', index=False)

In [None]:
# joining rookies with college stats
rookie_stats_pg = pd.read_sql('''
    WITH temp AS (
        SELECT 
            Player, Year, Tm, FantPos, Age, "PPR/G", PPR FROM rookies
    )
    SELECT 
        t.Player, t.Tm, t.Year, c.Pick, t.FantPos, t.Age, c.School, c.Conf, c.G as G, c.Cmp/G as "Cmp/G", 
        c.Pass_Att/G as "Pass_Att/G", c.Cmp_Pct, c.Pass_Yds/G as "Pass_Yds/G", c."Pass_Y/A", c.Pass_TD/G as "Pass_TD/G",
        c.Int/G as "Int/G", c.PR, c.Rush_Att/G as "Rush_Att/G", c.Rush_Yds/G as "Rush_Yds/G", c."Rush_Y/A", 
        c.Rush_TD/G as "Rush_TD/G", c.Rec/G as "Rec/G", c.Rec_Yds/G as "Rec_Yds/G", c."Y/R", c.Rec_TD/G as "Rec_TD/G", 
        c.Plays/G as "Plays/G", c.Tot_Yds/G as "Tot_Yds/G", c."Y/P", c.Tot_TD/G as "Tot_TD/G", t."PPR/G", t.PPR
    FROM 
        temp t
    LEFT JOIN 
        college_stats c
    ON
        t.Player = c.Player'''
    , conn)
rookie_stats_pg.dropna(subset=['G'], inplace=True) # na rows are undrafted players
rookie_stats_pg

In [None]:
corr_mat = rookie_stats_pg.drop(['Player', 'Year', 'Tm', 'FantPos', 'School', 'Conf'], axis=1).corr()
corr_mat['PPR/G'].sort_values(ascending=False)
# unsurprisingly none of the stats have that good of a correlation with PPR/G
# where the player was picked is the best feature

In [None]:
# rookies features
rookie_features_pg = rookie_stats_pg.drop(['Player', 'Tm', 'FantPos', 'School', 'Conf', 'PPR', 'PPR/G'], axis=1)
rookie_info_pg = rookie_stats_pg[['Player', 'Year', 'Tm', 'FantPos', 'Age', 'School']]
rookie_PPR_pg = rookie_stats_pg[['Year', 'PPR/G']][rookie_stats_pg['Year'] < 2024].reset_index(drop=True)

# scaling the data
scaler_rookies = StandardScaler()
rookie_features_pg = pd.DataFrame(scaler_rookies.fit_transform(rookie_features_pg), columns=rookie_features_pg.columns)
scalerPPR_rookies = StandardScaler() 
rookie_PPR_pg = pd.DataFrame(scalerPPR_rookies.fit_transform(rookie_PPR_pg['PPR/G'].values.reshape(-1,1)), columns=['PPR/G'])
rookie_features_pg['Year'] = rookie_stats_pg['Year'].reset_index(drop=True)
rookie_PPR_pg['Year'] = rookie_stats_pg['Year'][rookie_stats_pg['Year'] < 2024].reset_index(drop=True)

# saving 2024 data
rookie_features_pg_2024 = rookie_features_pg[rookie_features_pg['Year'] == 2024].reset_index(drop=True).drop('Year', axis=1)
rookie_features_pg = rookie_features_pg[rookie_features_pg['Year'] < 2024].reset_index(drop=True)
rookie_info_pg_2024 = rookie_info_pg[rookie_info_pg['Year'] == 2024].reset_index(drop=True)
rookie_info_pg = rookie_info_pg[rookie_info_pg['Year'] < 2024].reset_index(drop=True)

In [None]:
# linear regression rookies
lr = LinearRegression()
results = cross_validate(lr, rookie_features_pg.drop('Year', axis=1), rookie_PPR_pg['PPR/G'], cv=kf, scoring='neg_mean_squared_error', return_train_score=True)

print(f"Train MSE: {-results['train_score']}")
print(f"Test MSE: {-results['test_score']}")
print(f"Mean Train MSE: {np.mean(-results['train_score'])}")
print(f"Mean Test MSE: {np.mean(-results['test_score'])}")
# as expected the fit is bad because the data is not great

In [None]:
# the data is overfit a little bit and since most of the stats aren't highly correlated I'm going to use less features
# I chose PR, Tot_Yds/G, and Tot_TD/G because they are combined stats and represent all of the other stats
# PR (passer rating) is calculated from attempts, completions, yards, tds, and ints
lr = LinearRegression()
results = cross_validate(lr, rookie_features_pg[['Pick', 'Age', 'PR', 'Tot_Yds/G', 'Tot_TD/G']], rookie_PPR_pg['PPR/G'], cv=kf, scoring='neg_mean_squared_error', return_train_score=True)

print(f"Train MSE: {-results['train_score']}")
print(f"Test MSE: {-results['test_score']}")
print(f"Mean Train MSE: {np.mean(-results['train_score'])}")
print(f"Mean Test MSE: {np.mean(-results['test_score'])}")

In [None]:
# fit was a little better so I'm going to continue using only those stats
rookie_features_pg = rookie_features_pg[['Year', 'Pick', 'Age', 'PR', 'Tot_Yds/G', 'Tot_TD/G']]
rookie_features_pg.head()

In [None]:
# gradient boosted regression rookies
gbr = GradientBoostingRegressor(n_estimators=6000,
                                learning_rate=0.001,
                                max_depth=4,
                                max_features='sqrt',
                                min_samples_leaf=15,
                                min_samples_split=10,
                                loss='huber',
                                random_state=42)
results = cross_validate(gbr, rookie_features_pg.drop('Year', axis=1), rookie_PPR_pg['PPR/G'], cv=kf, scoring='neg_mean_squared_error', return_train_score=True)

print(f"Train MSE: {-results['train_score']}")
print(f"Test MSE: {-results['test_score']}")
print(f"Mean Train MSE: {np.mean(-results['train_score'])}")
print(f"Mean Test MSE: {np.mean(-results['test_score'])}")

In [None]:
# data and parameters
X_tensor = torch.tensor(rookie_features_pg.drop('Year', axis=1).values, dtype=torch.float32)
y_tensor = torch.tensor(rookie_PPR_pg.drop('Year', axis=1).values, dtype=torch.float32)
learning_rate = 0.001
num_epochs = 1000
input_size = X_tensor.shape[1]
hidden_size = int(X_tensor.shape[1]*2/3)
output_size = 1
criterion = nn.MSELoss()
model = SimpleNN(input_size, hidden_size, output_size)
optimizer = optim.Adam(model.parameters(), lr=0.001)

# cross validation
fold_results = []
for fold, (train_idx, test_idx) in enumerate(kf.split(X_tensor)):
    X_train = X_tensor[train_idx]
    y_train = y_tensor[train_idx]
    X_test = X_tensor[test_idx]
    y_test = y_tensor[test_idx]
    
    # initialize the model, loss function, and optimizer
    model = SimpleNN(input_size, hidden_size, output_size)
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    
    # train the model
    train(model, criterion, optimizer, num_epochs, X_train, y_train)
    
    # validate the model
    val_loss = evaluate(model, X_train, y_train, X_test, y_test)
    print(f'Fold {fold + 1}')
    print(f'Train MSE: {val_loss[0]}')
    print(f'Test MSE: {val_loss[1]}\n')
    fold_results.append(val_loss)

fold_results = np.array(fold_results)
print(f'Mean Train Loss: {np.mean(fold_results[:,0])}')
print(f'Mean Test Loss: {np.mean(fold_results[:,1])}')

In [None]:
# 2023 data
rookie_features_pg_2023 = rookie_features_pg[rookie_features_pg['Year']==2023].drop('Year', axis=1)
rookie_PPR_pg_2023 = rookie_PPR_pg[rookie_PPR_pg['Year']==2023].drop('Year', axis=1)
rookie_info_pg_2023 = rookie_info_pg[rookie_info_pg['Year']==2023]

In [None]:
# train and test data
X_train = torch.tensor(rookie_features_pg[rookie_features_pg['Year'] < 2023].drop('Year', axis=1).values, dtype=torch.float32)
y_train = torch.tensor(rookie_PPR_pg[rookie_PPR_pg['Year'] < 2023].drop('Year', axis=1).values, dtype=torch.float32)
X_test = torch.tensor(rookie_features_pg_2023.values, dtype=torch.float32)
y_test = torch.tensor(rookie_PPR_pg_2023.values, dtype=torch.float32)

# parameters
input_size = X_train.shape[1]
hidden_size = int(X_train.shape[1]*2/3)
output_size = 1
model = SimpleNN(input_size, hidden_size, output_size)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
num_epochs = 1000

# training loop
model.train()
for epoch in range(num_epochs):
    # forward pass
    outputs = model(X_train)
    loss = criterion(outputs, y_train)
    
    # backward pass and optimization
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    if (epoch+1) % 100 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

# evaluating the model
model.eval()  
with torch.no_grad():  
    train_predict = model(X_train)
    test_predict = model(X_test)

    train_mse = mean_squared_error(y_train.numpy(), train_predict.numpy())
    test_mse = mean_squared_error(y_test.numpy(), test_predict.numpy())

print(f'Train Loss: {train_mse}')
print(f'Test Loss: {test_mse}')

In [None]:
# I think this good for now, try other models later like ridge regression because data is overfit

In [None]:
# saving predictions
result = pd.DataFrame({'Actual PPR/G': rookie_PPR_pg_2023.values.flatten(), 'Predicted PPR/G': test_predict.flatten()})
result = pd.concat([rookie_info_pg_2023.reset_index(drop=True), result.reset_index(drop=True)], axis=1)
result['Actual PPR/G'] = rookie_stats_pg['PPR/G'][rookie_stats_pg['Year']==2023].reset_index(drop=True)
result['Predicted PPR/G'] = scalerPPR_rookies.inverse_transform(result['Predicted PPR/G'].values.reshape(-1,1))
result['Actual PPR'] = rookie_stats_pg['PPR'][rookie_stats_pg['Year']==2023].reset_index(drop=True)
result['Predicted PPR'] = result['Predicted PPR/G']*16
result['Predicted Rank'] = result['Predicted PPR'].rank(ascending=False, method='min').astype(int)
result['Actual Rank'] = result['Actual PPR'].rank(ascending=False, method='min').astype(int)
result.sort_values('Predicted Rank', inplace=True)

pred_rookies_2023 = result.copy()
pred_rookies_2023

In [None]:
# combining non rookies and rookies
pred_all_2023 = pd.concat([pred_2023, pred_rookies_2023.drop('School', axis=1)]).reset_index(drop=True)
pred_all_2023.drop(['Predicted Rank', 'Actual Rank'], axis=1, inplace=True)
pred_all_2023.head()

In standard fantasy league formats (which is what I play with) each team starts 1 qb, 2 rbs, 2 wr, 1 te, and 1 flex (rb/wr/te). Because of this different positions are valued differently. Rbs are generally the most valued because each team needs 2 and they are relatively the most scarce. Qbs are the last valued because even though they are the highest scoring position, each team only needs and they are relatively the most scarce. Wrs are valued slightly less than rbs because they score around the same as rbs but there are more of them. Tes are valued similarly to rbs because they don't score as much but they are very scarce. If I make my overall rankings by sorting all the players by predicted PPR, my rankings will have qbs too high, so I need to account for this in my overall rankings. I did some research online and found that qbs are generally 70% as valuable as other positions so I'll start from there and see how it looks. I think I will also make some slight adjustments to rbs, wrs, and tes. 

In [None]:
# creating Adj PPR that adjusts for positional value
pred_all_2023['Adj Predicted PPR'] = pred_all_2023['Predicted PPR']
pred_all_2023.loc[pred_all_2023['FantPos']=='QB', 'Adj Predicted PPR'] = pred_all_2023.loc[pred_all_2023['FantPos']=='QB', 'Predicted PPR']*0.7
pred_all_2023.loc[pred_all_2023['FantPos']=='WR', 'Adj Predicted PPR'] = pred_all_2023.loc[pred_all_2023['FantPos']=='WR', 'Predicted PPR']*0.95
pred_all_2023.loc[pred_all_2023['FantPos']=='TE', 'Adj Predicted PPR'] = pred_all_2023.loc[pred_all_2023['FantPos']=='TE', 'Predicted PPR']*1.05

pred_all_2023['Adj Actual PPR'] = pred_all_2023['Actual PPR']
pred_all_2023.loc[pred_all_2023['FantPos']=='QB', 'Adj Actual PPR'] = pred_all_2023.loc[pred_all_2023['FantPos']=='QB', 'Actual PPR']*0.7
pred_all_2023.loc[pred_all_2023['FantPos']=='WR', 'Adj Actual PPR'] = pred_all_2023.loc[pred_all_2023['FantPos']=='WR', 'Actual PPR']*0.95
pred_all_2023.loc[pred_all_2023['FantPos']=='TE', 'Adj Actual PPR'] = pred_all_2023.loc[pred_all_2023['FantPos']=='TE', 'Actual PPR']*1.05

pred_all_2023['Predicted OvRank'] = pred_all_2023['Adj Predicted PPR'].rank(ascending=False, method='min').astype(int)
pred_all_2023['Actual OvRank'] = pred_all_2023['Adj Actual PPR'].rank(ascending=False, method='min').astype(int)

pred_all_2023.sort_values('Predicted OvRank', inplace=True)
pred_all_2023.head()

In [None]:
# splitting up by position
pred_qb_2023 = pred_all_2023[pred_all_2023['FantPos']=='QB'].reset_index(drop=True)
pred_rb_2023 = pred_all_2023[pred_all_2023['FantPos']=='RB'].reset_index(drop=True)
pred_wr_2023 = pred_all_2023[pred_all_2023['FantPos']=='WR'].reset_index(drop=True)
pred_te_2023 = pred_all_2023[pred_all_2023['FantPos']=='TE'].reset_index(drop=True)

pred_qb_2023['Predicted PosRank'] = pred_qb_2023['Predicted PPR'].rank(ascending=False, method='min').astype(int)
pred_qb_2023['Actual PosRank'] = pred_qb_2023['Actual PPR'].rank(ascending=False, method='min').astype(int)
pred_rb_2023['Predicted PosRank'] = pred_rb_2023['Predicted PPR'].rank(ascending=False, method='min').astype(int)
pred_rb_2023['Actual PosRank'] = pred_rb_2023['Actual PPR'].rank(ascending=False, method='min').astype(int)
pred_wr_2023['Predicted PosRank'] = pred_wr_2023['Predicted PPR'].rank(ascending=False, method='min').astype(int)
pred_wr_2023['Actual PosRank'] = pred_wr_2023['Actual PPR'].rank(ascending=False, method='min').astype(int)
pred_te_2023['Predicted PosRank'] = pred_te_2023['Predicted PPR'].rank(ascending=False, method='min').astype(int)
pred_te_2023['Actual PosRank'] = pred_te_2023['Actual PPR'].rank(ascending=False, method='min').astype(int)

pred_qb_2023.sort_values('Predicted PosRank', inplace=True)
pred_rb_2023.sort_values('Predicted PosRank', inplace=True)
pred_wr_2023.sort_values('Predicted PosRank', inplace=True)
pred_te_2023.sort_values('Predicted PosRank', inplace=True)

pred_qb_2023.head()

In [None]:
# recombining positions
pred_all_2023 = pd.concat([pred_qb_2023, pred_rb_2023, pred_wr_2023, pred_te_2023]).reset_index(drop=True)
pred_all_2023.sort_values('Predicted OvRank', inplace=True)
pred_all_2023

I'm happy with my model's predictions for now so I'm going to train the models and predict for the upcoming 2024 season. I will come back and try other models and finetune new and existing models

# 2024 Predictions

In [None]:
# I made player_features 2024 and player_info_2024 earlier
player_features_pg_2024.head()

In [None]:
# train and test data
X_train = torch.tensor(player_features_pg.drop('Year', axis=1).values, dtype=torch.float32)
y_train = torch.tensor(player_PPR_pg.drop('Year', axis=1).values, dtype=torch.float32)
X_test = torch.tensor(player_features_pg_2024.values, dtype=torch.float32)

# parameters
input_size = X_train.shape[1]
hidden_size = int(X_train.shape[1]*2/3)
output_size = 1
model = SimpleNN(input_size, hidden_size, output_size)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
num_epochs = 1000

# training loop
model.train()
for epoch in range(num_epochs):
    # forward pass
    outputs = model(X_train)
    loss = criterion(outputs, y_train)
    
    # backward pass and optimization
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    if (epoch+1) % 100 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

# evaluating the model
model.eval()  
with torch.no_grad():  
    train_predict = model(X_train)
    test_predict = model(X_test)
    train_mse = mean_squared_error(y_train.numpy(), train_predict.numpy())

print(f'Train Loss: {train_mse}')

In [None]:
# saving predictions
result = pd.DataFrame({'PPR/G': test_predict.flatten()})
result = pd.concat([player_info_pg_2024.reset_index(drop=True), result.reset_index(drop=True)], axis=1)
result['PPR/G'] = scalerPPR.inverse_transform(result['PPR/G'].values.reshape(-1,1))
result['PPR'] = result['PPR/G']*16
result.loc[result['FantPos'] == 'QB', 'PPR'] = (result['PPR/G'] * player_stats_pg_2024['GS/G'].reset_index(drop=True) * 16)
result.sort_values('PPR', ascending=False, inplace=True)

pred_2024 = result.copy()
pred_2024.head()

In [None]:
# I also made rookie_features_2024 and rookie_info_2024 earlier
rookie_features_pg_2024 = rookie_features_pg_2024[['Pick', 'Age', 'PR', 'Tot_Yds/G', 'Tot_TD/G']]
rookie_features_pg_2024.head()

In [None]:
# train and test data
X_train = torch.tensor(rookie_features_pg.drop('Year', axis=1).values, dtype=torch.float32)
y_train = torch.tensor(rookie_PPR_pg.drop('Year', axis=1).values, dtype=torch.float32)
X_test = torch.tensor(rookie_features_pg_2024.values, dtype=torch.float32)

# paramters
input_size = X_train.shape[1]
hidden_size = int(X_train.shape[1]*2/3)
output_size = 1
model = SimpleNN(input_size, hidden_size, output_size)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
num_epochs = 1000

# training loop
model.train()
for epoch in range(num_epochs):
    # Forward pass
    outputs = model(X_train)
    loss = criterion(outputs, y_train)
    
    # Backward pass and optimization
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    if (epoch+1) % 100 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

# evaluating the model
model.eval()  
with torch.no_grad(): 
    train_predict = model(X_train)
    test_predict = model(X_test)
    train_mse = mean_squared_error(y_train.numpy(), train_predict.numpy())

print(f'Train Loss: {train_mse}')

In [None]:
# saving predictions
result = pd.DataFrame({'PPR/G': test_predict.flatten()})
result = pd.concat([rookie_info_pg_2024.reset_index(drop=True), result.reset_index(drop=True)], axis=1)
result['PPR/G'] = scalerPPR_rookies.inverse_transform(result['PPR/G'].values.reshape(-1,1))
result['PPR'] = result['PPR/G']*16
result.sort_values('PPR', ascending=False, inplace=True)

pred_rookies_2024 = result.copy()
pred_rookies_2024.head()

In [None]:
# combining both predictions (non rookies and rookies)
pred_all_2024 = pd.concat([pred_2024, pred_rookies_2024.drop('School', axis=1)]).reset_index(drop=True)
pred_all_2024['PPR/G'] = pred_all_2024['PPR/G'].astype('float64')
pred_all_2024['Age'] = pred_all_2024['Age'].astype('int')

# adjusting PPR by position
pred_all_2024['Adj PPR'] = pred_all_2024['PPR']
pred_all_2024.loc[pred_all_2024['FantPos']=='QB', 'Adj PPR'] = pred_all_2024.loc[pred_all_2024['FantPos']=='QB', 'PPR']*0.7
pred_all_2024.loc[pred_all_2024['FantPos']=='WR', 'Adj PPR'] = pred_all_2024.loc[pred_all_2024['FantPos']=='WR', 'PPR']*0.95
pred_all_2024.loc[pred_all_2024['FantPos']=='TE', 'Adj PPR'] = pred_all_2024.loc[pred_all_2024['FantPos']=='TE', 'PPR']*1.05
pred_all_2024['OvRank'] = pred_all_2024['Adj PPR'].rank(ascending=False, method='min').astype(int)
pred_all_2024.sort_values('OvRank', inplace=True)
pred_all_2024.drop('Adj PPR', axis=1, inplace=True)

# splitting up by position
pred_qb_2024 = pred_all_2024[pred_all_2024['FantPos']=='QB'].reset_index(drop=True)
pred_rb_2024 = pred_all_2024[pred_all_2024['FantPos']=='RB'].reset_index(drop=True)
pred_wr_2024 = pred_all_2024[pred_all_2024['FantPos']=='WR'].reset_index(drop=True)
pred_te_2024 = pred_all_2024[pred_all_2024['FantPos']=='TE'].reset_index(drop=True)

# making the positional rankings
pred_qb_2024['PosRank'] = pred_qb_2024['PPR'].rank(ascending=False, method='min').astype(int)
pred_rb_2024['PosRank'] = pred_rb_2024['PPR'].rank(ascending=False, method='min').astype(int)
pred_wr_2024['PosRank'] = pred_wr_2024['PPR'].rank(ascending=False, method='min').astype(int)
pred_te_2024['PosRank'] = pred_te_2024['PPR'].rank(ascending=False, method='min').astype(int)

pred_qb_2024.sort_values('PosRank', inplace=True)
pred_rb_2024.sort_values('PosRank', inplace=True)
pred_wr_2024.sort_values('PosRank', inplace=True)
pred_te_2024.sort_values('PosRank', inplace=True)

# recombining the positions into one dataframe
pred_all_2024 = pd.concat([pred_qb_2024, pred_rb_2024, pred_wr_2024, pred_te_2024]).reset_index(drop=True)
pred_all_2024.sort_values('OvRank', inplace=True)

In [None]:
# reordering and rounding columns
new_order = ['OvRank', 'PosRank', 'Player', 'Year', 'Tm', 'FantPos', 'Age', 'PPR/G', 'PPR']
pred_all_2024 = pred_all_2024[new_order].round(2)
pred_qb_2024 = pred_qb_2024[new_order].round(2)
pred_rb_2024 = pred_rb_2024[new_order].round(2)
pred_wr_2024 = pred_wr_2024[new_order].round(2)
pred_te_2024 = pred_te_2024[new_order].round(2)

pred_all_2024.head()

In [None]:
pred_qb_2024.head()

In [None]:
pred_rb_2024.head()

In [None]:
pred_wr_2024.head()

In [None]:
pred_te_2024.head()

In [None]:
# exporting to csv
pred_all_2024.to_csv('all_players.csv', index=False)
pred_qb_2024.to_csv('qbs.csv', index=False)
pred_rb_2024.to_csv('rbs.csv', index=False)
pred_wr_2024.to_csv('wrs.csv', index=False)
pred_te_2024.to_csv('tes.csv', index=False)

In [None]:
conn.close()