Load Data

In [1]:
#import libraries
import pandas as pd
from sqlalchemy import create_engine
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, explained_variance_score, max_error

In [2]:
#create connection engine to the mysql db
connection_string = 'mysql://lucab:lucab@127.0.0.1:3306/nbamvp_pred'
engine = create_engine(connection_string)

In [3]:
#get tables out of the mysql databse and save them into dataframes
stats = pd.read_sql_table('player_mvp_stats_tbl', engine)

In [4]:
stats.head(3)

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,Team,W,L,W/L%,GB,PS/G,PA/G,SRS,Latitude,Longitude
0,Alan Henderson,PF,27,ATL,82,82,33.8,5.2,11.3,0.461,...,Atlanta Hawks,28,54,0.341,28.0,94.3,99.7,-5.41,33.75722,-84.39639
1,Bimbo Coles,PG,31,ATL,80,54,24.1,3.5,7.6,0.455,...,Atlanta Hawks,28,54,0.341,28.0,94.3,99.7,-5.41,33.75722,-84.39639
2,Cal Bowdler,SF,22,ATL,46,0,9.2,1.1,2.5,0.426,...,Atlanta Hawks,28,54,0.341,28.0,94.3,99.7,-5.41,33.75722,-84.39639


In [5]:
stats.columns

Index(['Player', 'Pos', 'Age', 'Tm', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P',
       '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB',
       'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'Year',
       'Pts Won', 'Pts Max', 'Share', 'Team', 'W', 'L', 'W/L%', 'GB', 'PS/G',
       'PA/G', 'SRS', 'Latitude', 'Longitude'],
      dtype='object')

Prediction using Ridge

In [6]:
#define the columns that will predict the MVP
predictors = ["Age", "G", "GS", "MP", "FG", "FGA", 'FG%', '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'W', 'L', 'PS/G', 'PA/G', 'SRS']

In [7]:
#define train test and future prediction data
train = stats[stats["Year"] <= 2022]
test = stats[stats["Year"] == 2023]
predict24 = stats[stats["Year"] == 2024]

In [8]:
test.sort_values("Share", ascending=False).head(3)

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,Team,W,L,W/L%,GB,PS/G,PA/G,SRS,Latitude,Longitude
9081,Joel Embiid,C,28,PHI,66,66,34.6,11.0,20.1,0.548,...,Philadelphia 76ers,54,28,0.659,3.0,115.2,110.9,4.37,39.90111,-75.17194
3002,Nikola Jokić,C,27,DEN,69,69,33.7,9.4,14.8,0.632,...,Denver Nuggets,53,29,0.646,0.0,115.8,112.5,3.04,39.74861,-105.0075
6631,Giannis Antetokounmpo,PF,28,MIL,63,63,32.1,11.2,20.3,0.553,...,Milwaukee Bucks,58,24,0.707,0.0,116.9,113.3,3.61,43.045028,-87.918167


In [9]:
#create Ridge regression model
reg = Ridge(alpha=.1)

In [10]:
#train the Ridge model using the predictors to predict the target variable "Share"
reg.fit(train[predictors],train["Share"])

In [11]:
#predict the "Share" of the player of the 22-23 season in the column predictions
prediction_test = reg.predict(test[predictors])
prediction_test = pd.DataFrame(predictions, columns=["predictions"], index=test.index)

#combine the predicted "Share" with the dataframe
prediction_test = pd.concat([test[["Player", "Share"]], prediction_test], axis=1)

In [13]:
prediction_test.sort_values("Share", ascending=False).head()

Unnamed: 0,Player,Share,predictions
9081,Joel Embiid,0.915,0.185013
3002,Nikola Jokić,0.674,0.173004
6631,Giannis Antetokounmpo,0.606,0.210209
776,Jayson Tatum,0.28,0.129409
8281,Shai Gilgeous-Alexander,0.046,0.132419


In [14]:
prediction_test.sort_values("predictions", ascending=False).head()

Unnamed: 0,Player,Share,predictions
6631,Giannis Antetokounmpo,0.606,0.210209
2605,Luka Dončić,0.01,0.186478
9081,Joel Embiid,0.915,0.185013
3002,Nikola Jokić,0.674,0.173004
9860,Damian Lillard,0.0,0.151607


In [24]:
#predict the "Share" of the player of the 23-24 season in the column predictions
prediction_24 = reg.predict(predict24[predictors])
prediction_24 = pd.DataFrame(prediction_24, columns=["predictions"], index=predict24.index)
prediction_24 = pd.concat([predict24[["Player", "Team", "Share", "Latitude", "Longitude"]], prediction_24], axis=1)

In [36]:
prediction_24.sort_values("predictions", ascending=False).head()

Unnamed: 0,Player,Team,Share,Latitude,Longitude,predictions
6653,Giannis Antetokounmpo,Milwaukee Bucks,0.0,43.045028,-87.918167,0.205706
9100,Joel Embiid,Philadelphia 76ers,0.0,39.90111,-75.17194,0.194538
2626,Luka Dončić,Dallas Mavericks,0.0,32.79056,-96.81028,0.193568
3021,Nikola Jokić,Denver Nuggets,0.0,39.74861,-105.0075,0.170602
8300,Shai Gilgeous-Alexander,Oklahoma City Thunder,0.0,35.46333,-97.515,0.152924


Evaluation of the model

In [17]:
# Calculate evaluation metrics
evs = (explained_variance_score(prediction_test["Share"], prediction_test["predictions"]))
me = (max_error(prediction_test["Share"], prediction_test["predictions"]))
mse = mean_squared_error(prediction_test["Share"], prediction_test["predictions"])
rmse = mean_squared_error(prediction_test["Share"], prediction_test["predictions"], squared=False)
mae = mean_absolute_error(prediction_test["Share"], prediction_test["predictions"])
r2 = r2_score(prediction_test["Share"], prediction_test["predictions"])

print("Explained Variance (EVS):", evs)
print("Max Error (ME):", me)
print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("Mean Absolute Error (MAE):", mae)
print("R^2 Score:", r2)

Explained Variance (EVS): 0.1625739340507817
Max Error (ME): 0.7299868927421154
Mean Squared Error (MSE): 0.0026936882887538378
Root Mean Squared Error (RMSE): 0.051900754221435336
Mean Absolute Error (MAE): 0.02206809905243676
R^2 Score: 0.1600147108574298




Test Hypothesis "Je mehr Punkte pro Spiel ein Spieler erzielt, desto höher ist der “Share”-Wert."

In [35]:
#predict the "Share" for Steph Curry
steph_curry_24 = predict24[stats['Player'] == "Stephen Curry"].copy()
predict_sc_24 = reg.predict(steph_curry_24[predictors])
print(predict_sc_24)

[0.08479382]


  steph_curry_24 = predict24[stats['Player'] == "Stephen Curry"].copy()


In [38]:
#predict the "Share" for Steph Curry when points per game is increased by 5
steph_curry_24_hyp = predict24[stats['Player'] == "Stephen Curry"].copy()
steph_curry_24_hyp["PTS"] = 31.4
predict_sc_24_hyp = reg.predict(steph_curry_24_hyp[predictors])
print(predict_sc_24_hyp)

[0.12898886]


  steph_curry_24_hyp = predict24[stats['Player'] == "Stephen Curry"].copy()


Add predicted MVP to predictedMVPs

In [18]:
#create a dataframe based on some data of the predicted MVP
mvp_name = prediction_24.sort_values("predictions", ascending=False).head(1)["Player"].iloc[0]
mvp_team = prediction_24.sort_values("predictions", ascending=False).head(1)["Team"].iloc[0]
mvp_latitude = prediction_24.sort_values("predictions", ascending=False).head(1)["Latitude"].iloc[0]
mvp_longitude= prediction_24.sort_values("predictions", ascending=False).head(1)["Longitude"].iloc[0]

predictedMVPs = pd.DataFrame({
    'Model': "Ridge",
    'Name': mvp_name,
    'Team': mvp_team,
    'Latitude': mvp_latitude,
    'Longitude': mvp_longitude
}, index=[0])

In [19]:
predictedMVPs.head()

Unnamed: 0,Model,Name,Team,Latitude,Longitude
0,Ridge,Giannis Antetokounmpo,Milwaukee Bucks,43.045028,-87.918167


In [20]:
#save the dataframe "predictedMVPs" to the mysql database
predictedMVPs.to_sql(name='predictedMVPs_tbl', con=engine, if_exists='replace', index=False)

1

In [21]:
import os
import platform
import socket
from platform import python_version
from datetime import datetime

print('-----------------------------------')
print(os.name.upper())
print(platform.system(), '|', platform.release())
print('Datetime:', datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
print('Python Version:', python_version())
print('-----------------------------------')

-----------------------------------
NT
Windows | 10
Datetime: 2024-05-07 20:10:03
Python Version: 3.10.13
-----------------------------------
