In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, root_mean_squared_error, r2_score

In [3]:
df=pd.read_csv("stats.csv")

#Same data cleaning as from nba_eda
df["2P%"] = df["2P%"].fillna(0)
df["3P%"] = df["3P%"].fillna(0)
df["FT%"] = df["FT%"].fillna(0)
df["eFG%"] = df["eFG%"].fillna(0)
df["FG%"] = df["FG%"].fillna(0)
core_players = df[(df["MP"] >= 20) & (df["G"] >= 40)]

- From data exploration, I found that the most important features that correlate with points per game are Pos, FTA, 3PA, 2PA, and MP.
- Position is a categorical variable, so I will encode it to integers

In [12]:
df_encoded = pd.get_dummies(df, columns=["Pos"], drop_first=False)

In [20]:
features = ["MP", "FGA", "FTA", "3PA", "AST", "TOV"]
position_columns = [col for col in df_encoded.columns if col.startswith("Pos_")]
features = features + position_columns
X = df_encoded[features]
y = df_encoded["PTS"]
X.head()

Unnamed: 0,MP,FGA,FTA,3PA,AST,TOV,Pos_C,Pos_PF,Pos_PG,Pos_SF,Pos_SG
0,33.6,21.8,11.6,3.6,5.6,3.8,True,False,False,False,False
1,37.5,23.6,8.7,10.6,9.8,4.0,False,False,True,False,False
2,35.2,18.8,10.7,1.7,6.5,3.4,False,True,False,False,False
3,34.0,19.8,8.7,3.6,6.2,2.2,False,False,True,False,False
4,35.4,21.4,6.5,6.8,6.7,2.4,False,False,True,False,False


In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [15]:
models = {
    "Linear Regression" : LinearRegression(),
    "Random Forest" : RandomForestRegressor(),
    "XGboost" : XGBRegressor(),
}

In [21]:
results = []
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_train_pred = model.predict(X_train)
    mae=mean_absolute_error(y_test, y_pred)
    rmse=root_mean_squared_error(y_test, y_pred)
    r2=r2_score(y_test, y_pred)
    results.append({
        "Model" : name,
        "MAE" : mae,
        "RMSE" : rmse,
        "R2 score" : r2
    })

df_results=pd.DataFrame(results)
df_results


Unnamed: 0,Model,MAE,RMSE,R2 score
0,Linear Regression,0.649984,0.88058,0.981849
1,Random Forest,0.707061,0.99327,0.976906
2,XGboost,0.692015,0.940437,0.979297


- The linear regression seemed to perform the best according to the metrics that I used, which could indicate that my dataset was fairly linear and complex models did not add that much value
- The R-Squared scores were all pretty high, so I suspect that there might be overfitting happening. To tell if this is the case, I will test the model on a set of test data.

In [23]:
r2_test = r2_score(y_test, y_pred)
r2_train=r2_score(y_train, y_train_pred)
r2_test,r2_train

(0.9792971588211172, 0.9999864987048749)

Since the R2 score of the training and testing data were both high and similar, it seems that there is no overfitting or underfitting. The cause of the high R2 score is likely the linear relationship between the features and the prediction.

In [17]:
import joblib

In [18]:
model = LinearRegression()
model.fit(X_train, y_train)

In [19]:
joblib.dump(model, "ppg_model_1")
joblib.dump(X_train.columns.tolist(), "feature_names")

['feature_names']