In [1]:
import sys
import os
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
sys.path.append(os.path.abspath(os.path.join('..', 'src')))
%matplotlib inline

In [2]:
%load_ext autoreload
%autoreload 2
from data import ApiFetcher
from model import TeamEmbeddings

In [3]:
api = ApiFetcher(2015, 2025)
df = api.df_with_id()
print(df.columns)

Index(['home_fga', 'away_fga', 'home_fg_pct', 'away_fg_pct', 'home_fg3a',
       'away_fg3a', 'home_fg3_pct', 'away_fg3_pct', 'home_oreb', 'away_oreb',
       'home_dreb', 'away_dreb', 'home_ast', 'away_ast', 'home_stl',
       'away_stl', 'home_blk', 'away_blk', 'home_tov', 'away_tov', 'home_pf',
       'away_pf', 'home_pts', 'away_pts', 'home_team_id', 'away_team_id'],
      dtype='object')


In [4]:
def rolling_window_features(df, cols, windows=[3,5,10]):
    for team_type in ['home_team_id', 'away_team_id']:
        for window in windows:
            for col in cols:
                if col.startswith('home_') and team_type == 'home_team_id':
                    df[f'{col}_rolling_mean_{window}'] = df.groupby(team_type)[col].rolling(window).mean().reset_index(0, drop=True)
                elif col.startswith('away_') and team_type == 'away_team_id':
                    df[f'{col}_rolling_mean_{window}'] = df.groupby(team_type)[col].rolling(window).mean().reset_index(0, drop=True)
    return df

In [5]:
statistics = ['home_fga', 'away_fga', 'home_fg_pct', 'away_fg_pct', 'home_fg3a',
       'away_fg3a', 'home_fg3_pct', 'away_fg3_pct', 'home_oreb', 'away_oreb',
       'home_dreb', 'away_dreb', 'home_ast', 'away_ast', 'home_stl',
       'away_stl', 'home_blk', 'away_blk', 'home_tov', 'away_tov', 'home_pf',
       'away_pf']
results = ['home_pts', 'away_pts']
df = rolling_window_features(df, statistics, windows=[3,5,10])
df = df.drop(columns=statistics)
df.dropna(inplace=True)


X = df.drop(columns=results)
y = df[results]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train) 
X_test_scaled = scaler.transform(X_test)       

X_train_scaled = pd.DataFrame(X_train_scaled, columns=X.columns, index=X_train.index)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X.columns, index=X_test.index)

# Linear Regression

In [6]:
model = LinearRegression()
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)
print(f"Model R^2: {model.score(X_test_scaled, y_test)}")
print(f"Model MSE: {np.mean((y_pred - y_test) ** 2)}")

coef_df = pd.DataFrame({
    'feature': X.columns,
    'home_pts_coef': model.coef_[0],
    'away_pts_coef': model.coef_[1],
    'abs_home_coef': np.abs(model.coef_[0]),
    'abs_away_coef': np.abs(model.coef_[1])
})

print("\nNajważniejsze cechy dla home_pts:")
print(coef_df.nlargest(5, 'abs_home_coef')[['feature', 'home_pts_coef']])

print("\nNajważniejsze cechy dla away_pts:")
print(coef_df.nlargest(5, 'abs_away_coef')[['feature', 'away_pts_coef']])

Model R^2: 0.4849443429546855
Model MSE: 85.82145952865373

Najważniejsze cechy dla home_pts:
                       feature  home_pts_coef
3   home_fg_pct_rolling_mean_3       4.748461
40    away_dreb_rolling_mean_3      -3.604619
35     away_fga_rolling_mean_3       3.312689
4     home_fg3a_rolling_mean_3       2.363626
42     away_stl_rolling_mean_3      -1.639108

Najważniejsze cechy dla away_pts:
                       feature  away_pts_coef
36  away_fg_pct_rolling_mean_3       4.492955
7     home_dreb_rolling_mean_3      -3.049784
2      home_fga_rolling_mean_3       2.975361
35     away_fga_rolling_mean_3       2.436373
37    away_fg3a_rolling_mean_3       2.161280


In [7]:
model = Ridge()
grid_search = GridSearchCV(model, param_grid={'alpha': [0.1, 1.0, 10.0, 100.0]})
grid_search.fit(X_train_scaled, y_train)
model = grid_search.best_estimator_
y_pred = model.predict(X_test_scaled)
print(f"Model R^2: {model.score(X_test_scaled, y_test)}")
print(f"Model MSE: {np.mean((y_pred - y_test) ** 2)}")

coef_df = pd.DataFrame({
    'feature': X.columns,
    'home_pts_coef': model.coef_[0],
    'away_pts_coef': model.coef_[1],
    'abs_home_coef': np.abs(model.coef_[0]),
    'abs_away_coef': np.abs(model.coef_[1])
})

print("\nNajważniejsze cechy dla home_pts:")
print(coef_df.nlargest(5, 'abs_home_coef')[['feature', 'home_pts_coef']])

print("\nNajważniejsze cechy dla away_pts:")
print(coef_df.nlargest(5, 'abs_away_coef')[['feature', 'away_pts_coef']])

Model R^2: 0.4848513486532238
Model MSE: 85.83709999989706

Najważniejsze cechy dla home_pts:
                       feature  home_pts_coef
3   home_fg_pct_rolling_mean_3       4.437416
40    away_dreb_rolling_mean_3      -3.407579
35     away_fga_rolling_mean_3       3.011151
4     home_fg3a_rolling_mean_3       2.057133
42     away_stl_rolling_mean_3      -1.519809

Najważniejsze cechy dla away_pts:
                       feature  away_pts_coef
36  away_fg_pct_rolling_mean_3       4.202400
7     home_dreb_rolling_mean_3      -2.903308
2      home_fga_rolling_mean_3       2.756562
35     away_fga_rolling_mean_3       2.294409
37    away_fg3a_rolling_mean_3       1.878572


In [8]:
model = Lasso()
grid_search = GridSearchCV(model, param_grid={'alpha': [0.1, 1.0, 10.0, 100.0]})
grid_search.fit(X_train_scaled, y_train)
model = grid_search.best_estimator_
y_pred = model.predict(X_test_scaled)
print(f"Model R^2: {model.score(X_test_scaled, y_test)}")
print(f"Model MSE: {np.mean((y_pred - y_test) ** 2)}")

coef_df = pd.DataFrame({
    'feature': X.columns,
    'home_pts_coef': model.coef_[0],
    'away_pts_coef': model.coef_[1],
    'abs_home_coef': np.abs(model.coef_[0]),
    'abs_away_coef': np.abs(model.coef_[1])
})

print("\nNajważniejsze cechy dla home_pts:")
print(coef_df.nlargest(5, 'abs_home_coef')[['feature', 'home_pts_coef']])

print("\nNajważniejsze cechy dla away_pts:")
print(coef_df.nlargest(5, 'abs_away_coef')[['feature', 'away_pts_coef']])

Model R^2: 0.48448863688623234
Model MSE: 85.89921734719888

Najważniejsze cechy dla home_pts:
                       feature  home_pts_coef
3   home_fg_pct_rolling_mean_3       4.867772
40    away_dreb_rolling_mean_3      -3.085939
35     away_fga_rolling_mean_3       3.039338
4     home_fg3a_rolling_mean_3       2.058345
2      home_fga_rolling_mean_3       1.782198

Najważniejsze cechy dla away_pts:
                       feature  away_pts_coef
36  away_fg_pct_rolling_mean_3       4.582878
2      home_fga_rolling_mean_3       3.097066
7     home_dreb_rolling_mean_3      -3.021530
35     away_fga_rolling_mean_3       2.508614
37    away_fg3a_rolling_mean_3       1.832661


# Nonlinear Regression