# Model Creation

## Introduction

Welcome to the prediction model creation portion of my basketball prediction project. This will be the last notebook in the step to having a complete prediction model with manually scraped and pruned data. In the future, I will need to create the failure model that will be able to take these predictions and use a confidence interval to flag a team as a fail if they lose too many games and not failed if they win enough.

## Methods

In [163]:
# imports
import pandas as pd

# machine learning imports
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor

# model selection
from sklearn.model_selection import GridSearchCV

# data preprocessing
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler, OneHotEncoder

### Data

In [164]:
# load data
master_df = pd.read_csv("/Users/trustanprice/Desktop/Personal/Basketball-Predictions/data/raw/master-stats/master_df.csv")
print(master_df.head())

   Season                   Team  GP     W     L   WIN%   Min    PTS   FGM  \
0    2025  Oklahoma City Thunder  82  68.0  14.0  0.829  48.1  120.5  44.6   
1    2025  Oklahoma City Thunder  82  68.0  14.0  0.829  48.1  120.5  44.6   
2    2025  Oklahoma City Thunder  82  68.0  14.0  0.829  48.1  120.5  44.6   
3    2025    Cleveland Cavaliers  82  64.0  18.0  0.780  48.2  121.9  44.5   
4    2025    Cleveland Cavaliers  82  64.0  18.0  0.780  48.2  121.9  44.5   

    FGA  ...            Coach  Yw/Franch  YOverall  CareerW  CareerL  \
0  92.7  ...  Mark Daigneault          5         5      211      189   
1  92.7  ...  Mark Daigneault          5         5      211      189   
2  92.7  ...  Mark Daigneault          5         5      211      189   
3  90.8  ...   Kenny Atkinson          1         5      182      208   
4  90.8  ...   Kenny Atkinson          1         5      182      208   

   CareerW%  Pk  Coach_Count      Payroll  NWins  
0     0.528  15            1  166418720.0    Na

I will now create the training and testing data, making the training data 2016-2023 seasons (8 seasons) and the testing data 2024/2025 seasons (2 seasons). I am doing it like this because I am predicting for the seasons to come; therefore, I am trying to replicate the traditional 80/20 split while making it a time-based split.

In [165]:
# train/test split
master_test = master_df[master_df["Season"].isin([2024, 2025])]
master_train = master_df[~master_df["Season"].isin([2024, 2025])]

print("Train shape:", master_train.shape)
print("Test shape:", master_test.shape)

Train shape: (515, 56)
Test shape: (132, 56)


In [166]:
# Numeric features (continuous or counts)
numeric_features = [
    "GP", "W", "L", "WIN%", "Min", "PTS", "FGM", "FGA", "FG%",
    "3PM", "3PA", "3P%", "FTM", "FTA", "FT%", "OREB", "DREB",
    "REB", "AST", "TOV", "STL", "BLK", "BLKA", "PF", "PFD",
    "PLUS_MINUS", "Home_W", "Home_L", "Road_W", "Road_L",
    "E_W", "E_L", "W_W", "W_L", "Pre-ASG_W", "Pre-ASG_L",
    "Post-ASG_W", "Post-ASG_L", "SOS", "Yw/Franch", "YOverall",
    "CareerW", "CareerL", "CareerW%", "Pk", "Coach_Count", "Payroll",
    
    # Player-aggregated features
    "avg_age", "avg_pts_top10", "avg_production_score", "injury_rate"
]

# Categorical features (labels, identifiers, strings)
categorical_features = [
    "Season"
]

# Define target column (predict next season’s wins)
target_column = "NWins"

### Models

In [167]:
# --- Training data ---
X_train = master_train.drop(columns=["NWins"])   # drop target column
y_train = master_train["NWins"]                  # target = next season wins

# --- Testing data ---
X_test = master_test.drop(columns=["NWins"])     # drop target column
y_test = master_test["NWins"]                    # target = next season wins (NaN for 2025)

In [168]:
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer

# --- Transformers ---
numeric_transformer = make_pipeline(
    SimpleImputer(strategy="mean"),
    RobustScaler()
)

categorical_transformer = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(handle_unknown="ignore", drop="first")
)

preprocessor = make_column_transformer(
    (numeric_transformer, numeric_features),
    (categorical_transformer, categorical_features),
    remainder="drop"
)

# --- Elastic Net Model ---
enet = ElasticNet(max_iter=10000)

# --- Grid Search Hyperparameters ---
param_grid = {
    "elasticnet__alpha": [0.01, 0.1, 1.0, 10.0],   # Regularization strength
    "elasticnet__l1_ratio": [0.1, 0.5, 0.9]        # Balance: 0 = Ridge, 1 = Lasso
}

pipeline = make_pipeline(
    preprocessor,
    enet
)

model = GridSearchCV(
    pipeline,
    param_grid,
    cv=10,
    scoring="neg_mean_absolute_error",  # you can also try r2
    n_jobs=-1,
    verbose=1
)

# --- Fit the model ---
model.fit(X_train, y_train)

print("Best Params:", model.best_params_)
print("Best Score:", model.best_score_)


Fitting 10 folds for each of 12 candidates, totalling 120 fits
Best Params: {'elasticnet__alpha': 1.0, 'elasticnet__l1_ratio': 0.9}
Best Score: -8.844172257019858


In [169]:
# --- Get feature importances (coefficients) ---
# Get the trained pipeline
best_pipeline = model.best_estimator_

# 1. Get one-hot encoder feature names
cat_ohe = best_pipeline.named_steps['columntransformer'] \
    .named_transformers_['pipeline-2'] \
    .named_steps['onehotencoder'] \
    .get_feature_names_out(categorical_features)

# 2. Combine numeric + categorical feature names
all_features = numeric_features + list(cat_ohe)

# 3. Get coefficients
coefficients = best_pipeline.named_steps['elasticnet'].coef_

feature_importance = pd.DataFrame({
    "Feature": all_features,
    "Coefficient": coefficients
}).sort_values(by="Coefficient", key=abs, ascending=False)

print(feature_importance.head(30))  # top 15 features


                 Feature  Coefficient
25            PLUS_MINUS     3.011835
38                   SOS    -1.342478
2                      L    -1.220410
1                      W     1.219661
3                   WIN%     1.219532
20                   STL     0.565105
4                    Min     0.333147
12                   FTM     0.150562
27                Home_L    -0.074596
43              CareerW%    -0.000000
37            Post-ASG_L     0.000000
36            Post-ASG_W    -0.000000
35             Pre-ASG_L    -0.000000
39             Yw/Franch    -0.000000
40              YOverall     0.000000
41               CareerW    -0.000000
42               CareerL     0.000000
0                     GP    -0.000000
46               Payroll    -0.000000
44                    Pk     0.000000
45           Coach_Count    -0.000000
33                   W_L    -0.000000
47               avg_age    -0.000000
48         avg_pts_top10    -0.000000
49  avg_production_score     0.000000
50          

In [170]:
# --- Step 2: Keep only non-zero features ---
important_features = [
    f for f, c in zip(all_features, coefficients) if abs(c) > 1e-6
]

print("Selected features:", important_features)

# --- Step 3: Reduce train/test sets ---
X_train_reduced = X_train[important_features].copy()
X_test_reduced = X_test[important_features].copy()

print("Reduced train shape:", X_train_reduced.shape)
print("Reduced test shape:", X_test_reduced.shape)

Selected features: ['W', 'L', 'WIN%', 'Min', 'FTM', 'STL', 'PLUS_MINUS', 'Home_L', 'SOS']
Reduced train shape: (515, 9)
Reduced test shape: (132, 9)


In [171]:
# train models
reduced_numeric_features = [col for col in X_train_reduced.columns if col in numeric_features]
reduced_categorical_features = [col for col in X_train_reduced.columns if col in categorical_features]

# define transformers
numeric_transformer = make_pipeline(
    SimpleImputer(strategy="mean"),
    RobustScaler()
)

categorical_transformer = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(handle_unknown="infrequent_if_exist", drop="first")
)

# update preprocessor
preprocessor_reduced = make_column_transformer(
    (numeric_transformer, reduced_numeric_features),
    (categorical_transformer, reduced_categorical_features),
    remainder="drop"
)

knn = KNeighborsRegressor()

param_grid = {
    'kneighborsregressor__n_neighbors': [3, 5, 7, 10], 
    'kneighborsregressor__weights': ['uniform', 'distance'],
    'kneighborsregressor__metric': ['euclidean', 'manhattan', 'minkowski'], 
    'kneighborsregressor__p': [1, 2]
}

pipeline = make_pipeline(
    preprocessor_reduced,
    knn,
)

model = GridSearchCV(
    pipeline, 
    param_grid, 
    cv=10, 
    scoring="neg_mean_absolute_error",
    n_jobs=-1,
    verbose=1)

model.fit(X_train_reduced, y_train)

Fitting 10 folds for each of 48 candidates, totalling 480 fits


0,1,2
,estimator,Pipeline(step...Regressor())])
,param_grid,"{'kneighborsregressor__metric': ['euclidean', 'manhattan', ...], 'kneighborsregressor__n_neighbors': [3, 5, ...], 'kneighborsregressor__p': [1, 2], 'kneighborsregressor__weights': ['uniform', 'distance']}"
,scoring,'neg_mean_absolute_error'
,n_jobs,-1
,refit,True
,cv,10
,verbose,1
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,transformers,"[('pipeline-1', ...), ('pipeline-2', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,with_centering,True
,with_scaling,True
,quantile_range,"(25.0, ...)"
,copy,True
,unit_variance,False

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'infrequent_if_exist'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_neighbors,10
,weights,'distance'
,algorithm,'auto'
,leaf_size,30
,p,1
,metric,'euclidean'
,metric_params,
,n_jobs,


## Results

In [172]:
# report model metrics

In [173]:
# summary figure

In [174]:
# serialize model

## Discussion