In [66]:
# import
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import FunctionTransformer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import VarianceThreshold
import seaborn as sns
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import RFECV
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

from sklearn import set_config
set_config(transform_output="pandas")
# read data
df_regular = pd.read_csv("2021-2022 NBA Player Stats - Regular.csv",encoding='latin-1', delimiter=";")
df_playoffs = pd.read_csv("2021-2022 NBA Player Stats - Playoffs.csv",encoding='latin-1', delimiter=";")

# rename columns for better interpretability.  Took directly from kaggle dataset page.

def rename_columns(df):
    df.rename(columns={'Rk' : 'Rank', 'Player' : 'Players_name', 'Pos' : 'Position', 'Age' : 'Players_age', 'Tm' : 'Team', 'G' : 'Games_played', 'GS' : 'Games_started', 'MP' : 'Minutes_played_per_game', 'FG' : 'Field_goals_per_game', 'FGA' : 'Field_goal_attempts_per_game', 'FG%' : 'Field_goal_percentage', '3P' : '3_point_field_goals_per_game', '3PA' : '3_point_field_goal_attempts_per_game', '3P%' : '3_point_field_goal_percentage', '2P' : '2_point_field_goals_per_game', '2PA' : '2_point_field_goal_attempts_per_game', '2P%' : '2_point_field_goal_percentage', 'eFG%' : 'Effective_field_goal_percentage', 'FT' : 'Free_throws_per_game', 'FTA' : 'Free_throw_attempts_per_game', 'FT%' : 'Free_throw_percentage', 'ORB' : 'Offensive_rebounds_per_game', 'DRB' : 'Defensive_rebounds_per_game', 'TRB' : 'Total_rebounds_per_game', 'AST' : 'Assists_per_game', 'STL' : 'Steals_per_game', 'BLK' : 'Blocks_per_game', 'TOV' : 'Turnovers_per_game', 'PF' : 'Personal_fouls_per_game', 'PTS' : 'Points_per_game'}, inplace=True)
    return df

df_regular = rename_columns(df_regular)
df_playoffs = rename_columns(df_playoffs)
# convert position column to only have guard, forward, and center (3 classes)

def convert_position(df):
    df['Class'] = df['Position'].map({'SG': 'Guard','SF' : 'Forward', 'PG' : 'Guard', 'PF' : 'Forward', 'C' : 'Center','SG-SF' : 'Guard', 'SF-SG' : 'Forward', 'SG-PG' : 'Guard', 'C-PF' : 'Center', 'PF-SF' : 'Forward', 'PG-SG' : 'Guard'})
    df.drop(columns=['Position'], inplace=True)
    return df

df_regular = convert_position(df_regular)
df_playoffs = convert_position(df_playoffs)
# encode class column as numeric using LabelEncoder

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

df_regular['Class'] = le.fit_transform(df_regular['Class'])
df_playoffs['Class'] = le.fit_transform(df_playoffs['Class'])
# create extra features.  My hopethesis is that the ratios of some statistics relative to the sum of those stats might be more predictive than the absolute stats in certain instances.  I've followed basketball (and played) my entire life so my intuition is that we can use these ratios to predict a player's position.

# df_regular['Total_stats'] = df_regular['Points_per_game'] + df_regular['Total_rebounds_per_game'] + df_regular['Assists_per_game'] + df_regular['Steals_per_game'] + df_regular['Blocks_per_game'] + df_regular['Turnovers_per_game'] + df_regular['3_point_field_goals_per_game']

# df_regular['Total_stats'].describe()
# df_regular[df_regular.Total_stats == 0]

# df_regular['Points_per_game_percentage'] = df_regular['Points_per_game'] / df_regular['Total_stats']
# df_regular['Total_rebounds_per_game_percentage'] = df_regular['Total_rebounds_per_game'] / df_regular['Total_stats']
# df_regular['Assists_per_game_percentage'] = df_regular['Assists_per_game'] / df_regular['Total_stats']
# df_regular['Steals_per_game_percentage'] = df_regular['Steals_per_game'] / df_regular['Total_stats']
# df_regular['Blocks_per_game_percentage'] = df_regular['Blocks_per_game'] / df_regular['Total_stats']
# df_regular['Turnovers_per_game_percentage'] = df_regular['Turnovers_per_game'] / df_regular['Total_stats']
# df_regular['3_point_field_goals_per_game_percentage'] = df_regular['3_point_field_goals_per_game'] / df_regular['Total_stats']

# df_regular = df_regular[df_regular['Total_stats'] != 0]

# # create the same extra features for the playoffs dataset

# df_playoffs['Total_stats'] = df_playoffs['Points_per_game'] + df_playoffs['Total_rebounds_per_game'] + df_playoffs['Assists_per_game'] + df_playoffs['Steals_per_game'] + df_playoffs['Blocks_per_game'] + df_playoffs['Turnovers_per_game'] + df_playoffs['3_point_field_goals_per_game']

# df_playoffs['Points_per_game_percentage'] = df_playoffs['Points_per_game'] / df_playoffs['Total_stats']
# df_playoffs['Total_rebounds_per_game_percentage'] = df_playoffs['Total_rebounds_per_game'] / df_playoffs['Total_stats']
# df_playoffs['Assists_per_game_percentage'] = df_playoffs['Assists_per_game'] / df_playoffs['Total_stats']
# df_playoffs['Steals_per_game_percentage'] = df_playoffs['Steals_per_game'] / df_playoffs['Total_stats']
# df_playoffs['Blocks_per_game_percentage'] = df_playoffs['Blocks_per_game'] / df_playoffs['Total_stats']
# df_playoffs['Turnovers_per_game_percentage'] = df_playoffs['Turnovers_per_game'] / df_playoffs['Total_stats']
# df_playoffs['3_point_field_goals_per_game_percentage'] = df_playoffs['3_point_field_goals_per_game'] / df_playoffs['Total_stats']

# df_playoffs = df_playoffs[df_playoffs['Total_stats'] != 0]



In [67]:
df_regular['2_point_vs_3_point_attempts'] = df_regular['2_point_field_goals_per_game'].div(df_regular['3_point_field_goal_attempts_per_game'].values, fill_value=None)


In [68]:
# Use minmax scaler to scale the following columns: Defensive Rebounds, Steals, Blocks

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

df_regular['Defensive_rebounds_per_game'] = scaler.fit_transform(df_regular[['Defensive_rebounds_per_game']])
df_regular['Steals_per_game'] = scaler.fit_transform(df_regular[['Steals_per_game']])
df_regular['Blocks_per_game'] = scaler.fit_transform(df_regular[['Blocks_per_game']])

# sum the three scaled columns to create a new column called 'Defensive_stats'

df_regular['Defensive_stats'] = df_regular['Defensive_rebounds_per_game'] + df_regular['Steals_per_game'] + df_regular['Blocks_per_game']

# get the percentage of the three scaled columns relative to the sum of the three scaled columns

df_regular['Defensive_rebounds_per_game_percentage'] = df_regular['Defensive_rebounds_per_game'] / df_regular['Defensive_stats']
df_regular['Steals_per_game_percentage'] = df_regular['Steals_per_game'] / df_regular['Defensive_stats']
df_regular['Blocks_per_game_percentage'] = df_regular['Blocks_per_game'] / df_regular['Defensive_stats']

In [69]:
# repeat the same process for the following columns Assists, Offensive_Rebounds,Turnovers, 3_point_field_goals

df_regular['Assists_per_game'] = scaler.fit_transform(df_regular[['Assists_per_game']])
df_regular['Offensive_rebounds_per_game'] = scaler.fit_transform(df_regular[['Offensive_rebounds_per_game']])
df_regular['Turnovers_per_game'] = scaler.fit_transform(df_regular[['Turnovers_per_game']])
df_regular['3_point_field_goals_per_game'] = scaler.fit_transform(df_regular[['3_point_field_goals_per_game']])
df_regular['Offensive_stats'] = df_regular['Assists_per_game'] + df_regular['Offensive_rebounds_per_game'] + df_regular['Turnovers_per_game'] + df_regular['3_point_field_goals_per_game']

# get the percentage of the four scaled columns relative to the sum of the four scaled columns

df_regular['Assists_per_game_percentage'] = df_regular['Assists_per_game'] / df_regular['Offensive_stats']
df_regular['Offensive_rebounds_per_game_percentage'] = df_regular['Offensive_rebounds_per_game'] / df_regular['Offensive_stats']
df_regular['Turnovers_per_game_percentage'] = df_regular['Turnovers_per_game'] / df_regular['Offensive_stats']
df_regular['3_point_field_goals_per_game_percentage'] = df_regular['3_point_field_goals_per_game'] / df_regular['Offensive_stats']



In [70]:
# replace inf values with missing values for the new columns

df_regular.replace([np.inf, -np.inf], np.nan, inplace=True)

In [75]:

# split data into train and test

from sklearn.model_selection import train_test_split

X = df_regular.drop(columns=['Class'])
y = df_regular['Class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# separate categorical columns from numeric
passthrough = ['Players_name', 'Team']
X_train.drop(columns=passthrough, inplace=True)
X_test.drop(columns=passthrough, inplace=True)
mask = X_train.dtypes == 'object'
categorical = X_train.columns[mask]
numeric = X_train.columns[~mask]

print('There are {} numeric columns and {} categorical columns'.format(len(numeric), len(categorical)))

# create new X_train with only the new columns

new_features = ['Defensive_rebounds_per_game_percentage', 'Steals_per_game_percentage', 'Blocks_per_game_percentage', 'Assists_per_game_percentage', 'Offensive_rebounds_per_game_percentage', 'Turnovers_per_game_percentage', '3_point_field_goals_per_game_percentage']

There are 37 numeric columns and 0 categorical columns


In [87]:
# create a logistic regression pipeline using only the new_features with a column transformer and a logistic regression classifier and impute missing values and infinities

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

# how many categorical columns are there in X_train?
print('There are {} categorical columns in X_train'.format(len(categorical)))

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric),
        ('cat', categorical_transformer, categorical)])
        

# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
clf = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', LogisticRegression())])

# fit the model
clf.fit(X_train, y_train)

# predict the test set
y_pred = clf.predict(X_test)

# print the accuracy score
print('Accuracy score: {}'.format(accuracy_score(y_test, y_pred)))



There are 0 categorical columns in X_train
Accuracy score: 0.7791411042944786


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [78]:
# get the most important features using the coefficients of the logistic regression model

# get the coefficients of the logistic regression model
coefficients = clf.named_steps['classifier'].coef_[0]

# get the feature names
feature_names = clf.named_steps['preprocessor'].transformers_[0][2]

# create a dataframe with the coefficients and feature names
df_coefficients = pd.DataFrame({'feature': feature_names, 'coefficient': coefficients})

# sort the dataframe by the absolute value of the coefficients
df_coefficients = df_coefficients.sort_values(by='coefficient', key=abs, ascending=False)

# print the top 10 features
print(df_coefficients.head(10))

                                 feature  coefficient
4                Minutes_played_per_game    -1.694657
21                      Assists_per_game    -1.268568
19           Defensive_rebounds_per_game     0.824962
20               Total_rebounds_per_game     0.780394
12  2_point_field_goal_attempts_per_game    -0.764404
6           Field_goal_attempts_per_game    -0.707078
11          2_point_field_goals_per_game     0.643750
8           3_point_field_goals_per_game     0.564947
5                   Field_goals_per_game     0.542063
9   3_point_field_goal_attempts_per_game    -0.464601


ValueError: Invalid parameter 'C' for estimator Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  Index(['Rank', 'Players_age', 'Games_played', 'Games_started',
       'Minutes_played_per_game', 'Field_goals_per_game',
       'Field_goal_attempts_per_game', 'Field_goal_percentage',
       '3_point_field_goals_per_...
       'Assists_per_game_percentage', 'Offensive_rebounds_per_game_percentage',
       'Turnovers_per_game_percentage',
       '3_point_field_goals_per_game_percentage'],
      dtype='object')),
                                                 ('cat',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(fill_value='missing',
                                                                                 strategy='constant')),
                                                                  ('onehot',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  Index([], dtype='object'))])),
                ('classifier', LogisticRegression(max_iter=10000))]). Valid parameters are: ['memory', 'steps', 'verbose'].