In [None]:
# Import Part
import numpy as np
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split

In [37]:
# Load File
df_train = pd.read_csv('/Users/xlu/Desktop/DATA1030/Final Project/data/exports/players_2015_2024.csv')

df_train.head()

Unnamed: 0,season_end,Player,Age,Tm,GP,W,L,W_PCT,MP,FGM,...,AST%,TOV%,USG%,PER,WS,OWS,DWS,WS/48,BPM,VORP
0,2015,Quincy Acy,24.0,NYK,,,,,1287.0,,...,0.087,0.138,0.155,11.9,1.7,1.0,0.7,0.063,-3.1,-0.3
1,2015,Jordan Adams,20.0,MEM,,,,,248.0,,...,0.101,0.127,0.204,12.8,0.4,0.0,0.4,0.073,-0.6,0.1
2,2015,Steven Adams,21.0,OKC,,,,,1771.0,,...,0.055,0.168,0.143,14.1,4.1,1.9,2.2,0.111,0.4,1.1
3,2015,Jeff Adrien,28.0,MIN,,,,,215.0,,...,0.105,0.129,0.143,14.2,0.4,0.2,0.2,0.087,-2.2,0.0
4,2015,Arron Afflalo,29.0,DEN,,,,,1750.0,,...,0.093,0.103,0.199,11.7,1.8,1.4,0.4,0.05,-1.6,0.2


In [None]:
# Setting Features
# Target Variable 'y': High USG% (>= 0.25)
sd = 0.25
df_train['is_high_usg'] = (df_train['USG%'] >= sd).astype(int)

df_train = df_train.drop(columns = ['Player', 'Tm', 'GP', 'W', 'L'])

y = df_train['is_high_usg']
X = df_train.loc[:, df_train.columns != 'is_high_usg']

print(y)
print(X.head())

0       0
1       0
2       0
3       0
4       0
       ..
5495    0
5496    0
5497    0
5498    0
5499    1
Name: is_high_usg, Length: 5500, dtype: int64
   season_end   Age  W_PCT      MP  FGM    FGA  FG3M  FG3A  FTM    FTA  ...  \
0        2015  24.0    NaN  1287.0  NaN  331.0   NaN   NaN  NaN   97.0  ...   
1        2015  20.0    NaN   248.0  NaN   86.0   NaN   NaN  NaN   23.0  ...   
2        2015  21.0    NaN  1771.0  NaN  399.0   NaN   NaN  NaN  205.0  ...   
3        2015  28.0    NaN   215.0  NaN   44.0   NaN   NaN  NaN   38.0  ...   
4        2015  29.0    NaN  1750.0  NaN  657.0   NaN   NaN  NaN  151.0  ...   

    AST%   TOV%   USG%   PER   WS  OWS  DWS  WS/48  BPM  VORP  
0  0.087  0.138  0.155  11.9  1.7  1.0  0.7  0.063 -3.1  -0.3  
1  0.101  0.127  0.204  12.8  0.4  0.0  0.4  0.073 -0.6   0.1  
2  0.055  0.168  0.143  14.1  4.1  1.9  2.2  0.111  0.4   1.1  
3  0.105  0.129  0.143  14.2  0.4  0.2  0.2  0.087 -2.2   0.0  
4  0.093  0.103  0.199  11.7  1.8  1.4  0.4  0.05

In [49]:
# Splitting Part (60/20/20)
# Setting Required Variable Before Splitting
s = pd.to_numeric(X["season_end"])
g = pd.DataFrame({"y": y.values, "s": s}).groupby("s")["y"].agg(n="count", pos="sum").sort_index()
yrs, n_y, p_y = g.index.to_numpy(), g["n"].to_numpy(), g["pos"].to_numpy()
cum_n, cum_p  = np.cumsum(n_y), np.cumsum(p_y)
N, P, glob    = int(cum_n[-1]), int(cum_p[-1]), float(cum_p[-1]/cum_n[-1])

# Create Naive Cuts + Small Window
i1, i2 = np.searchsorted(cum_n, [0.60*N, 0.80*N])
I = range(max(0, i1-1), min(len(yrs)-2, i1+2))
J = range(max(i1+1, i2-1), min(len(yrs)-1, i2+2))

best = None
for i in I:
    n_tr, p_tr = cum_n[i], cum_p[i]
    for j in J:
        n_va, p_va = cum_n[j]-cum_n[i], cum_p[j]-cum_p[i]
        n_te, p_te = N-cum_n[j],        P-cum_p[j]
        sizes = np.array([n_tr, n_va, n_te]) / N
        rates = np.array([p_tr/max(n_tr,1), p_va/max(n_va,1), p_te/max(n_te,1)])

        score = np.abs(rates-glob).sum() + 0.5*np.abs(sizes-np.array([.6,.2,.2])).sum()
        if (best is None) or (score < best[0]): best = (score, yrs[i], yrs[j])

y_tr_end, y_va_end = best[1], best[2]

idx_tr = s <= y_tr_end
idx_va = (s > y_tr_end) & (s <= y_va_end)
idx_te = s > y_va_end

X_train, y_train = X.loc[idx_tr], y.loc[idx_tr]
X_val,   y_val   = X.loc[idx_va], y.loc[idx_va]
X_test,  y_test  = X.loc[idx_te], y.loc[idx_te]

# Print Splitting Results
print(f"Years → train: ≤{y_tr_end},  val: {y_tr_end+1}..{y_va_end},  test: >{y_va_end}")
print(np.unique(y_train, return_counts=True))
print(np.unique(y_val,   return_counts=True))
print(np.unique(y_test,  return_counts=True))

Years → train: ≤2020,  val: 2021..2022,  test: >2022
(array([0, 1]), array([2877,  367]))
(array([0, 1]), array([1025,  120]))
(array([0, 1]), array([979, 132]))


In [None]:
# Preprocessing Part
# Setting Feature
FEAT_CONTI = [
    'Age','MP','PTS','AST','AST%','TRB%','DRB%','ORB%','STL','BLK','TOV',
    'FGA','FGM','3PAr','FTA','FTM','TS%','FTr','TOV%','W_PCT','FG3A','FG3M',
    'DWS', 'OWS', 'PER', 'VORP'
] 

FEAT_CAT = ['season_end']

# Categorical Variable
enc = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

# Fit the Training Data
enc.fit(X_train[FEAT_CAT])
print('categories:', enc.categories_)
print('feature names:', enc.get_feature_names_out(FEAT_CAT))

# Transform
onehot_train = enc.transform(X_train[FEAT_CAT])
onehot_val   = enc.transform(X_val[FEAT_CAT])
onehot_test  = enc.transform(X_test[FEAT_CAT])

categories: [array([2015, 2016, 2017, 2018, 2019, 2020])]
feature names: ['season_end_2015' 'season_end_2016' 'season_end_2017' 'season_end_2018'
 'season_end_2019' 'season_end_2020']


In [51]:
# Continuous Variable
# 1. MinMaxScaler
minmax_ftrs = ['TS%','3PAr','FTr','ORB%','DRB%','TRB%','AST%']

scaler_mm = MinMaxScaler()
scaler_mm.fit(X_train[minmax_ftrs])
print(scaler_mm.transform(X_train[minmax_ftrs]))
print(scaler_mm.transform(X_val[minmax_ftrs]))
print(scaler_mm.transform(X_test[minmax_ftrs]))

[[0.533      0.181      0.293      ... 0.205      0.23356401 0.11082803]
 [0.489      0.291      0.267      ... 0.087      0.11072664 0.12866242]
 [0.549      0.005      0.514      ... 0.193      0.2733564  0.07006369]
 ...
 [0.313      0.27272727 0.36363636 ... 0.08       0.05363322 0.10573248]
 [0.616      0.03888889 0.49444444 ... 0.114      0.1799308  0.15159236]
 [0.667      0.11111111 0.         ... 0.128      0.16435986 0.11974522]]
[[0.547      0.35270541 0.29859719 ... 0.15       0.17647059 0.21019108]
 [0.503      0.41743119 0.19036697 ... 0.06       0.06228374 0.17707006]
 [0.573      0.60674157 0.15730337 ... 0.146      0.16089965 0.05987261]
 ...
 [0.629      0.43010753 0.34946237 ... 0.141      0.18512111 0.04713376]
 [0.559      0.57075472 0.12971698 ... 0.071      0.07612457 0.07898089]
 [0.         0.66666667 0.         ... 0.         0.         0.        ]]
[[0.589      0.56818182 0.18181818 ... 0.152      0.15743945 0.04076433]
 [0.607      0.84       0.032      ... 

In [52]:
# 2. StandardScaler
std_ftrs = ['Age','MP','PTS','AST','REB',
            'DREB','OREB','STL','BLK','TOV',
            'FGA','FGM','FTA','FTM']

scaler_std = StandardScaler()
print(scaler_std.fit_transform(X_train[std_ftrs]))
print(scaler_std.transform(X_val[std_ftrs]))
print(scaler_std.transform(X_test[std_ftrs]))

[[-0.5738083   0.25787307 -0.16220712 ...         nan -0.03874242
          nan]
 [-1.52038044 -1.00144184 -0.83098693 ...         nan -0.66376602
          nan]
 [-1.28373741  0.84450292  0.14358364 ...         nan  0.87345418
          nan]
 ...
 [-1.28373741 -1.2628416  -1.02018122 ... -1.04389051 -0.82424506
  -0.80555297]
 [-1.75702348 -0.49221117  0.15018344 ...  0.1890103   0.64540503
   0.40650223]
 [-0.5738083  -1.24016222 -1.01138148 ... -1.02602238 -0.85803012
  -0.82719681]]
[[-0.33716526  0.3751748   0.32177826 ...  0.31408719  0.40046335
   0.22252957]
 [-0.5738083   0.1234411   0.00718776 ... -0.0492314  -0.15699013
  -0.09130615]
 [-1.28373741 -0.49149607 -0.55819517 ... -0.59718732 -0.6215347
  -0.58911454]
 ...
 [-1.28373741 -0.45649624 -0.44379862 ... -0.48997855 -0.3090229
  -0.38349803]
 [-1.52038044  0.32994935  0.06438603 ...  0.07584549 -0.39348555
  -0.36185419]
 [-0.10052223 -1.29574661 -1.03778069 ... -1.06175863 -0.85803012
  -0.82719681]]
[[-1.04709437 -1.1