In [1]:
from pycaret.classification import setup, compare_models, evaluate_model
import pandas as pd

In [2]:
# load treated data set
train_set = pd.read_parquet('../data/05_model_input/train_dataset_kobe_dev.parquet')
test_set = pd.read_parquet('../data/05_model_input/test_dataset_kobe_dev.parquet')
train_set.head()

Unnamed: 0,lat,lon,minutes_remaining,period,playoffs,shot_distance,shot_made_flag
26131,33.8633,-118.2818,8,3,0,18,1.0
16057,34.0443,-118.2698,9,3,0,0,1.0
1103,34.0443,-118.2698,6,3,0,0,0.0
26435,34.0443,-118.2698,5,2,1,0,1.0
21046,34.0433,-118.2748,3,1,0,0,1.0


In [3]:
# check if it has null values
train_set.info()

<class 'pandas.core.frame.DataFrame'>
Index: 16228 entries, 26131 to 14158
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   lat                16228 non-null  float64
 1   lon                16228 non-null  float64
 2   minutes_remaining  16228 non-null  int64  
 3   period             16228 non-null  int64  
 4   playoffs           16228 non-null  int64  
 5   shot_distance      16228 non-null  int64  
 6   shot_made_flag     16228 non-null  float64
dtypes: float64(3), int64(4)
memory usage: 1014.2 KB


In [4]:
# check basis statistics
train_set.describe()

Unnamed: 0,lat,lon,minutes_remaining,period,playoffs,shot_distance,shot_made_flag
count,16228.0,16228.0,16228.0,16228.0,16228.0,16228.0,16228.0
mean,33.98086,-118.262196,5.100136,2.475228,0.149864,10.214013,0.477323
std,0.065865,0.093494,3.422079,1.149379,0.356949,7.554856,0.499501
min,33.8093,-118.4878,0.0,1.0,0.0,0.0,0.0
25%,33.9263,-118.3138,2.0,1.0,0.0,1.0,0.0
50%,33.9998,-118.2698,5.0,3.0,0.0,12.0,0.0
75%,34.0443,-118.1948,8.0,3.0,0.0,17.0,1.0
max,34.0883,-118.0498,11.0,7.0,1.0,26.0,1.0


In [5]:
target_column = 'shot_made_flag'
s = setup(train_set, target = target_column, session_id=123, fix_imbalance=True)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,shot_made_flag
2,Target type,Binary
3,Original data shape,"(16228, 7)"
4,Transformed data shape,"(16743, 7)"
5,Transformed train set shape,"(11874, 7)"
6,Transformed test set shape,"(4869, 7)"
7,Numeric features,6
8,Preprocess,True
9,Imputation type,simple


In [6]:
# check inbalance 
train_set[target_column].value_counts()

shot_made_flag
0.0    8482
1.0    7746
Name: count, dtype: int64

In [7]:
best = compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
gbc,Gradient Boosting Classifier,0.5861,0.5977,0.417,0.5951,0.4901,0.1597,0.167,0.427
ada,Ada Boost Classifier,0.5855,0.5965,0.434,0.5897,0.4995,0.1597,0.1654,0.135
lr,Logistic Regression,0.5735,0.5957,0.5367,0.5554,0.5458,0.144,0.1441,0.48
ridge,Ridge Classifier,0.5728,0.5979,0.5389,0.5543,0.5463,0.1428,0.1429,0.024
lda,Linear Discriminant Analysis,0.5728,0.5978,0.5391,0.5543,0.5464,0.1428,0.1429,0.02
nb,Naive Bayes,0.5709,0.5927,0.5231,0.5537,0.5378,0.138,0.1383,0.016
qda,Quadratic Discriminant Analysis,0.5704,0.5995,0.5651,0.5488,0.5567,0.1401,0.1403,0.012
lightgbm,Light Gradient Boosting Machine,0.5627,0.5881,0.5203,0.5441,0.5317,0.122,0.1222,0.389
rf,Random Forest Classifier,0.5495,0.5664,0.5339,0.528,0.5307,0.0977,0.0977,0.208
et,Extra Trees Classifier,0.5491,0.5611,0.5402,0.527,0.5334,0.0973,0.0974,0.186


In [8]:
# See performance of the model
evaluate_model(best)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelinâ€¦