In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import tree, preprocessing, metrics, model_selection, linear_model
from sklearn import model_selection

# Caminho para o arquivo CSV
file_path = "data/01_raw/kobe_shot.csv"

# Carregar o arquivo CSV usando o Pandas
data = pd.read_csv(file_path)

# Verificar a quantidade de valores nulos ou vazios na coluna shot_made_flag
num_null_values = data['shot_made_flag'].isnull().sum()
print(num_null_values)

data.head(6)

5000


Unnamed: 0,action_type,combined_shot_type,game_event_id,game_id,lat,loc_x,loc_y,lon,minutes_remaining,period,...,shot_type,shot_zone_area,shot_zone_basic,shot_zone_range,team_id,team_name,game_date,matchup,opponent,shot_id
0,Jump Shot,Jump Shot,10,20000012,33.9723,167,72,-118.1028,10,1,...,2PT Field Goal,Right Side(R),Mid-Range,16-24 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,1
1,Jump Shot,Jump Shot,12,20000012,34.0443,-157,0,-118.4268,10,1,...,2PT Field Goal,Left Side(L),Mid-Range,8-16 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,2
2,Jump Shot,Jump Shot,35,20000012,33.9093,-101,135,-118.3708,7,1,...,2PT Field Goal,Left Side Center(LC),Mid-Range,16-24 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,3
3,Jump Shot,Jump Shot,43,20000012,33.8693,138,175,-118.1318,6,1,...,2PT Field Goal,Right Side Center(RC),Mid-Range,16-24 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,4
4,Driving Dunk Shot,Dunk,155,20000012,34.0443,0,0,-118.2698,6,2,...,2PT Field Goal,Center(C),Restricted Area,Less Than 8 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,5
5,Jump Shot,Jump Shot,244,20000012,34.0553,-145,-11,-118.4148,9,3,...,2PT Field Goal,Left Side(L),Mid-Range,8-16 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,6


In [9]:
#drop rows with na values in the target feature and reset the index so we dont have anything missing
data = data[data['shot_made_flag'].notnull()].reset_index() 

data['period'].unique()

# 12 minutes per period but it is a countdown so we subtract from 11
data['minutes_from_period_start'] = 11 - data['minutes_remaining']
data['seconds_from_period_start'] = 60*data['minutes_from_period_start'] + (60 - data['seconds_remaining']) 
# overtime lasts 5 minutes
data['minutes_from_game_start'] = (data['period'] <= 4).astype(int)*((data['period'] - 1 )*12 + data['minutes_from_period_start']) \
+ (data['period'] > 4).astype(int)*((data['period'] - 5)*5 + 4*12 + data['minutes_from_period_start'])
data['seconds_from_game_start'] = (data['period'] <= 4).astype(int)*((data['period'] - 1 )*12*60 + data['seconds_from_period_start']) \
+ (data['period'] > 4).astype(int)*((data['period'] - 5)*5*60 + 4*12*60 + data['minutes_from_period_start'])

data['game_date'] = pd.to_datetime(data['game_date'])
data['game_day'] = data['game_date'].apply(lambda x: x.weekday())

data_cl = data.copy() # create a copy of the already cleaned from null values data frame
target = data_cl['shot_made_flag'].copy()


# Create new features
data_cl['game_year'] = data_cl['game_date'].apply(lambda x: x.year)
data_cl['game_month'] = data_cl['game_date'].apply(lambda x: x.month)

# Replace 20 least common action types with value 'Other'
rare_action_types = data_cl['action_type'].value_counts().sort_values().index.values[:20]
data_cl.loc[data_cl['action_type'].isin(rare_action_types), 'action_type'] = 'Other'

# Loc_x, and loc_y binning to group similar values together and not have a simingly continuous variable
data_cl['loc_x'] = pd.cut(data_cl['loc_x'], 25)
data_cl['loc_y'] = pd.cut(data_cl['loc_y'], 25)

# Home/Away game based on 'matchup' feature
data_cl['home_play'] = data_cl['matchup'].str.contains('vs').astype('int')
data_cl.drop('matchup', axis=1, inplace=True)

# Remove some columns
data_cl.drop('team_id', axis=1, inplace=True) # Always one number
data_cl.drop('lat', axis=1, inplace=True) # Correlated with loc_x
data_cl.drop('lon', axis=1, inplace=True) # Correlated with loc_y
data_cl.drop('game_id', axis=1, inplace=True) # Independent 
data_cl.drop('game_event_id', axis=1, inplace=True) # Independent
data_cl.drop('team_name', axis=1, inplace=True) # Always LA Lakers
data_cl.drop('shot_made_flag', axis=1, inplace=True)
data_cl.drop('minutes_remaining', axis=1, inplace=True) # Correlated with seconds_from_game_start
data_cl.drop('seconds_remaining', axis=1, inplace=True) # Correlated with seconds_from_game_start
data_cl.drop('minutes_from_period_start', axis=1, inplace=True) # Correlated with seconds_from_game_start
data_cl.drop('minutes_from_game_start', axis=1, inplace=True) # Correlated with seconds_from_game_start
data_cl.drop('seconds_from_period_start', axis=1, inplace=True) # Correlated with seconds_from_game_start + period
data_cl.drop('game_date', axis=1, inplace=True) # Correlated with  game_year,game_month,game_day
data_cl.drop('index', axis=1, inplace=True) 


In [48]:
data_cl.head(1).T

Unnamed: 0,0
action_type,Jump Shot
combined_shot_type,Jump Shot
loc_x,"(-170.32, -150.4]"
loc_y,"(-10.6, 22.8]"
period,1
playoffs,0
season,2000-01
shot_distance,15
shot_type,2PT Field Goal
shot_zone_area,Left Side(L)


In [2]:
from pycaret.classification import setup, compare_models, predict_model, create_model,predict_model
import numpy as np

data_transformed = data
data_transformed = data_transformed.dropna(subset=['shot_made_flag'])

print(f"Número de linhas antes da remoção: {len(data)}")
print(f"Número de linhas após a remoção: {len(data_transformed)}")

setup(data_transformed, target='shot_made_flag', categorical_features=['action_type', 'combined_shot_type'])

Número de linhas antes da remoção: 30697
Número de linhas após a remoção: 25697


Unnamed: 0,Description,Value
0,Session id,7067
1,Target,shot_made_flag
2,Target type,Binary
3,Original data shape,"(25697, 25)"
4,Transformed data shape,"(25697, 30)"
5,Transformed train set shape,"(17987, 30)"
6,Transformed test set shape,"(7710, 30)"
7,Numeric features,13
8,Categorical features,2
9,Preprocess,True


<pycaret.classification.oop.ClassificationExperiment at 0x12f4a2ffd60>

In [6]:
def eval_metrics(pred):
    actual = pred['shot_made_flag']
    pred = pred['prediction_label']
    return (metrics.precision_score(actual, pred), 
            metrics.recall_score(actual, pred),
            metrics.f1_score(actual, pred))

In [7]:
# Comparar modelos disponíveis
best_model = compare_models()
    
# Treinar o melhor modelo
final_model = create_model(best_model)

# Fazer previsões usando o modelo treinado
predictions = predict_model(final_model, data_transformed)

(precision, recall, f1) = eval_metrics(predictions)
cm =  metrics.confusion_matrix(predictions["shot_made_flag"], predictions['prediction_label'])


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.6567,0.0,0.4846,0.6562,0.5574,0.2872,0.2961,0.221
dummy,Dummy Classifier,0.5538,0.0,0.0,0.0,0.0,0.0,0.0,0.04


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.6509,0.0,0.4751,0.648,0.5482,0.2746,0.2834
1,0.6715,0.0,0.505,0.6761,0.5782,0.3183,0.3274
2,0.6598,0.0,0.4956,0.6579,0.5653,0.2949,0.3028
3,0.6637,0.0,0.4919,0.6672,0.5663,0.3018,0.3112
4,0.6504,0.0,0.4819,0.645,0.5517,0.2748,0.2827
5,0.6565,0.0,0.4682,0.6631,0.5489,0.2846,0.2958
6,0.6543,0.0,0.4944,0.6476,0.5607,0.284,0.2911
7,0.6385,0.0,0.4701,0.6262,0.537,0.2502,0.2572
8,0.6707,0.0,0.49,0.6823,0.5704,0.3149,0.3263
9,0.6507,0.0,0.4738,0.6485,0.5476,0.2742,0.2831


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Light Gradient Boosting Machine,0.7445,0.8176,0.5973,0.7785,0.676,0.4711,0.4824
