# 직구, 변화구로 나눠서 학습하기

In [1]:
from sklearn.model_selection import train_test_split

In [2]:
from sklearn.multioutput import MultiOutputRegressor
from lightgbm import LGBMRegressor

In [5]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error, mean_absolute_error

def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

def mse(y_true, y_pred):
    return mean_squared_error(y_true, y_pred)

def mae(y_true, y_pred):
    return mean_absolute_error(y_true, y_pred)

def mape(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100  # 백분율로 변환


In [7]:
data = pd.read_excel("mlb_dp_20.xlsx")
data.head()

Unnamed: 0.1,Unnamed: 0,id,ball_type,pitch_type,velocity,exit_velocity,hit_dist,zone_num,pitcher_hand,batter_hand,...,temp,wind,rain,theta_p,theta_n,distance,player_age,slg_percent,isolated_power,babip
0,0,676356,0,2,91.2,88.5,3,14,0,0,...,23.0,0.0,0,115.306158,-25.306158,91.654771,24,0.429,0.167,0.281
1,1,676356,1,2,86.3,105.0,228,9,0,0,...,21.3,11.6,0,103.751442,-13.751442,189.305002,24,0.429,0.167,0.281
2,2,676356,0,1,95.7,91.9,100,8,0,0,...,23.0,0.0,0,41.540762,48.459238,96.804764,24,0.429,0.167,0.281
3,3,676356,0,5,83.3,80.7,208,6,0,0,...,23.0,0.0,0,91.562115,-1.562115,168.446016,24,0.429,0.167,0.281
4,4,676356,0,2,91.2,88.5,3,14,0,0,...,23.0,0.0,0,116.40834,-26.40834,92.322138,24,0.429,0.167,0.281


In [9]:
data.drop("Unnamed: 0", axis=1, inplace=True)

In [11]:
data.head()

Unnamed: 0,id,ball_type,pitch_type,velocity,exit_velocity,hit_dist,zone_num,pitcher_hand,batter_hand,ball,...,temp,wind,rain,theta_p,theta_n,distance,player_age,slg_percent,isolated_power,babip
0,676356,0,2,91.2,88.5,3,14,0,0,1,...,23.0,0.0,0,115.306158,-25.306158,91.654771,24,0.429,0.167,0.281
1,676356,1,2,86.3,105.0,228,9,0,0,0,...,21.3,11.6,0,103.751442,-13.751442,189.305002,24,0.429,0.167,0.281
2,676356,0,1,95.7,91.9,100,8,0,0,0,...,23.0,0.0,0,41.540762,48.459238,96.804764,24,0.429,0.167,0.281
3,676356,0,5,83.3,80.7,208,6,0,0,1,...,23.0,0.0,0,91.562115,-1.562115,168.446016,24,0.429,0.167,0.281
4,676356,0,2,91.2,88.5,3,14,0,0,1,...,23.0,0.0,0,116.40834,-26.40834,92.322138,24,0.429,0.167,0.281


In [13]:
# 직구 -> 0, 변화구 -> 1로 인코딩
def pitch_type_enc(type):
    if type == 0 or type == 1 or type == 4 or type == 12:
        return 0 # 직구
    else:
        return 1 # 변화구

In [21]:
# 직구/변화구로 인코딩
data["pitch_type2"] = data["pitch_type"].apply(lambda x: pitch_type_enc(x))

In [23]:
data.head()

Unnamed: 0,id,ball_type,pitch_type,velocity,exit_velocity,hit_dist,zone_num,pitcher_hand,batter_hand,ball,...,wind,rain,theta_p,theta_n,distance,player_age,slg_percent,isolated_power,babip,pitch_type2
0,676356,0,2,91.2,88.5,3,14,0,0,1,...,0.0,0,115.306158,-25.306158,91.654771,24,0.429,0.167,0.281,1
1,676356,1,2,86.3,105.0,228,9,0,0,0,...,11.6,0,103.751442,-13.751442,189.305002,24,0.429,0.167,0.281,1
2,676356,0,1,95.7,91.9,100,8,0,0,0,...,0.0,0,41.540762,48.459238,96.804764,24,0.429,0.167,0.281,0
3,676356,0,5,83.3,80.7,208,6,0,0,1,...,0.0,0,91.562115,-1.562115,168.446016,24,0.429,0.167,0.281,1
4,676356,0,2,91.2,88.5,3,14,0,0,1,...,0.0,0,116.40834,-26.40834,92.322138,24,0.429,0.167,0.281,1


In [25]:
X = data.drop(columns=["theta_p", "theta_n", "distance", "pitch_type"])
y = data[["theta_p", "distance"]]

In [27]:
# 1차: Train(80%) + Test(20%) 분할
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=1234)

# 2차: Train(60%) + Validation(20%) 분할 (Train+Validation을 다시 나눔)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=1234)

In [29]:
lgbm_model = LGBMRegressor(n_jobs=-1, random_state=1234)
lgbm_model = MultiOutputRegressor(lgbm_model)
lgbm_model.fit(X_train, y_train)
print("lightGBM RMSE : ", rmse(y_val, lgbm_model.predict(X_val)))
print("lightGBM MAPE : ", mape(y_val, lgbm_model.predict(X_val)))

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009127 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2203
[LightGBM] [Info] Number of data points in the train set: 65982, number of used features: 19
[LightGBM] [Info] Start training from score 92.026926
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009392 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2203
[LightGBM] [Info] Number of data points in the train set: 65982, number of used features: 19
[LightGBM] [Info] Start training from score 166.914255
lightGBM RMSE :  22.99493472691974
lightGBM MAPE :  24.90160074604369
