In [1]:
import pandas as pd
import numpy as np

In [2]:
!gdown 1yOZjTvvqq9RAhvzBHZsk7WxAbwad82qT
file_path = "/content/mlb_final_zone.xlsx"

Downloading...
From: https://drive.google.com/uc?id=1yOZjTvvqq9RAhvzBHZsk7WxAbwad82qT
To: /content/mlb_final_zone.xlsx
100% 11.5M/11.5M [00:00<00:00, 36.1MB/s]


In [3]:
# 엑셀 파일 불러오기
df = pd.read_excel(file_path)

# 불필요한 컬럼 제거
df = df.drop(columns=["theta_p", "distance"])

# X, y 설정
X = df.drop(columns=["zone"])
y = df["zone"]
df

Unnamed: 0,id,ball_type,pitch_type,velocity,exit_velocity,hit_dist,zone_num,pitcher_hand,batter_hand,ball,strike,place,temp,wind,rain,player_age,slg_percent,isolated_power,babip,zone
0,676356,0,2,91.2,88.5,3,14,0,0,1,2,29,23.0,0.0,0,24,0.429,0.167,0.281,6
1,676356,1,2,86.3,105.0,228,9,0,0,0,2,19,21.3,11.6,0,24,0.429,0.167,0.281,11
2,676356,0,1,95.7,91.9,100,8,0,0,0,1,29,23.0,0.0,0,24,0.429,0.167,0.281,4
3,676356,0,5,83.3,80.7,208,6,0,0,1,2,29,23.0,0.0,0,24,0.429,0.167,0.281,8
4,676356,0,2,91.2,88.5,3,14,0,0,1,2,29,23.0,0.0,0,24,0.429,0.167,0.281,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
109966,642715,0,0,92.4,100.8,322,5,1,0,2,2,14,23.0,0.0,0,27,0.407,0.190,0.259,19
109967,642715,1,1,91.1,107.9,132,8,0,0,0,0,14,23.0,0.0,0,27,0.407,0.190,0.259,5
109968,642715,1,0,97.6,93.1,37,5,1,0,0,1,14,23.0,0.0,0,27,0.407,0.190,0.259,6
109969,642715,0,0,93.7,54.6,1,4,0,0,1,2,14,23.0,0.0,0,27,0.407,0.190,0.259,2


# KNN

In [None]:
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# KNN 모델 파이프라인 (표준화 포함)
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('knn', KNeighborsClassifier())
])

# 5-Fold 설정
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# 하이퍼파라미터 튜닝: K 값 1 ~ 20
param_grid = {
    'knn__n_neighbors': list(range(1, 21))
}

# GridSearchCV로 최적의 K 찾기
grid_search = GridSearchCV(pipeline, param_grid, cv=kf, scoring='accuracy')
grid_search.fit(X, y)

# 결과 출력
print("최적의 K:", grid_search.best_params_['knn__n_neighbors'])
print("최고 평균 정확도:", grid_search.best_score_)

최적의 K: 1
최고 평균 정확도: 0.4162642712101337


#Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from scipy.stats import randint

# 파이프라인 설정 (표준화 + 랜덤포레스트)
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('rf', RandomForestClassifier(random_state=42))
])

# 하이퍼파라미터 범위 설정 (난수 분포 사용)
param_distributions = {
    'rf__n_estimators': randint(50, 300),
    'rf__max_depth': [None, 10, 20, 30],
    'rf__min_samples_split': randint(2, 10),
    'rf__min_samples_leaf': randint(1, 5)
}

# 5-Fold 교차검증
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# RandomizedSearchCV 실행 (n_iter = 시도할 조합 수)
random_search = RandomizedSearchCV(
    pipeline,
    param_distributions=param_distributions,
    n_iter=20,                # 시도할 조합 수
    cv=kf,
    scoring='accuracy',
    random_state=42,
    n_jobs=-1
)

# 학습
random_search.fit(X, y)

# 결과 출력
print("최적 하이퍼파라미터:", random_search.best_params_)
print("최고 평균 정확도:", random_search.best_score_)



최적 하이퍼파라미터: {'rf__max_depth': None, 'rf__min_samples_leaf': 1, 'rf__min_samples_split': 4, 'rf__n_estimators': 108}
최고 평균 정확도: 0.621600230181192


#lightGBM

In [4]:
pip install lightgbm



In [5]:
from sklearn.model_selection import RandomizedSearchCV, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from lightgbm import LGBMClassifier
from scipy.stats import randint, uniform

# 파이프라인 (표준화 + LightGBM)
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('lgbm', LGBMClassifier(random_state=42))
])

# 하이퍼파라미터 분포 설정
param_dist = {
    'lgbm__n_estimators': randint(100, 200),
    'lgbm__max_depth': randint(3, 10),
    'lgbm__num_leaves': randint(20, 80),
    'lgbm__learning_rate': uniform(0.01, 0.3),
    'lgbm__min_child_samples': randint(10, 100)
}

# 5-Fold 교차검증
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# RandomizedSearchCV 설정
random_search = RandomizedSearchCV(
    pipeline,
    param_distributions=param_dist,
    n_iter=10,               # 시도할 조합 수
    cv=kf,
    scoring='accuracy',
    n_jobs=-1,
    random_state=42
)

# 학습
random_search.fit(X, y)

# 결과 출력
print("최적 하이퍼파라미터:", random_search.best_params_)
print("최고 평균 정확도:", random_search.best_score_)



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.018339 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2178
[LightGBM] [Info] Number of data points in the train set: 109971, number of used features: 19
[LightGBM] [Info] Start training from score -3.160987
[LightGBM] [Info] Start training from score -3.363375
[LightGBM] [Info] Start training from score -2.987861
[LightGBM] [Info] Start training from score -2.402142
[LightGBM] [Info] Start training from score -1.698950
[LightGBM] [Info] Start training from score -2.278427
[LightGBM] [Info] Start training from score -3.859512
[LightGBM] [Info] Start training from score -3.457793
[LightGBM] [Info] Start training from score -3.865570
[LightGBM] [Info] Start training from score -3.669883
[LightGBM] [Info] Start training from score -3.138919
[LightGBM] [Info] Start training from score -3.73095

#XGBoost

In [4]:
pip install xgboost



In [None]:
from sklearn.model_selection import RandomizedSearchCV, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from scipy.stats import randint, uniform

# y 값 조정 (최소값을 0으로 변경)
y = np.array(y) - y.min()

# 파이프라인 (표준화 + XGBoost)
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('xgb', XGBClassifier(random_state=42, use_label_encoder=False))  # 수정
])

# 하이퍼파라미터 분포 설정
param_dist = {
    'xgb__n_estimators': randint(100, 300),
    'xgb__max_depth': randint(3, 15),
    'xgb__learning_rate': uniform(0.01, 0.3),
    'xgb__subsample': uniform(0.5, 0.5),
    'xgb__colsample_bytree': uniform(0.5, 0.5),
    'xgb__min_child_weight': randint(1, 10)
}

# 5-Fold 교차검증
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# RandomizedSearchCV 설정
random_search = RandomizedSearchCV(
    pipeline,
    param_distributions=param_dist,
    n_iter=10,
    cv=kf,
    scoring='accuracy',
    n_jobs=-1,
    random_state=42
)

# 학습
random_search.fit(X, y)

# 결과 출력
print("최적 하이퍼파라미터:", random_search.best_params_)
print("최고 평균 정확도:", random_search.best_score_)

Parameters: { "use_label_encoder" } are not used.



최적 하이퍼파라미터: {'xgb__colsample_bytree': np.float64(0.8059264473611898), 'xgb__learning_rate': np.float64(0.05184815819561255), 'xgb__max_depth': 14, 'xgb__min_child_weight': 3, 'xgb__n_estimators': 207, 'xgb__subsample': np.float64(0.7571172192068059)}
최고 평균 정확도: 0.6360040258143105
