In [1]:
import numpy as np
import pandas as pd

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/finaldata.csv', encoding='latin1')

# 데이터 확인
print(data.info())
print(data.head())
print(data.shape)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1735 entries, 0 to 1734
Data columns (total 39 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   name    1735 non-null   object 
 1   debut   1735 non-null   object 
 2   start   1735 non-null   int64  
 3   end     1735 non-null   int64  
 4   salary  1735 non-null   float64
 5   birth   1735 non-null   int64  
 6   target  1735 non-null   float64
 7   Pos     1735 non-null   object 
 8   G       1735 non-null   float64
 9   GS      1735 non-null   float64
 10  MP      1735 non-null   float64
 11  FG      1735 non-null   float64
 12  FGA     1735 non-null   float64
 13  FG%     1735 non-null   float64
 14  3P      1735 non-null   float64
 15  3PA     1735 non-null   float64
 16  3P%     1735 non-null   float64
 17  2P      1735 non-null   float64
 18  2PA     1735 non-null   float64
 19  2P%     1735 non-null   float64
 20  eFG%    1735 non-null   float64
 21  FT      1735 non-null   float64
 22  

In [4]:
# 'debut' 열에 '-' 값이 있는 행 제거
data = data[data['debut'] != '-']
# 'debut' 열을 int 타입으로 변환
data['debut'] = data['debut'].astype(int)

# 결측치가 있는 모든 행 제거
data_cleaned = data.drop(columns=['Pos', 'name'])


print(data_cleaned.info())


<class 'pandas.core.frame.DataFrame'>
Index: 1734 entries, 0 to 1734
Data columns (total 37 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   debut   1734 non-null   int64  
 1   start   1734 non-null   int64  
 2   end     1734 non-null   int64  
 3   salary  1734 non-null   float64
 4   birth   1734 non-null   int64  
 5   target  1734 non-null   float64
 6   G       1734 non-null   float64
 7   GS      1734 non-null   float64
 8   MP      1734 non-null   float64
 9   FG      1734 non-null   float64
 10  FGA     1734 non-null   float64
 11  FG%     1734 non-null   float64
 12  3P      1734 non-null   float64
 13  3PA     1734 non-null   float64
 14  3P%     1734 non-null   float64
 15  2P      1734 non-null   float64
 16  2PA     1734 non-null   float64
 17  2P%     1734 non-null   float64
 18  eFG%    1734 non-null   float64
 19  FT      1734 non-null   float64
 20  FTA     1734 non-null   float64
 21  FT%     1734 non-null   float64
 22  ORB  

In [5]:
# target feature (목표 변수)
target = data_cleaned['target']

# input feature (입력 변수들)
input_features = data_cleaned.drop(columns=['target'])

# 결과 데이터 확인
print(f"target.shape -> {target.shape}")
print(target.head())


target.shape -> (1734,)
0    0.013971
1    0.032143
2    0.028680
3    0.021363
4    0.018167
Name: target, dtype: float64


In [6]:
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.metrics import mean_squared_error, r2_score

# 훈련/검증 데이터셋 분리
X_train, X_test, y_train, y_test = train_test_split(input_features, target, test_size=0.2, random_state=42)

# XGBoost 모델 학습 및 평가(표준화 X)
xgb_model = XGBRegressor()
xgb_model.fit(X_train, y_train)
xgb_predictions = xgb_model.predict(X_test)
xgb_test_mse = mean_squared_error(y_test, xgb_predictions)
xgb_test_r2 = r2_score(y_test, xgb_predictions)
xgb_train_mse = mean_squared_error(y_train, xgb_model.predict(X_train))
xgb_train_r2 = r2_score(y_train, xgb_model.predict(X_train))

# AdaBoost 모델 학습 및 평가(표준화 X)
ada_model = AdaBoostRegressor()
ada_model.fit(X_train, y_train)
ada_predictions = ada_model.predict(X_test)
ada_test_mse = mean_squared_error(y_test, ada_predictions)
ada_test_r2 = r2_score(y_test, ada_predictions)
ada_train_mse = mean_squared_error(y_train, ada_model.predict(X_train))
ada_train_r2 = r2_score(y_train, ada_model.predict(X_train))

print("표준화 안한 모델")
print(f"""
          xgb_test_mse: {xgb_test_mse}
          xgb_train_mse: {xgb_train_mse}
          xgb_test_r2: {xgb_test_r2}
          xgb_train_r2: {xgb_train_r2}

          ada_test_mse: {ada_test_mse}
          ada_train_mse: {ada_train_mse}
          ada_test_r2: {ada_test_r2}
          ada_train_r2: {ada_train_r2}
      """
     )


표준화 안한 모델

          xgb_test_mse: 6.893317031478567e-05
          xgb_train_mse: 3.7344312792603566e-07
          xgb_test_r2: 0.9893601110776246
          xgb_train_r2: 0.99994187460462

          ada_test_mse: 0.0002615655145532
          ada_train_mse: 0.0001776191896123923
          ada_test_r2: 0.9596271576069805
          ada_train_r2: 0.9723540618336516
      


In [7]:
from sklearn.preprocessing import StandardScaler
#표준화
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# XGBoost 모델 학습 및 평가(표준화 O)
xgb_scaled= XGBRegressor()
xgb_scaled.fit(X_train_scaled, y_train)
xgb_scaled_predictions = xgb_scaled.predict(X_test_scaled)
xgb_scaled_test_mse = mean_squared_error(y_test, xgb_scaled_predictions)
xgb_scaled_test_r2 = r2_score(y_test, xgb_scaled_predictions)
xgb_scaled_train_mse = mean_squared_error(y_train, xgb_scaled.predict(X_train_scaled))
xgb_scaled_train_r2 = r2_score(y_train, xgb_scaled.predict(X_train_scaled))

# AdaBoost 모델 학습 및 평가(표준화 O)
ada_scaled = AdaBoostRegressor()
ada_scaled.fit(X_train_scaled, y_train)
ada_scaled_predictions = ada_scaled.predict(X_test_scaled)
ada_scaled_test_mse = mean_squared_error(y_test, ada_scaled_predictions)
ada_scaled_test_r2 = r2_score(y_test, ada_scaled_predictions)
ada_scaled_train_mse = mean_squared_error(y_train, ada_scaled.predict(X_train_scaled))
ada_scaled_train_r2 = r2_score(y_train, ada_scaled.predict(X_train_scaled))

print("표준화 한 모델")
print(f"""
          xgb_scaled_test_mse: {xgb_scaled_test_mse}
          xgb_scaled_train_mse: {xgb_scaled_train_mse}
          xgb_scaled_test_r2: {xgb_scaled_test_r2}
          xgb_scaled_train_r2: {xgb_scaled_train_r2}

          ada_scaled_test_mse: {ada_scaled_test_mse}
          ada_scaled_train_mse: {ada_scaled_train_mse}
          ada_scaled_test_r2: {ada_scaled_test_r2}
          ada_scaled_train_r2: {ada_scaled_train_r2}
      """
     )

표준화 한 모델

          xgb_scaled_test_mse: 6.893317031478567e-05
          xgb_scaled_train_mse: 3.7344312792603566e-07
          xgb_scaled_test_r2: 0.9893601110776246
          xgb_scaled_train_r2: 0.99994187460462

          ada_scaled_test_mse: 0.00026645332981072516
          ada_scaled_train_mse: 0.00018686597863652548
          ada_scaled_test_r2: 0.9588727194870497
          ada_scaled_train_r2: 0.9709148245634202
      
