In [21]:
import os
import random

import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, RobustScaler

from sklearn.model_selection import KFold , StratifiedKFold
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from category_encoders.target_encoder import TargetEncoder
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler, RobustScaler
import statsmodels.api as sm

In [22]:
def seed_everything(seed):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)

# 시드 고정
seed_everything(42)

In [23]:
# train data 가져오기

df2 = pd.read_csv('K-League-data2.csv').drop(columns='Unnamed: 0')
print(df2.shape)
df2.head(1)

(666, 99)


Unnamed: 0,Rnd.,대회,홈 득점,홈 도움,홈 슈팅,홈 유효 슈팅,홈 블락된슈팅,홈 벗어난슈팅,홈 PA내 슈팅,홈 PA외 슈팅,...,원정 피파울,원정 경고,원정 퇴장,원정 팀,원정 경기결과,원정 소속 선수 수,원정 평균 나이,원정 용병 수,원정 평균 가치,원정 총 가치
0,1,대구vs수원FC,1,1,13,4,2,7,5,8,...,13,3,0,suwonFC,D,36,25.6,5,350.0,12.6


In [24]:
# train data 처리

X = df2.drop(columns=['대회', '홈 경기결과', '원정 경기결과']) # 필요 없거나 중복되는 열 제거
X = pd.get_dummies(X, columns=['홈 팀', '원정 팀'], drop_first=False) # 문자 숫자로 바꾸기

y = df2['홈 경기결과']  # 홈 팀 기준으로 승부 예측. (홈 승? >> 원정 패, 홈 패? >> 원정 승리) // 원정 경기결과를 넣어도 예측 결과 동일

# 경기 결과 (W, D, L)를 숫자로 
le = LabelEncoder()
y = le.fit_transform(y)

# train/test 분리. 데이터셋의 개수가 작아서 9:1로 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# scaler = StandardScaler()  
scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)
# X.columns



(599, 122) (67, 122)
(599,) (67,)


In [25]:
# test data 가져오기

test_df = pd.read_csv('K-League-Test3536.csv')

predict_df = (test_df[['Rnd.', '홈 팀','원정 팀']])

test_df = test_df.drop(columns=['Unnamed: 0', '대회', '홈 경기결과', '원정 경기결과'])
test_df = pd.get_dummies(test_df, columns=['홈 팀', '원정 팀'], drop_first=False)
test_df.shape #(12,114)

# 결과확인을 위한 df
predict_df

Unnamed: 0,Rnd.,홈 팀,원정 팀
0,36,ulsan,pohang
1,36,suwonFC,suwon
2,36,daegu,gwangju
3,36,jeju,seoul
4,36,daejeon,gangwon
5,36,incheon,jeonbuk
6,37,jeonbuk,gwangju
7,37,gangwon,suwonFC
8,37,incheon,ulsan
9,37,seoul,suwon


### 개수가 122개 / 114개로 안맞음

- 왜? ) train에는 21시즌 or 22시즌에 있었던 성남, 김천 데이터가 포함되어있음.
또한 test에는 2 라운드밖에 없으니 없는 데이터가 있음 

In [26]:
# 김천/성남은 2023시즌에 없기 때문에 드랍.
# 홈-광주, 원정-인천, 원정-제주, 홈-수원FC는 36,37라운드에 포함되지 않아 드랍.

drop_list = ['홈 팀_gimcheon', '원정 팀_gimcheon',  '홈 팀_seongnam','원정 팀_seongnam', 
            '홈 팀_gwangju', '원정 팀_incheon', '원정 팀_jeju', '홈 팀_suwon']

In [27]:
# train_data 다시 처리

X = df2.drop(columns=['대회', '홈 경기결과', '원정 경기결과']) # 필요 없거나 중복되는 열 제거
X = pd.get_dummies(X, columns=['홈 팀', '원정 팀'], drop_first=False) # 문자 숫자로 바꾸기

X = X.drop(columns=drop_list)


y = df2['홈 경기결과']  
print(y.value_counts())

# 경기 결과 (W, D, L)를 숫자로 
le = LabelEncoder()
y = le.fit_transform(y)
# print(y)

# train/test 분리. 데이터셋의 개수가 작아서 9:1로 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# scaler = StandardScaler()  
scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)  # 114개의 열로 맞춤

W    260
L    209
D    197
Name: 홈 경기결과, dtype: int64
(599, 114) (67, 114)
(599,) (67,)


# 변수 선택 전,  12개 모델 사용

In [28]:
predict_map = {0:'D', 1: "L", 2:'W'}  # 0,1,2로 바꾼걸 다시 DLW로 매칭

# result_amp = [W, L, D, D, L, D,  W, W, W, L,L, W]
result_map = [2,1,0,0,1,0, 2,2,2,1,1,2]

In [49]:
# 다항 로지스틱 회귀

model = LogisticRegression(random_state=42, solver='newton-cg',max_iter=500)
model.fit(X_train_scaled, y_train)

y_pred = model.predict(test_df)
print(y_pred)

pred_list = []
for i in y_pred:
    pred_list.append(predict_map[i])
predict_df['홈 경기결과'] = pred_list
print(predict_df)

accuracy = accuracy_score(result_map, y_pred)
print(f'Accuracy: {accuracy}')
print(classification_report(result_map, y_pred))
print(confusion_matrix(result_map, y_pred))


[0 0 0 0 0 0 0 0 0 0 0 0]
    Rnd.      홈 팀     원정 팀 홈 경기결과
0     36    ulsan   pohang      D
1     36  suwonFC    suwon      D
2     36    daegu  gwangju      D
3     36     jeju    seoul      D
4     36  daejeon  gangwon      D
5     36  incheon  jeonbuk      D
6     37  jeonbuk  gwangju      D
7     37  gangwon  suwonFC      D
8     37  incheon    ulsan      D
9     37    seoul    suwon      D
10    37     jeju  daejeon      D
11    37   pohang    daegu      D
Accuracy: 0.25
              precision    recall  f1-score   support

           0       0.25      1.00      0.40         3
           1       0.00      0.00      0.00         4
           2       0.00      0.00      0.00         5

    accuracy                           0.25        12
   macro avg       0.08      0.33      0.13        12
weighted avg       0.06      0.25      0.10        12

[[3 0 0]
 [4 0 0]
 [5 0 0]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [50]:
from sklearn.neural_network import MLPClassifier

model = MLPClassifier(random_state=42, max_iter=500)
model.fit(X_train_scaled, y_train)

y_pred = model.predict(test_df)
print(y_pred)

pred_list = []
for i in y_pred:
    pred_list.append(predict_map[i])
predict_df['홈 경기결과'] = pred_list
print(predict_df)

accuracy = accuracy_score(result_map, y_pred)
print(f'Accuracy: {accuracy}')
print(classification_report(result_map, y_pred))
print(confusion_matrix(result_map, y_pred))

[0 0 0 0 0 1 0 0 1 0 0 1]
    Rnd.      홈 팀     원정 팀 홈 경기결과
0     36    ulsan   pohang      D
1     36  suwonFC    suwon      D
2     36    daegu  gwangju      D
3     36     jeju    seoul      D
4     36  daejeon  gangwon      D
5     36  incheon  jeonbuk      L
6     37  jeonbuk  gwangju      D
7     37  gangwon  suwonFC      D
8     37  incheon    ulsan      L
9     37    seoul    suwon      D
10    37     jeju  daejeon      D
11    37   pohang    daegu      L
Accuracy: 0.16666666666666666
              precision    recall  f1-score   support

           0       0.22      0.67      0.33         3
           1       0.00      0.00      0.00         4
           2       0.00      0.00      0.00         5

    accuracy                           0.17        12
   macro avg       0.07      0.22      0.11        12
weighted avg       0.06      0.17      0.08        12

[[2 1 0]
 [4 0 0]
 [3 2 0]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [51]:
# LDA

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

model = LinearDiscriminantAnalysis()
model.fit(X_train_scaled, y_train)

y_pred = model.predict(test_df)
print(y_pred)

pred_list = []
for i in y_pred:
    pred_list.append(predict_map[i])
predict_df['홈 경기결과'] = pred_list
print(predict_df)

accuracy = accuracy_score(result_map, y_pred)
print(f'Accuracy: {accuracy}')
print(classification_report(result_map, y_pred))
print(confusion_matrix(result_map, y_pred))


[1 1 1 1 1 1 1 1 1 1 1 1]
    Rnd.      홈 팀     원정 팀 홈 경기결과
0     36    ulsan   pohang      L
1     36  suwonFC    suwon      L
2     36    daegu  gwangju      L
3     36     jeju    seoul      L
4     36  daejeon  gangwon      L
5     36  incheon  jeonbuk      L
6     37  jeonbuk  gwangju      L
7     37  gangwon  suwonFC      L
8     37  incheon    ulsan      L
9     37    seoul    suwon      L
10    37     jeju  daejeon      L
11    37   pohang    daegu      L
Accuracy: 0.3333333333333333
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         3
           1       0.33      1.00      0.50         4
           2       0.00      0.00      0.00         5

    accuracy                           0.33        12
   macro avg       0.11      0.33      0.17        12
weighted avg       0.11      0.33      0.17        12

[[0 3 0]
 [0 4 0]
 [0 5 0]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [52]:
# QDA

from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

model = QuadraticDiscriminantAnalysis()
model.fit(X_train_scaled, y_train)

y_pred = model.predict(test_df)
print(y_pred)

pred_list = []
for i in y_pred:
    pred_list.append(predict_map[i])
predict_df['홈 경기결과'] = pred_list
print(predict_df)

accuracy = accuracy_score(result_map, y_pred)
print(f'Accuracy: {accuracy}')
print(classification_report(result_map, y_pred))
print(confusion_matrix(result_map, y_pred))

[1 1 1 1 1 1 1 1 1 1 1 1]
    Rnd.      홈 팀     원정 팀 홈 경기결과
0     36    ulsan   pohang      L
1     36  suwonFC    suwon      L
2     36    daegu  gwangju      L
3     36     jeju    seoul      L
4     36  daejeon  gangwon      L
5     36  incheon  jeonbuk      L
6     37  jeonbuk  gwangju      L
7     37  gangwon  suwonFC      L
8     37  incheon    ulsan      L
9     37    seoul    suwon      L
10    37     jeju  daejeon      L
11    37   pohang    daegu      L
Accuracy: 0.3333333333333333
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         3
           1       0.33      1.00      0.50         4
           2       0.00      0.00      0.00         5

    accuracy                           0.33        12
   macro avg       0.11      0.33      0.17        12
weighted avg       0.11      0.33      0.17        12

[[0 3 0]
 [0 4 0]
 [0 5 0]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [30]:
from sklearn import svm

# train data로 모델 학습
model = svm.SVC(random_state=42, kernel='linear')
model.fit(X_train_scaled, y_train)

# test data로 predict
y_pred = model.predict(test_df)
print(y_pred)

pred_list = []
for i in y_pred:
    pred_list.append(predict_map[i])
predict_df['홈 경기결과'] = pred_list
print(predict_df)

accuracy = accuracy_score(result_map, y_pred)
print(f'Accuracy: {accuracy}')
print(classification_report(result_map, y_pred))
print(confusion_matrix(result_map, y_pred))


[0 1 0 0 0 0 0 0 0 0 1 0]
    Rnd.      홈 팀     원정 팀 홈 경기결과
0     36    ulsan   pohang      D
1     36  suwonFC    suwon      L
2     36    daegu  gwangju      D
3     36     jeju    seoul      D
4     36  daejeon  gangwon      D
5     36  incheon  jeonbuk      D
6     37  jeonbuk  gwangju      D
7     37  gangwon  suwonFC      D
8     37  incheon    ulsan      D
9     37    seoul    suwon      D
10    37     jeju  daejeon      L
11    37   pohang    daegu      D
Accuracy: 0.4166666666666667
              precision    recall  f1-score   support

           0       0.30      1.00      0.46         3
           1       1.00      0.50      0.67         4
           2       0.00      0.00      0.00         5

    accuracy                           0.42        12
   macro avg       0.43      0.50      0.38        12
weighted avg       0.41      0.42      0.34        12

[[3 0 0]
 [2 2 0]
 [5 0 0]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [31]:
# 그래디언트 부스팅

from sklearn.ensemble import GradientBoostingClassifier

model = GradientBoostingClassifier(random_state=42, max_depth=3, n_estimators=500, )
model.fit(X_train_scaled, y_train)

y_pred = model.predict(test_df)
print(y_pred)

pred_list = []
for i in y_pred:
    pred_list.append(predict_map[i])
predict_df['홈 경기결과'] = pred_list
print(predict_df)

accuracy = accuracy_score(result_map, y_pred)
print(f'Accuracy: {accuracy}')
print(classification_report(result_map, y_pred))
print(confusion_matrix(result_map, y_pred))


[0 0 0 0 1 0 0 0 0 1 1 0]
    Rnd.      홈 팀     원정 팀 홈 경기결과
0     36    ulsan   pohang      D
1     36  suwonFC    suwon      D
2     36    daegu  gwangju      D
3     36     jeju    seoul      D
4     36  daejeon  gangwon      L
5     36  incheon  jeonbuk      D
6     37  jeonbuk  gwangju      D
7     37  gangwon  suwonFC      D
8     37  incheon    ulsan      D
9     37    seoul    suwon      L
10    37     jeju  daejeon      L
11    37   pohang    daegu      D
Accuracy: 0.5
              precision    recall  f1-score   support

           0       0.33      1.00      0.50         3
           1       1.00      0.75      0.86         4
           2       0.00      0.00      0.00         5

    accuracy                           0.50        12
   macro avg       0.44      0.58      0.45        12
weighted avg       0.42      0.50      0.41        12

[[3 0 0]
 [1 3 0]
 [5 0 0]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [53]:
# 에이다 부스트

from sklearn.ensemble import AdaBoostClassifier

model = AdaBoostClassifier(random_state=42,)
model.fit(X_train_scaled, y_train)

y_pred = model.predict(test_df)
print(y_pred)

pred_list = []
for i in y_pred:
    pred_list.append(predict_map[i])
predict_df['홈 경기결과'] = pred_list
print(predict_df)

accuracy = accuracy_score(result_map, y_pred)
print(f'Accuracy: {accuracy}')
print(classification_report(result_map, y_pred))
print(confusion_matrix(result_map, y_pred))


[0 0 0 0 1 1 2 2 2 1 1 2]
    Rnd.      홈 팀     원정 팀 홈 경기결과
0     36    ulsan   pohang      D
1     36  suwonFC    suwon      D
2     36    daegu  gwangju      D
3     36     jeju    seoul      D
4     36  daejeon  gangwon      L
5     36  incheon  jeonbuk      L
6     37  jeonbuk  gwangju      W
7     37  gangwon  suwonFC      W
8     37  incheon    ulsan      W
9     37    seoul    suwon      L
10    37     jeju  daejeon      L
11    37   pohang    daegu      W
Accuracy: 0.75
              precision    recall  f1-score   support

           0       0.50      0.67      0.57         3
           1       0.75      0.75      0.75         4
           2       1.00      0.80      0.89         5

    accuracy                           0.75        12
   macro avg       0.75      0.74      0.74        12
weighted avg       0.79      0.75      0.76        12

[[2 1 0]
 [1 3 0]
 [1 0 4]]




In [54]:
import lightgbm as lgb

lgb_model = lgb.LGBMClassifier(random_state=42, max_depth=3, n_estimators=500)
lgb_model.fit(X_train_scaled, y_train)

y_pred = model.predict(test_df)
print(y_pred)

pred_list = []
for i in y_pred:
    pred_list.append(predict_map[i])
predict_df['홈 경기결과'] = pred_list
print(predict_df)

accuracy = accuracy_score(result_map, y_pred)
print(f'Accuracy: {accuracy}')
print(classification_report(result_map, y_pred))
print(confusion_matrix(result_map, y_pred))


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001260 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5235
[LightGBM] [Info] Number of data points in the train set: 599, number of used features: 111
[LightGBM] [Info] Start training from score -1.191255
[LightGBM] [Info] Start training from score -1.180326
[LightGBM] [Info] Start training from score -0.944223
[0 0 0 0 1 1 2 2 2 1 1 2]
    Rnd.      홈 팀     원정 팀 홈 경기결과
0     36    ulsan   pohang      D
1     36  suwonFC    suwon      D
2     36    daegu  gwangju      D
3     36     jeju    seoul      D
4     36  daejeon  gangwon      L
5     36  incheon  jeonbuk      L
6     37  jeonbuk  gwangju      W
7     37  gangwon  suwonFC      W
8     37  incheon    ulsan      W
9     37    seoul    suwon      L
10    37     jeju  daejeon      L
11    37   pohang    daegu      W
Accuracy: 0.75
   



In [56]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(random_state=42)
model.fit(X_train_scaled, y_train)

y_pred = model.predict(test_df)
print(y_pred)

pred_list = []
for i in y_pred:
    pred_list.append(predict_map[i])
predict_df['홈 경기결과'] = pred_list
print(predict_df)

accuracy = accuracy_score(result_map, y_pred)
print(f'Accuracy: {accuracy}')
print(classification_report(result_map, y_pred))
print(confusion_matrix(result_map, y_pred))


[0 0 0 1 0 0 2 2 2 1 1 0]
    Rnd.      홈 팀     원정 팀 홈 경기결과
0     36    ulsan   pohang      D
1     36  suwonFC    suwon      D
2     36    daegu  gwangju      D
3     36     jeju    seoul      L
4     36  daejeon  gangwon      D
5     36  incheon  jeonbuk      D
6     37  jeonbuk  gwangju      W
7     37  gangwon  suwonFC      W
8     37  incheon    ulsan      W
9     37    seoul    suwon      L
10    37     jeju  daejeon      L
11    37   pohang    daegu      D
Accuracy: 0.5833333333333334
              precision    recall  f1-score   support

           0       0.33      0.67      0.44         3
           1       0.67      0.50      0.57         4
           2       1.00      0.60      0.75         5

    accuracy                           0.58        12
   macro avg       0.67      0.59      0.59        12
weighted avg       0.72      0.58      0.61        12

[[2 1 0]
 [2 2 0]
 [2 0 3]]




In [33]:
# xgboost

from xgboost import XGBClassifier

model = XGBClassifier()

model.fit(X_train_scaled, y_train)

y_pred = model.predict(test_df)
print(y_pred)

pred_list = []
for i in y_pred:
    pred_list.append(predict_map[i])
predict_df['홈 경기결과'] = pred_list
print(predict_df)

accuracy = accuracy_score(result_map, y_pred)
print(f'Accuracy: {accuracy}')
print(classification_report(result_map, y_pred))
print(confusion_matrix(result_map, y_pred))


[0 0 1 1 1 1 2 2 2 1 1 2]
    Rnd.      홈 팀     원정 팀 홈 경기결과
0     36    ulsan   pohang      D
1     36  suwonFC    suwon      D
2     36    daegu  gwangju      L
3     36     jeju    seoul      L
4     36  daejeon  gangwon      L
5     36  incheon  jeonbuk      L
6     37  jeonbuk  gwangju      W
7     37  gangwon  suwonFC      W
8     37  incheon    ulsan      W
9     37    seoul    suwon      L
10    37     jeju  daejeon      L
11    37   pohang    daegu      W
Accuracy: 0.5833333333333334
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         3
           1       0.50      0.75      0.60         4
           2       1.00      0.80      0.89         5

    accuracy                           0.58        12
   macro avg       0.50      0.52      0.50        12
weighted avg       0.58      0.58      0.57        12

[[0 3 0]
 [1 3 0]
 [1 0 4]]


In [34]:
from catboost import CatBoostClassifier


model = CatBoostClassifier(random_seed=42, metric_period=100, depth=4)

model.fit(X_train_scaled, y_train)

y_pred = model.predict(test_df)
print(y_pred)

y_pred = y_pred.reshape(-1)
pred_list = []
for i in y_pred:
    pred_list.append(predict_map[i])
predict_df['홈 경기결과'] = pred_list
print(predict_df)

accuracy = accuracy_score(result_map, y_pred)
print(f'Accuracy: {accuracy}')
print(classification_report(result_map, y_pred))
print(confusion_matrix(result_map, y_pred))


Learning rate set to 0.077085
0:	learn: 1.0291243	total: 4.95ms	remaining: 4.95s
100:	learn: 0.1718829	total: 305ms	remaining: 2.71s
200:	learn: 0.1019578	total: 631ms	remaining: 2.51s
300:	learn: 0.0754662	total: 1.09s	remaining: 2.54s
400:	learn: 0.0533483	total: 1.62s	remaining: 2.43s
500:	learn: 0.0425894	total: 2.1s	remaining: 2.09s
600:	learn: 0.0342938	total: 2.54s	remaining: 1.69s
700:	learn: 0.0289653	total: 3.04s	remaining: 1.3s
800:	learn: 0.0243735	total: 3.56s	remaining: 885ms
900:	learn: 0.0211503	total: 4.03s	remaining: 443ms
999:	learn: 0.0189985	total: 4.46s	remaining: 0us
[[0]
 [0]
 [0]
 [0]
 [1]
 [0]
 [2]
 [2]
 [0]
 [1]
 [1]
 [2]]
    Rnd.      홈 팀     원정 팀 홈 경기결과
0     36    ulsan   pohang      D
1     36  suwonFC    suwon      D
2     36    daegu  gwangju      D
3     36     jeju    seoul      D
4     36  daejeon  gangwon      L
5     36  incheon  jeonbuk      D
6     37  jeonbuk  gwangju      W
7     37  gangwon  suwonFC      W
8     37  incheon    ulsan      D
9 

-------------------------

# 변수 선택 후, 12개 모델 사용

In [38]:
# train_V2.ipynb에서 실행한 변수선택 후 변수들 가져오기

selected_variables =['홈 득점', '원정 득점', '홈 경고', '원정 팀_daejeon']


selected_X_train = X_train[selected_variables]
selected_X_test = X_test[selected_variables]

# 확인
print(selected_X_train.shape, selected_X_test.shape)
print(y_train.shape, y_test.shape)

scaler = RobustScaler()
selected_X_train_scaled = scaler.fit_transform(selected_X_train)
selected_X_test_scaled = scaler.transform(selected_X_test)

(599, 4) (67, 4)
(599,) (67,)


- 다항 로지스틱 회귀

In [39]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(random_state=42, solver='newton-cg', max_iter=500)
model.fit(selected_X_train_scaled, y_train)

# test data로 predict
y_pred = model.predict(test_df[selected_variables])
print(y_pred)

pred_list = []
for i in y_pred:
    pred_list.append(predict_map[i])
predict_df['홈 경기결과'] = pred_list
print(predict_df)

accuracy = accuracy_score(result_map, y_pred)
print(f'Accuracy: {accuracy}')
print(classification_report(result_map, y_pred))
print(confusion_matrix(result_map, y_pred))


[1 1 1 0 1 1 2 2 2 1 1 2]
    Rnd.      홈 팀     원정 팀 홈 경기결과
0     36    ulsan   pohang      L
1     36  suwonFC    suwon      L
2     36    daegu  gwangju      L
3     36     jeju    seoul      D
4     36  daejeon  gangwon      L
5     36  incheon  jeonbuk      L
6     37  jeonbuk  gwangju      W
7     37  gangwon  suwonFC      W
8     37  incheon    ulsan      W
9     37    seoul    suwon      L
10    37     jeju  daejeon      L
11    37   pohang    daegu      W
Accuracy: 0.75
              precision    recall  f1-score   support

           0       1.00      0.33      0.50         3
           1       0.57      1.00      0.73         4
           2       1.00      0.80      0.89         5

    accuracy                           0.75        12
   macro avg       0.86      0.71      0.71        12
weighted avg       0.86      0.75      0.74        12

[[1 2 0]
 [0 4 0]
 [0 1 4]]




In [40]:
# MLP

from sklearn.neural_network import MLPClassifier

model = MLPClassifier(random_state=42, max_iter=500)
model.fit(selected_X_train_scaled, y_train)

# test data로 predict
y_pred = model.predict(test_df[selected_variables])
print(y_pred)

pred_list = []
for i in y_pred:
    pred_list.append(predict_map[i])
predict_df['홈 경기결과'] = pred_list
print(predict_df)

accuracy = accuracy_score(result_map, y_pred)
print(f'Accuracy: {accuracy}')
print(classification_report(result_map, y_pred))
print(confusion_matrix(result_map, y_pred))


[1 1 1 0 1 1 2 2 2 1 1 2]
    Rnd.      홈 팀     원정 팀 홈 경기결과
0     36    ulsan   pohang      L
1     36  suwonFC    suwon      L
2     36    daegu  gwangju      L
3     36     jeju    seoul      D
4     36  daejeon  gangwon      L
5     36  incheon  jeonbuk      L
6     37  jeonbuk  gwangju      W
7     37  gangwon  suwonFC      W
8     37  incheon    ulsan      W
9     37    seoul    suwon      L
10    37     jeju  daejeon      L
11    37   pohang    daegu      W
Accuracy: 0.75
              precision    recall  f1-score   support

           0       1.00      0.33      0.50         3
           1       0.57      1.00      0.73         4
           2       1.00      0.80      0.89         5

    accuracy                           0.75        12
   macro avg       0.86      0.71      0.71        12
weighted avg       0.86      0.75      0.74        12

[[1 2 0]
 [0 4 0]
 [0 1 4]]




In [57]:
# LDA

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

model = LinearDiscriminantAnalysis()
model.fit(selected_X_train_scaled, y_train)

y_pred = model.predict(test_df[selected_variables])
print(y_pred)

pred_list = []
for i in y_pred:
    pred_list.append(predict_map[i])
predict_df['홈 경기결과'] = pred_list
print(predict_df)

accuracy = accuracy_score(result_map, y_pred)
print(f'Accuracy: {accuracy}')
print(classification_report(result_map, y_pred))
print(confusion_matrix(result_map, y_pred))


[1 1 1 0 1 1 2 2 2 1 1 2]
    Rnd.      홈 팀     원정 팀 홈 경기결과
0     36    ulsan   pohang      L
1     36  suwonFC    suwon      L
2     36    daegu  gwangju      L
3     36     jeju    seoul      D
4     36  daejeon  gangwon      L
5     36  incheon  jeonbuk      L
6     37  jeonbuk  gwangju      W
7     37  gangwon  suwonFC      W
8     37  incheon    ulsan      W
9     37    seoul    suwon      L
10    37     jeju  daejeon      L
11    37   pohang    daegu      W
Accuracy: 0.75
              precision    recall  f1-score   support

           0       1.00      0.33      0.50         3
           1       0.57      1.00      0.73         4
           2       1.00      0.80      0.89         5

    accuracy                           0.75        12
   macro avg       0.86      0.71      0.71        12
weighted avg       0.86      0.75      0.74        12

[[1 2 0]
 [0 4 0]
 [0 1 4]]




In [58]:
# QDA

from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

model = QuadraticDiscriminantAnalysis()
model.fit(selected_X_train_scaled, y_train)

print(y_pred)

pred_list = []
for i in y_pred:
    pred_list.append(predict_map[i])
predict_df['홈 경기결과'] = pred_list
print(predict_df)

accuracy = accuracy_score(result_map, y_pred)
print(f'Accuracy: {accuracy}')
print(classification_report(result_map, y_pred))
print(confusion_matrix(result_map, y_pred))


[1 1 1 0 1 1 2 2 2 1 1 2]
    Rnd.      홈 팀     원정 팀 홈 경기결과
0     36    ulsan   pohang      L
1     36  suwonFC    suwon      L
2     36    daegu  gwangju      L
3     36     jeju    seoul      D
4     36  daejeon  gangwon      L
5     36  incheon  jeonbuk      L
6     37  jeonbuk  gwangju      W
7     37  gangwon  suwonFC      W
8     37  incheon    ulsan      W
9     37    seoul    suwon      L
10    37     jeju  daejeon      L
11    37   pohang    daegu      W
Accuracy: 0.75
              precision    recall  f1-score   support

           0       1.00      0.33      0.50         3
           1       0.57      1.00      0.73         4
           2       1.00      0.80      0.89         5

    accuracy                           0.75        12
   macro avg       0.86      0.71      0.71        12
weighted avg       0.86      0.75      0.74        12

[[1 2 0]
 [0 4 0]
 [0 1 4]]


In [41]:

# SVM

from sklearn import svm

model = svm.SVC(random_state=42, kernel='linear')
model.fit(selected_X_train_scaled, y_train)

# test data로 predict
y_pred = model.predict(test_df[selected_variables])
print(y_pred)

pred_list = []
for i in y_pred:
    pred_list.append(predict_map[i])
predict_df['홈 경기결과'] = pred_list
print(predict_df)

accuracy = accuracy_score(result_map, y_pred)
print(f'Accuracy: {accuracy}')
print(classification_report(result_map, y_pred))
print(confusion_matrix(result_map, y_pred))


[1 1 1 0 1 1 2 2 2 1 1 2]
    Rnd.      홈 팀     원정 팀 홈 경기결과
0     36    ulsan   pohang      L
1     36  suwonFC    suwon      L
2     36    daegu  gwangju      L
3     36     jeju    seoul      D
4     36  daejeon  gangwon      L
5     36  incheon  jeonbuk      L
6     37  jeonbuk  gwangju      W
7     37  gangwon  suwonFC      W
8     37  incheon    ulsan      W
9     37    seoul    suwon      L
10    37     jeju  daejeon      L
11    37   pohang    daegu      W
Accuracy: 0.75
              precision    recall  f1-score   support

           0       1.00      0.33      0.50         3
           1       0.57      1.00      0.73         4
           2       1.00      0.80      0.89         5

    accuracy                           0.75        12
   macro avg       0.86      0.71      0.71        12
weighted avg       0.86      0.75      0.74        12

[[1 2 0]
 [0 4 0]
 [0 1 4]]




In [42]:
# 그래디언트 부스팅

from sklearn.ensemble import GradientBoostingClassifier

model = GradientBoostingClassifier(random_state=42, max_depth=3, n_estimators=500, )
model.fit(selected_X_train_scaled, y_train)

# test data로 predict
y_pred = model.predict(test_df[selected_variables])
print(y_pred)

pred_list = []
for i in y_pred:
    pred_list.append(predict_map[i])
predict_df['홈 경기결과'] = pred_list
print(predict_df)

accuracy = accuracy_score(result_map, y_pred)
print(f'Accuracy: {accuracy}')
print(classification_report(result_map, y_pred))
print(confusion_matrix(result_map, y_pred))


[0 1 1 0 1 1 2 2 2 1 1 2]
    Rnd.      홈 팀     원정 팀 홈 경기결과
0     36    ulsan   pohang      D
1     36  suwonFC    suwon      L
2     36    daegu  gwangju      L
3     36     jeju    seoul      D
4     36  daejeon  gangwon      L
5     36  incheon  jeonbuk      L
6     37  jeonbuk  gwangju      W
7     37  gangwon  suwonFC      W
8     37  incheon    ulsan      W
9     37    seoul    suwon      L
10    37     jeju  daejeon      L
11    37   pohang    daegu      W
Accuracy: 0.75
              precision    recall  f1-score   support

           0       0.50      0.33      0.40         3
           1       0.67      1.00      0.80         4
           2       1.00      0.80      0.89         5

    accuracy                           0.75        12
   macro avg       0.72      0.71      0.70        12
weighted avg       0.76      0.75      0.74        12

[[1 2 0]
 [0 4 0]
 [1 0 4]]




In [59]:
# 에이다 부스트

from sklearn.ensemble import AdaBoostClassifier

model = AdaBoostClassifier(random_state=42,)
model.fit(selected_X_train_scaled, y_train)

y_pred = model.predict(test_df[selected_variables])
print(y_pred)

pred_list = []
for i in y_pred:
    pred_list.append(predict_map[i])
predict_df['홈 경기결과'] = pred_list
print(predict_df)

accuracy = accuracy_score(result_map, y_pred)
print(f'Accuracy: {accuracy}')
print(classification_report(result_map, y_pred))
print(confusion_matrix(result_map, y_pred))


[1 0 0 0 0 1 2 2 1 1 0 2]
    Rnd.      홈 팀     원정 팀 홈 경기결과
0     36    ulsan   pohang      L
1     36  suwonFC    suwon      D
2     36    daegu  gwangju      D
3     36     jeju    seoul      D
4     36  daejeon  gangwon      D
5     36  incheon  jeonbuk      L
6     37  jeonbuk  gwangju      W
7     37  gangwon  suwonFC      W
8     37  incheon    ulsan      L
9     37    seoul    suwon      L
10    37     jeju  daejeon      D
11    37   pohang    daegu      W
Accuracy: 0.5
              precision    recall  f1-score   support

           0       0.40      0.67      0.50         3
           1       0.25      0.25      0.25         4
           2       1.00      0.60      0.75         5

    accuracy                           0.50        12
   macro avg       0.55      0.51      0.50        12
weighted avg       0.60      0.50      0.52        12

[[2 1 0]
 [3 1 0]
 [0 2 3]]




In [60]:
# light gbm

import lightgbm as lgb

lgb_model = lgb.LGBMClassifier(random_state=42, max_depth=3, n_estimators=500)
lgb_model.fit(selected_X_train_scaled, y_train)

y_pred = model.predict(test_df[selected_variables])
print(y_pred)

pred_list = []
for i in y_pred:
    pred_list.append(predict_map[i])
predict_df['홈 경기결과'] = pred_list
print(predict_df)

accuracy = accuracy_score(result_map, y_pred)
print(f'Accuracy: {accuracy}')
print(classification_report(result_map, y_pred))
print(confusion_matrix(result_map, y_pred))


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000213 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 20
[LightGBM] [Info] Number of data points in the train set: 599, number of used features: 3
[LightGBM] [Info] Start training from score -1.191255
[LightGBM] [Info] Start training from score -1.180326
[LightGBM] [Info] Start training from score -0.944223
[1 0 0 0 0 1 2 2 1 1 0 2]
    Rnd.      홈 팀     원정 팀 홈 경기결과
0     36    ulsan   pohang      L
1     36  suwonFC    suwon      D
2     36    daegu  gwangju      D
3     36     jeju    seoul      D
4     36  daejeon  gangwon      D
5     36  incheon  jeonbuk      L
6     37  jeonbuk  gwangju      W
7     37  gangwon  suwonFC      W
8     37  incheon    ulsan      L
9     37    seoul    suwon      L
10    37     jeju  daejeon      D
11    37   pohang    daegu      W
Accuracy: 0.5
              precision    recall  f1-score   support

           0     



In [61]:
# 랜덤 포레스트

from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(random_state=42)
model.fit(selected_X_train_scaled, y_train)

y_pred = model.predict(test_df[selected_variables])
print(y_pred)

pred_list = []
for i in y_pred:
    pred_list.append(predict_map[i])
predict_df['홈 경기결과'] = pred_list
print(predict_df)

accuracy = accuracy_score(result_map, y_pred)
print(f'Accuracy: {accuracy}')
print(classification_report(result_map, y_pred))
print(confusion_matrix(result_map, y_pred))


[0 1 1 0 1 1 2 2 2 1 1 2]
    Rnd.      홈 팀     원정 팀 홈 경기결과
0     36    ulsan   pohang      D
1     36  suwonFC    suwon      L
2     36    daegu  gwangju      L
3     36     jeju    seoul      D
4     36  daejeon  gangwon      L
5     36  incheon  jeonbuk      L
6     37  jeonbuk  gwangju      W
7     37  gangwon  suwonFC      W
8     37  incheon    ulsan      W
9     37    seoul    suwon      L
10    37     jeju  daejeon      L
11    37   pohang    daegu      W
Accuracy: 0.75
              precision    recall  f1-score   support

           0       0.50      0.33      0.40         3
           1       0.67      1.00      0.80         4
           2       1.00      0.80      0.89         5

    accuracy                           0.75        12
   macro avg       0.72      0.71      0.70        12
weighted avg       0.76      0.75      0.74        12

[[1 2 0]
 [0 4 0]
 [1 0 4]]




In [43]:
# XGBoost

model = XGBClassifier()
model.fit(selected_X_train_scaled, y_train)

y_pred = model.predict(test_df[selected_variables])
print(y_pred)

pred_list = []
for i in y_pred:
    pred_list.append(predict_map[i])
predict_df['홈 경기결과'] = pred_list
print(predict_df)

accuracy = accuracy_score(result_map, y_pred)
print(f'Accuracy: {accuracy}')
print(classification_report(result_map, y_pred))
print(confusion_matrix(result_map, y_pred))


[0 1 1 0 1 1 2 2 2 1 1 2]
    Rnd.      홈 팀     원정 팀 홈 경기결과
0     36    ulsan   pohang      D
1     36  suwonFC    suwon      L
2     36    daegu  gwangju      L
3     36     jeju    seoul      D
4     36  daejeon  gangwon      L
5     36  incheon  jeonbuk      L
6     37  jeonbuk  gwangju      W
7     37  gangwon  suwonFC      W
8     37  incheon    ulsan      W
9     37    seoul    suwon      L
10    37     jeju  daejeon      L
11    37   pohang    daegu      W
Accuracy: 0.75
              precision    recall  f1-score   support

           0       0.50      0.33      0.40         3
           1       0.67      1.00      0.80         4
           2       1.00      0.80      0.89         5

    accuracy                           0.75        12
   macro avg       0.72      0.71      0.70        12
weighted avg       0.76      0.75      0.74        12

[[1 2 0]
 [0 4 0]
 [1 0 4]]


In [48]:
# catboost

model = CatBoostClassifier(random_seed=42, metric_period=250, depth=2)
model.fit(selected_X_train_scaled, y_train)

y_pred = model.predict(test_df[selected_variables])
print(y_pred)

pred_list = []
y_pred = y_pred.reshape(-1)
for i in y_pred:
    pred_list.append(predict_map[i])
predict_df['홈 경기결과'] = pred_list
# print(predict_df)

accuracy = accuracy_score(result_map, y_pred)
print(f'Accuracy: {accuracy}')
print(classification_report(result_map, y_pred))
print(confusion_matrix(result_map, y_pred))


Learning rate set to 0.077085
0:	learn: 1.0361016	total: 647us	remaining: 647ms
250:	learn: 0.1041215	total: 90ms	remaining: 268ms
500:	learn: 0.0681831	total: 170ms	remaining: 170ms
750:	learn: 0.0560720	total: 278ms	remaining: 92ms
999:	learn: 0.0493335	total: 371ms	remaining: 0us
[[0]
 [1]
 [1]
 [0]
 [1]
 [1]
 [2]
 [2]
 [2]
 [1]
 [1]
 [2]]
Accuracy: 0.75
              precision    recall  f1-score   support

           0       0.50      0.33      0.40         3
           1       0.67      1.00      0.80         4
           2       1.00      0.80      0.89         5

    accuracy                           0.75        12
   macro avg       0.72      0.71      0.70        12
weighted avg       0.76      0.75      0.74        12

[[1 2 0]
 [0 4 0]
 [1 0 4]]
