In [7]:
import os
import random

import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, RobustScaler

from sklearn.model_selection import KFold , StratifiedKFold
from sklearn.metrics import confusion_matrix, r2_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from category_encoders.target_encoder import TargetEncoder
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler, RobustScaler
import statsmodels.api as sm


def seed_everything(seed):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)


# 시드 고정
seed_everything(42)

In [8]:
train_df = pd.read_csv('K-League-data.csv').drop(columns='Unnamed: 0')
print(train_df.shape)
train_df.head(1)

(684, 99)


Unnamed: 0,Rnd.,대회,홈 득점,홈 도움,홈 슈팅,홈 유효 슈팅,홈 블락된슈팅,홈 벗어난슈팅,홈 PA내 슈팅,홈 PA외 슈팅,...,원정 피파울,원정 경고,원정 퇴장,원정 팀,원정 경기결과,원정 소속 선수 수,원정 평균 나이,원정 용병 수,원정 평균 가치,원정 총 가치
0,1,대구vs수원FC,1,1,13,4,2,7,5,8,...,13,3,0,suwonFC,D,36,25.6,5,350.0,12.6


In [9]:
test_df = pd.read_csv('K-League-Test-data.csv')

predict_df = (test_df[[ '홈 팀','원정 팀']])

# test df의 경기결과 저장
test_result = test_df['홈 경기결과']
test_df = test_df.drop(columns=['Unnamed: 0', '대회', '홈 경기결과', '원정 경기결과'])
test_df = pd.get_dummies(test_df, columns=['홈 팀', '원정 팀'], drop_first=False)

print(test_df.shape)  # shape이 122 // 118로 안맞음 

# 결과확인을 위한 df
# test_df
predict_df

(90, 118)


Unnamed: 0,홈 팀,원정 팀
0,daegu,gimcheon
1,pohang,daegu
2,daegu,suwonFC
3,gwangju,daegu
4,gangwon,daegu
...,...,...
85,suwonFC,pohang
86,pohang,seoul
87,suwonFC,seoul
88,seoul,ulsan


In [10]:
# 2024 시즌에 없는 성남, 수원의 데이터는 날림
drop_columns = ['홈 팀_seongnam', '원정 팀_seongnam', '홈 팀_suwon', '원정 팀_suwon']

# train_data 다시 처리
X = train_df.drop(columns=['대회', '홈 경기결과', '원정 경기결과']) # 필요 없거나 중복되는 열 제거
X = pd.get_dummies(X, columns=['홈 팀', '원정 팀'], drop_first=False) # 문자 숫자로 바꾸기

X = X.drop(columns=drop_columns)

y = train_df['홈 경기결과']  
# print(y.value_counts())

# 경기 결과 (W, D, L)를 숫자로 
le = LabelEncoder()
y = le.fit_transform(y)
# print(y)

# train/test 분리. 데이터셋의 개수가 작아서 9:1로 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# scaler = StandardScaler()  
scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)  # 118개의 열로 맞춤

(615, 118) (69, 118)
(615,) (69,)


In [11]:
predict_map = {0:'D', 1: "L", 2:'W'}  # 0,1,2로 바꾼걸 다시 DLW로 매칭

test_result

0     L
1     W
2     D
3     L
4     W
     ..
85    W
86    D
87    L
88    L
89    W
Name: 홈 경기결과, Length: 90, dtype: object

In [12]:
predict_map = {0: 'D', 1: 'L', 2: 'W'}

# predict_map을 역으로 변환
reverse_predict_map = {value: key for key, value in predict_map.items()}

# test_result를 역으로 변환
result_map = [reverse_predict_map[res] for res in test_result]

# print(test_result, result_map)

In [13]:
# 다항 로지스틱 회귀

model = LogisticRegression(random_state=42, solver='newton-cg',max_iter=500)
model.fit(X_train_scaled, y_train)

y_pred = model.predict(test_df)
print(y_pred)

pred_list = []
for i in y_pred:
    pred_list.append(predict_map[i])
predict_df['홈 경기결과'] = pred_list
print(predict_df)

accuracy = accuracy_score(result_map, y_pred)
print(f'Accuracy: {accuracy}')
print(classification_report(result_map, y_pred))
print(confusion_matrix(result_map, y_pred))

r2 = r2_score(result_map, y_pred)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
        홈 팀      원정 팀 홈 경기결과
0     daegu  gimcheon      D
1    pohang     daegu      D
2     daegu   suwonFC      D
3   gwangju     daegu      D
4   gangwon     daegu      D
..      ...       ...    ...
85  suwonFC    pohang      D
86   pohang     seoul      D
87  suwonFC     seoul      D
88    seoul     ulsan      D
89    ulsan   suwonFC      D

[90 rows x 3 columns]
Accuracy: 0.28888888888888886
              precision    recall  f1-score   support

           0       0.29      1.00      0.45        26
           1       0.00      0.00      0.00        28
           2       0.00      0.00      0.00        36

    accuracy                           0.29        90
   macro avg       0.10      0.33      0.15        90
weighted avg       0.08      0.29      0.13        90

[[26  0  0]
 [28  0  0]
 [36  0  0

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [14]:
print(r2_score)

<function sklearn.metrics._regression.r2_score(y_true, y_pred, *, sample_weight=None, multioutput='uniform_average', force_finite=True)>

In [9]:
from sklearn.neural_network import MLPClassifier

model = MLPClassifier(random_state=42, max_iter=500)
model.fit(X_train_scaled, y_train)

y_pred = model.predict(test_df)
print(y_pred)

pred_list = []
for i in y_pred:
    pred_list.append(predict_map[i])
predict_df['홈 경기결과'] = pred_list
print(predict_df)

accuracy = accuracy_score(result_map, y_pred)
print(f'Accuracy: {accuracy}')
print(classification_report(result_map, y_pred))
print(confusion_matrix(result_map, y_pred))

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
        홈 팀      원정 팀 홈 경기결과
0     daegu  gimcheon      D
1    pohang     daegu      D
2     daegu   suwonFC      D
3   gwangju     daegu      D
4   gangwon     daegu      D
..      ...       ...    ...
85  suwonFC    pohang      D
86   pohang     seoul      D
87  suwonFC     seoul      D
88    seoul     ulsan      D
89    ulsan   suwonFC      D

[90 rows x 3 columns]
Accuracy: 0.28888888888888886
              precision    recall  f1-score   support

           0       0.29      1.00      0.45        26
           1       0.00      0.00      0.00        28
           2       0.00      0.00      0.00        36

    accuracy                           0.29        90
   macro avg       0.10      0.33      0.15        90
weighted avg       0.08      0.29      0.13        90

[[26  0  0]
 [28  0  0]
 [36  0  0

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [10]:
# LDA

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

model = LinearDiscriminantAnalysis()
model.fit(X_train_scaled, y_train)

y_pred = model.predict(test_df)
print(y_pred)

pred_list = []
for i in y_pred:
    pred_list.append(predict_map[i])
predict_df['홈 경기결과'] = pred_list
print(predict_df)

accuracy = accuracy_score(result_map, y_pred)
print(f'Accuracy: {accuracy}')
print(classification_report(result_map, y_pred))
print(confusion_matrix(result_map, y_pred))

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
        홈 팀      원정 팀 홈 경기결과
0     daegu  gimcheon      L
1    pohang     daegu      L
2     daegu   suwonFC      L
3   gwangju     daegu      L
4   gangwon     daegu      L
..      ...       ...    ...
85  suwonFC    pohang      L
86   pohang     seoul      L
87  suwonFC     seoul      L
88    seoul     ulsan      L
89    ulsan   suwonFC      L

[90 rows x 3 columns]
Accuracy: 0.3111111111111111
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        26
           1       0.31      1.00      0.47        28
           2       0.00      0.00      0.00        36

    accuracy                           0.31        90
   macro avg       0.10      0.33      0.16        90
weighted avg       0.10      0.31      0.15        90

[[ 0 26  0]
 [ 0 28  0]
 [ 0 36  0]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [11]:
from sklearn import svm

# train data로 모델 학습
model = svm.SVC(random_state=42, kernel='linear')
model.fit(X_train_scaled, y_train)

# test data로 predict
y_pred = model.predict(test_df)
print(y_pred)

pred_list = []
for i in y_pred:
    pred_list.append(predict_map[i])
predict_df['홈 경기결과'] = pred_list
print(predict_df)

accuracy = accuracy_score(result_map, y_pred)
print(f'Accuracy: {accuracy}')
print(classification_report(result_map, y_pred))
print(confusion_matrix(result_map, y_pred))

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
        홈 팀      원정 팀 홈 경기결과
0     daegu  gimcheon      D
1    pohang     daegu      D
2     daegu   suwonFC      D
3   gwangju     daegu      D
4   gangwon     daegu      D
..      ...       ...    ...
85  suwonFC    pohang      D
86   pohang     seoul      D
87  suwonFC     seoul      D
88    seoul     ulsan      D
89    ulsan   suwonFC      D

[90 rows x 3 columns]
Accuracy: 0.28888888888888886
              precision    recall  f1-score   support

           0       0.29      1.00      0.45        26
           1       0.00      0.00      0.00        28
           2       0.00      0.00      0.00        36

    accuracy                           0.29        90
   macro avg       0.10      0.33      0.15        90
weighted avg       0.08      0.29      0.13        90

[[26  0  0]
 [28  0  0]
 [36  0  0

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [12]:
# 그래디언트 부스팅

from sklearn.ensemble import GradientBoostingClassifier

model = GradientBoostingClassifier(random_state=42, max_depth=3, n_estimators=500, )
model.fit(X_train_scaled, y_train)

y_pred = model.predict(test_df)
print(y_pred)

pred_list = []
for i in y_pred:
    pred_list.append(predict_map[i])
predict_df['홈 경기결과'] = pred_list
print(predict_df)

accuracy = accuracy_score(result_map, y_pred)
print(f'Accuracy: {accuracy}')
print(classification_report(result_map, y_pred))
print(confusion_matrix(result_map, y_pred))

[1 0 1 1 1 0 1 0 1 1 0 0 1 1 1 1 0 1 1 0 1 1 0 0 1 0 1 0 1 1 0 1 1 0 0 1 1
 1 1 1 1 1 1 0 1 1 0 1 1 1 0 0 0 2 1 0 1 1 1 1 1 1 1 0 0 1 0 1 1 1 1 1 1 1
 0 1 2 1 1 1 1 1 1 1 1 1 1 1 1 0]
        홈 팀      원정 팀 홈 경기결과
0     daegu  gimcheon      L
1    pohang     daegu      D
2     daegu   suwonFC      L
3   gwangju     daegu      L
4   gangwon     daegu      L
..      ...       ...    ...
85  suwonFC    pohang      L
86   pohang     seoul      L
87  suwonFC     seoul      L
88    seoul     ulsan      L
89    ulsan   suwonFC      D

[90 rows x 3 columns]
Accuracy: 0.4444444444444444
              precision    recall  f1-score   support

           0       0.40      0.38      0.39        26
           1       0.44      1.00      0.62        28
           2       1.00      0.06      0.11        36

    accuracy                           0.44        90
   macro avg       0.61      0.48      0.37        90
weighted avg       0.65      0.44      0.35        90

[[10 16  0]
 [ 0 28  0]
 [15 19  2]



In [13]:
# 에이다 부스트

from sklearn.ensemble import AdaBoostClassifier

model = AdaBoostClassifier(random_state=42,)
model.fit(X_train_scaled, y_train)

y_pred = model.predict(test_df)
print(y_pred)

pred_list = []
for i in y_pred:
    pred_list.append(predict_map[i])
predict_df['홈 경기결과'] = pred_list
print(predict_df)

accuracy = accuracy_score(result_map, y_pred)
print(f'Accuracy: {accuracy}')
print(classification_report(result_map, y_pred))
print(confusion_matrix(result_map, y_pred))



[1 2 1 1 2 1 1 1 1 1 2 1 1 1 2 1 1 1 2 2 1 2 1 1 2 1 1 2 1 1 1 1 1 2 2 2 1
 1 2 2 1 2 1 2 2 1 1 1 1 2 1 1 2 2 1 2 2 1 1 1 1 1 1 1 1 1 1 2 1 1 2 1 2 2
 2 1 2 1 1 1 2 1 2 1 1 2 1 1 1 2]
        홈 팀      원정 팀 홈 경기결과
0     daegu  gimcheon      L
1    pohang     daegu      W
2     daegu   suwonFC      L
3   gwangju     daegu      L
4   gangwon     daegu      W
..      ...       ...    ...
85  suwonFC    pohang      W
86   pohang     seoul      L
87  suwonFC     seoul      L
88    seoul     ulsan      L
89    ulsan   suwonFC      W

[90 rows x 3 columns]
Accuracy: 0.6666666666666666
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        26
           1       0.48      1.00      0.65        28
           2       1.00      0.89      0.94        36

    accuracy                           0.67        90
   macro avg       0.49      0.63      0.53        90
weighted avg       0.55      0.67      0.58        90

[[ 0 26  0]
 [ 0 28  0]
 [ 0  4 32]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [14]:
# xgboost

from xgboost import XGBClassifier

model = XGBClassifier()

model.fit(X_train_scaled, y_train)

y_pred = model.predict(test_df)
print(y_pred)

pred_list = []
for i in y_pred:
    pred_list.append(predict_map[i])
predict_df['홈 경기결과'] = pred_list
print(predict_df)

accuracy = accuracy_score(result_map, y_pred)
print(f'Accuracy: {accuracy}')
print(classification_report(result_map, y_pred))
print(confusion_matrix(result_map, y_pred))

[1 2 1 1 2 0 1 0 0 1 2 0 1 1 2 1 2 1 2 2 1 2 2 0 2 0 1 2 1 1 0 1 0 2 2 2 0
 1 2 2 0 2 1 2 2 1 0 0 0 2 0 0 2 2 0 2 2 1 1 1 1 1 1 0 0 1 0 2 1 1 2 1 2 2
 2 1 2 1 0 0 2 0 2 1 0 2 0 1 1 2]
        홈 팀      원정 팀 홈 경기결과
0     daegu  gimcheon      L
1    pohang     daegu      W
2     daegu   suwonFC      L
3   gwangju     daegu      L
4   gangwon     daegu      W
..      ...       ...    ...
85  suwonFC    pohang      W
86   pohang     seoul      D
87  suwonFC     seoul      L
88    seoul     ulsan      L
89    ulsan   suwonFC      W

[90 rows x 3 columns]
Accuracy: 0.7777777777777778
              precision    recall  f1-score   support

           0       0.62      0.58      0.60        26
           1       0.66      0.75      0.70        28
           2       1.00      0.94      0.97        36

    accuracy                           0.78        90
   macro avg       0.76      0.76      0.76        90
weighted avg       0.78      0.78      0.78        90

[[15 11  0]
 [ 7 21  0]
 [ 2  0 34]

In [15]:
from catboost import CatBoostClassifier


model = CatBoostClassifier(random_seed=42, metric_period=100, depth=4)

model.fit(X_train_scaled, y_train)

y_pred = model.predict(test_df)
# print(y_pred)

y_pred = y_pred.reshape(-1)
pred_list = []
for i in y_pred:
    pred_list.append(predict_map[i])
predict_df['홈 경기결과'] = pred_list
print(predict_df)

accuracy = accuracy_score(result_map, y_pred)
print(f'Accuracy: {accuracy}')
print(classification_report(result_map, y_pred))
print(confusion_matrix(result_map, y_pred))

Learning rate set to 0.077189
0:	learn: 1.0415435	total: 141ms	remaining: 2m 20s
100:	learn: 0.1797570	total: 458ms	remaining: 4.08s
200:	learn: 0.1006592	total: 752ms	remaining: 2.99s
300:	learn: 0.0727082	total: 1.07s	remaining: 2.49s
400:	learn: 0.0544745	total: 1.36s	remaining: 2.04s
500:	learn: 0.0411392	total: 1.66s	remaining: 1.66s
600:	learn: 0.0334891	total: 1.93s	remaining: 1.28s
700:	learn: 0.0272204	total: 2.23s	remaining: 952ms
800:	learn: 0.0225538	total: 2.52s	remaining: 625ms
900:	learn: 0.0191301	total: 2.8s	remaining: 308ms
999:	learn: 0.0167640	total: 3.08s	remaining: 0us
        홈 팀      원정 팀 홈 경기결과
0     daegu  gimcheon      L
1    pohang     daegu      W
2     daegu   suwonFC      L
3   gwangju     daegu      L
4   gangwon     daegu      W
..      ...       ...    ...
85  suwonFC    pohang      W
86   pohang     seoul      L
87  suwonFC     seoul      L
88    seoul     ulsan      L
89    ulsan   suwonFC      W

[90 rows x 3 columns]
Accuracy: 0.7555555555555555
  

# 변수 선택 후

In [16]:
# 선택된 변수들만 사용하기 118 -> 8

selected_variables = ['홈 득점', '원정 득점', '홈 경고', '원정 코너킥', '원정 차단', '원정 키패스', '원정 탈압박', '원정 팀_daejeon']

selected_X_train = X_train[selected_variables]
selected_X_test = X_test[selected_variables]

# 확인
print(selected_X_train.shape, selected_X_test.shape)
print(y_train.shape, y_test.shape)

scaler = RobustScaler()
selected_X_train_scaled = scaler.fit_transform(selected_X_train)
selected_X_test_scaled = scaler.transform(selected_X_test)

(615, 8) (69, 8)
(615,) (69,)


In [28]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(random_state=42, solver='newton-cg', max_iter=500)
model.fit(selected_X_train_scaled, y_train)

# test data로 predict
y_pred = model.predict(test_df[selected_variables])
print(y_pred)

pred_list = []
for i in y_pred:
    pred_list.append(predict_map[i])
predict_df['홈 경기결과'] = pred_list
print(predict_df)

accuracy = accuracy_score(result_map, y_pred)
print(f'Accuracy: {accuracy}')
print(classification_report(result_map, y_pred))
print(confusion_matrix(result_map, y_pred))


[1 2 1 1 2 2 1 2 1 1 2 1 1 1 2 1 2 1 2 2 1 2 2 2 2 1 1 2 1 1 2 1 1 2 2 2 1
 1 2 2 1 2 1 2 2 1 2 1 1 2 2 2 2 2 1 2 2 1 1 1 1 1 1 2 1 1 2 2 1 1 2 1 2 2
 2 1 2 1 1 1 2 1 2 1 1 2 1 1 1 2]
        홈 팀      원정 팀 홈 경기결과
0     daegu  gimcheon      L
1    pohang     daegu      W
2     daegu   suwonFC      L
3   gwangju     daegu      L
4   gangwon     daegu      W
..      ...       ...    ...
85  suwonFC    pohang      W
86   pohang     seoul      L
87  suwonFC     seoul      L
88    seoul     ulsan      L
89    ulsan   suwonFC      W

[90 rows x 3 columns]
Accuracy: 0.7
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        26
           1       0.60      1.00      0.75        28
           2       0.81      0.97      0.89        36

    accuracy                           0.70        90
   macro avg       0.47      0.66      0.54        90
weighted avg       0.51      0.70      0.59        90

[[ 0 18  8]
 [ 0 28  0]
 [ 0  1 35]]
0.16240875912

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [29]:
# MLP

from sklearn.neural_network import MLPClassifier

model = MLPClassifier(random_state=42, max_iter=500)
model.fit(selected_X_train_scaled, y_train)

# test data로 predict
y_pred = model.predict(test_df[selected_variables])
print(y_pred)

pred_list = []
for i in y_pred:
    pred_list.append(predict_map[i])
predict_df['홈 경기결과'] = pred_list
print(predict_df)

accuracy = accuracy_score(result_map, y_pred)
print(f'Accuracy: {accuracy}')
print(classification_report(result_map, y_pred))
print(confusion_matrix(result_map, y_pred))


[1 2 0 1 2 0 1 0 0 1 2 0 1 1 2 0 2 0 2 2 1 2 2 0 0 1 1 2 1 1 0 0 1 2 2 2 1
 1 2 0 1 2 1 2 0 1 2 1 1 2 0 0 2 2 1 0 0 1 1 1 0 1 1 0 1 1 0 2 1 1 2 1 2 0
 2 0 2 0 1 1 2 1 2 0 1 0 1 1 1 2]
        홈 팀      원정 팀 홈 경기결과
0     daegu  gimcheon      L
1    pohang     daegu      W
2     daegu   suwonFC      D
3   gwangju     daegu      L
4   gangwon     daegu      W
..      ...       ...    ...
85  suwonFC    pohang      D
86   pohang     seoul      L
87  suwonFC     seoul      L
88    seoul     ulsan      L
89    ulsan   suwonFC      W

[90 rows x 3 columns]
Accuracy: 0.7888888888888889
              precision    recall  f1-score   support

           0       0.64      0.62      0.63        26
           1       0.76      1.00      0.86        28
           2       0.96      0.75      0.84        36

    accuracy                           0.79        90
   macro avg       0.79      0.79      0.78        90
weighted avg       0.81      0.79      0.79        90

[[16  9  1]
 [ 0 28  0]
 [ 9  0 27]



In [30]:
# LDA

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

model = LinearDiscriminantAnalysis()
model.fit(selected_X_train_scaled, y_train)

y_pred = model.predict(test_df[selected_variables])
print(y_pred)

pred_list = []
for i in y_pred:
    pred_list.append(predict_map[i])
predict_df['홈 경기결과'] = pred_list
print(predict_df)

accuracy = accuracy_score(result_map, y_pred)
print(f'Accuracy: {accuracy}')
print(classification_report(result_map, y_pred))
print(confusion_matrix(result_map, y_pred))



[1 2 2 1 2 2 1 2 1 1 2 2 1 1 2 2 2 1 2 2 1 2 2 2 2 1 1 2 1 1 2 1 1 2 2 2 1
 1 2 2 1 2 1 2 2 1 2 1 1 2 2 2 2 2 1 2 2 1 1 1 1 1 1 2 1 1 2 2 1 2 2 1 2 2
 2 2 2 2 1 1 2 1 2 2 1 2 1 1 1 2]
        홈 팀      원정 팀 홈 경기결과
0     daegu  gimcheon      L
1    pohang     daegu      W
2     daegu   suwonFC      W
3   gwangju     daegu      L
4   gangwon     daegu      W
..      ...       ...    ...
85  suwonFC    pohang      W
86   pohang     seoul      L
87  suwonFC     seoul      L
88    seoul     ulsan      L
89    ulsan   suwonFC      W

[90 rows x 3 columns]
Accuracy: 0.7111111111111111
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        26
           1       0.70      1.00      0.82        28
           2       0.72      1.00      0.84        36

    accuracy                           0.71        90
   macro avg       0.47      0.67      0.55        90
weighted avg       0.51      0.71      0.59        90

[[ 0 12 14]
 [ 0 28  0]
 [ 0  0 36]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [31]:
# SVM

from sklearn import svm

model = svm.SVC(random_state=42, kernel='linear')
model.fit(selected_X_train_scaled, y_train)

# test data로 predict
y_pred = model.predict(test_df[selected_variables])
print(y_pred)

pred_list = []
for i in y_pred:
    pred_list.append(predict_map[i])
predict_df['홈 경기결과'] = pred_list
print(predict_df)

accuracy = accuracy_score(result_map, y_pred)
print(f'Accuracy: {accuracy}')
print(classification_report(result_map, y_pred))
print(confusion_matrix(result_map, y_pred))


[1 2 1 1 2 0 1 0 1 1 2 1 1 1 2 1 2 1 2 2 1 2 2 0 0 1 1 2 1 1 0 1 1 2 2 2 1
 1 2 0 1 2 1 2 0 1 0 1 1 2 0 0 2 2 1 0 0 1 1 1 1 1 1 0 1 1 0 2 1 1 2 1 2 0
 2 1 2 1 1 1 2 1 2 1 1 2 1 1 1 2]
        홈 팀      원정 팀 홈 경기결과
0     daegu  gimcheon      L
1    pohang     daegu      W
2     daegu   suwonFC      L
3   gwangju     daegu      L
4   gangwon     daegu      W
..      ...       ...    ...
85  suwonFC    pohang      W
86   pohang     seoul      L
87  suwonFC     seoul      L
88    seoul     ulsan      L
89    ulsan   suwonFC      W

[90 rows x 3 columns]
Accuracy: 0.7111111111111111
              precision    recall  f1-score   support

           0       0.53      0.31      0.39        26
           1       0.60      1.00      0.75        28
           2       1.00      0.78      0.88        36

    accuracy                           0.71        90
   macro avg       0.71      0.70      0.67        90
weighted avg       0.74      0.71      0.70        90

[[ 8 18  0]
 [ 0 28  0]
 [ 7  1 28]



In [21]:
# 그래디언트 부스팅

from sklearn.ensemble import GradientBoostingClassifier

model = GradientBoostingClassifier(random_state=42, max_depth=3, n_estimators=500, )
model.fit(selected_X_train_scaled, y_train)

# test data로 predict
y_pred = model.predict(test_df[selected_variables])
print(y_pred)

pred_list = []
for i in y_pred:
    pred_list.append(predict_map[i])
predict_df['홈 경기결과'] = pred_list
print(predict_df)

accuracy = accuracy_score(result_map, y_pred)
print(f'Accuracy: {accuracy}')
print(classification_report(result_map, y_pred))
print(confusion_matrix(result_map, y_pred))

[1 2 1 1 2 0 1 0 1 1 2 1 1 1 2 1 2 1 2 2 1 2 2 0 2 1 1 2 1 1 1 1 1 2 2 2 1
 1 2 2 1 2 1 2 2 1 0 1 1 2 0 0 2 2 1 2 2 1 1 1 1 1 1 0 2 1 0 2 1 1 2 1 2 2
 2 1 2 1 1 1 2 1 2 1 1 2 1 1 1 2]
        홈 팀      원정 팀 홈 경기결과
0     daegu  gimcheon      L
1    pohang     daegu      W
2     daegu   suwonFC      L
3   gwangju     daegu      L
4   gangwon     daegu      W
..      ...       ...    ...
85  suwonFC    pohang      W
86   pohang     seoul      L
87  suwonFC     seoul      L
88    seoul     ulsan      L
89    ulsan   suwonFC      W

[90 rows x 3 columns]
Accuracy: 0.7777777777777778
              precision    recall  f1-score   support

           0       1.00      0.31      0.47        26
           1       0.60      1.00      0.75        28
           2       0.97      0.94      0.96        36

    accuracy                           0.78        90
   macro avg       0.86      0.75      0.73        90
weighted avg       0.86      0.78      0.75        90

[[ 8 17  1]
 [ 0 28  0]
 [ 0  2 34]



In [22]:
# 에이다 부스트

from sklearn.ensemble import AdaBoostClassifier

model = AdaBoostClassifier(random_state=42,)
model.fit(selected_X_train_scaled, y_train)

y_pred = model.predict(test_df[selected_variables])
print(y_pred)

pred_list = []
for i in y_pred:
    pred_list.append(predict_map[i])
predict_df['홈 경기결과'] = pred_list
print(predict_df)

accuracy = accuracy_score(result_map, y_pred)
print(f'Accuracy: {accuracy}')
print(classification_report(result_map, y_pred))
print(confusion_matrix(result_map, y_pred))

[0 2 0 0 2 0 0 0 0 0 2 1 0 0 2 0 2 0 2 2 0 2 2 0 2 1 0 2 0 0 1 1 0 2 2 2 0
 0 2 2 0 2 0 2 2 0 0 0 0 2 0 0 2 2 0 2 2 0 0 0 0 0 0 0 1 0 0 2 0 0 2 0 2 2
 2 0 2 0 0 0 2 0 2 0 0 2 0 0 0 2]
        홈 팀      원정 팀 홈 경기결과
0     daegu  gimcheon      D
1    pohang     daegu      W
2     daegu   suwonFC      D
3   gwangju     daegu      D
4   gangwon     daegu      W
..      ...       ...    ...
85  suwonFC    pohang      W
86   pohang     seoul      D
87  suwonFC     seoul      D
88    seoul     ulsan      D
89    ulsan   suwonFC      W

[90 rows x 3 columns]
Accuracy: 0.6333333333333333
              precision    recall  f1-score   support

           0       0.45      0.88      0.60        26
           1       0.00      0.00      0.00        28
           2       1.00      0.94      0.97        36

    accuracy                           0.63        90
   macro avg       0.48      0.61      0.52        90
weighted avg       0.53      0.63      0.56        90

[[23  3  0]
 [28  0  0]
 [ 0  2 34]



In [23]:
# XGBoost

model = XGBClassifier()
model.fit(selected_X_train_scaled, y_train)

y_pred = model.predict(test_df[selected_variables])
print(y_pred)

pred_list = []
for i in y_pred:
    pred_list.append(predict_map[i])
predict_df['홈 경기결과'] = pred_list
print(predict_df)

accuracy = accuracy_score(result_map, y_pred)
print(f'Accuracy: {accuracy}')
print(classification_report(result_map, y_pred))
print(confusion_matrix(result_map, y_pred))

[1 2 1 1 2 0 1 0 1 1 2 1 1 1 2 1 2 1 2 2 1 2 2 0 1 1 1 2 1 1 1 1 1 2 2 2 1
 1 2 1 1 2 1 2 1 1 0 1 1 2 0 0 2 2 1 1 1 1 1 1 1 1 1 0 1 1 0 2 1 1 2 1 2 1
 2 1 2 1 1 1 2 1 2 1 1 2 1 1 1 2]
        홈 팀      원정 팀 홈 경기결과
0     daegu  gimcheon      L
1    pohang     daegu      W
2     daegu   suwonFC      L
3   gwangju     daegu      L
4   gangwon     daegu      W
..      ...       ...    ...
85  suwonFC    pohang      W
86   pohang     seoul      L
87  suwonFC     seoul      L
88    seoul     ulsan      L
89    ulsan   suwonFC      W

[90 rows x 3 columns]
Accuracy: 0.7111111111111111
              precision    recall  f1-score   support

           0       1.00      0.31      0.47        26
           1       0.52      1.00      0.68        28
           2       1.00      0.78      0.88        36

    accuracy                           0.71        90
   macro avg       0.84      0.70      0.68        90
weighted avg       0.85      0.71      0.70        90

[[ 8 18  0]
 [ 0 28  0]
 [ 0  8 28]

In [24]:
# catboost

model = CatBoostClassifier(random_seed=42, metric_period=250, depth=2)
model.fit(selected_X_train_scaled, y_train)

y_pred = model.predict(test_df[selected_variables])
# print(y_pred)

pred_list = []
y_pred = y_pred.reshape(-1)
for i in y_pred:
    pred_list.append(predict_map[i])
predict_df['홈 경기결과'] = pred_list
# print(predict_df)

accuracy = accuracy_score(result_map, y_pred)
print(f'Accuracy: {accuracy}')
print(classification_report(result_map, y_pred))
print(confusion_matrix(result_map, y_pred))

Learning rate set to 0.077189
0:	learn: 1.0697402	total: 754us	remaining: 753ms
250:	learn: 0.1068122	total: 86.3ms	remaining: 257ms
500:	learn: 0.0687990	total: 173ms	remaining: 172ms
750:	learn: 0.0512261	total: 263ms	remaining: 87.3ms
999:	learn: 0.0407634	total: 342ms	remaining: 0us
Accuracy: 0.7222222222222222
              precision    recall  f1-score   support

           0       1.00      0.31      0.47        26
           1       0.54      1.00      0.70        28
           2       0.97      0.81      0.88        36

    accuracy                           0.72        90
   macro avg       0.84      0.70      0.68        90
weighted avg       0.84      0.72      0.71        90

[[ 8 17  1]
 [ 0 28  0]
 [ 0  7 29]]
