In [1]:
import os
import random

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold , StratifiedKFold
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from category_encoders.target_encoder import TargetEncoder
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
import statsmodels.api as sm


df = pd.read_csv('K-League-data.csv').drop(columns='Unnamed: 0')
print(df.shape)
print(df)

(1332, 52)
      Rnd.        대회   H/A  득점  도움  슈팅  유효 슈팅  블락된슈팅  벗어난슈팅  PA내 슈팅  ...  경고  \
0        1  대구vs수원FC  HOME   1   1  13      4      2      7       5  ...   1   
1        1    대구vs서울  HOME   0   0  10      1      3      6       7  ...   4   
2        1    제주vs포항  HOME   0   0   9      1      3      5       5  ...   2   
3        1    서울vs인천  HOME   2   1  13      7      2      4       7  ...   3   
4        1    포항vs대구  AWAY   2   1   9      4      3      2       6  ...   3   
...    ...       ...   ...  ..  ..  ..    ...    ...    ...     ...  ...  ..   
1327    38    포항vs강원  AWAY   0   0  11      0      2      9       6  ...   3   
1328    38  수원FCvs서울  HOME   0   0  10      0      5      5       5  ...   0   
1329    38    울산vs대구  AWAY   0   0   9      4      1      4       3  ...   3   
1330    38  수원FCvs수원  HOME   2   1  12      3      4      5       5  ...   2   
1331    38    울산vs대구  HOME   2   2  19      6      9      4      10  ...   1   

      퇴장       기준    시즌  경기결

In [2]:
def seed_everything(seed):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)

seed_everything(42)

In [3]:
X = df.drop(columns=['경기결과', '기준'])
X = pd.get_dummies(X, columns=['대회', 'H/A', '시즌'], drop_first=True)

y = df['경기결과']

le = LabelEncoder()
y = le.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# scaler = StandardScaler()  
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)

(1198, 225) (134, 225)
(1198,) (134,)


In [7]:
# 다항 로지스틱 회귀

model = LogisticRegression(random_state=42, solver='newton-cg',max_iter=500)
model.fit(X_train_scaled, y_train)

y_pred = model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.6119402985074627
              precision    recall  f1-score   support

           0       0.55      0.51      0.53        41
           1       0.65      0.71      0.68        52
           2       0.62      0.59      0.60        41

    accuracy                           0.61       134
   macro avg       0.61      0.60      0.60       134
weighted avg       0.61      0.61      0.61       134

[[21 11  9]
 [ 9 37  6]
 [ 8  9 24]]


In [8]:
# OLS

results = sm.OLS(y_train,sm.add_constant(X_train_scaled)).fit()
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.452
Model:                            OLS   Adj. R-squared:                  0.326
Method:                 Least Squares   F-statistic:                     3.600
Date:                Thu, 23 Nov 2023   Prob (F-statistic):           5.37e-43
Time:                        19:41:11   Log-Likelihood:                -1079.9
No. Observations:                1198   AIC:                             2608.
Df Residuals:                     974   BIC:                             3748.
Df Model:                         223                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.7069      0.526     -1.344      0.1

In [9]:
# 다층신경망

from sklearn.neural_network import MLPClassifier

model = MLPClassifier(random_state=42, max_iter=500)
model.fit(X_train_scaled, y_train)

y_pred = model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.6119402985074627
              precision    recall  f1-score   support

           0       0.64      0.61      0.62        41
           1       0.60      0.60      0.60        52
           2       0.60      0.63      0.62        41

    accuracy                           0.61       134
   macro avg       0.61      0.61      0.61       134
weighted avg       0.61      0.61      0.61       134

[[25 12  4]
 [ 8 31 13]
 [ 6  9 26]]




In [10]:
# LDA

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

model = LinearDiscriminantAnalysis()
model.fit(X_train_scaled, y_train)

# 모델 평가
y_pred = model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.6119402985074627
              precision    recall  f1-score   support

           0       0.54      0.54      0.54        41
           1       0.64      0.69      0.67        52
           2       0.65      0.59      0.62        41

    accuracy                           0.61       134
   macro avg       0.61      0.60      0.61       134
weighted avg       0.61      0.61      0.61       134

[[22 12  7]
 [10 36  6]
 [ 9  8 24]]


In [11]:
# QDA

from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

model = QuadraticDiscriminantAnalysis()
model.fit(X_train_scaled, y_train)

# 모델 평가
y_pred = model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.35074626865671643
              precision    recall  f1-score   support

           0       0.39      0.95      0.55        41
           1       0.08      0.02      0.03        52
           2       0.32      0.17      0.22        41

    accuracy                           0.35       134
   macro avg       0.26      0.38      0.27       134
weighted avg       0.25      0.35      0.25       134

[[39  1  1]
 [37  1 14]
 [24 10  7]]




In [12]:
# SVM

from sklearn import svm

model = svm.SVC(random_state=42, kernel='linear')
model.fit(X_train_scaled, y_train)

# 모델 평가
y_pred = model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.5970149253731343
              precision    recall  f1-score   support

           0       0.47      0.46      0.47        41
           1       0.67      0.67      0.67        52
           2       0.62      0.63      0.63        41

    accuracy                           0.60       134
   macro avg       0.59      0.59      0.59       134
weighted avg       0.60      0.60      0.60       134

[[19 11 11]
 [12 35  5]
 [ 9  6 26]]


In [22]:
# 그래디언트부스팅

from sklearn.ensemble import GradientBoostingClassifier

model = GradientBoostingClassifier(random_state=42, max_depth=3, n_estimators=500, )
model.fit(X_train_scaled, y_train)

# 모델 평가
y_pred = model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))


feature_importances = model.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
# display(feature_importance_df)
feature_importance_df[:20]

Accuracy: 0.5223880597014925
              precision    recall  f1-score   support

           0       0.37      0.32      0.34        41
           1       0.63      0.65      0.64        52
           2       0.51      0.56      0.53        41

    accuracy                           0.52       134
   macro avg       0.50      0.51      0.51       134
weighted avg       0.51      0.52      0.52       134

[[13 17 11]
 [ 7 34 11]
 [15  3 23]]


Unnamed: 0,Feature,Importance
1,득점,0.290005
32,클리어링,0.050301
22,수비진영 패스,0.024132
36,블락,0.022212
46,총 가치,0.021494
31,태클,0.020318
35,획득,0.020107
0,Rnd.,0.019191
27,크로스,0.01822
30,경합 공중,0.017255


In [23]:
# 에이다 부스트

from sklearn.ensemble import AdaBoostClassifier

model = AdaBoostClassifier(random_state=42,)
model.fit(X_train_scaled, y_train)

# 모델 평가
y_pred = model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

feature_importances = model.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
# display(feature_importance_df)
feature_importance_df[:20]

Accuracy: 0.5373134328358209
              precision    recall  f1-score   support

           0       0.40      0.41      0.40        41
           1       0.66      0.56      0.60        52
           2       0.55      0.63      0.59        41

    accuracy                           0.54       134
   macro avg       0.54      0.54      0.53       134
weighted avg       0.55      0.54      0.54       134

[[17 11 13]
 [15 29  8]
 [11  4 26]]


Unnamed: 0,Feature,Importance
1,득점,0.16
24,롱패스,0.06
145,대회_수원vs광주,0.06
32,클리어링,0.06
34,차단,0.06
27,크로스,0.04
31,태클,0.04
22,수비진영 패스,0.04
46,총 가치,0.04
41,퇴장,0.02


In [24]:
# lightgbm

import lightgbm as lgb

lgb_model = lgb.LGBMClassifier(random_state=42, max_depth=3, n_estimators=500)
lgb_model.fit(X_train_scaled, y_train)

y_pred = model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

feature_importances = model.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
# display(feature_importance_df)
feature_importance_df[:20]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002984 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3637
[LightGBM] [Info] Number of data points in the train set: 1198, number of used features: 50
[LightGBM] [Info] Start training from score -1.221941
[LightGBM] [Info] Start training from score -1.055323
[LightGBM] [Info] Start training from score -1.029286
Accuracy: 0.5373134328358209
              precision    recall  f1-score   support

           0       0.40      0.41      0.40        41
           1       0.66      0.56      0.60        52
           2       0.55      0.63      0.59        41

    accuracy                           0.54       134
   macro avg       0.54      0.54      0.53       134
weighted avg       0.55      0.54      0.54       134

[[17 11 13]
 [15 29  8]
 [11  4 26]]


Unnamed: 0,Feature,Importance
1,득점,0.16
24,롱패스,0.06
145,대회_수원vs광주,0.06
32,클리어링,0.06
34,차단,0.06
27,크로스,0.04
31,태클,0.04
22,수비진영 패스,0.04
46,총 가치,0.04
41,퇴장,0.02


In [25]:
# randomforest

from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(random_state=42)
model.fit(X_train_scaled, y_train)

# 모델 평가
y_pred = model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

feature_importances = model.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
# display(feature_importance_df)
feature_importance_df[:20]

Accuracy: 0.5597014925373134
              precision    recall  f1-score   support

           0       0.44      0.29      0.35        41
           1       0.58      0.67      0.63        52
           2       0.60      0.68      0.64        41

    accuracy                           0.56       134
   macro avg       0.54      0.55      0.54       134
weighted avg       0.54      0.56      0.55       134

[[12 18 11]
 [ 9 35  8]
 [ 6  7 28]]


Unnamed: 0,Feature,Importance
1,득점,0.082692
2,도움,0.040078
32,클리어링,0.029699
4,유효 슈팅,0.025852
22,수비진영 패스,0.02438
25,중거리패스,0.023771
35,획득,0.023598
30,경합 공중,0.023228
27,크로스,0.023191
24,롱패스,0.023046


In [26]:
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer

# One-hot encode categorical variables
# X = pd.get_dummies(X, columns=['대회', 'H/A', '시즌'], drop_first=True)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# Initialize XGBoost model
model = XGBClassifier()

# Train the model
model.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test_scaled)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))


feature_importances = model.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
# display(feature_importance_df)
feature_importance_df[:20]

Accuracy: 0.4701492537313433
              precision    recall  f1-score   support

           0       0.28      0.22      0.25        41
           1       0.59      0.56      0.57        52
           2       0.47      0.61      0.53        41

    accuracy                           0.47       134
   macro avg       0.45      0.46      0.45       134
weighted avg       0.46      0.47      0.46       134

[[ 9 16 16]
 [11 29 12]
 [12  4 25]]


Unnamed: 0,Feature,Importance
1,득점,0.074874
57,대회_강원vs제주,0.044678
213,대회_포항vs대전,0.030367
153,대회_수원vs인천,0.029746
94,대회_대구vs포항,0.028914
144,대회_수원vs강원,0.024555
106,대회_서울vs강원,0.024305
140,대회_수원FCvs인천,0.023635
83,대회_대구vs광주,0.021431
84,대회_대구vs김천,0.020187


In [27]:
import pandas as pd
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score

# Initialize CatBoost model
model = CatBoostClassifier(random_seed=42, metric_period=100, depth=4)

# Train the model
model.fit(X_train_scaled, y_train)  # Specify categorical features

# Make predictions on the test set
y_pred = model.predict(X_test_scaled)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))


feature_importances = model.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
# display(feature_importance_df)
feature_importance_df[:20]

Learning rate set to 0.079859
0:	learn: 1.0682594	total: 147ms	remaining: 2m 27s
100:	learn: 0.7512092	total: 376ms	remaining: 3.35s
200:	learn: 0.6486727	total: 578ms	remaining: 2.3s
300:	learn: 0.5769150	total: 786ms	remaining: 1.83s
400:	learn: 0.5197448	total: 1s	remaining: 1.5s
500:	learn: 0.4742173	total: 1.21s	remaining: 1.21s
600:	learn: 0.4358369	total: 1.42s	remaining: 944ms
700:	learn: 0.4012124	total: 1.63s	remaining: 696ms
800:	learn: 0.3671154	total: 1.84s	remaining: 457ms
900:	learn: 0.3389751	total: 2.05s	remaining: 225ms
999:	learn: 0.3124106	total: 2.26s	remaining: 0us
Accuracy: 0.5447761194029851
              precision    recall  f1-score   support

           0       0.41      0.27      0.32        41
           1       0.63      0.71      0.67        52
           2       0.52      0.61      0.56        41

    accuracy                           0.54       134
   macro avg       0.52      0.53      0.52       134
weighted avg       0.53      0.54      0.53       1

Unnamed: 0,Feature,Importance
1,득점,21.418086
32,클리어링,5.379082
35,획득,3.225846
34,차단,3.039019
30,경합 공중,2.870752
22,수비진영 패스,2.714519
24,롱패스,2.646933
31,태클,2.615507
46,총 가치,2.579184
29,경합 지상,2.480436


# PCA


In [28]:
from sklearn.decomposition import PCA

# 가정: df가 DataFrame인 경우
X = df.drop(columns=['경기결과', '기준'])
X = pd.get_dummies(X, columns=['대회', 'H/A', '시즌'], drop_first=True)

y = df['경기결과']

# LabelEncoder를 대상 변수 'y'에 직접 적용
le = LabelEncoder()
y = le.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

pca = PCA(n_components=0.95)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

model = LogisticRegression(random_state=42, solver='newton-cg', max_iter=500)
model.fit(X_train_pca, y_train)

y_pred = model.predict(X_test_pca)

accuracy = accuracy_score(y_test, y_pred)
print(f'정확도: {accuracy}')
print(classification_report(y_test, y_pred))
# print(confusion_matrix(y_test, y_pred))

정확도: 0.6194029850746269
              precision    recall  f1-score   support

           0       0.57      0.41      0.48        41
           1       0.65      0.75      0.70        52
           2       0.61      0.66      0.64        41

    accuracy                           0.62       134
   macro avg       0.61      0.61      0.60       134
weighted avg       0.61      0.62      0.61       134



In [29]:
pcsSummary_df = pd.DataFrame({'Standard deviation': np.sqrt(pca.explained_variance_),
                           'Proportion of variance': pca.explained_variance_ratio_,
                           'Cumulative proportion': np.cumsum(pca.explained_variance_ratio_)})
pcsSummary_df = pcsSummary_df.transpose()
pcsSummary_df.columns = ['PC{}'.format(i) for i in range(1, len(pcsSummary_df.columns) + 1)]
pcsSummary_df.round(4)

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,...,PC147,PC148,PC149,PC150,PC151,PC152,PC153,PC154,PC155,PC156
Standard deviation,0.5939,0.5525,0.5108,0.444,0.3455,0.3204,0.3112,0.2964,0.2952,0.2418,...,0.0686,0.0681,0.0677,0.0671,0.066,0.0654,0.065,0.0646,0.0642,0.0639
Proportion of variance,0.1078,0.0933,0.0798,0.0603,0.0365,0.0314,0.0296,0.0269,0.0266,0.0179,...,0.0014,0.0014,0.0014,0.0014,0.0013,0.0013,0.0013,0.0013,0.0013,0.0012
Cumulative proportion,0.1078,0.2012,0.281,0.3412,0.3777,0.4091,0.4387,0.4656,0.4922,0.5101,...,0.9389,0.9403,0.9417,0.9431,0.9444,0.9457,0.947,0.9483,0.9495,0.9508


In [46]:
contributions_df = pd.DataFrame(pca.components_, columns=X.columns)

print(contributions_df.abs().sum(axis=0).sort_values(ascending=False)[:28])
# 서울 vs 성남? 
# 서울 : 1승 1무 2패
# 성남 : 2승 1무 1패 
# 성남 2승의 경기 지표상, 서울이 경기력은 좋았으나 패배한 경우임. 
# 절댓값으로 더했으니 실패한 예측력에 기여했을듯

차단             8.529684
대회_인천vs대구      8.477757
클리어링           8.427554
대회_수원vs대구      8.356949
블락             8.350296
키패스            8.337414
대회_대구vs제주      8.228254
대회_강원vs인천      8.197517
획득             8.194549
대회_전북vs울산      8.132815
대회_수원vs제주      8.127350
대회_전북vs인천      8.077724
대회_강원vs전북      8.014506
대회_수원vs울산      8.002739
대회_제주vs대구      7.962018
공격진영 패스        7.884280
대회_서울vs성남      7.846197
대회_대구vs포항      7.821292
롱패스            7.798662
대회_제주vs수원FC    7.796165
대회_전북vs제주      7.791390
대회_수원vs강원      7.783522
대회_인천vs수원FC    7.774685
대회_대구vs서울      7.769256
대회_수원FCvs강원    7.740753
대회_서울vs울산      7.739460
인터셉트           7.724103
대회_서울vs강원      7.721244
dtype: float64
