In [5]:
import os
import random

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold , StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from category_encoders.target_encoder import TargetEncoder
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler


df = pd.read_csv('K-League-data.csv').drop(columns='Unnamed: 0')
print(df.shape)
print(df)

(1332, 52)
      Rnd.        대회   H/A  득점  도움  슈팅  유효 슈팅  블락된슈팅  벗어난슈팅  PA내 슈팅  ...  경고  \
0        1  대구vs수원FC  HOME   1   1  13      4      2      7       5  ...   1   
1        1    대구vs서울  HOME   0   0  10      1      3      6       7  ...   4   
2        1    제주vs포항  HOME   0   0   9      1      3      5       5  ...   2   
3        1    서울vs인천  HOME   2   1  13      7      2      4       7  ...   3   
4        1    포항vs대구  AWAY   2   1   9      4      3      2       6  ...   3   
...    ...       ...   ...  ..  ..  ..    ...    ...    ...     ...  ...  ..   
1327    38    포항vs강원  AWAY   0   0  11      0      2      9       6  ...   3   
1328    38  수원FCvs서울  HOME   0   0  10      0      5      5       5  ...   0   
1329    38    울산vs대구  AWAY   0   0   9      4      1      4       3  ...   3   
1330    38  수원FCvs수원  HOME   2   1  12      3      4      5       5  ...   2   
1331    38    울산vs대구  HOME   2   2  19      6      9      4      10  ...   1   

      퇴장       기준    시즌  경기결

In [6]:
def seed_everything(seed):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)

seed_everything(42)

In [23]:
X = df.drop(columns=['경기결과', '기준'])
X = pd.get_dummies(X, columns=['대회', 'H/A', '시즌'], drop_first=True)

y = df['경기결과']

le = LabelEncoder()
y = le.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# scaler = StandardScaler()  
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)

(1198, 225) (134, 225)
(1198,) (134,)


In [24]:
model = LogisticRegression(random_state=42, solver='newton-cg',max_iter=500)
model.fit(X_train_scaled, y_train)

y_pred = model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
print(classification_report(y_test, y_pred))

Accuracy: 0.6119402985074627
              precision    recall  f1-score   support

           0       0.55      0.51      0.53        41
           1       0.65      0.71      0.68        52
           2       0.62      0.59      0.60        41

    accuracy                           0.61       134
   macro avg       0.61      0.60      0.60       134
weighted avg       0.61      0.61      0.61       134



In [9]:
# 다층신경망

from sklearn.neural_network import MLPClassifier

model = MLPClassifier(random_state=42, max_iter=500)
model.fit(X_train_scaled, y_train)

y_pred = model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
print(classification_report(y_test, y_pred))


Accuracy: 0.6119402985074627
              precision    recall  f1-score   support

           0       0.64      0.61      0.62        41
           1       0.60      0.60      0.60        52
           2       0.60      0.63      0.62        41

    accuracy                           0.61       134
   macro avg       0.61      0.61      0.61       134
weighted avg       0.61      0.61      0.61       134





In [10]:
# LDA

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

model = LinearDiscriminantAnalysis()
model.fit(X_train_scaled, y_train)

# 모델 평가
y_pred = model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
print(classification_report(y_test, y_pred))

Accuracy: 0.6119402985074627
              precision    recall  f1-score   support

           0       0.54      0.54      0.54        41
           1       0.64      0.69      0.67        52
           2       0.65      0.59      0.62        41

    accuracy                           0.61       134
   macro avg       0.61      0.60      0.61       134
weighted avg       0.61      0.61      0.61       134



In [11]:
# QDA

from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

model = QuadraticDiscriminantAnalysis()
model.fit(X_train_scaled, y_train)

# 모델 평가
y_pred = model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
print(classification_report(y_test, y_pred))

Accuracy: 0.35074626865671643
              precision    recall  f1-score   support

           0       0.39      0.95      0.55        41
           1       0.08      0.02      0.03        52
           2       0.32      0.17      0.22        41

    accuracy                           0.35       134
   macro avg       0.26      0.38      0.27       134
weighted avg       0.25      0.35      0.25       134





In [12]:
# SVM

from sklearn import svm

model = svm.SVC(random_state=42, kernel='linear')
model.fit(X_train_scaled, y_train)

# 모델 평가
y_pred = model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
print(classification_report(y_test, y_pred))

Accuracy: 0.5970149253731343
              precision    recall  f1-score   support

           0       0.47      0.46      0.47        41
           1       0.67      0.67      0.67        52
           2       0.62      0.63      0.63        41

    accuracy                           0.60       134
   macro avg       0.59      0.59      0.59       134
weighted avg       0.60      0.60      0.60       134



In [13]:
# 그래디언트부스팅

from sklearn.ensemble import GradientBoostingClassifier

model = GradientBoostingClassifier(random_state=42, max_depth=3, n_estimators=500, )
model.fit(X_train_scaled, y_train)

# 모델 평가
y_pred = model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
print(classification_report(y_test, y_pred))

Accuracy: 0.5223880597014925
              precision    recall  f1-score   support

           0       0.37      0.32      0.34        41
           1       0.63      0.65      0.64        52
           2       0.51      0.56      0.53        41

    accuracy                           0.52       134
   macro avg       0.50      0.51      0.51       134
weighted avg       0.51      0.52      0.52       134



In [14]:
# 에이다 부스트

from sklearn.ensemble import AdaBoostClassifier

model = AdaBoostClassifier(random_state=42,)
model.fit(X_train_scaled, y_train)

# 모델 평가
y_pred = model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
print(classification_report(y_test, y_pred))

Accuracy: 0.5373134328358209
              precision    recall  f1-score   support

           0       0.40      0.41      0.40        41
           1       0.66      0.56      0.60        52
           2       0.55      0.63      0.59        41

    accuracy                           0.54       134
   macro avg       0.54      0.54      0.53       134
weighted avg       0.55      0.54      0.54       134



In [15]:
# lightgbm

import lightgbm as lgb

lgb_model = lgb.LGBMClassifier(random_state=42, max_depth=3, n_estimators=500)
lgb_model.fit(X_train_scaled, y_train)

y_pred = model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
print(classification_report(y_test, y_pred))

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000435 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3637
[LightGBM] [Info] Number of data points in the train set: 1198, number of used features: 50
[LightGBM] [Info] Start training from score -1.221941
[LightGBM] [Info] Start training from score -1.055323
[LightGBM] [Info] Start training from score -1.029286
Accuracy: 0.5373134328358209
              precision    recall  f1-score   support

           0       0.40      0.41      0.40        41
           1       0.66      0.56      0.60        52
           2       0.55      0.63      0.59        41

    accuracy                           0.54       134
   macro avg       0.54      0.54      0.53       134
weighted avg       0.55      0.54      0.54       134



In [16]:
# randomforest

from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(random_state=42)
model.fit(X_train_scaled, y_train)

# 모델 평가
y_pred = model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
print(classification_report(y_test, y_pred))

Accuracy: 0.5597014925373134
              precision    recall  f1-score   support

           0       0.44      0.29      0.35        41
           1       0.58      0.67      0.63        52
           2       0.60      0.68      0.64        41

    accuracy                           0.56       134
   macro avg       0.54      0.55      0.54       134
weighted avg       0.54      0.56      0.55       134



In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer

# One-hot encode categorical variables
# X = pd.get_dummies(X, columns=['대회', 'H/A', '시즌'], drop_first=True)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# Initialize XGBoost model
model = XGBClassifier()

# Train the model
model.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test_scaled)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
print(classification_report(y_test, y_pred))


Accuracy: 0.4701492537313433
              precision    recall  f1-score   support

           0       0.28      0.22      0.25        41
           1       0.59      0.56      0.57        52
           2       0.47      0.61      0.53        41

    accuracy                           0.47       134
   macro avg       0.45      0.46      0.45       134
weighted avg       0.46      0.47      0.46       134



In [18]:
import pandas as pd
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score

# Initialize CatBoost model
model = CatBoostClassifier(random_seed=42, metric_period=100, depth=4)

# Train the model
model.fit(X_train_scaled, y_train)  # Specify categorical features

# Make predictions on the test set
y_pred = model.predict(X_test_scaled)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
print(classification_report(y_test, y_pred))

Learning rate set to 0.079859
0:	learn: 1.0682594	total: 142ms	remaining: 2m 22s
100:	learn: 0.7512092	total: 964ms	remaining: 8.58s
200:	learn: 0.6486727	total: 1.98s	remaining: 7.87s
300:	learn: 0.5769150	total: 3.23s	remaining: 7.49s
400:	learn: 0.5197448	total: 4.19s	remaining: 6.25s
500:	learn: 0.4742173	total: 4.55s	remaining: 4.53s
600:	learn: 0.4358369	total: 4.87s	remaining: 3.23s
700:	learn: 0.4012124	total: 5.19s	remaining: 2.21s
800:	learn: 0.3671154	total: 5.49s	remaining: 1.36s
900:	learn: 0.3389751	total: 5.8s	remaining: 638ms
999:	learn: 0.3124106	total: 6.11s	remaining: 0us
Accuracy: 0.5447761194029851
              precision    recall  f1-score   support

           0       0.41      0.27      0.32        41
           1       0.63      0.71      0.67        52
           2       0.52      0.61      0.56        41

    accuracy                           0.54       134
   macro avg       0.52      0.53      0.52       134
weighted avg       0.53      0.54      0.53    

# PCA


In [53]:
from sklearn.decomposition import PCA

# 가정: df가 DataFrame인 경우
X = df.drop(columns=['경기결과', '기준'])
X = pd.get_dummies(X, columns=['대회', 'H/A', '시즌'], drop_first=True)

y = df['경기결과']

# LabelEncoder를 대상 변수 'y'에 직접 적용
le = LabelEncoder()
y = le.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# PCA를 적용하여 특성 차원 축소
# n_components = 10  # 주성분의 개수 선택
pca = PCA(n_components=0.95)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

# 로지스틱 회귀 모델 초기화 및 훈련
model = LogisticRegression(random_state=42, solver='newton-cg', max_iter=500)
model.fit(X_train_pca, y_train)

# 테스트 세트에 대한 예측 수행
y_pred = model.predict(X_test_pca)

# 모델 평가
accuracy = accuracy_score(y_test, y_pred)
print(f'정확도: {accuracy}')
print(classification_report(y_test, y_pred))


정확도: 0.6194029850746269
              precision    recall  f1-score   support

           0       0.57      0.41      0.48        41
           1       0.65      0.75      0.70        52
           2       0.61      0.66      0.64        41

    accuracy                           0.62       134
   macro avg       0.61      0.61      0.60       134
weighted avg       0.61      0.62      0.61       134

[0.35268724 0.30523622 0.26095352 0.19711759 0.11934061 0.1026808
 0.09681667 0.08785402 0.08711624 0.05845146 0.05423579 0.05010706
 0.03517051 0.03302601 0.03212817 0.03067636 0.02944265 0.02670854
 0.02592838 0.02529825 0.02428598 0.02298982 0.02163389 0.02084466
 0.01901294 0.01843404 0.01690513 0.0160761  0.01559683 0.01525151
 0.01392488 0.013466   0.01345599 0.01195732 0.01138097 0.01126064
 0.01099343 0.0108427  0.01016034 0.00999301 0.00990377 0.00984843
 0.00977368 0.00972348 0.0095915  0.00951302 0.00933265 0.00921608
 0.0091163  0.00904299 0.00894746 0.00882872 0.0087967  0.00

In [54]:
pcsSummary_df = pd.DataFrame({'Standard deviation': np.sqrt(pca.explained_variance_),
                           'Proportion of variance': pca.explained_variance_ratio_,
                           'Cumulative proportion': np.cumsum(pca.explained_variance_ratio_)})
pcsSummary_df = pcsSummary_df.transpose()
pcsSummary_df.columns = ['PC{}'.format(i) for i in range(1, len(pcsSummary_df.columns) + 1)]
pcsSummary_df.round(4)

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,...,PC147,PC148,PC149,PC150,PC151,PC152,PC153,PC154,PC155,PC156
Standard deviation,0.5939,0.5525,0.5108,0.444,0.3455,0.3204,0.3112,0.2964,0.2952,0.2418,...,0.0686,0.0681,0.0677,0.0671,0.066,0.0654,0.065,0.0646,0.0642,0.0639
Proportion of variance,0.1078,0.0933,0.0798,0.0603,0.0365,0.0314,0.0296,0.0269,0.0266,0.0179,...,0.0014,0.0014,0.0014,0.0014,0.0013,0.0013,0.0013,0.0013,0.0013,0.0012
Cumulative proportion,0.1078,0.2012,0.281,0.3412,0.3777,0.4091,0.4387,0.4656,0.4922,0.5101,...,0.9389,0.9403,0.9417,0.9431,0.9444,0.9457,0.947,0.9483,0.9495,0.9508
