In [2]:
import os
import random

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold , StratifiedKFold
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from category_encoders.target_encoder import TargetEncoder
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
import statsmodels.api as sm


df = pd.read_csv('K-League-data.csv').drop(columns='Unnamed: 0')
# print(df.shape)
# print(df)

In [3]:
def seed_everything(seed):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)

seed_everything(42)

In [4]:
X = df.drop(columns=['경기결과', '기준'])
X = pd.get_dummies(X, columns=['대회', 'H/A', '시즌'], drop_first=True)

y = df['경기결과']

le = LabelEncoder()
y = le.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# scaler = StandardScaler()  
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)

(1198, 225) (134, 225)
(1198,) (134,)


In [5]:
# 다항 로지스틱 회귀

model = LogisticRegression(random_state=42, solver='newton-cg',max_iter=500)
model.fit(X_train_scaled, y_train)

y_pred = model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.6119402985074627
              precision    recall  f1-score   support

           0       0.55      0.51      0.53        41
           1       0.65      0.71      0.68        52
           2       0.62      0.59      0.60        41

    accuracy                           0.61       134
   macro avg       0.61      0.60      0.60       134
weighted avg       0.61      0.61      0.61       134

[[21 11  9]
 [ 9 37  6]
 [ 8  9 24]]


In [6]:
# OLS

# results = sm.OLS(y_train,sm.add_constant(X_train_scaled)).fit()
# print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.452
Model:                            OLS   Adj. R-squared:                  0.326
Method:                 Least Squares   F-statistic:                     3.600
Date:                Sun, 26 Nov 2023   Prob (F-statistic):           5.37e-43
Time:                        14:10:59   Log-Likelihood:                -1079.9
No. Observations:                1198   AIC:                             2608.
Df Residuals:                     974   BIC:                             3748.
Df Model:                         223                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.7069      0.526     -1.344      0.1

In [None]:
# y_pred = results.predict(X_test_scaled)
# accuracy = accuracy_score(y_test, y_pred)
# print(f'Accuracy: {accuracy}')
# print(classification_report(y_test, y_pred))
# print(confusion_matrix(y_test, y_pred))

In [7]:
# 다층신경망

from sklearn.neural_network import MLPClassifier

model = MLPClassifier(random_state=42, max_iter=500)
model.fit(X_train_scaled, y_train)

y_pred = model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.6119402985074627
              precision    recall  f1-score   support

           0       0.64      0.61      0.62        41
           1       0.60      0.60      0.60        52
           2       0.60      0.63      0.62        41

    accuracy                           0.61       134
   macro avg       0.61      0.61      0.61       134
weighted avg       0.61      0.61      0.61       134

[[25 12  4]
 [ 8 31 13]
 [ 6  9 26]]




In [8]:
# LDA

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

model = LinearDiscriminantAnalysis()
model.fit(X_train_scaled, y_train)

# 모델 평가
y_pred = model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.6119402985074627
              precision    recall  f1-score   support

           0       0.54      0.54      0.54        41
           1       0.64      0.69      0.67        52
           2       0.65      0.59      0.62        41

    accuracy                           0.61       134
   macro avg       0.61      0.60      0.61       134
weighted avg       0.61      0.61      0.61       134

[[22 12  7]
 [10 36  6]
 [ 9  8 24]]


In [9]:
# QDA

from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

model = QuadraticDiscriminantAnalysis()
model.fit(X_train_scaled, y_train)

# 모델 평가
y_pred = model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.35074626865671643
              precision    recall  f1-score   support

           0       0.39      0.95      0.55        41
           1       0.08      0.02      0.03        52
           2       0.32      0.17      0.22        41

    accuracy                           0.35       134
   macro avg       0.26      0.38      0.27       134
weighted avg       0.25      0.35      0.25       134

[[39  1  1]
 [37  1 14]
 [24 10  7]]




In [10]:
# SVM

from sklearn import svm

model = svm.SVC(random_state=42, kernel='linear')
model.fit(X_train_scaled, y_train)

# 모델 평가
y_pred = model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.5970149253731343
              precision    recall  f1-score   support

           0       0.47      0.46      0.47        41
           1       0.67      0.67      0.67        52
           2       0.62      0.63      0.63        41

    accuracy                           0.60       134
   macro avg       0.59      0.59      0.59       134
weighted avg       0.60      0.60      0.60       134

[[19 11 11]
 [12 35  5]
 [ 9  6 26]]


In [11]:
# 그래디언트부스팅

from sklearn.ensemble import GradientBoostingClassifier

model = GradientBoostingClassifier(random_state=42, max_depth=3, n_estimators=500, )
model.fit(X_train_scaled, y_train)

# 모델 평가
y_pred = model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))


feature_importances = model.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
# display(feature_importance_df)
feature_importance_df[:20]

Accuracy: 0.5223880597014925
              precision    recall  f1-score   support

           0       0.37      0.32      0.34        41
           1       0.63      0.65      0.64        52
           2       0.51      0.56      0.53        41

    accuracy                           0.52       134
   macro avg       0.50      0.51      0.51       134
weighted avg       0.51      0.52      0.52       134

[[13 17 11]
 [ 7 34 11]
 [15  3 23]]


Unnamed: 0,Feature,Importance
1,득점,0.290005
32,클리어링,0.050301
22,수비진영 패스,0.024132
36,블락,0.022212
46,총 가치,0.021494
31,태클,0.020318
35,획득,0.020107
0,Rnd.,0.019191
27,크로스,0.01822
30,경합 공중,0.017255


In [12]:
# 에이다 부스트

from sklearn.ensemble import AdaBoostClassifier

model = AdaBoostClassifier(random_state=42,)
model.fit(X_train_scaled, y_train)

# 모델 평가
y_pred = model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

feature_importances = model.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
# display(feature_importance_df)
feature_importance_df[:20]

Accuracy: 0.5373134328358209
              precision    recall  f1-score   support

           0       0.40      0.41      0.40        41
           1       0.66      0.56      0.60        52
           2       0.55      0.63      0.59        41

    accuracy                           0.54       134
   macro avg       0.54      0.54      0.53       134
weighted avg       0.55      0.54      0.54       134

[[17 11 13]
 [15 29  8]
 [11  4 26]]


Unnamed: 0,Feature,Importance
1,득점,0.16
24,롱패스,0.06
145,대회_수원vs광주,0.06
32,클리어링,0.06
34,차단,0.06
27,크로스,0.04
31,태클,0.04
22,수비진영 패스,0.04
46,총 가치,0.04
41,퇴장,0.02


In [13]:
# lightgbm

import lightgbm as lgb

lgb_model = lgb.LGBMClassifier(random_state=42, max_depth=3, n_estimators=500)
lgb_model.fit(X_train_scaled, y_train)

y_pred = model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

feature_importances = model.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
# display(feature_importance_df)
feature_importance_df[:20]

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000611 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3637
[LightGBM] [Info] Number of data points in the train set: 1198, number of used features: 50
[LightGBM] [Info] Start training from score -1.221941
[LightGBM] [Info] Start training from score -1.055323
[LightGBM] [Info] Start training from score -1.029286
Accuracy: 0.5373134328358209
              precision    recall  f1-score   support

           0       0.40      0.41      0.40        41
           1       0.66      0.56      0.60        52
           2       0.55      0.63      0.59        41

    accuracy                           0.54       134
   macro avg       0.54      0.54      0.53       134
weighted avg       0.55      0.54      0.54       134

[[17 11 13]
 [15 29  8]
 [11  4 26]]


Unnamed: 0,Feature,Importance
1,득점,0.16
24,롱패스,0.06
145,대회_수원vs광주,0.06
32,클리어링,0.06
34,차단,0.06
27,크로스,0.04
31,태클,0.04
22,수비진영 패스,0.04
46,총 가치,0.04
41,퇴장,0.02


In [14]:
# randomforest

from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(random_state=42)
model.fit(X_train_scaled, y_train)

# 모델 평가
y_pred = model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

feature_importances = model.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
# display(feature_importance_df)
feature_importance_df[:20]

Accuracy: 0.5597014925373134
              precision    recall  f1-score   support

           0       0.44      0.29      0.35        41
           1       0.58      0.67      0.63        52
           2       0.60      0.68      0.64        41

    accuracy                           0.56       134
   macro avg       0.54      0.55      0.54       134
weighted avg       0.54      0.56      0.55       134

[[12 18 11]
 [ 9 35  8]
 [ 6  7 28]]


Unnamed: 0,Feature,Importance
1,득점,0.082692
2,도움,0.040078
32,클리어링,0.029699
4,유효 슈팅,0.025852
22,수비진영 패스,0.02438
25,중거리패스,0.023771
35,획득,0.023598
30,경합 공중,0.023228
27,크로스,0.023191
24,롱패스,0.023046


In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer

# One-hot encode categorical variables
# X = pd.get_dummies(X, columns=['대회', 'H/A', '시즌'], drop_first=True)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# Initialize XGBoost model
model = XGBClassifier()

# Train the model
model.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test_scaled)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))


feature_importances = model.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
# display(feature_importance_df)
feature_importance_df[:20]

Accuracy: 0.4701492537313433
              precision    recall  f1-score   support

           0       0.28      0.22      0.25        41
           1       0.59      0.56      0.57        52
           2       0.47      0.61      0.53        41

    accuracy                           0.47       134
   macro avg       0.45      0.46      0.45       134
weighted avg       0.46      0.47      0.46       134

[[ 9 16 16]
 [11 29 12]
 [12  4 25]]


Unnamed: 0,Feature,Importance
1,득점,0.074874
57,대회_강원vs제주,0.044678
213,대회_포항vs대전,0.030367
153,대회_수원vs인천,0.029746
94,대회_대구vs포항,0.028914
144,대회_수원vs강원,0.024555
106,대회_서울vs강원,0.024305
140,대회_수원FCvs인천,0.023635
83,대회_대구vs광주,0.021431
84,대회_대구vs김천,0.020187


In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score

# Initialize CatBoost model
model = CatBoostClassifier(random_seed=42, metric_period=100, depth=4)

# Train the model
model.fit(X_train_scaled, y_train)  # Specify categorical features

# Make predictions on the test set
y_pred = model.predict(X_test_scaled)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))


feature_importances = model.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
# display(feature_importance_df)
feature_importance_df[:20]

Learning rate set to 0.079859
0:	learn: 1.0682594	total: 137ms	remaining: 2m 16s
100:	learn: 0.7512092	total: 411ms	remaining: 3.66s
200:	learn: 0.6486727	total: 678ms	remaining: 2.69s
300:	learn: 0.5769150	total: 933ms	remaining: 2.17s
400:	learn: 0.5197448	total: 1.17s	remaining: 1.75s
500:	learn: 0.4742173	total: 1.41s	remaining: 1.41s
600:	learn: 0.4358369	total: 1.65s	remaining: 1.1s
700:	learn: 0.4012124	total: 1.88s	remaining: 802ms
800:	learn: 0.3671154	total: 2.1s	remaining: 523ms
900:	learn: 0.3389751	total: 2.35s	remaining: 258ms
999:	learn: 0.3124106	total: 2.58s	remaining: 0us
Accuracy: 0.5447761194029851
              precision    recall  f1-score   support

           0       0.41      0.27      0.32        41
           1       0.63      0.71      0.67        52
           2       0.52      0.61      0.56        41

    accuracy                           0.54       134
   macro avg       0.52      0.53      0.52       134
weighted avg       0.53      0.54      0.53     

Unnamed: 0,Feature,Importance
1,득점,21.418086
32,클리어링,5.379082
35,획득,3.225846
34,차단,3.039019
30,경합 공중,2.870752
22,수비진영 패스,2.714519
24,롱패스,2.646933
31,태클,2.615507
46,총 가치,2.579184
29,경합 지상,2.480436


# PCA


In [17]:
from sklearn.decomposition import PCA

# 가정: df가 DataFrame인 경우
X = df.drop(columns=['경기결과', '기준'])
X = pd.get_dummies(X, columns=['대회', 'H/A', '시즌'], drop_first=True)

y = df['경기결과']

# LabelEncoder를 대상 변수 'y'에 직접 적용
le = LabelEncoder()
y = le.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# 데이터 스케일 조정
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# pca 적용 : 정확도 95%가 될때까지
pca = PCA(n_components=0.95)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

# 성능이 제일 괜찮은 회귀기반 모델
model = LogisticRegression(random_state=42, solver='newton-cg', max_iter=500)
model.fit(X_train_pca, y_train)

y_pred = model.predict(X_test_pca)

accuracy = accuracy_score(y_test, y_pred)
print(f'정확도: {accuracy}')
print(classification_report(y_test, y_pred))
# print(confusion_matrix(y_test, y_pred))

# pca 이후 성능 유지

정확도: 0.6194029850746269
              precision    recall  f1-score   support

           0       0.57      0.41      0.48        41
           1       0.65      0.75      0.70        52
           2       0.61      0.66      0.64        41

    accuracy                           0.62       134
   macro avg       0.61      0.61      0.60       134
weighted avg       0.61      0.62      0.61       134



In [18]:
pcsSummary_df = pd.DataFrame({'Standard deviation': np.sqrt(pca.explained_variance_),
                           'Proportion of variance': pca.explained_variance_ratio_,
                           'Cumulative proportion': np.cumsum(pca.explained_variance_ratio_)})
pcsSummary_df = pcsSummary_df.transpose()
pcsSummary_df.columns = ['PC{}'.format(i) for i in range(1, len(pcsSummary_df.columns) + 1)]
pcsSummary_df.round(4)

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,...,PC147,PC148,PC149,PC150,PC151,PC152,PC153,PC154,PC155,PC156
Standard deviation,0.5939,0.5525,0.5108,0.444,0.3455,0.3204,0.3112,0.2964,0.2952,0.2418,...,0.0686,0.0681,0.0677,0.0671,0.066,0.0654,0.065,0.0646,0.0642,0.0639
Proportion of variance,0.1078,0.0933,0.0798,0.0603,0.0365,0.0314,0.0296,0.0269,0.0266,0.0179,...,0.0014,0.0014,0.0014,0.0014,0.0013,0.0013,0.0013,0.0013,0.0013,0.0012
Cumulative proportion,0.1078,0.2012,0.281,0.3412,0.3777,0.4091,0.4387,0.4656,0.4922,0.5101,...,0.9389,0.9403,0.9417,0.9431,0.9444,0.9457,0.947,0.9483,0.9495,0.9508


In [19]:
contributions_df = pd.DataFrame(pca.components_, columns=X.columns)

print(contributions_df.abs().sum(axis=0).sort_values(ascending=False)[:28])
# 서울 vs 성남? 
# 서울 : 1승 1무 2패
# 성남 : 2승 1무 1패 
# 성남 2승의 경기 지표상, 서울이 경기력은 좋았으나 패배한 경우임. 
# 절댓값으로 더했으니 실패한 예측력에 기여했을듯

차단             8.529684
대회_인천vs대구      8.477757
클리어링           8.427554
대회_수원vs대구      8.356949
블락             8.350296
키패스            8.337414
대회_대구vs제주      8.228254
대회_강원vs인천      8.197517
획득             8.194549
대회_전북vs울산      8.132815
대회_수원vs제주      8.127350
대회_전북vs인천      8.077724
대회_강원vs전북      8.014506
대회_수원vs울산      8.002739
대회_제주vs대구      7.962018
공격진영 패스        7.884280
대회_서울vs성남      7.846197
대회_대구vs포항      7.821292
롱패스            7.798662
대회_제주vs수원FC    7.796165
대회_전북vs제주      7.791390
대회_수원vs강원      7.783522
대회_인천vs수원FC    7.774685
대회_대구vs서울      7.769256
대회_수원FCvs강원    7.740753
대회_서울vs울산      7.739460
인터셉트           7.724103
대회_서울vs강원      7.721244
dtype: float64


In [20]:
# print(contributions_df.abs().sum(axis=0).sort_values(ascending=False)[28:56])
# 구단 vs 구단이 주로 구성

In [21]:
print(contributions_df.abs().sum(axis=0).sort_values(ascending=False)[56:84])
# 도움이 득점보다 영향력이 높네,,
# 횡패스도 높았음

대회_울산vs강원      7.356615
대회_서울vs대구      7.342539
대회_울산vs수원      7.330036
횡패스            7.321087
대회_수원FCvs울산    7.309764
대회_성남vs울산      7.309452
대회_수원vs전북      7.300302
대회_전북vs서울      7.285003
대회_포항vs수원FC    7.281292
도움             7.275277
대회_성남vs포항      7.249329
득점             7.235037
대회_대구vs수원      7.218230
크로스            7.207202
대회_전북vs강원      7.202164
대회_울산vs대구      7.179829
대회_수원FCvs수원    7.148328
대회_수원FCvs포항    7.139317
대회_강원vs수원FC    7.123498
대회_수원vs포항      7.077885
대회_인천vs전북      7.063415
대회_전북vs수원      7.061838
대회_포항vs수원      7.061286
대회_울산vs서울      6.961195
대회_포항vs서울      6.893744
대회_대구vs인천      6.892823
대회_강원vs수원      6.877159
대회_대구vs울산      6.866363
dtype: float64


In [22]:
print(contributions_df.abs().sum(axis=0).sort_values(ascending=False)[146:156])

소속 선수 수        5.623336
태클             5.452972
탈압박            5.360130
대회_제주vs전북      5.322418
대회_수원FCvs광주    5.282182
용병 수           5.228287
대회_수원vs수원FC    5.001900
퇴장             4.814241
대회_전북vs포항      4.729139
드리블            4.112648
dtype: float64


In [23]:
from sklearn.decomposition import PCA

# 가정: df가 DataFrame인 경우
X = df.drop(columns=['경기결과', '기준'])
X = pd.get_dummies(X, columns=['대회', 'H/A', '시즌'], drop_first=True)

y = df['경기결과']

# LabelEncoder를 대상 변수 'y'에 직접 적용
le = LabelEncoder()
y = le.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# 데이터 스케일 조정
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# pca 적용 : 정확도 95%가 될때까지
pca = PCA(n_components=0.95)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

# 성능이 제일 괜찮은 트리기반 모델
model = RandomForestClassifier(random_state=42)
model.fit(X_train_pca, y_train)

# 모델 평가
y_pred = model.predict(X_test_pca)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

# pca 이후 성능 유지 못함

Accuracy: 0.5
              precision    recall  f1-score   support

           0       0.55      0.39      0.46        41
           1       0.48      0.52      0.50        52
           2       0.49      0.59      0.53        41

    accuracy                           0.50       134
   macro avg       0.51      0.50      0.50       134
weighted avg       0.51      0.50      0.50       134

[[16 16  9]
 [ 9 27 16]
 [ 4 13 24]]


# 변수 선택법 - 단계적 선택법 ( stepwise selection )

In [33]:
print(type(X.columns))

<class 'pandas.core.indexes.base.Index'>


In [44]:
# 변수선택을 통해 형성한 모델의 aic 를 구함
# aic가 낮을 수록 모델이 좋다고 평가

def processSubset(X, y, feature_set):
    model = sm.OLS(y, sm.add_constant(X[feature_set])) # Modeling
    regr = model.fit() # model fitting
    AIC = regr.aic # model's AIC
    return {"model" : regr, "AIC" : AIC}

processSubset(X=X_train, y= y_train, feature_set=X.columns)


{'model': <statsmodels.regression.linear_model.RegressionResultsWrapper at 0x27163cdb640>,
 'AIC': 2607.7895104807712}

In [48]:
import time
import itertools

def processSubset(X, y, feature_set):
    model = sm.OLS(y, sm.add_constant(X[feature_set]))  # feature_set으로 X 선택
    regr = model.fit()  # model fitting
    AIC = regr.aic  # model's AIC
    return {"model": regr, "AIC": AIC}

def getBest(X, y, k):
    tic = time.time()
    results = []  # 결과 저장 공간
    for combo in itertools.combinations(X.columns.difference(['const']), k):
        # 각 변수 조합을 고려한 경우의 수
        combo = (list(combo) + ['const'])
        # 상수항을 추가하여 combo를 결성
        results.append(processSubset(X, y, feature_set=combo))  # 모델링된 것을 저장

        # 만약 k=2이면 여기서 두가지 변수만 뽑아서 경우의 수를 분석하여
        # 저장 후 그 중 AIC가 가장 낮은 모델을 선택하도록 함

    models = pd.DataFrame(results)  # 데이터프레임으로 모델결과 변환
    best_model = models.loc[models['AIC'].idxmin()]  # idxmin은 최소값의 인덱스를 뽑는 함수

    toc = time.time()
    print("Processed", models.shape[0], "models on", k, "predictors in", (toc - tic), "seconds.")

    return best_model

# X_train_scaled을 사용하려면 해당 데이터프레임에 'const' 열을 추가해야 합니다.
X_train['const'] = 1

print(getBest(X=X_train, y=y_train, k=2))


Processed 25200 models on 2 predictors in 67.71621918678284 seconds.
model    <statsmodels.regression.linear_model.Regressio...
AIC                                            2603.971654
Name: 15431, dtype: object


In [49]:
models = pd.DataFrame(columns=["AIC","model"])
tic = time.time()
for i in range(1,4):
	models.loc[i] = getBest(X=X_train, y=y_train,k=i)
toc = time.time()
print("Total elapsed time:",(toc-tic),"seconds.")

# i=3 1시간 걸림. i=2까지만 계산 

Processed 225 models on 1 predictors in 0.5496170520782471 seconds.
Processed 25200 models on 2 predictors in 65.41166257858276 seconds.


KeyboardInterrupt: 

In [50]:
models

Unnamed: 0,AIC,model
1,2616.816186,<statsmodels.regression.linear_model.Regressio...
2,2603.971654,<statsmodels.regression.linear_model.Regressio...


In [52]:
models.loc[2, "model"].summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.21
Model:,OLS,Adj. R-squared:,0.208
Method:,Least Squares,F-statistic:,158.6
Date:,"Sun, 26 Nov 2023",Prob (F-statistic):,8.13e-62
Time:,15:41:04,Log-Likelihood:,-1299.0
No. Observations:,1198,AIC:,2604.0
Df Residuals:,1195,BIC:,2619.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
대회_성남vs대구,-1.3869,0.359,-3.860,0.000,-2.092,-0.682
득점,0.3297,0.019,17.561,0.000,0.293,0.367
const,0.6450,0.032,20.345,0.000,0.583,0.707

0,1,2,3
Omnibus:,320.361,Durbin-Watson:,1.993
Prob(Omnibus):,0.0,Jarque-Bera (JB):,84.203
Skew:,-0.401,Prob(JB):,5.19e-19
Kurtosis:,1.978,Cond. No.,32.5


In [66]:
# 전진 선택법
def forward(X, y, predictors):
    remaining_predictors = [p for p in X.columns.difference(['const']) if p not in predictors]
    tic = time.time()
    results = []
    for p in remaining_predictors:
        results.append(processSubset(X=X, y=y, feature_set=predictors+[p]+['const']))
    
    # 데이터프레임으로 변환
    models = pd.DataFrame(results)
    
    # AIC가 가장 낮은 것을 선택
    best_model = models.loc[models['AIC'].idxmin()]  # 수정된 부분
    toc = time.time()
    print("Processed ", models.shape[0], "models on", len(predictors)+1, "predictors in", (toc-tic))
    print("Selected predictors:", best_model["model"].model.exog_names, "AIC: ", best_model['AIC'])  # 수정된 부분
    return best_model

# 후진 선택법 (후진 소거법)
def backward(X, y, predictors):
    tic = time.time()
    results = []
    
    # 데이터 변수들이 미리 정의된 predictors 조합 확인
    for combo in itertools.combinations(predictors, len(predictors) - 1):
        results.append(processSubset(X=X, y=y, feature_set=list(combo)+['const']))
    
    models = pd.DataFrame(results)
    
    # 가장 낮은 AIC를 가진 모델을 선택
    best_model = models.loc[models['AIC'].idxmin()]  # 수정된 부분
    toc = time.time()
    
    print("Processed ", models.shape[0], "models on", len(predictors) - 1, "predictors in", (toc-tic))
    print("Selected predictors:", best_model['model'].model.exog_names, ' AIC:', best_model['AIC'])  # 수정된 부분
    return best_model


In [None]:
def Stepwise_model(X, y):
    Stepmodels = pd.DataFrame(columns=["AIC", "model"])
    tic = time.time()
    predictors = []
    Smodel_before = processSubset(X, y, predictors + ['const'])['AIC']
    
    # 변수 1~10개 0-9 -> 1-10
    for i in range(1, len(X.columns.difference(['const'])) + 1):
        Forward_result = forward(X=X, y=y, predictors=predictors)  # constant added
        print('forward')
        predictors = Stepmodels.loc[i]['model'].model.model.exog_names
        predictors = [k for k in predictors if k != 'const']
        Backward_result = backward(X=X, y=y, predictors=predictors)
        if Backward_result["AIC"] < Forward_result["AIC"]:
            Stepmodels.loc[i] = Backward_result
            predictors = Stepmodels.loc[i]["model"].model.model.exog_names
            Smodel_before = Stepmodels.loc[i]["AIC"]
            predictors = [k for k in predictors if k != "const"]
            print('backward')
        if Stepmodels.loc[i]["AIC"] > Smodel_before:
            break
        else:
            Smodel_before = Stepmodels.loc[i]["AIC"]
    
    toc = time.time()
    print("Total elapsed time:", (toc - tic), "seconds.")
    return Stepmodels.loc[Stepmodels['AIC'].idxmin()]["model"].model

Stepwise_best_model = Stepwise_model(X=X_train, y=y_train)

Processed  225 models on 1 predictors in 56.194833755493164
Selected predictors: ['득점', 'const'] AIC:  2616.8161855512703
forward


KeyError: 1

In [None]:
Stepwise_best_model.aic

In [None]:
print(Stepwise_best_model.params.shape)

In [None]:
Stepwise_best_model.predict(X_test[Stepwise_best_model.model.exog_names])

In [None]:
# OLS분석에서 다중공선성의 문제가 있을 수 있다고 했음. 
# 변수 선택법을 통해 변수를 줄이며 모델의 신뢰도를 높일 수 있음.

In [70]:
def stepwise_feature_selection(X_train, y_train, variables=X_train.columns.tolist() ):
    import statsmodels.api as sm
    import matplotlib.pyplot as plt
    import warnings
    warnings.filterwarnings("ignore")
    
    y = y_train ## 반응 변수

    selected_variables = [] ## 선택된 변수들
    sl_enter = 0.05
    sl_remove = 0.05
    
    sv_per_step = [] ## 각 스텝별로 선택된 변수들
    adjusted_r_squared = [] ## 각 스텝별 수정된 결정계수
    steps = [] ## 스텝
    step = 0
    while len(variables) > 0:
        remainder = list(set(variables) - set(selected_variables))
        pval = pd.Series(index=remainder) ## 변수의 p-value
        ## 기존에 포함된 변수와 새로운 변수 하나씩 돌아가면서 
        ## 선형 모형을 적합한다.
        for col in remainder: 
            X = X_train[selected_variables+[col]]
            X = sm.add_constant(X)
            model = sm.OLS(y,X).fit(disp=0)
            pval[col] = model.pvalues[col]
    
        min_pval = pval.min()
        if min_pval < sl_enter: ## 최소 p-value 값이 기준 값보다 작으면 포함
            selected_variables.append(pval.idxmin())
            ## 선택된 변수들에대해서
            ## 어떤 변수를 제거할지 고른다.
            while len(selected_variables) > 0:
                selected_X = X_train[selected_variables]
                selected_X = sm.add_constant(selected_X)
                selected_pval = sm.OLS(y,selected_X).fit(disp=0).pvalues[1:] ## 절편항의 p-value는 뺀다
                max_pval = selected_pval.max()
                if max_pval >= sl_remove: ## 최대 p-value값이 기준값보다 크거나 같으면 제외
                    remove_variable = selected_pval.idxmax()
                    selected_variables.remove(remove_variable)
                else:
                    break
            
            step += 1
            steps.append(step)
            adj_r_squared = sm.OLS(y,sm.add_constant(X_train[selected_variables])).fit(disp=0).rsquared_adj
            adjusted_r_squared.append(adj_r_squared)
            sv_per_step.append(selected_variables.copy())
        else:
            break

    fig = plt.figure(figsize=(100,10))
    fig.set_facecolor('white')
    
    font_size = 15
    plt.xticks(steps,[f'step {s}\n'+'\n'.join(sv_per_step[i]) for i,s in enumerate(steps)], fontsize=12)
    plt.plot(steps,adjusted_r_squared, marker='o')
      
    plt.ylabel('Adjusted R Squared',fontsize=font_size)
    plt.grid(True)
    plt.show()

    return selected_variables
    

selected_variables = stepwise_feature_selection(X_train, y_train)