In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import (VotingClassifier,
                              RandomForestClassifier,
                              GradientBoostingClassifier,
                              AdaBoostClassifier,
                              BaggingClassifier)
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

In [2]:
random_state = 42

In [3]:
# 데이터 로드
train = pd.read_csv('dataset/학업위험 예측(다중분류)/train.csv')
test = pd.read_csv('dataset/학업위험 예측(다중분류)/test.csv')

In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 76518 entries, 0 to 76517
Data columns (total 38 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   id                                              76518 non-null  int64  
 1   Marital status                                  76518 non-null  int64  
 2   Application mode                                76518 non-null  int64  
 3   Application order                               76518 non-null  int64  
 4   Course                                          76518 non-null  int64  
 5   Daytime/evening attendance                      76518 non-null  int64  
 6   Previous qualification                          76518 non-null  int64  
 7   Previous qualification (grade)                  76518 non-null  float64
 8   Nacionality                                     76518 non-null  int64  
 9   Mother's qualification                 

In [5]:
# 데이터 분리
# kfold 수행 시 DataFrame 객체를 numpy 객체로 변환 필수
X = train.drop(['id', 'Target'], axis=1)
y = train['Target']
test_id = test.pop('id')

In [6]:
print(X.shape)
print(y.shape)

(76518, 36)
(76518,)


In [7]:
# float64 타입 컬럼명 추출
float_columns = X.select_dtypes(include=['float64']).columns

In [8]:
# float64 데이터 스케일링
scaler = RobustScaler()
scaled_float_X = scaler.fit_transform(X[float_columns])
scaled_float_X = pd.DataFrame(scaled_float_X, columns=float_columns)
scaled_float_test = scaler.transform(test[float_columns])
scaled_float_test = pd.DataFrame(scaled_float_test, columns=float_columns)

In [9]:
# int 데이터와 스케일링 데이터 합치기
int_columns = X.select_dtypes(include=['int64']).columns
X = pd.concat([scaled_float_X, X[int_columns]], axis=1).to_numpy()
test = pd.concat([scaled_float_test, test[int_columns]], axis=1).to_numpy()

### RandomForestClassifer
- n_estimators : 트리 개수. 일반적으로 값이 클수록 모델의 성능이 향상 됨
- max_depth : 트리 최대 깊이
- min_samples_split : 각 노드에서 분할을 위해 필요한 최소 샘플 수
- min_samples_leaf : 리프 노드에 필요한 최소 샘플 수
- criterion : 불순도 기준

In [10]:
# 모델 객체 생성
rf = RandomForestClassifier(criterion='entropy', random_state=random_state)
lgbm = LGBMClassifier(verbose=-1)
gb = GradientBoostingClassifier()
bagging = BaggingClassifier()
xgb = XGBClassifier()
catboost = CatBoostClassifier(verbose=False)

# ada = AdaBoostClassifier()
# gnb = GaussianNB()
# knn = KNeighborsClassifier()
# svm = SVC(probability=True)
# lg = LogisticRegression()
# dc = DecisionTreeClassifier()

# model = VotingClassifier(estimators=[('rf', rf), ('gb', gb), ('dc', dc),
#                                      ('ada', ada), ('xgb', xgb),('lgbm', lgbm),
#                                      ('catboost', catboost), ('bagging', bagging),
#                                      ('gnb', gnb), ('knn', knn), ('svm', svm), ('lg', lg)],
#                          voting='hard')  # 소프트 보팅을 사용하여 predict_proba를 기반으로 앙상블

model = VotingClassifier(estimators=[('rf', rf), ('gb', gb),('xgb', xgb),('lgbm', lgbm),
                                     ('catboost', catboost), ('bagging', bagging),],
                         voting='soft')  # 소프트 보팅을 사용하여 predict_proba를 기반으로 앙상블

### StratifiedKFold vs ReapeatedStratifiedKFold
- StratifiedKFold
    - 각 클래스 비율을 각 폴드에서 동일하게 유지
    - 각 폴드가 전체 데이터셋의 클래스 분포를 대표함
- ReapeatedStratifiedKFold
    - skf와 동일하나 n_repeats 각 반복마다 데이터를 다르게 분할하여 더 신뢰할 수 있는 결과 확보

In [11]:
# 계층화 K-fold 교차 검증 객체 생성
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=random_state)

In [12]:
# from sklearn.model_selection import cross_val_score
# cross_val_score(model, X, y, cv=skf, scoring="accuracy")

In [13]:
best_acc = 0.
acc_lst = []

for train_idx, val_idx in skf.split(X, y):
    X_train, X_val = X[train_idx], X[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]
    
    model.fit(X_train, y_train)
    preds_val = model.predict(X_val)
    acc_val = accuracy_score(y_val, preds_val)
    print(acc_val)
    acc_lst.append(acc_val)

0.8335729221118662
0.8322660742289597
0.8309592263460533
0.8290531268378749
0.8304907534470365


In [14]:
print('Mean CV accuracy:', np.mean(acc_lst))

Mean CV accuracy: 0.8312684205943581


In [15]:
# 전체 훈련 데이터로 최종 모델 학습
model.fit(X, y)

VotingClassifier(estimators=[('rf',
                              RandomForestClassifier(criterion='entropy',
                                                     random_state=42)),
                             ('gb', GradientBoostingClassifier()),
                             ('xgb',
                              XGBClassifier(base_score=None, booster=None,
                                            callbacks=None,
                                            colsample_bylevel=None,
                                            colsample_bynode=None,
                                            colsample_bytree=None,
                                            early_stopping_rounds=None,
                                            enable_categorical=False,
                                            eval_metric=None, gamma=None,
                                            gpu_id...
                                            max_leaves=None,
                                            mi

In [16]:
# 테스트 데이터 추론
preds_test = model.predict(test)

In [17]:
# 제출
submission = pd.DataFrame({'id': test_id, 'Target': preds_test})
submission.to_csv('submission_skfold_ensemble.csv', index=False)

In [18]:
# skt, MinMaxScaler, voting(일부), only 연속형 데이터 스케일링 : 0.83346
# skt, StandardScaler, voting(일부), only 연속형 데이터 스케일링 : 0.83267
# skt, StandardScaler, voting(전부) : 0.83003
# outlier, oversampling, skt, StandardScaler : 별로
# skt, StandardScaler : 0.82915
# skt, MinMaxScaler : 0.82915
# skt         : 0.82846
# no          : 0.82650

In [13]:
row

(0,
 0    anyio
 1    3.6.2
 Name: 0, dtype: object)