In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import f1_score, classification_report
from sklearn.model_selection import StratifiedKFold
import xgboost as xgb

file_path = ''

train = pd.read_csv(file_path + "./train.csv")
test = pd.read_csv(file_path + "./test.csv")

# 특성과 레이블 분리
X = train.drop(columns=["SUBCLASS", "ID"])
y = train["SUBCLASS"]

# 학습 데이터와 검증 데이터 분할
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 레이블 인코딩 (학습 데이터에만 fit)
le_subclass = LabelEncoder()
y_train_encoded = le_subclass.fit_transform(y_train)
y_val_encoded = le_subclass.transform(y_val)




In [None]:
# 범주형 열 선택 (학습 데이터에서)
categorical_columns = X_train.select_dtypes(include=['object', 'category']).columns

# OrdinalEncoder 학습 (학습 데이터에만 fit)
ordinal_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
ordinal_encoder.fit(X_train[categorical_columns])

# 학습 데이터 변환
train_x_encoded = X_train.copy()
train_x_encoded[categorical_columns] = ordinal_encoder.transform(X_train[categorical_columns])

# 검증 데이터 변환
val_x_encoded = X_val.copy()
val_x_encoded[categorical_columns] = ordinal_encoder.transform(X_val[categorical_columns])

# 테스트 데이터 변환
test_x_encoded = test.drop(columns=['ID']).copy()
test_x_encoded[categorical_columns] = ordinal_encoder.transform(test_x_encoded[categorical_columns])

In [10]:
train_x_encoded.shape,train["SUBCLASS"].value_counts()

((4960, 4384),
 SUBCLASS
 BRCA      786
 KIPAN     515
 GBMLGG    461
 STES      379
 KIRC      334
 THCA      324
 SKCM      276
 PRAD      266
 OV        253
 LGG       229
 HNSC      223
 COAD      223
 UCEC      198
 SARC      198
 LUAD      184
 LUSC      178
 LIHC      158
 LAML      158
 CESC      155
 PCPG      147
 TGCT      124
 PAAD      120
 BLCA      104
 THYM       98
 ACC        72
 DLBC       38
 Name: count, dtype: int64)

>Train

In [3]:
from sklearn.metrics import f1_score, classification_report
import lightgbm as lgb


# 인덱스 재설정
train_x_encoded.reset_index(drop=True, inplace=True)
train_y_encoded = pd.Series(y_train_encoded).reset_index(drop=True)
val_x_encoded.reset_index(drop=True, inplace=True)
val_y_encoded = pd.Series(y_val_encoded).reset_index(drop=True)
test_x_encoded.reset_index(drop=True, inplace=True)

X_train = train_x_encoded
y_train = train_y_encoded

# Stratified K-Fold 설정
folds = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# 테스트 데이터 예측 결과 저장용 배열
oof_lgb_test_preds = np.zeros((test_x_encoded.shape[0], len(np.unique(y_train))))

# LightGBM 모델 파라미터 설정
params = {
    'objective': 'multiclass',
    'num_class': len(np.unique(y_train)),
    'learning_rate': 0.01786236294491012,
    'random_state': 42,
    'metric': 'multi_logloss',
    'early_stopping_rounds': 100,
    'n_jobs': -1,
    'verbose': -1,
    'num_leaves': 31,
    'min_data_in_leaf': 27,
    'lambda_l1': 0.00046641762342032746,
    'lambda_l2': 8.228074508440626e-06,
    'min_split_gain': 0.0008507877755254931,
    'min_child_weight': 0.00044655605235435
}

# F1 스코어 및 중요도 저장용 리스트
macro_f1_score_list = []
feature_importance_list = []
class_f1_scores = []

# 각 클래스의 샘플 수 계산
class_counts = np.bincount(y_train)
total_samples = len(y_train)
class_weights = total_samples / (len(class_counts) * class_counts)

# class_weights[6] *= 2
# class_weights[2] *= 2
# class_weights[3] *= 2

# 가중치 배열 생성
weights = np.array([class_weights[label] for label in y_train])

for idx, (train_idx, _) in enumerate(folds.split(X_train, y_train)):
    print('#' * 40, f'폴드 {idx+1} / {folds.n_splits}', "#" * 40)
    X_tr, y_tr = X_train.iloc[train_idx], y_train[train_idx]
    weights_tr = weights[train_idx]
    
    d_train = lgb.Dataset(X_tr, label=y_tr, weight=weights_tr, )
    d_valid = lgb.Dataset(val_x_encoded, label=val_y_encoded, reference=d_train, )
    
    lgb_model = lgb.train(params=params,
                          train_set=d_train,
                          num_boost_round=2000,
                          valid_sets=[d_train, d_valid])
    
    # 테스트 데이터 예측
    oof_lgb_test_preds += lgb_model.predict(test_x_encoded, num_iteration=lgb_model.best_iteration) / folds.n_splits
    
    # 검증 데이터 예측
    val_preds_proba = lgb_model.predict(val_x_encoded, num_iteration=lgb_model.best_iteration)
    val_preds = np.argmax(val_preds_proba, axis=1)
    
    # 다중 클래스 F1 스코어 계산 (클래스별 F1 스코어 포함)
    macro_f1 = f1_score(val_y_encoded, val_preds, average='macro')
    class_f1 = f1_score(val_y_encoded, val_preds, average=None)
    macro_f1_score_list.append(macro_f1)
    class_f1_scores.append(class_f1)
    
    print(f'폴드 {idx+1} Macro F1 score: {macro_f1}\n')
    print(f'폴드 {idx+1} 클래스별 F1 score:\n{classification_report(val_y_encoded, val_preds)}\n')
    
    feature_importance_list.append(lgb_model.feature_importance())

print(f'검증 평균 Macro F1 score: {np.mean(macro_f1_score_list)}')

# 3폴드(시간: 3분)
# 검증 평균 Macro F1 score: 0.7014147468057759 basic
# 검증 평균 Macro F1 score: 0.7026641057374847 lr
# 검증 평균 Macro F1 score: 0.6668368970749656 num leaves (basic)
# 검증 평균 Macro F1 score: 0.7005378207730599 min_data_in_leaf
# 검증 평균 Macro F1 score: 0.7042535190375319 lamda1
# 검증 평균 Macro F1 score: 0.6979089872451847 labmda2 (제외)
# 검증 평균 Macro F1 score: 0.6972929850759524  gain(제외)
# 검증 평균 Macro F1 score: 0.7022703359832209 min_child_weight
# 검증 평균 Macro F1 score: 0.699685460809067 최종

######################################## 폴드 1 / 10 ########################################
폴드 1 Macro F1 score: 0.29123782146058275

폴드 1 클래스별 F1 score:
              precision    recall  f1-score   support

           0       0.38      0.86      0.52        14
           1       0.24      0.19      0.21        21
           2       0.67      0.33      0.44       157
           3       0.12      0.19      0.15        31
           4       0.52      0.56      0.54        45
           5       0.11      0.14      0.12         7
           6       0.30      0.14      0.19        92
           7       0.21      0.24      0.23        45
           8       0.28      0.14      0.18       103
           9       0.35      0.45      0.39        67
          10       0.47      0.50      0.48        32
          11       0.41      0.78      0.54        46
          12       0.11      0.16      0.13        31
          13       0.19      0.14      0.16        37
          14       0.33      0.11  

In [4]:
test_preds_proba = oof_lgb_test_preds
test_preds = np.argmax(test_preds_proba, axis=1)
test_preds_labels = le_subclass.inverse_transform(test_preds)

In [5]:
submisson = pd.read_csv(file_path+"./sample_submission.csv")
submisson["SUBCLASS"] = test_preds_labels
submisson["SUBCLASS"].value_counts()

SUBCLASS
COAD      371
HNSC      188
THYM      170
BRCA      147
STES      136
OV        114
UCEC      110
KIRC      104
GBMLGG    100
LAML       98
KIPAN      96
LGG        95
CESC       87
LIHC       82
PCPG       79
THCA       78
ACC        73
TGCT       65
PRAD       57
SKCM       55
LUAD       49
SARC       47
PAAD       44
BLCA       42
DLBC       41
LUSC       18
Name: count, dtype: int64

In [6]:
submisson.head()

Unnamed: 0,ID,SUBCLASS
0,TEST_0000,STES
1,TEST_0001,UCEC
2,TEST_0002,THCA
3,TEST_0003,LGG
4,TEST_0004,LAML


In [7]:


submisson.to_csv(file_path+ './oof_lgb_submission.csv', encoding='UTF-8-sig', index=False)