# 암환자 유전체 데이터 기반 암종 분류 AI 모델 개발


- '2024 생명연구자원 AI활용 경진대회'는 바이오 데이터를 기반으로 한 AI 기술의 문제 해결 능력을 탐구하는 것을 목표로 합니다. <br>이 대회는 바이오 분야에서 AI 활용의 저변을 확대하고, 복잡한 바이오 데이터를 효율적으로 분석 및 해석할 수 있는 AI 알고리즘 개발에 초점을 맞추고 있습니다. <br><br>
- 본 대회의 구체적인 과제는 암환자 유전체 데이터의 변이 정보를 활용하여 암종을 분류하는 AI 모델을 개발하는 것입니다. <br>참가자들은 제공된 학습 데이터셋(암환자 유전체 변이 정보)을 사용하여 특정 변이 정보를 바탕으로 암종을 정확하게 분류할 수 있는 AI 알고리즘을 개발해야 합니다. <br><br>
- 이 대회의 궁극적인 목적은 바이오 데이터의 활용도를 높이고, 바이오 분야에서 AI 기술의 적용 가능성을 극대화하며, 인공지능 기술이 실제 바이오 의료 문제 해결에 어떻게 기여할 수 있는지 탐구하는 것입니다.

# VERSIONS #
### GPU 0 : NVIDIA GeForce RTX 3080 Ti Laptop GPU
### GPU 1 : AMD Radeon(TM) Graphics
### CPU : AMD Ryzen 9 6900HX with Radeon Graphics

### -------------------------- Python & library version --------------------------
### Python version: 3.9.13 (main, Aug 25 2022, 23:51:50) [MSC v.1916 64 bit (AMD64)]
### pandas version: 2.0.3
### numpy version: 1.21.5
### matplotlib version: 3.5.2
### tqdm version: 4.64.1
### xgboost version: 1.7.2
### lightgbm version: 3.3.3
### catboost version: 1.1.1
### scikit-learn version: 1.0.2
------------------------------------------------------------------------------

In [24]:
# <!-- from google.colab import drive
# drive.mount('/content/drive')# Import library -->

import pandas as pd
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
import xgboost as xgb
from tqdm import tqdm  # tqdm 임포트
import random
import os
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    
seed_everything(42) # Seed 고정

In [25]:
train = pd.read_csv('1019_final_tr_2.csv')
test = pd.read_csv('1019_final_te_2.csv')

In [27]:
import numpy as np

# train.loc[train['SUBCLASS']=='GBM','SUBCLASS'] = 'GBMLGG'
# train.loc[train['SUBCLASS']=='GBMLGG','SUBCLASS'] = np.nan
# train.loc[train['SUBCLASS']=='KIPAN','SUBCLASS'] =  np.nan

train.loc[train['SUBCLASS']=='READ','SUBCLASS'] = np.nan
train.loc[train['SUBCLASS']=='UVM','SUBCLASS'] = np.nan
train.loc[train['SUBCLASS']=='UCS','SUBCLASS'] = np.nan
train.loc[train['SUBCLASS']=='CHOL','SUBCLASS'] = np.nan
train.loc[train['SUBCLASS']=='KIRP','SUBCLASS'] = 'KIPAN'
train.loc[train['SUBCLASS']=='KICH','SUBCLASS'] = 'KIPAN'
train.loc[train['SUBCLASS']=='KIRC','SUBCLASS'] = 'KIRC'

train.loc[train['SUBCLASS']=='STAD','SUBCLASS'] = 'STES'
train.loc[train['SUBCLASS']=='ESCA','SUBCLASS'] = 'STES'
train = train.dropna(subset=['SUBCLASS'])

train = train.reset_index(drop=True)

In [28]:
wt_indices = train.iloc[:, 2:].apply(lambda row: (row == 'WT').all(), axis=1)
result_indices = train[wt_indices].index
print(len(result_indices))
train = train.drop(result_indices)
train = train.reset_index(drop=True)

94


# Changing data

In [32]:
from sklearn.feature_extraction.text import TfidfVectorizer
import re
# 1. 아미노산 변환 사전 정의
amino_acid_dict = {
    'A': 'Alanine',
    'R': 'Arginine',
    'N': 'Asparagine',
    'D': 'Aspartic acid',
    'C': 'Cysteine',
    'E': 'Glutamic acid',
    'Q': 'Glutamine',
    'G': 'Glycine',
    'H': 'Histidine',
    'I': 'Isoleucine',
    'L': 'Leucine',
    'K': 'Lysine',
    'M': 'Methionine',
    'F': 'Phenylalanine',
    'P': 'Proline',
    'S': 'Serine',
    'T': 'Threonine',
    'W': 'Tryptophan',
    'Y': 'Tyrosine',
    'V': 'Valine',
    '*' :'Stop Codon',
    'X' : 'UnKnown'
}

def translate_amino_acids(sequence, amino_acid_dict):
    translated = [amino_acid_dict.get(aa, "Unknown") for aa in sequence]
    return ", ".join(translated)

# Function to interpret mutation notation
def interpret_mutation_with_col_name(gene, mutation_code):
    if mutation_code == "WT" or not isinstance(mutation_code, str) or len(mutation_code) < 2:
        return None  # Skip wild-type entries, non-string entries, or too-short strings

    # Split the mutation code by spaces to handle multiple mutations in one cell
    mutations = mutation_code.split(' ')
    descriptions = []

    for mutation in mutations:
        description = None  # Initialize description for each mutation
        if '-' in mutation and 'fs' in mutation:
            # Extract the position (e.g., -5026fs -> position 5026)
            position = mutation.split('fs')[0]  # Split at 'fs' and take the first part
            position = position.replace('-', '')  # Remove '-' to get the numeric position

            # Create description for the frameshift mutation
            description = f" {gene},  frameshift  {position}"

        elif 'fs' in mutation:
            # Handle frameshift mutations
            if mutation[1].isalpha():  # 숫자가 아닐 때 (알파벳일 때)
                original_aa_1 = mutation[0]
                original_aa_1 = amino_acid_dict.get(original_aa_1, "Unknown")

                original_aa_2 = mutation[1]
                original_aa_2 = amino_acid_dict.get(original_aa_2, "Unknown")

                position = mutation[2:-2]  # Exclude 'fs' from position
                description = f" {gene},  frameshift {original_aa_1}  {original_aa_2}  {position}"
            else:
                original_aa = mutation[0]
                original_aa = amino_acid_dict.get(original_aa, "Unknown")

                position = mutation[1:-2]  # Exclude 'fs' from position
                description = f" {gene},  frameshift  {original_aa}  {position}"

        elif '*' in mutation:
            if mutation[0]=='*' :
                position = mutation[1:-1]
                new_aa = mutation[-1]
                new_aa = amino_acid_dict.get(new_aa, "Unknown")
                description = f"{gene}, stop codon {position} changes to {new_aa}"

            else:
                # Handle stop codon mutations
                original_aa = mutation[0]
                original_aa = amino_acid_dict.get(original_aa, "Unknown")

                position = mutation[1:-1]  # Position before the *
                description = f"{gene}, {original_aa} {position} changes to stop codon"

        elif 'delins' in mutation:
            # Handle deletion-insertion mutations
            match = re.match(r"([A-Za-z]+)(\d+)_([A-Za-z]+)(\d+)delins([A-Za-z]+)", mutation)
            if match:
                original_aa_start = match.group(1)
                start_position = match.group(2)
                original_aa_end = match.group(3)
                end_position = match.group(4)
                inserted_seq = match.group(5)

                # Translate amino acids using dictionary
                original_aa_start = amino_acid_dict.get(original_aa_start, "Unknown")
                original_aa_end = amino_acid_dict.get(original_aa_end, "Unknown")
                inserted_seq_translated = translate_amino_acids(inserted_seq, amino_acid_dict)

                description = f"{gene} {start_position} ({original_aa_start}) to {end_position} ({original_aa_end}), a del is followed by ins of {inserted_seq_translated}"

        elif 'ins' in mutation:
            # Handle insertion mutations
            match = re.match(r"([A-Za-z]+)(\d+)ins([A-Za-z]+)", mutation)
            if match:
                original_aa = match.group(1)
                position = match.group(2)
                inserted_seq = match.group(3)

                # Translate amino acids using dictionary
                original_aa = amino_acid_dict.get(original_aa, "Unknown")
                inserted_seq_translated = translate_amino_acids(inserted_seq, amino_acid_dict)

                description = f"{gene}, {inserted_seq_translated} ins {position} {original_aa}"

        elif 'del' in mutation:
            # Handle deletion mutations (modified regex to capture both amino acids and positions)
            match = re.match(r"([A-Za-z]+)(\d+)del", mutation)
            if match:
                original_aa_start = match.group(1)
                start_position = match.group(2)
#                 original_aa_end = match.group(3)
#                 end_position = match.group(4)

                # Translate amino acids using dictionary
                original_aa_start = amino_acid_dict.get(original_aa_start, "Unknown")
#                 original_aa_end = amino_acid_dict.get(original_aa_end, "Unknown")

                description = f"{gene}, {start_position}, ({original_aa_start}) is del"

        elif 'dup' in mutation:
            # Handle duplication mutations (modified regex to capture both amino acids and positions)
            match = re.match(r"([A-Za-z]+)(\d+)dup", mutation)
            if match:
                original_aa_start = match.group(1)
                start_position = match.group(2)
#                 original_aa_end = match.group(3)
#                 end_position = match.group(4)

                # Translate amino acids using dictionary
                original_aa_start = amino_acid_dict.get(original_aa_start, "Unknown")
#                 original_aa_end = amino_acid_dict.get(original_aa_end, "Unknown")

                description = f"{gene} {start_position} ({original_aa_start}) dup"

        elif len(mutation) >= 2 and mutation[0] == mutation[-1]:
            # Handle mutations where the original and new amino acid are the same (e.g., S1866S)
            original_aa = mutation[0]
            original_aa = amino_acid_dict.get(original_aa, "Unknown")

            position = mutation[1:-1]
            description = f"{gene}, {original_aa} {position} changes {original_aa}"

        elif len(mutation) >= 2 and mutation[0] != mutation[-1]:
            # Handle mutations where the original and new amino acid are different
            original_aa = mutation[0]
            original_aa = amino_acid_dict.get(original_aa, "Unknown")

            position = mutation[1:-1]
            new_aa = mutation[-1]
            new_aa = amino_acid_dict.get(new_aa, "Unknown")

            description = f" {gene}, {original_aa}  changes  {new_aa} {position}"

        else:
            # Handle general mutations
            match = re.match(r"([A-Za-z]+)(\d+)", mutation)

            if match:
                original_aa = match.group(1)  # 'T'
                position = match.group(2)     # '218'
                original_aa = amino_acid_dict.get(original_aa, "Unknown")
                description = f"{gene}, {original_aa} is changes del or ins {position}"

        if description:
            descriptions.append(description)

    return "; ".join(descriptions)

# Train 데이터 돌연변이 설명 생성
train['mutation_description'] = ""
for index, row in train.iterrows():
    mutation_descriptions = []
    for gene, mutation in row.items():
        if gene != 'SUBCLASS' and mutation != "WT" and gene != 'ID':
            description = interpret_mutation_with_col_name(gene, mutation)
            if description:
                mutation_descriptions.append(description)
    full_description = "; ".join(mutation_descriptions)
    train.at[index, 'mutation_description'] = full_description

# Test 데이터 돌연변이 설명 생성
test['mutation_description'] = ""
for index, row in test.iterrows():
    mutation_descriptions = []
    for gene, mutation in row.items():
        if gene != 'SUBCLASS' and mutation != "WT" and gene != 'ID':
            description = interpret_mutation_with_col_name(gene, mutation)
            if description:
                mutation_descriptions.append(description)
    full_description = "; ".join(mutation_descriptions)
    test.at[index, 'mutation_description'] = full_description

df_test = test['mutation_description']
df_train = train[["SUBCLASS", "mutation_description"]]

# TF-IDF Vectorizer 초기화 및 학습
vectorizer = TfidfVectorizer(max_features=40000)

# train 데이터를 이용해 TF-IDF 모델을 학습 (fit)
vectorizer.fit(df_train['mutation_description'])

# train 데이터를 변환 (transform)
train_tfidf = vectorizer.transform(df_train['mutation_description'])

# test 데이터를 변환 (transform)
test_tfidf = vectorizer.transform(df_test)

# 결과 출력
print("Train TF-IDF shape:", train_tfidf.shape)
print("Test TF-IDF shape:", test_tfidf.shape)

# 희소 행렬을 밀집 행렬로 변환
train_data = pd.DataFrame(train_tfidf.toarray())
test_data = pd.DataFrame(test_tfidf.toarray())

Train TF-IDF shape: (39874, 28122)
Test TF-IDF shape: (2546, 28122)


In [38]:
# SUBCLASS 범주형 데이터를 숫자로 변환 (LabelEncoder 사용)
train_data['SUBCLASS'] = df_train['SUBCLASS']
le_subclass = LabelEncoder()
train_data['SUBCLASS'] = le_subclass.fit_transform(train_data['SUBCLASS'])

# 변환된 레이블 확인
for i, label in enumerate(le_subclass.classes_):
    print(f"원래 레이블: {label}, 변환된 숫자: {i}")

# Feature와 Target 분리
X = train_data.drop(columns=['SUBCLASS'])
y = train_data['SUBCLASS']
X_test = test_data

원래 레이블: ACC, 변환된 숫자: 0
원래 레이블: BLCA, 변환된 숫자: 1
원래 레이블: BRCA, 변환된 숫자: 2
원래 레이블: CESC, 변환된 숫자: 3
원래 레이블: COAD, 변환된 숫자: 4
원래 레이블: DLBC, 변환된 숫자: 5
원래 레이블: GBMLGG, 변환된 숫자: 6
원래 레이블: HNSC, 변환된 숫자: 7
원래 레이블: KIPAN, 변환된 숫자: 8
원래 레이블: KIRC, 변환된 숫자: 9
원래 레이블: LAML, 변환된 숫자: 10
원래 레이블: LGG, 변환된 숫자: 11
원래 레이블: LIHC, 변환된 숫자: 12
원래 레이블: LUAD, 변환된 숫자: 13
원래 레이블: LUSC, 변환된 숫자: 14
원래 레이블: OV, 변환된 숫자: 15
원래 레이블: PAAD, 변환된 숫자: 16
원래 레이블: PCPG, 변환된 숫자: 17
원래 레이블: PRAD, 변환된 숫자: 18
원래 레이블: SARC, 변환된 숫자: 19
원래 레이블: SKCM, 변환된 숫자: 20
원래 레이블: STES, 변환된 숫자: 21
원래 레이블: TGCT, 변환된 숫자: 22
원래 레이블: THCA, 변환된 숫자: 23
원래 레이블: THYM, 변환된 숫자: 24
원래 레이블: UCEC, 변환된 숫자: 25


In [39]:
import pandas as pd
import numpy as np
import lightgbm as lgbm
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.utils.class_weight import compute_sample_weight  # 이 부분이 추가됩니다.


# 입력 데이터 설정
trt = X
y = y
models = []

# Stratified K-Fold 설정
for seed in range(0, 1):  # 시드를 여러 개로 설정하여 평균내기

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
    evals_result = {}
    test_preds = []

    # Stratified K-Fold를 사용하여 모델 학습
    for train_index, test_index in skf.split(trt, y):
        X_train, X_val = trt.iloc[train_index], trt.iloc[test_index]
        y_train, y_val = y[train_index], y[test_index]

        print(X_train.shape, X_val.shape)

        y_pred_list = []
    #     for seed in range(42, 43):  # 시드를 여러 개로 설정하여 평균내기

        sample_weight = compute_sample_weight(class_weight='balanced', y=y_train)

        dtrain = lgbm.Dataset(X_train, y_train)#, weight=sample_weight)
        dvalid = lgbm.Dataset(X_val, y_val)
        print(f"Seed: {seed}")
        params = {
            "objective": "multiclass",
            "num_class": len(np.unique(y)),  # 클래스 수 지정
            "metric": "multi_logloss",
            "verbosity": -1,
            "boosting_type": "gbdt",
            "max_depth": 6,
            "num_leaves": 300,
            "learning_rate": 0.1,
            "lambda_l1": 0,
            "lambda_l2": 0,
            "bagging_fraction": 0.5,
            "feature_fraction": 0.5,
            "early_stopping_round": 20,
            "max_bin": 35,
            "seed": seed ,
            # "is_unbalance": False,  # 직접 가중치를 적용하기 때문에 False로 설정
            # "class_weight": class_weights_dict  # 클래스 가중치 적용
        }

        model = lgbm.train(
            params,
            dtrain,
            valid_sets=[dvalid],
            num_boost_round=4000,
            #early_stopping_rounds=100,  # 조기 종료 라운드
            #verbose_eval=100
        )

        # 예측 수행
        y_pred = model.predict(X_val)
        y_pred_list.append(y_pred)

        # 다중 분류 F1-Score 계산 (average=None 사용)
        y_pred_avg = np.argmax(np.mean(y_pred_list, axis=0), axis=1)
        f1_per_class = f1_score(y_val, y_pred_avg, average=None)

        # 각 클래스별 F1 Score 출력
        for idx, score in enumerate(f1_per_class):
            print(f"F1 Score for class {idx}: {score}")

        # Overall Macro F1 Score
        f1_macro = f1_score(y_val, y_pred_avg, average='macro')
        print(f"Overall F1 Score (Macro): {f1_macro}")

        models.append(model)

(31899, 28122) (7975, 28122)
Seed: 0
[1]	valid_0's multi_logloss: 2.72135
Training until validation scores don't improve for 20 rounds
[2]	valid_0's multi_logloss: 2.53992
[3]	valid_0's multi_logloss: 2.39961
[4]	valid_0's multi_logloss: 2.2933
[5]	valid_0's multi_logloss: 2.20673
[6]	valid_0's multi_logloss: 2.13654
[7]	valid_0's multi_logloss: 2.0753
[8]	valid_0's multi_logloss: 2.01853
[9]	valid_0's multi_logloss: 1.97264
[10]	valid_0's multi_logloss: 1.92628
[11]	valid_0's multi_logloss: 1.8906
[12]	valid_0's multi_logloss: 1.85536
[13]	valid_0's multi_logloss: 1.82146
[14]	valid_0's multi_logloss: 1.7953
[15]	valid_0's multi_logloss: 1.76908
[16]	valid_0's multi_logloss: 1.74505
[17]	valid_0's multi_logloss: 1.7221
[18]	valid_0's multi_logloss: 1.6997
[19]	valid_0's multi_logloss: 1.68025
[20]	valid_0's multi_logloss: 1.66011
[21]	valid_0's multi_logloss: 1.64242
[22]	valid_0's multi_logloss: 1.62317
[23]	valid_0's multi_logloss: 1.60823
[24]	valid_0's multi_logloss: 1.59275
[25]	

[212]	valid_0's multi_logloss: 1.04705
[213]	valid_0's multi_logloss: 1.04611
[214]	valid_0's multi_logloss: 1.04526
[215]	valid_0's multi_logloss: 1.0442
[216]	valid_0's multi_logloss: 1.0431
[217]	valid_0's multi_logloss: 1.04223
[218]	valid_0's multi_logloss: 1.04139
[219]	valid_0's multi_logloss: 1.04054
[220]	valid_0's multi_logloss: 1.03964
[221]	valid_0's multi_logloss: 1.03873
[222]	valid_0's multi_logloss: 1.03775
[223]	valid_0's multi_logloss: 1.03695
[224]	valid_0's multi_logloss: 1.03629
[225]	valid_0's multi_logloss: 1.03543
[226]	valid_0's multi_logloss: 1.03443
[227]	valid_0's multi_logloss: 1.03365
[228]	valid_0's multi_logloss: 1.03272
[229]	valid_0's multi_logloss: 1.03186
[230]	valid_0's multi_logloss: 1.03118
[231]	valid_0's multi_logloss: 1.03041
[232]	valid_0's multi_logloss: 1.02974
[233]	valid_0's multi_logloss: 1.02893
[234]	valid_0's multi_logloss: 1.02841
[235]	valid_0's multi_logloss: 1.02728
[236]	valid_0's multi_logloss: 1.02677
[237]	valid_0's multi_loglo

[420]	valid_0's multi_logloss: 0.930795
[421]	valid_0's multi_logloss: 0.930554
[422]	valid_0's multi_logloss: 0.930256
[423]	valid_0's multi_logloss: 0.930143
[424]	valid_0's multi_logloss: 0.929895
[425]	valid_0's multi_logloss: 0.929632
[426]	valid_0's multi_logloss: 0.929269
[427]	valid_0's multi_logloss: 0.929
[428]	valid_0's multi_logloss: 0.928762
[429]	valid_0's multi_logloss: 0.928419
[430]	valid_0's multi_logloss: 0.928062
[431]	valid_0's multi_logloss: 0.92764
[432]	valid_0's multi_logloss: 0.927185
[433]	valid_0's multi_logloss: 0.926956
[434]	valid_0's multi_logloss: 0.926638
[435]	valid_0's multi_logloss: 0.92614
[436]	valid_0's multi_logloss: 0.925918
[437]	valid_0's multi_logloss: 0.925618
[438]	valid_0's multi_logloss: 0.925235
[439]	valid_0's multi_logloss: 0.924667
[440]	valid_0's multi_logloss: 0.924689
[441]	valid_0's multi_logloss: 0.924514
[442]	valid_0's multi_logloss: 0.924225
[443]	valid_0's multi_logloss: 0.923707
[444]	valid_0's multi_logloss: 0.923365
[445]

[626]	valid_0's multi_logloss: 0.89099
[627]	valid_0's multi_logloss: 0.890806
[628]	valid_0's multi_logloss: 0.890866
[629]	valid_0's multi_logloss: 0.890709
[630]	valid_0's multi_logloss: 0.89073
[631]	valid_0's multi_logloss: 0.890564
[632]	valid_0's multi_logloss: 0.890398
[633]	valid_0's multi_logloss: 0.890336
[634]	valid_0's multi_logloss: 0.890216
[635]	valid_0's multi_logloss: 0.890329
[636]	valid_0's multi_logloss: 0.890326
[637]	valid_0's multi_logloss: 0.890329
[638]	valid_0's multi_logloss: 0.890263
[639]	valid_0's multi_logloss: 0.890446
[640]	valid_0's multi_logloss: 0.89021
[641]	valid_0's multi_logloss: 0.890009
[642]	valid_0's multi_logloss: 0.890063
[643]	valid_0's multi_logloss: 0.889718
[644]	valid_0's multi_logloss: 0.88975
[645]	valid_0's multi_logloss: 0.889717
[646]	valid_0's multi_logloss: 0.88974
[647]	valid_0's multi_logloss: 0.889938
[648]	valid_0's multi_logloss: 0.889877
[649]	valid_0's multi_logloss: 0.889783
[650]	valid_0's multi_logloss: 0.889865
[651]

[29]	valid_0's multi_logloss: 1.52705
[30]	valid_0's multi_logloss: 1.51541
[31]	valid_0's multi_logloss: 1.50516
[32]	valid_0's multi_logloss: 1.49425
[33]	valid_0's multi_logloss: 1.48402
[34]	valid_0's multi_logloss: 1.47503
[35]	valid_0's multi_logloss: 1.46667
[36]	valid_0's multi_logloss: 1.4581
[37]	valid_0's multi_logloss: 1.45038
[38]	valid_0's multi_logloss: 1.44262
[39]	valid_0's multi_logloss: 1.435
[40]	valid_0's multi_logloss: 1.42597
[41]	valid_0's multi_logloss: 1.41933
[42]	valid_0's multi_logloss: 1.41232
[43]	valid_0's multi_logloss: 1.40544
[44]	valid_0's multi_logloss: 1.39836
[45]	valid_0's multi_logloss: 1.39226
[46]	valid_0's multi_logloss: 1.38523
[47]	valid_0's multi_logloss: 1.3792
[48]	valid_0's multi_logloss: 1.37353
[49]	valid_0's multi_logloss: 1.36799
[50]	valid_0's multi_logloss: 1.36221
[51]	valid_0's multi_logloss: 1.35695
[52]	valid_0's multi_logloss: 1.35177
[53]	valid_0's multi_logloss: 1.34705
[54]	valid_0's multi_logloss: 1.3429
[55]	valid_0's mu

[242]	valid_0's multi_logloss: 1.01959
[243]	valid_0's multi_logloss: 1.01907
[244]	valid_0's multi_logloss: 1.01824
[245]	valid_0's multi_logloss: 1.01753
[246]	valid_0's multi_logloss: 1.01684
[247]	valid_0's multi_logloss: 1.01599
[248]	valid_0's multi_logloss: 1.01535
[249]	valid_0's multi_logloss: 1.01459
[250]	valid_0's multi_logloss: 1.01381
[251]	valid_0's multi_logloss: 1.01296
[252]	valid_0's multi_logloss: 1.01174
[253]	valid_0's multi_logloss: 1.01132
[254]	valid_0's multi_logloss: 1.01022
[255]	valid_0's multi_logloss: 1.00968
[256]	valid_0's multi_logloss: 1.00882
[257]	valid_0's multi_logloss: 1.00817
[258]	valid_0's multi_logloss: 1.00754
[259]	valid_0's multi_logloss: 1.00706
[260]	valid_0's multi_logloss: 1.00626
[261]	valid_0's multi_logloss: 1.00568
[262]	valid_0's multi_logloss: 1.00521
[263]	valid_0's multi_logloss: 1.00446
[264]	valid_0's multi_logloss: 1.00379
[265]	valid_0's multi_logloss: 1.0032
[266]	valid_0's multi_logloss: 1.00256
[267]	valid_0's multi_logl

[448]	valid_0's multi_logloss: 0.926294
[449]	valid_0's multi_logloss: 0.926133
[450]	valid_0's multi_logloss: 0.925978
[451]	valid_0's multi_logloss: 0.925879
[452]	valid_0's multi_logloss: 0.925697
[453]	valid_0's multi_logloss: 0.925347
[454]	valid_0's multi_logloss: 0.925113
[455]	valid_0's multi_logloss: 0.924956
[456]	valid_0's multi_logloss: 0.924727
[457]	valid_0's multi_logloss: 0.924458
[458]	valid_0's multi_logloss: 0.924327
[459]	valid_0's multi_logloss: 0.924071
[460]	valid_0's multi_logloss: 0.923707
[461]	valid_0's multi_logloss: 0.923484
[462]	valid_0's multi_logloss: 0.923182
[463]	valid_0's multi_logloss: 0.922884
[464]	valid_0's multi_logloss: 0.922626
[465]	valid_0's multi_logloss: 0.92222
[466]	valid_0's multi_logloss: 0.922039
[467]	valid_0's multi_logloss: 0.921664
[468]	valid_0's multi_logloss: 0.921573
[469]	valid_0's multi_logloss: 0.921151
[470]	valid_0's multi_logloss: 0.9209
[471]	valid_0's multi_logloss: 0.920719
[472]	valid_0's multi_logloss: 0.920627
[47

[654]	valid_0's multi_logloss: 0.896737
[655]	valid_0's multi_logloss: 0.896662
[656]	valid_0's multi_logloss: 0.896543
[657]	valid_0's multi_logloss: 0.896464
[658]	valid_0's multi_logloss: 0.896517
[659]	valid_0's multi_logloss: 0.896524
[660]	valid_0's multi_logloss: 0.89665
[661]	valid_0's multi_logloss: 0.896562
[662]	valid_0's multi_logloss: 0.896497
[663]	valid_0's multi_logloss: 0.896448
[664]	valid_0's multi_logloss: 0.896148
[665]	valid_0's multi_logloss: 0.896185
[666]	valid_0's multi_logloss: 0.896045
[667]	valid_0's multi_logloss: 0.895992
[668]	valid_0's multi_logloss: 0.895873
[669]	valid_0's multi_logloss: 0.895709
[670]	valid_0's multi_logloss: 0.895757
[671]	valid_0's multi_logloss: 0.895662
[672]	valid_0's multi_logloss: 0.895587
[673]	valid_0's multi_logloss: 0.895529
[674]	valid_0's multi_logloss: 0.89545
[675]	valid_0's multi_logloss: 0.89541
[676]	valid_0's multi_logloss: 0.895407
[677]	valid_0's multi_logloss: 0.895356
[678]	valid_0's multi_logloss: 0.895621
[67

[94]	valid_0's multi_logloss: 1.20671
[95]	valid_0's multi_logloss: 1.20423
[96]	valid_0's multi_logloss: 1.20237
[97]	valid_0's multi_logloss: 1.20047
[98]	valid_0's multi_logloss: 1.19879
[99]	valid_0's multi_logloss: 1.19625
[100]	valid_0's multi_logloss: 1.19415
[101]	valid_0's multi_logloss: 1.19135
[102]	valid_0's multi_logloss: 1.18928
[103]	valid_0's multi_logloss: 1.18759
[104]	valid_0's multi_logloss: 1.18513
[105]	valid_0's multi_logloss: 1.18294
[106]	valid_0's multi_logloss: 1.18099
[107]	valid_0's multi_logloss: 1.17923
[108]	valid_0's multi_logloss: 1.17749
[109]	valid_0's multi_logloss: 1.17529
[110]	valid_0's multi_logloss: 1.17342
[111]	valid_0's multi_logloss: 1.17147
[112]	valid_0's multi_logloss: 1.16945
[113]	valid_0's multi_logloss: 1.16743
[114]	valid_0's multi_logloss: 1.1653
[115]	valid_0's multi_logloss: 1.16322
[116]	valid_0's multi_logloss: 1.1612
[117]	valid_0's multi_logloss: 1.15965
[118]	valid_0's multi_logloss: 1.15805
[119]	valid_0's multi_logloss: 1.

[304]	valid_0's multi_logloss: 0.973894
[305]	valid_0's multi_logloss: 0.973226
[306]	valid_0's multi_logloss: 0.972858
[307]	valid_0's multi_logloss: 0.972468
[308]	valid_0's multi_logloss: 0.97181
[309]	valid_0's multi_logloss: 0.971179
[310]	valid_0's multi_logloss: 0.970547
[311]	valid_0's multi_logloss: 0.970211
[312]	valid_0's multi_logloss: 0.96973
[313]	valid_0's multi_logloss: 0.969208
[314]	valid_0's multi_logloss: 0.968857
[315]	valid_0's multi_logloss: 0.968303
[316]	valid_0's multi_logloss: 0.967896
[317]	valid_0's multi_logloss: 0.967331
[318]	valid_0's multi_logloss: 0.966934
[319]	valid_0's multi_logloss: 0.966403
[320]	valid_0's multi_logloss: 0.965834
[321]	valid_0's multi_logloss: 0.965439
[322]	valid_0's multi_logloss: 0.964931
[323]	valid_0's multi_logloss: 0.964376
[324]	valid_0's multi_logloss: 0.964029
[325]	valid_0's multi_logloss: 0.96345
[326]	valid_0's multi_logloss: 0.96311
[327]	valid_0's multi_logloss: 0.962442
[328]	valid_0's multi_logloss: 0.962202
[329

[510]	valid_0's multi_logloss: 0.905385
[511]	valid_0's multi_logloss: 0.905241
[512]	valid_0's multi_logloss: 0.904932
[513]	valid_0's multi_logloss: 0.904666
[514]	valid_0's multi_logloss: 0.904479
[515]	valid_0's multi_logloss: 0.904132
[516]	valid_0's multi_logloss: 0.904052
[517]	valid_0's multi_logloss: 0.904005
[518]	valid_0's multi_logloss: 0.903755
[519]	valid_0's multi_logloss: 0.903547
[520]	valid_0's multi_logloss: 0.903239
[521]	valid_0's multi_logloss: 0.902941
[522]	valid_0's multi_logloss: 0.902704
[523]	valid_0's multi_logloss: 0.902526
[524]	valid_0's multi_logloss: 0.902428
[525]	valid_0's multi_logloss: 0.902176
[526]	valid_0's multi_logloss: 0.901964
[527]	valid_0's multi_logloss: 0.901849
[528]	valid_0's multi_logloss: 0.901346
[529]	valid_0's multi_logloss: 0.901169
[530]	valid_0's multi_logloss: 0.901155
[531]	valid_0's multi_logloss: 0.900966
[532]	valid_0's multi_logloss: 0.900716
[533]	valid_0's multi_logloss: 0.900584
[534]	valid_0's multi_logloss: 0.900383


[716]	valid_0's multi_logloss: 0.882919
[717]	valid_0's multi_logloss: 0.882924
[718]	valid_0's multi_logloss: 0.882921
[719]	valid_0's multi_logloss: 0.88297
[720]	valid_0's multi_logloss: 0.882871
[721]	valid_0's multi_logloss: 0.882712
[722]	valid_0's multi_logloss: 0.882858
[723]	valid_0's multi_logloss: 0.882855
[724]	valid_0's multi_logloss: 0.88303
[725]	valid_0's multi_logloss: 0.883023
[726]	valid_0's multi_logloss: 0.882605
[727]	valid_0's multi_logloss: 0.88264
[728]	valid_0's multi_logloss: 0.882654
[729]	valid_0's multi_logloss: 0.882553
[730]	valid_0's multi_logloss: 0.882618
[731]	valid_0's multi_logloss: 0.882609
[732]	valid_0's multi_logloss: 0.882536
[733]	valid_0's multi_logloss: 0.882646
[734]	valid_0's multi_logloss: 0.882677
[735]	valid_0's multi_logloss: 0.882811
[736]	valid_0's multi_logloss: 0.882785
[737]	valid_0's multi_logloss: 0.882851
[738]	valid_0's multi_logloss: 0.882941
[739]	valid_0's multi_logloss: 0.883039
[740]	valid_0's multi_logloss: 0.882997
[74

[144]	valid_0's multi_logloss: 1.12427
[145]	valid_0's multi_logloss: 1.12267
[146]	valid_0's multi_logloss: 1.12109
[147]	valid_0's multi_logloss: 1.11949
[148]	valid_0's multi_logloss: 1.11799
[149]	valid_0's multi_logloss: 1.11659
[150]	valid_0's multi_logloss: 1.11507
[151]	valid_0's multi_logloss: 1.11402
[152]	valid_0's multi_logloss: 1.11269
[153]	valid_0's multi_logloss: 1.11115
[154]	valid_0's multi_logloss: 1.10972
[155]	valid_0's multi_logloss: 1.10842
[156]	valid_0's multi_logloss: 1.10703
[157]	valid_0's multi_logloss: 1.10573
[158]	valid_0's multi_logloss: 1.10483
[159]	valid_0's multi_logloss: 1.10382
[160]	valid_0's multi_logloss: 1.10247
[161]	valid_0's multi_logloss: 1.10077
[162]	valid_0's multi_logloss: 1.0996
[163]	valid_0's multi_logloss: 1.09804
[164]	valid_0's multi_logloss: 1.09689
[165]	valid_0's multi_logloss: 1.09563
[166]	valid_0's multi_logloss: 1.0939
[167]	valid_0's multi_logloss: 1.09284
[168]	valid_0's multi_logloss: 1.09158
[169]	valid_0's multi_loglo

[353]	valid_0's multi_logloss: 0.959294
[354]	valid_0's multi_logloss: 0.95911
[355]	valid_0's multi_logloss: 0.958656
[356]	valid_0's multi_logloss: 0.958386
[357]	valid_0's multi_logloss: 0.95788
[358]	valid_0's multi_logloss: 0.957234
[359]	valid_0's multi_logloss: 0.956665
[360]	valid_0's multi_logloss: 0.956312
[361]	valid_0's multi_logloss: 0.955852
[362]	valid_0's multi_logloss: 0.955494
[363]	valid_0's multi_logloss: 0.954946
[364]	valid_0's multi_logloss: 0.954607
[365]	valid_0's multi_logloss: 0.954087
[366]	valid_0's multi_logloss: 0.95359
[367]	valid_0's multi_logloss: 0.952988
[368]	valid_0's multi_logloss: 0.952697
[369]	valid_0's multi_logloss: 0.952369
[370]	valid_0's multi_logloss: 0.95209
[371]	valid_0's multi_logloss: 0.951729
[372]	valid_0's multi_logloss: 0.951111
[373]	valid_0's multi_logloss: 0.950554
[374]	valid_0's multi_logloss: 0.950171
[375]	valid_0's multi_logloss: 0.949862
[376]	valid_0's multi_logloss: 0.949553
[377]	valid_0's multi_logloss: 0.949108
[378

[559]	valid_0's multi_logloss: 0.904341
[560]	valid_0's multi_logloss: 0.90436
[561]	valid_0's multi_logloss: 0.904399
[562]	valid_0's multi_logloss: 0.904229
[563]	valid_0's multi_logloss: 0.904149
[564]	valid_0's multi_logloss: 0.903929
[565]	valid_0's multi_logloss: 0.90398
[566]	valid_0's multi_logloss: 0.903806
[567]	valid_0's multi_logloss: 0.903752
[568]	valid_0's multi_logloss: 0.903351
[569]	valid_0's multi_logloss: 0.903127
[570]	valid_0's multi_logloss: 0.903059
[571]	valid_0's multi_logloss: 0.902847
[572]	valid_0's multi_logloss: 0.902652
[573]	valid_0's multi_logloss: 0.902573
[574]	valid_0's multi_logloss: 0.902427
[575]	valid_0's multi_logloss: 0.902158
[576]	valid_0's multi_logloss: 0.901945
[577]	valid_0's multi_logloss: 0.901647
[578]	valid_0's multi_logloss: 0.901503
[579]	valid_0's multi_logloss: 0.901412
[580]	valid_0's multi_logloss: 0.901186
[581]	valid_0's multi_logloss: 0.9011
[582]	valid_0's multi_logloss: 0.900826
[583]	valid_0's multi_logloss: 0.900927
[584

[28]	valid_0's multi_logloss: 1.52402
[29]	valid_0's multi_logloss: 1.51249
[30]	valid_0's multi_logloss: 1.50018
[31]	valid_0's multi_logloss: 1.49038
[32]	valid_0's multi_logloss: 1.48026
[33]	valid_0's multi_logloss: 1.47031
[34]	valid_0's multi_logloss: 1.46009
[35]	valid_0's multi_logloss: 1.45216
[36]	valid_0's multi_logloss: 1.44415
[37]	valid_0's multi_logloss: 1.43649
[38]	valid_0's multi_logloss: 1.42895
[39]	valid_0's multi_logloss: 1.42037
[40]	valid_0's multi_logloss: 1.41221
[41]	valid_0's multi_logloss: 1.40496
[42]	valid_0's multi_logloss: 1.39804
[43]	valid_0's multi_logloss: 1.39121
[44]	valid_0's multi_logloss: 1.38394
[45]	valid_0's multi_logloss: 1.37746
[46]	valid_0's multi_logloss: 1.37233
[47]	valid_0's multi_logloss: 1.36722
[48]	valid_0's multi_logloss: 1.36161
[49]	valid_0's multi_logloss: 1.35603
[50]	valid_0's multi_logloss: 1.35083
[51]	valid_0's multi_logloss: 1.34621
[52]	valid_0's multi_logloss: 1.3412
[53]	valid_0's multi_logloss: 1.33615
[54]	valid_0'

[241]	valid_0's multi_logloss: 1.02583
[242]	valid_0's multi_logloss: 1.02505
[243]	valid_0's multi_logloss: 1.0244
[244]	valid_0's multi_logloss: 1.02375
[245]	valid_0's multi_logloss: 1.02345
[246]	valid_0's multi_logloss: 1.02268
[247]	valid_0's multi_logloss: 1.0219
[248]	valid_0's multi_logloss: 1.0213
[249]	valid_0's multi_logloss: 1.0203
[250]	valid_0's multi_logloss: 1.0197
[251]	valid_0's multi_logloss: 1.01896
[252]	valid_0's multi_logloss: 1.01853
[253]	valid_0's multi_logloss: 1.01802
[254]	valid_0's multi_logloss: 1.01766
[255]	valid_0's multi_logloss: 1.01658
[256]	valid_0's multi_logloss: 1.01578
[257]	valid_0's multi_logloss: 1.01533
[258]	valid_0's multi_logloss: 1.01443
[259]	valid_0's multi_logloss: 1.01391
[260]	valid_0's multi_logloss: 1.01324
[261]	valid_0's multi_logloss: 1.01231
[262]	valid_0's multi_logloss: 1.01142
[263]	valid_0's multi_logloss: 1.01084
[264]	valid_0's multi_logloss: 1.0104
[265]	valid_0's multi_logloss: 1.00941
[266]	valid_0's multi_logloss: 

[448]	valid_0's multi_logloss: 0.936441
[449]	valid_0's multi_logloss: 0.936236
[450]	valid_0's multi_logloss: 0.935892
[451]	valid_0's multi_logloss: 0.935833
[452]	valid_0's multi_logloss: 0.935707
[453]	valid_0's multi_logloss: 0.935268
[454]	valid_0's multi_logloss: 0.93486
[455]	valid_0's multi_logloss: 0.934722
[456]	valid_0's multi_logloss: 0.9344
[457]	valid_0's multi_logloss: 0.934074
[458]	valid_0's multi_logloss: 0.933872
[459]	valid_0's multi_logloss: 0.93359
[460]	valid_0's multi_logloss: 0.933373
[461]	valid_0's multi_logloss: 0.933109
[462]	valid_0's multi_logloss: 0.933078
[463]	valid_0's multi_logloss: 0.932857
[464]	valid_0's multi_logloss: 0.93264
[465]	valid_0's multi_logloss: 0.932605
[466]	valid_0's multi_logloss: 0.932377
[467]	valid_0's multi_logloss: 0.932259
[468]	valid_0's multi_logloss: 0.932264
[469]	valid_0's multi_logloss: 0.932178
[470]	valid_0's multi_logloss: 0.931891
[471]	valid_0's multi_logloss: 0.931705
[472]	valid_0's multi_logloss: 0.931352
[473]

[654]	valid_0's multi_logloss: 0.908344
[655]	valid_0's multi_logloss: 0.908248
[656]	valid_0's multi_logloss: 0.907962
[657]	valid_0's multi_logloss: 0.907895
[658]	valid_0's multi_logloss: 0.907828
[659]	valid_0's multi_logloss: 0.907829
[660]	valid_0's multi_logloss: 0.90791
[661]	valid_0's multi_logloss: 0.90791
[662]	valid_0's multi_logloss: 0.907746
[663]	valid_0's multi_logloss: 0.907689
[664]	valid_0's multi_logloss: 0.907463
[665]	valid_0's multi_logloss: 0.907509
[666]	valid_0's multi_logloss: 0.907599
[667]	valid_0's multi_logloss: 0.90758
[668]	valid_0's multi_logloss: 0.907784
[669]	valid_0's multi_logloss: 0.907686
[670]	valid_0's multi_logloss: 0.907766
[671]	valid_0's multi_logloss: 0.907772
[672]	valid_0's multi_logloss: 0.907732
[673]	valid_0's multi_logloss: 0.907673
[674]	valid_0's multi_logloss: 0.907515
[675]	valid_0's multi_logloss: 0.907529
[676]	valid_0's multi_logloss: 0.907248
[677]	valid_0's multi_logloss: 0.907203
[678]	valid_0's multi_logloss: 0.907256
[67

KeyboardInterrupt: 

In [40]:
test_preds_1 = []
test_preds_2 = []
for i in range(len(models)):
    pred = models[i].predict(X_test)
    test_preds_1.append(pred)
    test_preds_2.append(pred)
    
sorted_probs = np.sort(test_preds_1, axis=0)
test_preds_1 = np.mean(sorted_probs[:],axis=0)

y_pred_avg = np.argmax(test_preds_1, axis=1)

original_labels = le_subclass.inverse_transform(y_pred_avg)

submisson = pd.read_csv("sample_submission.csv")
submisson["SUBCLASS"] = original_labels
submisson.to_csv('fffff.csv', encoding='UTF-8-sig', index=False)
submisson["SUBCLASS"].value_counts()

SUBCLASS
COAD      488
STES      263
BRCA      247
LUAD      231
PRAD      185
KIPAN     145
KIRC       97
UCEC       86
GBMLGG     84
THCA       82
SARC       75
HNSC       72
LGG        71
SKCM       69
OV         62
LIHC       43
CESC       40
PAAD       33
BLCA       32
PCPG       32
TGCT       31
LAML       31
ACC        17
THYM       13
DLBC        9
LUSC        8
Name: count, dtype: int64

In [81]:
pd.DataFrame(test_preds_1).to_csv('LGBM_FINAL_PROBA.csv', encoding='UTF-8-sig', index=False)


# UNTIL HERE