# 암환자 유전체 데이터 기반 암종 분류 AI 모델 개발


- '2024 생명연구자원 AI활용 경진대회'는 바이오 데이터를 기반으로 한 AI 기술의 문제 해결 능력을 탐구하는 것을 목표로 합니다. <br>이 대회는 바이오 분야에서 AI 활용의 저변을 확대하고, 복잡한 바이오 데이터를 효율적으로 분석 및 해석할 수 있는 AI 알고리즘 개발에 초점을 맞추고 있습니다. <br><br>
- 본 대회의 구체적인 과제는 암환자 유전체 데이터의 변이 정보를 활용하여 암종을 분류하는 AI 모델을 개발하는 것입니다. <br>참가자들은 제공된 학습 데이터셋(암환자 유전체 변이 정보)을 사용하여 특정 변이 정보를 바탕으로 암종을 정확하게 분류할 수 있는 AI 알고리즘을 개발해야 합니다. <br><br>
- 이 대회의 궁극적인 목적은 바이오 데이터의 활용도를 높이고, 바이오 분야에서 AI 기술의 적용 가능성을 극대화하며, 인공지능 기술이 실제 바이오 의료 문제 해결에 어떻게 기여할 수 있는지 탐구하는 것입니다.

# Import library

In [15]:
from google.colab import drive
drive.mount('/content/drive/', force_remount=True)

Mounted at /content/drive/


### -------------------------- Python & library version --------------------------
### Python version: 3.10.12 (main, Sep 11 2024, 15:47:36) [GCC 11.4.0]
### pandas version: 2.2.2
### numpy version: 1.26.4
### matplotlib version: 3.7.1
### tqdm version: 4.66.5
### scikit-learn version: 1.5.2
### torch version: 2.4.1+cu121
------------------------------------------------------------------------------

USING Colab pro plus A100 Server(고용량 Ram)

In [16]:
import sys
import tqdm as tq
import matplotlib
import sklearn as skl
import pandas as pd
import numpy as np
import torch
print("-------------------------- Python & library version --------------------------")
print("Python version: {}".format(sys.version))
print("pandas version: {}".format(pd.__version__))
print("numpy version: {}".format(np.__version__))
print("matplotlib version: {}".format(matplotlib.__version__))
print("tqdm version: {}".format(tq.__version__))
print("scikit-learn version: {}".format(skl.__version__))
print("torch version: {}".format(torch.__version__))

print("------------------------------------------------------------------------------")

-------------------------- Python & library version --------------------------
Python version: 3.10.12 (main, Sep 11 2024, 15:47:36) [GCC 11.4.0]
pandas version: 2.2.2
numpy version: 1.26.4
matplotlib version: 3.7.1
tqdm version: 4.66.5
scikit-learn version: 1.5.2
torch version: 2.4.1+cu121
------------------------------------------------------------------------------


In [17]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
import xgboost as xgb
from tqdm import tqdm  # tqdm 임포트


# Load Data

In [18]:
path = '/content/drive/MyDrive/유전체(과기부)/'
import warnings

# 경고 무시
warnings.filterwarnings("ignore", category=FutureWarning)

### Starting with Transformer

In [19]:
import pandas as pd
import re
train = pd.read_csv(path+'1019_final_tr_2.csv')
test = pd.read_csv(path+'1019_final_te_2.csv')

import numpy as np

In [20]:
train.loc[train['SUBCLASS']=='READ','SUBCLASS'] = np.nan
train.loc[train['SUBCLASS']=='UVM','SUBCLASS'] = np.nan
train.loc[train['SUBCLASS']=='UCS','SUBCLASS'] = np.nan
train.loc[train['SUBCLASS']=='CHOL','SUBCLASS'] = np.nan
train.loc[train['SUBCLASS']=='KIRP','SUBCLASS'] = 'KIPAN'
train.loc[train['SUBCLASS']=='KICH','SUBCLASS'] = 'KIPAN'
train.loc[train['SUBCLASS']=='KIRC','SUBCLASS'] = 'KIRC'

train.loc[train['SUBCLASS']=='STAD','SUBCLASS'] = 'STES'
train.loc[train['SUBCLASS']=='ESCA','SUBCLASS'] = 'STES'

# train.loc[train['SUBCLASS']=='BLCA_2','SUBCLASS'] = 'BLCA'
# train.loc[train['SUBCLASS']=='BRCA_2','SUBCLASS'] = np.nan

# train.loc[train['SUBCLASS']=='CESC_2','SUBCLASS'] = np.nan
# train.loc[train['SUBCLASS']=='COAD_2','SUBCLASS'] = np.nan
# train.loc[train['SUBCLASS']=='COAD_3','SUBCLASS'] =np.nan

# train.loc[train['SUBCLASS']=='HNSC_2','SUBCLASS'] = np.nan
# train.loc[train['SUBCLASS']=='LAML_2','SUBCLASS'] = np.nan
# train.loc[train['SUBCLASS']=='LIHC_2','SUBCLASS'] = np.nan

# train.loc[train['SUBCLASS']=='OV_2','SUBCLASS'] = np.nan
# train.loc[train['SUBCLASS']=='PRAD_2','SUBCLASS'] = np.nan
# train.loc[train['SUBCLASS']=='SARC_2','SUBCLASS'] = np.nan

# train.loc[train['SUBCLASS']=='THCA_2','SUBCLASS'] =np.nan
# train.loc[train['SUBCLASS']=='UCEC_2','SUBCLASS'] =np.nan

train = train.dropna(subset=['SUBCLASS'])

train = train.reset_index(drop=True)

In [21]:
wt_indices = train.iloc[:, 2:].apply(lambda row: (row == 'WT').all(), axis=1)
result_indices = train[wt_indices].index
print(len(result_indices))
train = train.drop(result_indices)
train = train.reset_index(drop=True)

94


In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer

# 1. 아미노산 변환 사전 정의
amino_acid_dict = {
    'A': 'Alanine',
    'R': 'Arginine',
    'N': 'Asparagine',
    'D': 'Aspartic acid',
    'C': 'Cysteine',
    'E': 'Glutamic acid',
    'Q': 'Glutamine',
    'G': 'Glycine',
    'H': 'Histidine',
    'I': 'Isoleucine',
    'L': 'Leucine',
    'K': 'Lysine',
    'M': 'Methionine',
    'F': 'Phenylalanine',
    'P': 'Proline',
    'S': 'Serine',
    'T': 'Threonine',
    'W': 'Tryptophan',
    'Y': 'Tyrosine',
    'V': 'Valine',
    '*' :'Stop Codon',
    'X' : 'UnKnown'
}

def translate_amino_acids(sequence, amino_acid_dict):
    translated = [amino_acid_dict.get(aa, "Unknown") for aa in sequence]
    return ", ".join(translated)

# Function to interpret mutation notation
def interpret_mutation_with_col_name(gene, mutation_code):
    if mutation_code == "WT" or not isinstance(mutation_code, str) or len(mutation_code) < 2:
        return None  # Skip wild-type entries, non-string entries, or too-short strings

    # Split the mutation code by spaces to handle multiple mutations in one cell
    mutations = mutation_code.split(' ')
    descriptions = []

    for mutation in mutations:
        description = None  # Initialize description for each mutation
        if '-' in mutation and 'fs' in mutation:
            # Extract the position (e.g., -5026fs -> position 5026)
            position = mutation.split('fs')[0]  # Split at 'fs' and take the first part
            position = position.replace('-', '')  # Remove '-' to get the numeric position

            # Create description for the frameshift mutation
            description = f" {gene},  frameshift  {position}"

        elif 'fs' in mutation:
            # Handle frameshift mutations
            if mutation[1].isalpha():  # 숫자가 아닐 때 (알파벳일 때)
                original_aa_1 = mutation[0]
                original_aa_1 = amino_acid_dict.get(original_aa_1, "Unknown")

                original_aa_2 = mutation[1]
                original_aa_2 = amino_acid_dict.get(original_aa_2, "Unknown")

                position = mutation[2:-2]  # Exclude 'fs' from position
                description = f" {gene},  frameshift {original_aa_1}  {original_aa_2}  {position}"
            else:
                original_aa = mutation[0]
                original_aa = amino_acid_dict.get(original_aa, "Unknown")

                position = mutation[1:-2]  # Exclude 'fs' from position
                description = f" {gene},  frameshift  {original_aa}  {position}"

        elif '*' in mutation:
            if mutation[0]=='*' :
                position = mutation[1:-1]
                new_aa = mutation[-1]
                new_aa = amino_acid_dict.get(new_aa, "Unknown")
                description = f"{gene}, stop codon {position} changes to {new_aa}"

            else:
                # Handle stop codon mutations
                original_aa = mutation[0]
                original_aa = amino_acid_dict.get(original_aa, "Unknown")

                position = mutation[1:-1]  # Position before the *
                description = f"{gene}, {original_aa} {position} changes to stop codon"

        elif 'delins' in mutation:
            # Handle deletion-insertion mutations
            match = re.match(r"([A-Za-z]+)(\d+)_([A-Za-z]+)(\d+)delins([A-Za-z]+)", mutation)
            if match:
                original_aa_start = match.group(1)
                start_position = match.group(2)
                original_aa_end = match.group(3)
                end_position = match.group(4)
                inserted_seq = match.group(5)

                # Translate amino acids using dictionary
                original_aa_start = amino_acid_dict.get(original_aa_start, "Unknown")
                original_aa_end = amino_acid_dict.get(original_aa_end, "Unknown")
                inserted_seq_translated = translate_amino_acids(inserted_seq, amino_acid_dict)

                description = f"{gene} {start_position} ({original_aa_start}) to {end_position} ({original_aa_end}), a del is followed by ins of {inserted_seq_translated}"

        elif 'ins' in mutation:
            # Handle insertion mutations
            match = re.match(r"([A-Za-z]+)(\d+)ins([A-Za-z]+)", mutation)
            if match:
                original_aa = match.group(1)
                position = match.group(2)
                inserted_seq = match.group(3)

                # Translate amino acids using dictionary
                original_aa = amino_acid_dict.get(original_aa, "Unknown")
                inserted_seq_translated = translate_amino_acids(inserted_seq, amino_acid_dict)

                description = f"{gene}, {inserted_seq_translated} ins {position} {original_aa}"

        elif 'del' in mutation:
            # Handle deletion mutations (modified regex to capture both amino acids and positions)
            match = re.match(r"([A-Za-z]+)(\d+)del", mutation)
            if match:
                original_aa_start = match.group(1)
                start_position = match.group(2)
#                 original_aa_end = match.group(3)
#                 end_position = match.group(4)

                # Translate amino acids using dictionary
                original_aa_start = amino_acid_dict.get(original_aa_start, "Unknown")
#                 original_aa_end = amino_acid_dict.get(original_aa_end, "Unknown")

                description = f"{gene}, {start_position}, ({original_aa_start}) is del"

        elif 'dup' in mutation:
            # Handle duplication mutations (modified regex to capture both amino acids and positions)
            match = re.match(r"([A-Za-z]+)(\d+)dup", mutation)
            if match:
                original_aa_start = match.group(1)
                start_position = match.group(2)
#                 original_aa_end = match.group(3)
#                 end_position = match.group(4)

                # Translate amino acids using dictionary
                original_aa_start = amino_acid_dict.get(original_aa_start, "Unknown")
#                 original_aa_end = amino_acid_dict.get(original_aa_end, "Unknown")

                description = f"{gene} {start_position} ({original_aa_start}) dup"

        elif len(mutation) >= 2 and mutation[0] == mutation[-1]:
            # Handle mutations where the original and new amino acid are the same (e.g., S1866S)
            original_aa = mutation[0]
            original_aa = amino_acid_dict.get(original_aa, "Unknown")

            position = mutation[1:-1]
            description = f"{gene}, {original_aa} {position} changes {original_aa}"

        elif len(mutation) >= 2 and mutation[0] != mutation[-1]:
            # Handle mutations where the original and new amino acid are different
            original_aa = mutation[0]
            original_aa = amino_acid_dict.get(original_aa, "Unknown")

            position = mutation[1:-1]
            new_aa = mutation[-1]
            new_aa = amino_acid_dict.get(new_aa, "Unknown")

            description = f" {gene}, {original_aa}  changes  {new_aa} {position}"

        else:
            # Handle general mutations
            match = re.match(r"([A-Za-z]+)(\d+)", mutation)

            if match:
                original_aa = match.group(1)  # 'T'
                position = match.group(2)     # '218'
                original_aa = amino_acid_dict.get(original_aa, "Unknown")
                description = f"{gene}, {original_aa} is changes del or ins {position}"

        if description:
            descriptions.append(description)

    return "; ".join(descriptions)

# Train 데이터 돌연변이 설명 생성
train['mutation_description'] = ""
for index, row in train.iterrows():
    mutation_descriptions = []
    for gene, mutation in row.items():
        if gene != 'SUBCLASS' and mutation != "WT" and gene != 'ID':
            description = interpret_mutation_with_col_name(gene, mutation)
            if description:
                mutation_descriptions.append(description)
    full_description = "; ".join(mutation_descriptions)
    train.at[index, 'mutation_description'] = full_description

# Test 데이터 돌연변이 설명 생성
test['mutation_description'] = ""
for index, row in test.iterrows():
    mutation_descriptions = []
    for gene, mutation in row.items():
        if gene != 'SUBCLASS' and mutation != "WT" and gene != 'ID':
            description = interpret_mutation_with_col_name(gene, mutation)
            if description:
                mutation_descriptions.append(description)
    full_description = "; ".join(mutation_descriptions)
    test.at[index, 'mutation_description'] = full_description

df_test = test['mutation_description']
df_train = train[["SUBCLASS", "mutation_description"]]

# TF-IDF Vectorizer 초기화 및 학습
vectorizer = TfidfVectorizer(max_features=40000)

# train 데이터를 이용해 TF-IDF 모델을 학습 (fit)
vectorizer.fit(df_train['mutation_description'])

# train 데이터를 변환 (transform)
train_tfidf = vectorizer.transform(df_train['mutation_description'])

# test 데이터를 변환 (transform)
test_tfidf = vectorizer.transform(df_test)

# 결과 출력
print("Train TF-IDF shape:", train_tfidf.shape)
print("Test TF-IDF shape:", test_tfidf.shape)

# 희소 행렬을 밀집 행렬로 변환
train_data = pd.DataFrame(train_tfidf.toarray())
test_data = pd.DataFrame(test_tfidf.toarray())

# SUBCLASS 범주형 데이터를 숫자로 변환 (LabelEncoder 사용)
train_data['SUBCLASS'] = df_train['SUBCLASS']
le_subclass = LabelEncoder()
train_data['SUBCLASS'] = le_subclass.fit_transform(train_data['SUBCLASS'])

# 변환된 레이블 확인
for i, label in enumerate(le_subclass.classes_):
    print(f"원래 레이블: {label}, 변환된 숫자: {i}")

# Feature와 Target 분리
X = train_data.drop(columns=['SUBCLASS'])
y = train_data['SUBCLASS']
X_test = test_data

# 데이터 준비 완료
print("Train shape:", X.shape)
print("Test shape:", X_test.shape)

Train TF-IDF shape: (39874, 28122)
Test TF-IDF shape: (2546, 28122)
원래 레이블: ACC, 변환된 숫자: 0
원래 레이블: BLCA, 변환된 숫자: 1
원래 레이블: BRCA, 변환된 숫자: 2
원래 레이블: CESC, 변환된 숫자: 3
원래 레이블: COAD, 변환된 숫자: 4
원래 레이블: DLBC, 변환된 숫자: 5
원래 레이블: GBMLGG, 변환된 숫자: 6
원래 레이블: HNSC, 변환된 숫자: 7
원래 레이블: KIPAN, 변환된 숫자: 8
원래 레이블: KIRC, 변환된 숫자: 9
원래 레이블: LAML, 변환된 숫자: 10
원래 레이블: LGG, 변환된 숫자: 11
원래 레이블: LIHC, 변환된 숫자: 12
원래 레이블: LUAD, 변환된 숫자: 13
원래 레이블: LUSC, 변환된 숫자: 14
원래 레이블: OV, 변환된 숫자: 15
원래 레이블: PAAD, 변환된 숫자: 16
원래 레이블: PCPG, 변환된 숫자: 17
원래 레이블: PRAD, 변환된 숫자: 18
원래 레이블: SARC, 변환된 숫자: 19
원래 레이블: SKCM, 변환된 숫자: 20
원래 레이블: STES, 변환된 숫자: 21
원래 레이블: TGCT, 변환된 숫자: 22
원래 레이블: THCA, 변환된 숫자: 23
원래 레이블: THYM, 변환된 숫자: 24
원래 레이블: UCEC, 변환된 숫자: 25
Train shape: (39874, 28122)
Test shape: (2546, 28122)


In [23]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import StratifiedKFold
import numpy as np
from sklearn.metrics import f1_score
from torch.optim.lr_scheduler import ReduceLROnPlateau

#DL 시드 고정
def seed_everything(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  # GPU 사용 시 추가
    torch.backends.cudnn.deterministic = True  # Reproducibility를 위한 설정
    torch.backends.cudnn.benchmark = False
# Convert data to PyTorch tensors

X_train = torch.tensor(X.values, dtype=torch.float32)
y_train = torch.tensor(y.values, dtype=torch.long)
X_test = torch.tensor(X_test.values, dtype=torch.float32)

In [24]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import StratifiedKFold
import numpy as np
from sklearn.metrics import f1_score
from torch.optim.lr_scheduler import ReduceLROnPlateau

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Custom Focal Loss implementation
class FocalLoss(nn.Module):
    def __init__(self, alpha=1, gamma=2, reduction='mean'):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.reduction = reduction

    def forward(self, inputs, targets):
        BCE_loss = nn.CrossEntropyLoss(reduction='mean')(inputs, targets)
        pt = torch.exp(-BCE_loss)
        F_loss = self.alpha * (1 - pt) ** self.gamma * BCE_loss
        # F_loss = BCE_loss
        if self.reduction == 'mean':
            return torch.mean(F_loss)
        elif self.reduction == 'sum':
            return torch.sum(F_loss)
        else:
            return F_loss

# X-Transformer Model definition
class XTransformerModel(nn.Module):
    def __init__(self, input_dim, num_classes, d_model=512, nhead=8, num_encoder_layers=2, dim_feedforward=2048, dropout=0.1):
        super(XTransformerModel, self).__init__()

        # Embedding for Categorical and Numerical Features
        self.input_fc = nn.Linear(input_dim, d_model)
        self.layer_norm1 = nn.LayerNorm(d_model)

        # Transformer Encoder for feature interactions
        encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead, dim_feedforward=dim_feedforward, dropout=dropout)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_encoder_layers)

        # Final output classification layer
        self.output_fc = nn.Linear(d_model, num_classes)
        self.dropout = nn.Dropout(dropout)
        self.layer_norm2 = nn.LayerNorm(d_model)

    def forward(self, x):
        # Feature embedding transformation
        x = self.input_fc(x)
        x = self.layer_norm1(x)

        # Apply transformer encoder for feature-wise attention
        x = x.unsqueeze(1)  # Add sequence dimension
        x = self.transformer_encoder(x)
        x = x.squeeze(1)  # Remove sequence dimension

        # Output normalization and classification
        x = self.layer_norm2(x)
        x = self.dropout(x)
        x = self.output_fc(x)

        return x

# Training function
def train_model(model, criterion, optimizer, scheduler, train_loader, val_loader, device, epochs=60):
    for epoch in range(epochs):
        model.train()
        train_loss = 0
        for X_batch, y_batch in train_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)

            optimizer.zero_grad()
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            loss.backward()
            optimizer.step()

            train_loss += loss.item()

        val_loss, val_f1 = evaluate_model(model, criterion, val_loader, device)
        scheduler.step(val_loss)

        print(f'Epoch {epoch+1}/{epochs}, Train Loss: {train_loss/len(train_loader):.4f}, Val Loss: {val_loss:.4f}, Val F1: {val_f1:.4f}')

# Evaluation function
def evaluate_model(model, criterion, val_loader, device):
    model.eval()
    val_loss = 0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for X_batch, y_batch in val_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            val_loss += loss.item()

            _, preds = torch.max(outputs, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(y_batch.cpu().numpy())

    val_loss /= len(val_loader)
    val_f1 = f1_score(all_labels, all_preds, average='weighted')
    return val_loss, val_f1

# Initialize model parameters
d_model = 512
nhead = 8
num_encoder_layers = 4
dim_feedforward = 2048
dropout = 0.4
learning_rate = 5e-5
weight_decay = 1e-5

# Initialize variables for k-fold cross-validation
n_splits = 10
# skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
input_dim = X_train.shape[1]
num_classes = len(y_train.unique())

oof_predictions = np.zeros((X_train.shape[0], num_classes))
test_predictions = np.zeros((X_test.shape[0], num_classes))

# Cross-validation loop
for state in [0,42,100]:
  skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=state)

  for fold, (train_idx, val_idx) in enumerate(skf.split(X_train, y_train)):
      print(f'Fold {fold + 1}/{n_splits}')

      X_fold_train, X_fold_val = X_train[train_idx], X_train[val_idx]
      y_fold_train, y_fold_val = y_train[train_idx], y_train[val_idx]

      fold_train_dataset = TensorDataset(X_fold_train, y_fold_train)
      fold_val_dataset = TensorDataset(X_fold_val, y_fold_val)

      fold_train_loader = DataLoader(fold_train_dataset, batch_size=1024, shuffle=True)
      fold_val_loader = DataLoader(fold_val_dataset, batch_size=1024, shuffle=False)

      model = XTransformerModel(input_dim=input_dim, num_classes=num_classes, d_model=d_model, nhead=nhead, num_encoder_layers=num_encoder_layers, dim_feedforward=dim_feedforward, dropout=dropout)
      model = model.to(device)

      optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
      scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5, verbose=True)
      criterion = FocalLoss(alpha=1, gamma=2)
      # criterion = nn.BCEWithLogitsLoss()

      train_model(model, criterion, optimizer, scheduler, fold_train_loader, fold_val_loader, device, epochs=40)

      # Make predictions for the validation set of this fold
      model.eval()
      with torch.no_grad():
          val_outputs = []
          for X_batch, _ in fold_val_loader:
              X_batch = X_batch.to(device)
              outputs = model(X_batch)
              val_outputs.append(outputs.cpu().numpy())

          oof_predictions[val_idx] = np.concatenate(val_outputs, axis=0)

      test_fold_predictions = []
      test_loader = DataLoader(TensorDataset(X_test), batch_size=1024, shuffle=False)
      with torch.no_grad():
          for X_batch in test_loader:
              X_batch = X_batch[0].to(device)
              outputs = model(X_batch)
              test_fold_predictions.append(outputs.cpu().numpy())

      test_predictions += np.concatenate(test_fold_predictions, axis=0) / n_splits

# Calculate final OOF F1 score
oof_pred_labels = np.argmax(oof_predictions, axis=1)
oof_f1 = f1_score(y_train.numpy(), oof_pred_labels, average='weighted')

print(f'Out-of-Fold F1 Score: {oof_f1:.4f}')

# Convert test predictions to class labels
test_pred_labels = np.argmax(test_predictions, axis=1)

# Assuming le_subclass is the label encoder used for encoding the SUBCLASS labels
predicted_labels = le_subclass.inverse_transform(test_pred_labels)

Fold 1/10




Epoch 1/40, Train Loss: 2.6165, Val Loss: 1.7386, Val F1: 0.3360
Epoch 2/40, Train Loss: 1.2813, Val Loss: 0.8985, Val F1: 0.5734
Epoch 3/40, Train Loss: 0.4804, Val Loss: 0.5581, Val F1: 0.6742
Epoch 4/40, Train Loss: 0.1898, Val Loss: 0.4208, Val F1: 0.7213
Epoch 5/40, Train Loss: 0.0959, Val Loss: 0.3521, Val F1: 0.7500
Epoch 6/40, Train Loss: 0.0598, Val Loss: 0.3152, Val F1: 0.7639
Epoch 7/40, Train Loss: 0.0412, Val Loss: 0.2983, Val F1: 0.7655
Epoch 8/40, Train Loss: 0.0312, Val Loss: 0.2836, Val F1: 0.7711
Epoch 9/40, Train Loss: 0.0227, Val Loss: 0.2778, Val F1: 0.7795
Epoch 10/40, Train Loss: 0.0183, Val Loss: 0.2702, Val F1: 0.7854
Epoch 11/40, Train Loss: 0.0144, Val Loss: 0.2664, Val F1: 0.7862
Epoch 12/40, Train Loss: 0.0122, Val Loss: 0.2630, Val F1: 0.7860
Epoch 13/40, Train Loss: 0.0104, Val Loss: 0.2606, Val F1: 0.7874
Epoch 14/40, Train Loss: 0.0090, Val Loss: 0.2592, Val F1: 0.7875
Epoch 15/40, Train Loss: 0.0079, Val Loss: 0.2594, Val F1: 0.7897
Epoch 16/40, Train 

KeyboardInterrupt: 

In [None]:
# Prepare the submission file

submission_dl = pd.read_csv(path+'sample_submission.csv')
submission_dl['SUBCLASS'] = predicted_labels
pd.DataFrame(test_predictions).to_csv(path + 'final_xtf_proba.csv', index=False)

Unnamed: 0_level_0,count
SUBCLASS,Unnamed: 1_level_1
COAD,540
STES,232
BRCA,229
KIRC,188
KIPAN,131
PRAD,120
LUAD,114
SARC,93
THCA,88
GBMLGG,88


In [None]:
## 여기까지

# 여기까지