In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import numpy as np
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset

from tqdm import tqdm

In [None]:
ROOT_DIR = "/content/drive/MyDrive/2024Genome_AI/open"

In [None]:
TRAIN_DIR = ROOT_DIR + "/train.csv"
TEST_DIR = ROOT_DIR + "/test.csv"

train = pd.read_csv(TRAIN_DIR)
test = pd.read_csv(TEST_DIR)

In [None]:
train

Unnamed: 0,ID,SUBCLASS,A2M,AAAS,AADAT,AARS1,ABAT,ABCA1,ABCA2,ABCA3,...,ZNF292,ZNF365,ZNF639,ZNF707,ZNFX1,ZNRF4,ZPBP,ZW10,ZWINT,ZYX
0,TRAIN_0000,KIPAN,WT,WT,WT,WT,WT,WT,WT,WT,...,WT,WT,WT,WT,WT,WT,WT,WT,WT,WT
1,TRAIN_0001,SARC,WT,WT,WT,WT,WT,WT,WT,WT,...,WT,WT,WT,WT,WT,WT,WT,WT,WT,WT
2,TRAIN_0002,SKCM,R895R,WT,WT,WT,WT,WT,WT,WT,...,WT,WT,WT,WT,WT,WT,WT,WT,WT,WT
3,TRAIN_0003,KIRC,WT,WT,WT,WT,WT,WT,WT,WT,...,WT,WT,WT,WT,WT,WT,WT,WT,WT,WT
4,TRAIN_0004,GBMLGG,WT,WT,WT,WT,WT,WT,WT,WT,...,WT,WT,WT,WT,WT,WT,WT,WT,WT,WT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6196,TRAIN_6196,LUAD,WT,WT,WT,WT,WT,WT,WT,WT,...,WT,WT,WT,WT,WT,WT,WT,WT,WT,WT
6197,TRAIN_6197,LGG,WT,WT,WT,WT,WT,WT,WT,WT,...,WT,WT,WT,WT,WT,WT,WT,WT,WT,WT
6198,TRAIN_6198,COAD,WT,WT,WT,WT,WT,WT,WT,WT,...,WT,WT,WT,WT,WT,WT,WT,WT,T181S,WT
6199,TRAIN_6199,TGCT,WT,WT,WT,WT,WT,WT,WT,WT,...,WT,WT,WT,WT,WT,WT,WT,WT,WT,WT


In [None]:
# 'WT'만 있는 칼럼 제거
wt_columns = train.columns[train.apply(lambda col: col.nunique() == 1 and col.iloc[0] == 'WT')]
train_cleaned = train.drop(columns=wt_columns)

le_subclass = LabelEncoder()
train_cleaned['SUBCLASS'] = le_subclass.fit_transform(train_cleaned['SUBCLASS'])

# 텍스트 변이 데이터를 준비 (변이 정보를 텍스트로 취급)
X = train_cleaned.drop(columns=['SUBCLASS', 'ID'])
y = train_cleaned['SUBCLASS']

# 학습/검증 데이터 분리
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# 각 클래스의 가중치 계산
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)

# 가중치를 텐서로 변환 (PyTorch에서 사용 가능하도록)
class_weights_tensor = torch.tensor(class_weights, dtype=torch.float).to(device)

In [None]:
# 'WT'가 아닌 변이 정보만 추출
def extract_mutation_info(row):
    mutation_info = []
    for col in row.index:
        if row[col] != 'WT':
            mutation_info.append(f"{col}_{row[col]}")  # 유전자 이름과 변이 정보를 결합
    return " ".join(mutation_info)

X_train_sequences = X_train.apply(extract_mutation_info, axis=1)
X_val_sequences = X_val.apply(extract_mutation_info, axis=1)

In [None]:
# BERT 모델과 토크나이저 불러오기
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

train_encodings = tokenizer(X_train_sequences.tolist(), truncation=True, padding=True, max_length=128, return_tensors='pt')
val_encodings = tokenizer(X_val_sequences.tolist(), truncation=True, padding=True, max_length=128, return_tensors='pt')

# 타겟 라벨을 텐서로 변환
train_labels = torch.tensor(y_train.values)
val_labels = torch.tensor(y_val.values)



In [None]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(le_subclass.classes_))

# 옵티마이저 설정
optimizer = AdamW(model.parameters(), lr=5e-5)
loss_fn = nn.CrossEntropyLoss(weight=class_weights_tensor)

# 학습을 GPU에서 실행
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

# 입력 데이터도 GPU로 이동
train_encodings = {key: val.to(device) for key, val in train_encodings.items()}
val_encodings = {key: val.to(device) for key, val in val_encodings.items()}
train_labels = train_labels.to(device)
val_labels = val_labels.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
train_dataset = TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], train_labels)
val_dataset = TensorDataset(val_encodings['input_ids'], val_encodings['attention_mask'], val_labels)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

# 학습 루프
patience = 5
best_val_f1 = 0
counter = 0

epochs = 100
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        input_ids, attention_mask, labels = batch
        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        logits = outputs.logits
        loss = loss_fn(logits, labels)
        total_loss += loss.item()
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(train_loader)}")

    # 검증 단계
    model.eval()
    all_predictions = []
    all_labels = []

    with torch.no_grad():
        for batch in val_loader:
            input_ids, attention_mask, labels = batch
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            predictions = torch.argmax(outputs.logits, dim=-1)
            all_predictions.extend(predictions.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    # Macro F1 Score 계산
    val_f1 = f1_score(all_labels, all_predictions, average='macro')

    # Early Stopping 체크
    if val_f1 > best_val_f1:
        best_val_f1 = val_f1
        counter = 0  # 개선되었으므로 counter를 초기화
        torch.save(model.state_dict(), 'best_model.pt')  # 모델 저장
    else:
        counter += 1

    # Early Stopping 조건 충족 시 학습 중단
    if counter >= patience:
        print("Early stopping triggered.")
        break

Epoch 1/100, Loss: 3.1649316787719726
Epoch 2/100, Loss: 2.987399858813132
Epoch 3/100, Loss: 3.1769837848601803
Epoch 4/100, Loss: 3.2878619855450046
Epoch 5/100, Loss: 3.28356131738232
Epoch 6/100, Loss: 3.2875879133901287
Epoch 7/100, Loss: 3.280689707110005
Early stopping triggered.


In [None]:
test_X = test.drop(columns=['ID'])
test_sequences = test_X.apply(extract_mutation_info, axis=1)
test_encodings = tokenizer(test_sequences.tolist(), truncation=True, padding=True, max_length=128, return_tensors='pt')
test_encodings = {key: val.to(device) for key, val in test_encodings.items()}

In [None]:
model.eval()
with torch.no_grad():
    outputs = model(test_encodings['input_ids'], attention_mask=test_encodings['attention_mask'])
    test_predictions = torch.argmax(outputs.logits, dim=-1)

In [None]:
submission = pd.read_csv("/content/drive/MyDrive/2024Genome_AI/open/sample_submission.csv")
submission["SUBCLASS"] = le_subclass.inverse_transform(test_predictions.cpu().numpy())
submission.to_csv('/content/drive/MyDrive/2024Genome_AI/open/Esm_submission.csv', index=False)

In [None]:
submission

Unnamed: 0,ID,SUBCLASS
0,TEST_0000,HNSC
1,TEST_0001,OV
2,TEST_0002,PRAD
3,TEST_0003,GBMLGG
4,TEST_0004,BLCA
...,...,...
2541,TEST_2541,STES
2542,TEST_2542,TGCT
2543,TEST_2543,STES
2544,TEST_2544,SARC
