In [1]:
import numpy as np

from src.utils.training_utils import set_seed

In [2]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="1"

In [3]:
!nvidia-smi

Mon Aug  4 09:30:47 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.35.03              Driver Version: 560.35.03      CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 4090        On  |   00000000:65:00.0 Off |                  Off |
| 74%   33C    P8             16W /  450W |    1083MiB /  24564MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
|   1  NVIDIA L40S                    On  |   00

In [4]:
from src.utils.esmfinetune_3labels import MultiTaskVarDataset, MultiTaskVarCollator
from src.varmodel_MT_AllLoRA_ESM3_proj import CYPVarAM

In [None]:
from transformers import AutoTokenizer, EsmModel
from peft import LoraConfig, get_peft_model

from transformers import AutoModelForMaskedLM
esm_model = AutoModelForMaskedLM.from_pretrained('Synthyra/ESMplusplus_small', trust_remote_code=True)
tokenizer = esm_model.tokenizer


# tokenizer = AutoTokenizer.from_pretrained("facebook/esm2_t33_650M_UR50D")
# esm_model = EsmModel.from_pretrained("facebook/esm2_t33_650M_UR50D")



In [None]:
esm_model

In [None]:
set_seed(42)
intermed_list = []
for i in range(30):
    if i > 24:
        intermed_list.append(str(i)+".attn.layernorm_qkv.1")
        intermed_list.append(str(i)+".attn.out_proj")
        intermed_list.append(str(i)+".ffn.1")
        intermed_list.append(str(i)+".ffn.3")
       

config = LoraConfig(
    r=16,
    lora_alpha=16,
    bias="none",
    #use_dora=True,
    target_modules=intermed_list#["layernorm_qkv.1", "out_proj", "ffn.1", "ffn.3"]#"query", "key", "value", "dense"] + intermed_list
)
lora_esm_model = get_peft_model(esm_model, config)

# for param in esm_model.parameters():
#     param.requires_grad = False
model = CYPVarAM(esm_model = lora_esm_model, drop_att = 0.1, drop_pff = 0.1,  input_size = 960, hidden_size = 300, num_heads = 6, num_tasks = 3)


In [None]:
model

In [None]:
device = "cuda"
model.to(device)

In [1]:
import pickle
with open("data/uniprot_cyp_variant_protvar_250709.pkl", "rb") as f:
    all_cyp_variants_rev = pickle.load(f)
wt_seq_dict = all_cyp_variants_rev[all_cyp_variants_rev['variant']=='WT'][['Gene', 'Sequence']].set_index('Gene').to_dict()['Sequence']
all_cyp_variants_rev['wt_seq'] = all_cyp_variants_rev['Gene'].map(lambda x: wt_seq_dict[x])
#foldx_shifted = all_cyp_variants_rev['foldx_score'] - all_cyp_variants_rev['foldx_score'].min()
#all_cyp_variants_rev['foldx_score'] = np.log1p(foldx_shifted)
#all_cyp_variants_rev = all_cyp_variants_rev[all_cyp_variants_rev['Gene'].isin(['CYP2D6'])]

In [2]:
len(all_cyp_variants_rev)

34662

In [None]:
def standardize_column(df, column_name):
    """
    Applies min-max normalization to a specific column in a pandas DataFrame.
    
    Parameters:
    df (pandas.DataFrame): The DataFrame containing the column to normalize
    column_name (str): The name of the column to normalize
    
    Returns:
    pandas.DataFrame: A DataFrame with the normalized column
    """
    # Create a copy of the DataFrame to avoid modifying the original
    df_normalized = df.copy()
    
    # Extract min and max values from the column
    mean_val = df[column_name].mean()
    std_val = df[column_name].std()
    
    
    df_normalized[column_name+'_std'] = (df[column_name] - mean_val) / (std_val + 1e-8)
    
    return df_normalized


In [None]:
def min_max_normalize_column(df, column_name):
    """
    Applies min-max normalization to a specific column in a pandas DataFrame.
    
    Parameters:
    df (pandas.DataFrame): The DataFrame containing the column to normalize
    column_name (str): The name of the column to normalize
    
    Returns:
    pandas.DataFrame: A DataFrame with the normalized column
    """
    # Create a copy of the DataFrame to avoid modifying the original
    df_normalized = df.copy()
    
    # Extract min and max values from the column
    min_val = df[column_name].min()
    max_val = df[column_name].max()
    
    # Check if min and max are the same to avoid division by zero
    if min_val == max_val:
        df_normalized[column_name] = 0  # If all values are the same, normalize to 0
    else:
        # Apply min-max normalization formula: (x - min) / (max - min)
        df_normalized[column_name+'_minmax'] = (df[column_name] - min_val) / (max_val - min_val)
    
    return df_normalized


In [None]:
from sklearn.model_selection import train_test_split
train_df, val_df = train_test_split(
    all_cyp_variants_rev,  test_size=0.1, random_state = 42)

In [None]:
train_wt_texts = train_df['wt_seq'].values
train_vr_texts = train_df['Sequence'].values

val_wt_texts = val_df['wt_seq'].values
val_vr_texts = val_df['Sequence'].values

train_labels_1 = (train_df[['am_score']]).astype('float32').values
train_labels_2 = (train_df[['esm1v_score']]).astype('float32').values
train_labels_3 = (train_df[['foldx_score']]).astype('float32').values
#train_labels_4 = (train_df[['conserv_score_std']]).astype('float32').values
val_labels_1 = (val_df[['am_score']]).astype('float32').values
val_labels_2 = (val_df[['esm1v_score']]).astype('float32').values
val_labels_3 = (val_df[['foldx_score']]).astype('float32').values
#val_labels_4 = (val_df[['conserv_score_std']]).astype('float32').values

In [None]:
import torch

In [None]:
from transformers import TrainingArguments, Trainer

In [None]:
train_encodings1 = tokenizer(list(train_wt_texts), padding=True)
train_encodings2 = tokenizer(list(train_vr_texts), padding=True)

val_encodings1 = tokenizer(list(val_wt_texts), padding=True)
val_encodings2 = tokenizer(list(val_vr_texts), padding=True)


In [None]:
train_dataset = MultiTaskVarDataset(train_encodings1, train_encodings2, train_labels_1, train_labels_2, train_labels_3)
val_dataset = MultiTaskVarDataset(val_encodings1, val_encodings2, val_labels_1, val_labels_2, val_labels_3)

In [None]:
custom_collator = MultiTaskVarCollator()

In [None]:
from src.utils.esmfinetune_3labels import eval_multitask_reg_metrics

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    result = eval_multitask_reg_metrics(
        predictions=predictions, 
        labels=labels)
    return result

In [None]:
from prettytable import PrettyTable
def count_parameters(model):
    table = PrettyTable(["Modules", "Parameters", "Trainable"])
    total_params = 0
    for name, parameter in model.named_parameters():
        params = parameter.numel()
        #print(name, params, parameter.requires_grad)
        table.add_row([name, params, parameter.requires_grad])
        if parameter.requires_grad:
            total_params+=params
    print(table)
    print(f"Total Trainable Params: {total_params}")
    return total_params

In [None]:
count_parameters(model)

In [None]:
from transformers import get_cosine_schedule_with_warmup

def create_optimizer(model, training_args):
    # Parameter 그룹 분리
    lora_params = []
    cross_attn_params = []
    predict_params = []
    other_params = []
    
    for name, param in model.named_parameters():
        if not param.requires_grad:
            continue
            
        if 'lora_' in name:  # DoRA parameters
            lora_params.append(param)
        else:  # Other trainable parameters
            other_params.append(param)
    
    # Optimizer with different learning rates
    optimizer = torch.optim.AdamW([
        {'params': lora_params, 'lr': 2e-5},       # LoRA: 더 낮게
        {'params': other_params, 'lr': 1e-4}       # Others: 기본값
    ], weight_decay=0.0)
    
    return optimizer

# TrainingArguments (거의 동일)
training_args = TrainingArguments(
    output_dir="esm3_MT_small_raw_5layer",
    # learning_rate=2e-5,  # 주석 처리 (custom optimizer 사용)
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=2,
    num_train_epochs=20,
    eval_strategy="epoch",
    save_strategy="epoch", 
    load_best_model_at_end=True,
    logging_steps=1,
    seed=42,
    data_seed=42,
    metric_for_best_model="eval_r2_avg",
    label_names=['labels'],
    dataloader_drop_last=True,
    lr_scheduler_type='cosine'
)

# Trainer with custom optimizer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    data_collator=custom_collator,
    optimizers=(create_optimizer(model, training_args), None)  # (optimizer, scheduler)
)

In [None]:
trainer.train()