In [6]:
from transformers import AutoTokenizer, AutoModel, EsmForTokenClassification, AutoModelForTokenClassification
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
import evaluate
import torch
from Bio import SeqIO
import pandas as pd
import numpy as np
from datasets import Dataset
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [2]:
tokenizer = AutoTokenizer.from_pretrained("facebook/esm2_t6_8M_UR50D")
seq_dict = SeqIO.to_dict(SeqIO.parse("sequences.fasta", "fasta"))

In [3]:
train_path = 'train.tsv'
train_df = pd.read_csv(train_path, sep='\t')
train_df[['id','amino_acid','index']] = train_df['id'].str.split('_', expand=True)
train_df['index'] = train_df['index'].astype(int)

# Labels:
idx_to_labels = {i: value for i, value in enumerate(train_df['secondary_structure'].unique())}
labels_to_idx = {v: k for k, v in idx_to_labels.items()}
train_df['label'] = pd.factorize(train_df['secondary_structure'])[0]

# Ids:
ids = train_df['id'].unique()

In [9]:
train_df['sequence'] = train_df['id'].apply(lambda x: str(seq_dict[x].seq))
train_df['length'] = train_df['sequence'].str.len()

In [10]:
df_filt = train_df.groupby('id')[['sequence']].agg({'sequence': 'first'}).reset_index()

In [11]:
sequences = df_filt['sequence'].tolist()

In [12]:
batch = tokenizer(
    sequences,
    add_special_tokens=False,
    padding=True,
    return_tensors="pt"
)

In [28]:
model = AutoModelForTokenClassification.from_pretrained('/u/scratch/t/ttthach/P1/SMALL_4', num_labels = 9, output_hidden_states=True).to(device='cuda')
seq = (df_filt['sequence'].iloc[1])
inputs = tokenizer(seq, add_special_tokens=False, return_tensors="pt").to(device='cuda')
with torch.no_grad():
    outputs = model(**inputs)

In [13]:
df_filt['input_ids'] = list(batch['input_ids'].numpy())
df_filt['attention_mask'] = list(batch['attention_mask'].numpy())

In [27]:
import gc
gc.collect()
torch.cuda.empty_cache()
!nvidia-smi

Thu Apr 24 23:00:20 2025       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.154.05             Driver Version: 535.154.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 2080 Ti     On  | 00000000:3B:00.0 Off |                  N/A |
| 27%   42C    P2              48W / 250W |    299MiB / 11264MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [43]:
def get_embedding(seq):
    inputs = tokenizer(seq, add_special_tokens=False, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.hidden_states[-1].squeeze(0).numpy()

In [42]:
def add_cols(df):
    df['embedding'] = df.apply(lambda x: x['embeddings'][x['index'] - 1] 
                                           if x['index'] - 1 < len(x['embeddings']) 
                                           else [0] * len(x['embeddings'][0]), axis=1)
    
    df['input_id'] = df.apply(lambda x: x['input_ids'][x['index'] - 1] 
                                                    if x['index'] - 1 < len(x['input_ids']) 
                                                    else -1, axis=1)
    
    df['mask'] = df.apply(lambda x: x['attention_mask'][x['index'] - 1]
                                                if x['index'] - 1 < len(x['attention_mask']) 
                                                else 1, axis=1)
    
    df['letter'] = df.apply(lambda x: x['sequence'][x['index'] - 1] 
                                                  if x['index'] - 1 < len(x['sequence']) 
                                                  else 'Z', axis=1)


In [40]:
def create_cat_df(df):
    temp_df = df.drop(columns=['embeddings', 'input_ids', 'attention_mask', 'secondary_structure'])
    emb = np.array(temp_df['embedding'].to_list())
    temp_df = pd.concat([temp_df.drop(columns=['embedding']), pd.DataFrame(emb)], axis=1)
    return temp_df

def create_test_cat_df(df):
    temp_df = df.drop(columns=['embeddings', 'input_ids', 'attention_mask'])
    emb = np.array(temp_df['embedding'].to_list())
    temp_df = pd.concat([temp_df.drop(columns=['embedding']), pd.DataFrame(emb)], axis=1)
    return temp_df

In [41]:
le = LabelEncoder()

def create_nocat_df(df, LabelEncoder=le):
    df_nocat = df.drop(columns=['sequence'])
    cat_col = ['id', 'amino_acid', 'letter']

    for col in cat_col:
        df_nocat[col] = le.fit_transform(df_nocat[col].astype(str))

    return df_nocat

In [None]:
model = AutoModelForTokenClassification.from_pretrained('/u/scratch/t/ttthach/P1/SMALL_4', num_labels = 9, output_hidden_states=True)
df_filt_small = df_filt.copy()
df_filt_small['embeddings'] = df_filt_small['sequence'].apply(get_embedding)
df_merge_small = pd.merge(train_df, df_filt_small, on=['id', 'sequence'], how='left')
add_cols(df_merge_small)

df_cat_s = create_cat_df(df_merge_small)
df_nocat_s = create_nocat_df(df_cat_s)

df_nocat_s.columns = df_nocat_s.columns.astype(str)
df_nocat_s.to_feather('/u/scratch/t/ttthach/P1/df_nocat_s.feather')
#featherLoad = pd.read_feather('/u/scratch/t/ttthach/P1/train_df_nocat.feather', dtype_backend='pyarrow')
#train_df_nocat = featherLoad

In [53]:
model = AutoModelForTokenClassification.from_pretrained('/u/scratch/t/ttthach/P1/MED_4', num_labels = 9, output_hidden_states=True)
df_filt_m = df_filt.copy()
df_filt_m['embeddings'] = df_filt_m['sequence'].apply(get_embedding)
df_merge_m = pd.merge(train_df, df_filt_m, on=['id', 'sequence'], how='left')
add_cols(df_merge_m)

df_cat_m = create_cat_df(df_merge_m)
df_nocat_m = create_nocat_df(df_cat_m)

df_nocat_m.columns = df_nocat_m.columns.astype(str)
df_nocat_m.to_feather('/u/scratch/t/ttthach/P1/df_nocat_m.feather')

In [None]:
model = AutoModelForTokenClassification.from_pretrained('/u/scratch/t/ttthach/P1/LARGE_4', num_labels = 9, output_hidden_states=True)
df_filt_l = df_filt.copy()
df_filt_l['embeddings'] = df_filt_l['sequence'].apply(get_embedding)
df_merge_l = pd.merge(train_df, df_filt_l, on=['id', 'sequence'], how='left')
add_cols(df_merge_l)

df_cat_l = create_cat_df(df_merge_l)
df_nocat_l = create_nocat_df(df_cat_l)

df_nocat_l.columns = df_nocat_l.columns.astype(str)
df_nocat_l.to_feather('/u/scratch/t/ttthach/P1/df_nocat_l.feather')

In [14]:
model = AutoModelForTokenClassification.from_pretrained('/u/scratch/t/ttthach/P1/FULL_4', num_labels = 9, output_hidden_states=True)
df_filt_f = df_filt.copy()
df_filt_f['embeddings'] = df_filt_f['sequence'].apply(get_embedding)
df_merge_f = pd.merge(train_df, df_filt_f, on=['id', 'sequence'], how='left')
add_cols(df_merge_f)

df_cat_f = create_cat_df(df_merge_f)
df_nocat_f = create_nocat_df(df_cat_f)

df_nocat_f.columns = df_nocat_f.columns.astype(str)
df_nocat_f.to_feather('/u/scratch/t/ttthach/P1/df_nocat_f.feather')

In [33]:
model = AutoModelForTokenClassification.from_pretrained('/u/scratch/t/ttthach/P1/SMALL_MED_LARGE_FULL_4', num_labels = 9, output_hidden_states=True)
df_filt_smlf = df_filt.copy()
df_filt_smlf['embeddings'] = df_filt_smlf['sequence'].apply(get_embedding)
df_merge_smlf = pd.merge(train_df, df_filt_smlf, on=['id', 'sequence'], how='left')
add_cols(df_merge_smlf)

df_cat_smlf = create_cat_df(df_merge_smlf)
df_nocat_smlf = create_nocat_df(df_cat_smlf)

df_nocat_smlf.columns = df_nocat_smlf.columns.astype(str)
df_nocat_smlf.to_feather('/u/scratch/t/ttthach/P1/df_nocat_smlf.feather')

In [45]:
model = AutoModelForTokenClassification.from_pretrained('/u/scratch/t/ttthach/P1/MED_LARGE_4', num_labels = 9, output_hidden_states=True)
df_filt_ml = df_filt.copy()
df_filt_ml['embeddings'] = df_filt_ml['sequence'].apply(get_embedding)
df_merge_ml = pd.merge(train_df, df_filt_ml, on=['id', 'sequence'], how='left')
add_cols(df_merge_ml)

df_cat_ml = create_cat_df(df_merge_ml)
df_nocat_ml = create_nocat_df(df_cat_ml)

df_nocat_ml.columns = df_nocat_ml.columns.astype(str)
df_nocat_ml.to_feather('/u/scratch/t/ttthach/P1/df_nocat_ml.feather')

In [14]:
model = AutoModelForTokenClassification.from_pretrained('/u/scratch/t/ttthach/P1/MED_LARGE_FULL_4', num_labels = 9, output_hidden_states=True)
df_filt_mlf = df_filt.copy()
df_filt_mlf['embeddings'] = df_filt_mlf['sequence'].apply(get_embedding)
df_merge_mlf = pd.merge(train_df, df_filt_mlf, on=['id', 'sequence'], how='left')
add_cols(df_merge_mlf)

df_cat_mlf = create_cat_df(df_merge_mlf)
df_nocat_mlf = create_nocat_df(df_cat_mlf)

df_nocat_mlf.columns = df_nocat_mlf.columns.astype(str)
df_nocat_mlf.to_feather('/u/scratch/t/ttthach/P1/df_nocat_mlf.feather')

In [15]:
model = AutoModelForTokenClassification.from_pretrained('/u/scratch/t/ttthach/P1/SMALL_FULL_4', num_labels = 9, output_hidden_states=True)
df_filt_sf = df_filt.copy()
df_filt_sf['embeddings'] = df_filt_sf['sequence'].apply(get_embedding)
df_merge_sf = pd.merge(train_df, df_filt_sf, on=['id', 'sequence'], how='left')
add_cols(df_merge_sf)

df_cat_sf = create_cat_df(df_merge_sf)
df_nocat_sf = create_nocat_df(df_cat_sf)

df_nocat_sf.columns = df_nocat_sf.columns.astype(str)
df_nocat_sf.to_feather('/u/scratch/t/ttthach/P1/df_nocat_sf.feather')

In [16]:
model = AutoModelForTokenClassification.from_pretrained('/u/scratch/t/ttthach/P1/MED_FULL_4', num_labels = 9, output_hidden_states=True)
df_filt_mf = df_filt.copy()
df_filt_mf['embeddings'] = df_filt_mf['sequence'].apply(get_embedding)
df_merge_mf = pd.merge(train_df, df_filt_mf, on=['id', 'sequence'], how='left')
add_cols(df_merge_mf)

df_cat_mf = create_cat_df(df_merge_mf)
df_nocat_mf = create_nocat_df(df_cat_mf)

df_nocat_mf.columns = df_nocat_mf.columns.astype(str)
df_nocat_mf.to_feather('/u/scratch/t/ttthach/P1/df_nocat_mf.feather')

In [17]:
model = AutoModelForTokenClassification.from_pretrained('/u/scratch/t/ttthach/P1/LARGE_FULL_4', num_labels = 9, output_hidden_states=True)
df_filt_lf = df_filt.copy()
df_filt_lf['embeddings'] = df_filt_lf['sequence'].apply(get_embedding)
df_merge_lf = pd.merge(train_df, df_filt_lf, on=['id', 'sequence'], how='left')
add_cols(df_merge_lf)

df_cat_lf = create_cat_df(df_merge_lf)
df_nocat_lf = create_nocat_df(df_cat_lf)

df_nocat_lf.columns = df_nocat_lf.columns.astype(str)
df_nocat_lf.to_feather('/u/scratch/t/ttthach/P1/df_nocat_lf.feather')

In [16]:
gc.collect()
torch.cuda.empty_cache()

In [None]:
x = df_nocat_s.drop(columns=['label'])
y = df_nocat_s['label']

x_train_total, x_test, y_train_total, y_test = train_test_split(x, y, test_size=0.2, stratify=y, random_state=42)
x_train, x_val, y_train, y_val = train_test_split(x_train_total, y_train_total, test_size=0.2, stratify=y_train_total, random_state=421)

xgb_s = XGBClassifier(objective='multi:softprob', 
                      num_class= 9,
                      eval_metric='mlogloss',
                      n_jobs=-1,
                      verbosity=2,
                      early_stopping_rounds=20,
                     )

xgb_s.fit(x_train, y_train, 
        eval_set=[(x_val, y_val)],
        verbose=True)

In [45]:
xgb_s.score(x_test, y_test)

0.6230589295242104

In [54]:
x = df_nocat_m.drop(columns=['label'])
y = df_nocat_m['label']

x_train_total, x_test, y_train_total, y_test = train_test_split(x, y, test_size=0.2, stratify=y, random_state=42)
x_train, x_val, y_train, y_val = train_test_split(x_train_total, y_train_total, test_size=0.2, stratify=y_train_total, random_state=421)

xgb_m = XGBClassifier(objective='multi:softprob', 
                      num_class= 9,
                      eval_metric='mlogloss',
                      n_jobs=-1,
                      verbosity=2,
                      early_stopping_rounds=20,
                     )

xgb_m.fit(x_train, y_train, 
        eval_set=[(x_val, y_val)],
        verbose=True)

[00:27:29] INFO: /workspace/src/data/iterative_dmatrix.cc:53: Finished constructing the `IterativeDMatrix`: (1733270, 327, 566779290).
[00:27:31] INFO: /workspace/src/data/iterative_dmatrix.cc:53: Finished constructing the `IterativeDMatrix`: (433318, 327, 141694986).
[0]	validation_0-mlogloss:1.68278
[1]	validation_0-mlogloss:1.49021
[2]	validation_0-mlogloss:1.36869
[3]	validation_0-mlogloss:1.28568
[4]	validation_0-mlogloss:1.22642
[5]	validation_0-mlogloss:1.18323
[6]	validation_0-mlogloss:1.15125
[7]	validation_0-mlogloss:1.12733
[8]	validation_0-mlogloss:1.10909
[9]	validation_0-mlogloss:1.09503
[10]	validation_0-mlogloss:1.08401
[11]	validation_0-mlogloss:1.07528
[12]	validation_0-mlogloss:1.06824
[13]	validation_0-mlogloss:1.06246
[14]	validation_0-mlogloss:1.05772
[15]	validation_0-mlogloss:1.05372
[16]	validation_0-mlogloss:1.05018
[17]	validation_0-mlogloss:1.04715
[18]	validation_0-mlogloss:1.04444
[19]	validation_0-mlogloss:1.04221
[20]	validation_0-mlogloss:1.04017
[21]	v

In [55]:
xgb_m.score(x_test, y_test)

0.6547511571189354

In [64]:
x = df_nocat_l.drop(columns=['label'])
y = df_nocat_l['label']

x_train_total, x_test, y_train_total, y_test = train_test_split(x, y, test_size=0.2, stratify=y, random_state=42)
x_train, x_val, y_train, y_val = train_test_split(x_train_total, y_train_total, test_size=0.2, stratify=y_train_total, random_state=421)

xgb_l = XGBClassifier(objective='multi:softprob', 
                      num_class= 9,
                      eval_metric='mlogloss',
                      n_jobs=-1,
                      verbosity=2,
                      early_stopping_rounds=20,
                     )

xgb_l.fit(x_train, y_train, 
        eval_set=[(x_val, y_val)],
        verbose=True)

[00:54:01] INFO: /workspace/src/data/iterative_dmatrix.cc:53: Finished constructing the `IterativeDMatrix`: (1733270, 327, 566779290).
[00:54:03] INFO: /workspace/src/data/iterative_dmatrix.cc:53: Finished constructing the `IterativeDMatrix`: (433318, 327, 141694986).
[0]	validation_0-mlogloss:1.67015
[1]	validation_0-mlogloss:1.47576
[2]	validation_0-mlogloss:1.35354
[3]	validation_0-mlogloss:1.26986
[4]	validation_0-mlogloss:1.21034
[5]	validation_0-mlogloss:1.16714
[6]	validation_0-mlogloss:1.13509
[7]	validation_0-mlogloss:1.11124
[8]	validation_0-mlogloss:1.09307
[9]	validation_0-mlogloss:1.07896
[10]	validation_0-mlogloss:1.06796
[11]	validation_0-mlogloss:1.05923
[12]	validation_0-mlogloss:1.05215
[13]	validation_0-mlogloss:1.04638
[14]	validation_0-mlogloss:1.04160
[15]	validation_0-mlogloss:1.03763
[16]	validation_0-mlogloss:1.03419
[17]	validation_0-mlogloss:1.03123
[18]	validation_0-mlogloss:1.02859
[19]	validation_0-mlogloss:1.02629
[20]	validation_0-mlogloss:1.02421
[21]	v

In [65]:
xgb_l.score(x_test, y_test)

0.6603969005643897

In [17]:
x = df_nocat_f.drop(columns=['label'])
y = df_nocat_f['label']

x_train_total, x_test, y_train_total, y_test = train_test_split(x, y, test_size=0.2, stratify=y, random_state=42)
x_train, x_val, y_train, y_val = train_test_split(x_train_total, y_train_total, test_size=0.2, stratify=y_train_total, random_state=421)

xgb_f = XGBClassifier(objective='multi:softprob', 
                      num_class= 9,
                      eval_metric='mlogloss',
                      n_jobs=-1,
                      verbosity=2,
                      early_stopping_rounds=20,
                     )

xgb_f.fit(x_train, y_train, 
        eval_set=[(x_val, y_val)],
        verbose=True)

[01:29:16] INFO: /workspace/src/data/iterative_dmatrix.cc:53: Finished constructing the `IterativeDMatrix`: (1733270, 327, 566779290).
[01:29:17] INFO: /workspace/src/data/iterative_dmatrix.cc:53: Finished constructing the `IterativeDMatrix`: (433318, 327, 141694986).
[0]	validation_0-mlogloss:1.66649
[1]	validation_0-mlogloss:1.47158
[2]	validation_0-mlogloss:1.34909
[3]	validation_0-mlogloss:1.26601
[4]	validation_0-mlogloss:1.20685
[5]	validation_0-mlogloss:1.16419
[6]	validation_0-mlogloss:1.13279
[7]	validation_0-mlogloss:1.10929
[8]	validation_0-mlogloss:1.09145
[9]	validation_0-mlogloss:1.07772
[10]	validation_0-mlogloss:1.06701
[11]	validation_0-mlogloss:1.05844
[12]	validation_0-mlogloss:1.05144
[13]	validation_0-mlogloss:1.04584
[14]	validation_0-mlogloss:1.04111
[15]	validation_0-mlogloss:1.03709
[16]	validation_0-mlogloss:1.03365
[17]	validation_0-mlogloss:1.03055
[18]	validation_0-mlogloss:1.02776
[19]	validation_0-mlogloss:1.02528
[20]	validation_0-mlogloss:1.02299
[21]	v

In [18]:
xgb_f.score(x_test, y_test)

0.6620344984833295

In [34]:
x = df_nocat_smlf.drop(columns=['label'])
y = df_nocat_smlf['label']

x_train_total, x_test, y_train_total, y_test = train_test_split(x, y, test_size=0.2, stratify=y, random_state=42)
x_train, x_val, y_train, y_val = train_test_split(x_train_total, y_train_total, test_size=0.2, stratify=y_train_total, random_state=421)

xgb_smlf = XGBClassifier(objective='multi:softprob', 
                      num_class= 9,
                      eval_metric='mlogloss',
                      n_jobs=-1,
                      verbosity=2,
                      early_stopping_rounds=20,
                     )

xgb_smlf.fit(x_train, y_train, 
        eval_set=[(x_val, y_val)],
        verbose=True)

[01:50:48] INFO: /workspace/src/data/iterative_dmatrix.cc:53: Finished constructing the `IterativeDMatrix`: (1733270, 327, 566779290).
[01:50:49] INFO: /workspace/src/data/iterative_dmatrix.cc:53: Finished constructing the `IterativeDMatrix`: (433318, 327, 141694986).
[0]	validation_0-mlogloss:1.65646
[1]	validation_0-mlogloss:1.45603
[2]	validation_0-mlogloss:1.32920
[3]	validation_0-mlogloss:1.24240
[4]	validation_0-mlogloss:1.18036
[5]	validation_0-mlogloss:1.13462
[6]	validation_0-mlogloss:1.10096
[7]	validation_0-mlogloss:1.07560
[8]	validation_0-mlogloss:1.05616
[9]	validation_0-mlogloss:1.04116
[10]	validation_0-mlogloss:1.02952
[11]	validation_0-mlogloss:1.02032
[12]	validation_0-mlogloss:1.01282
[13]	validation_0-mlogloss:1.00678
[14]	validation_0-mlogloss:1.00186
[15]	validation_0-mlogloss:0.99771
[16]	validation_0-mlogloss:0.99416
[17]	validation_0-mlogloss:0.99117
[18]	validation_0-mlogloss:0.98850
[19]	validation_0-mlogloss:0.98615
[20]	validation_0-mlogloss:0.98408
[21]	v

In [36]:
xgb_smlf.score(x_test, y_test)

0.6694526139718303

In [46]:
x = df_nocat_ml.drop(columns=['label'])
y = df_nocat_ml['label']

x_train_total, x_test, y_train_total, y_test = train_test_split(x, y, test_size=0.2, stratify=y, random_state=42)
x_train, x_val, y_train, y_val = train_test_split(x_train_total, y_train_total, test_size=0.2, stratify=y_train_total, random_state=421)

xgb_ml = XGBClassifier(objective='multi:softprob', 
                      num_class= 9,
                      eval_metric='mlogloss',
                      n_jobs=-1,
                      verbosity=2,
                      early_stopping_rounds=20,
                     )

xgb_ml.fit(x_train, y_train, 
        eval_set=[(x_val, y_val)],
        verbose=True)

[02:06:54] INFO: /workspace/src/data/iterative_dmatrix.cc:53: Finished constructing the `IterativeDMatrix`: (1733270, 327, 566779290).
[02:06:56] INFO: /workspace/src/data/iterative_dmatrix.cc:53: Finished constructing the `IterativeDMatrix`: (433318, 327, 141694986).
[0]	validation_0-mlogloss:1.66364
[1]	validation_0-mlogloss:1.46570
[2]	validation_0-mlogloss:1.34115
[3]	validation_0-mlogloss:1.25591
[4]	validation_0-mlogloss:1.19506
[5]	validation_0-mlogloss:1.15052
[6]	validation_0-mlogloss:1.11760
[7]	validation_0-mlogloss:1.09317
[8]	validation_0-mlogloss:1.07447
[9]	validation_0-mlogloss:1.05995
[10]	validation_0-mlogloss:1.04872
[11]	validation_0-mlogloss:1.03989
[12]	validation_0-mlogloss:1.03266
[13]	validation_0-mlogloss:1.02687
[14]	validation_0-mlogloss:1.02198
[15]	validation_0-mlogloss:1.01793
[16]	validation_0-mlogloss:1.01432
[17]	validation_0-mlogloss:1.01135
[18]	validation_0-mlogloss:1.00876
[19]	validation_0-mlogloss:1.00640
[20]	validation_0-mlogloss:1.00443
[21]	v

In [47]:
xgb_ml.score(x_test, y_test)

0.6652912321124275

In [29]:
#df_nocat_mlf = pd.read_feather('/u/scratch/t/ttthach/P1/df_nocat_mlf.feather', dtype_backend='pyarrow')
x = df_nocat_mlf.drop(columns=['label'])
y = df_nocat_mlf['label']

x_train_total, x_test, y_train_total, y_test = train_test_split(x, y, test_size=0.2, stratify=y, random_state=42)
x_train, x_val, y_train, y_val = train_test_split(x_train_total, y_train_total, test_size=0.2, stratify=y_train_total, random_state=421)

xgb_mlf = XGBClassifier(objective='multi:softprob', 
                      num_class= 9,
                      eval_metric='mlogloss',
                      n_jobs=-1,
                      verbosity=2,
                      early_stopping_rounds=20,
                     )

xgb_mlf.fit(x_train, y_train, 
        eval_set=[(x_val, y_val)],
        verbose=True)

[23:04:11] INFO: /workspace/src/data/iterative_dmatrix.cc:53: Finished constructing the `IterativeDMatrix`: (1733270, 327, 566779290).
[23:04:30] INFO: /workspace/src/data/iterative_dmatrix.cc:53: Finished constructing the `IterativeDMatrix`: (433318, 327, 141694986).
[0]	validation_0-mlogloss:1.64623
[1]	validation_0-mlogloss:1.44343
[2]	validation_0-mlogloss:1.31615
[3]	validation_0-mlogloss:1.22877
[4]	validation_0-mlogloss:1.16626
[5]	validation_0-mlogloss:1.12084
[6]	validation_0-mlogloss:1.08706
[7]	validation_0-mlogloss:1.06171
[8]	validation_0-mlogloss:1.04244
[9]	validation_0-mlogloss:1.02751
[10]	validation_0-mlogloss:1.01599
[11]	validation_0-mlogloss:1.00683
[12]	validation_0-mlogloss:0.99955
[13]	validation_0-mlogloss:0.99360
[14]	validation_0-mlogloss:0.98870
[15]	validation_0-mlogloss:0.98462
[16]	validation_0-mlogloss:0.98115
[17]	validation_0-mlogloss:0.97814
[18]	validation_0-mlogloss:0.97550
[19]	validation_0-mlogloss:0.97323
[20]	validation_0-mlogloss:0.97114
[21]	v

In [30]:
xgb_mlf.score(x_test, y_test)

0.6744503338890458

In [31]:
df_nocat_sf = pd.read_feather('/u/scratch/t/ttthach/P1/df_nocat_sf.feather', dtype_backend='pyarrow')
x = df_nocat_sf.drop(columns=['label'])
y = df_nocat_sf['label']

x_train_total, x_test, y_train_total, y_test = train_test_split(x, y, test_size=0.2, stratify=y, random_state=42)
x_train, x_val, y_train, y_val = train_test_split(x_train_total, y_train_total, test_size=0.2, stratify=y_train_total, random_state=421)

xgb_sf = XGBClassifier(objective='multi:softprob', 
                      num_class= 9,
                      eval_metric='mlogloss',
                      n_jobs=-1,
                      verbosity=2,
                      early_stopping_rounds=20,
                     )

xgb_sf.fit(x_train, y_train, 
        eval_set=[(x_val, y_val)],
        verbose=True)

[23:40:52] INFO: /workspace/src/data/iterative_dmatrix.cc:53: Finished constructing the `IterativeDMatrix`: (1733270, 327, 566779290).
[23:41:08] INFO: /workspace/src/data/iterative_dmatrix.cc:53: Finished constructing the `IterativeDMatrix`: (433318, 327, 141694986).
[0]	validation_0-mlogloss:1.67273
[1]	validation_0-mlogloss:1.47817
[2]	validation_0-mlogloss:1.35510
[3]	validation_0-mlogloss:1.27063
[4]	validation_0-mlogloss:1.21063
[5]	validation_0-mlogloss:1.16685
[6]	validation_0-mlogloss:1.13444
[7]	validation_0-mlogloss:1.11013
[8]	validation_0-mlogloss:1.09152
[9]	validation_0-mlogloss:1.07721
[10]	validation_0-mlogloss:1.06604
[11]	validation_0-mlogloss:1.05703
[12]	validation_0-mlogloss:1.04984
[13]	validation_0-mlogloss:1.04401
[14]	validation_0-mlogloss:1.03919
[15]	validation_0-mlogloss:1.03507
[16]	validation_0-mlogloss:1.03158
[17]	validation_0-mlogloss:1.02861
[18]	validation_0-mlogloss:1.02610
[19]	validation_0-mlogloss:1.02371
[20]	validation_0-mlogloss:1.02164
[21]	v

In [32]:
xgb_sf.score(x_test, y_test)

0.6591599325760136

In [33]:
df_nocat_mf = pd.read_feather('/u/scratch/t/ttthach/P1/df_nocat_mf.feather', dtype_backend='pyarrow')

x = df_nocat_mf.drop(columns=['label'])
y = df_nocat_mf['label']

x_train_total, x_test, y_train_total, y_test = train_test_split(x, y, test_size=0.2, stratify=y, random_state=42)
x_train, x_val, y_train, y_val = train_test_split(x_train_total, y_train_total, test_size=0.2, stratify=y_train_total, random_state=421)

xgb_mf = XGBClassifier(objective='multi:softprob', 
                      num_class= 9,
                      eval_metric='mlogloss',
                      n_jobs=-1,
                      verbosity=2,
                      early_stopping_rounds=20,
                     )

xgb_mf.fit(x_train, y_train, 
        eval_set=[(x_val, y_val)],
        verbose=True)

[00:17:52] INFO: /workspace/src/data/iterative_dmatrix.cc:53: Finished constructing the `IterativeDMatrix`: (1733270, 327, 566779290).
[00:18:09] INFO: /workspace/src/data/iterative_dmatrix.cc:53: Finished constructing the `IterativeDMatrix`: (433318, 327, 141694986).
[0]	validation_0-mlogloss:1.65342
[1]	validation_0-mlogloss:1.45282
[2]	validation_0-mlogloss:1.32666
[3]	validation_0-mlogloss:1.24020
[4]	validation_0-mlogloss:1.17860
[5]	validation_0-mlogloss:1.13368
[6]	validation_0-mlogloss:1.10055
[7]	validation_0-mlogloss:1.07572
[8]	validation_0-mlogloss:1.05663
[9]	validation_0-mlogloss:1.04208
[10]	validation_0-mlogloss:1.03076
[11]	validation_0-mlogloss:1.02178
[12]	validation_0-mlogloss:1.01448
[13]	validation_0-mlogloss:1.00857
[14]	validation_0-mlogloss:1.00372
[15]	validation_0-mlogloss:0.99962
[16]	validation_0-mlogloss:0.99618
[17]	validation_0-mlogloss:0.99324
[18]	validation_0-mlogloss:0.99060
[19]	validation_0-mlogloss:0.98831
[20]	validation_0-mlogloss:0.98624
[21]	v

In [34]:
xgb_mf.score(x_test, y_test)

0.6702501813912013

In [35]:
df_nocat_lf = pd.read_feather('/u/scratch/t/ttthach/P1/df_nocat_lf.feather', dtype_backend='pyarrow')

x = df_nocat_lf.drop(columns=['label'])
y = df_nocat_lf['label']

x_train_total, x_test, y_train_total, y_test = train_test_split(x, y, test_size=0.2, stratify=y, random_state=42)
x_train, x_val, y_train, y_val = train_test_split(x_train_total, y_train_total, test_size=0.2, stratify=y_train_total, random_state=421)

xgb_lf = XGBClassifier(objective='multi:softprob', 
                      num_class= 9,
                      eval_metric='mlogloss',
                      n_jobs=-1,
                      verbosity=2,
                      early_stopping_rounds=20,
                     )

xgb_lf.fit(x_train, y_train, 
        eval_set=[(x_val, y_val)],
        verbose=True)

[00:55:01] INFO: /workspace/src/data/iterative_dmatrix.cc:53: Finished constructing the `IterativeDMatrix`: (1733270, 327, 566779290).
[00:55:17] INFO: /workspace/src/data/iterative_dmatrix.cc:53: Finished constructing the `IterativeDMatrix`: (433318, 327, 141694986).
[0]	validation_0-mlogloss:1.64832
[1]	validation_0-mlogloss:1.44717
[2]	validation_0-mlogloss:1.32023
[3]	validation_0-mlogloss:1.23319
[4]	validation_0-mlogloss:1.17112
[5]	validation_0-mlogloss:1.12592
[6]	validation_0-mlogloss:1.09265
[7]	validation_0-mlogloss:1.06754
[8]	validation_0-mlogloss:1.04845
[9]	validation_0-mlogloss:1.03371
[10]	validation_0-mlogloss:1.02225
[11]	validation_0-mlogloss:1.01335
[12]	validation_0-mlogloss:1.00617
[13]	validation_0-mlogloss:1.00023
[14]	validation_0-mlogloss:0.99541
[15]	validation_0-mlogloss:0.99131
[16]	validation_0-mlogloss:0.98786
[17]	validation_0-mlogloss:0.98488
[18]	validation_0-mlogloss:0.98228
[19]	validation_0-mlogloss:0.97992
[20]	validation_0-mlogloss:0.97780
[21]	v

In [36]:
xgb_lf.score(x_test, y_test)

0.673643535365284

In [19]:
gc.collect()
torch.cuda.empty_cache()

In [38]:
test_path = 'test.tsv'
test_df = pd.read_csv(test_path, sep='\t')
test_df[['id','amino_acid','index']] = test_df['id'].str.split('_', expand=True)
test_df['index'] = test_df['index'].astype(int)
test_df['sequence'] = test_df['id'].apply(lambda x: str(seq_dict[x].seq))
test_df['length'] = test_df['sequence'].str.len()
test_filt = test_df.groupby('id')[['sequence']].agg({'sequence': 'first'}).reset_index()

test_sequences = test_filt['sequence'].tolist()

test_batch = tokenizer(
    test_sequences,
    add_special_tokens=False,
    padding=True,
    return_tensors="pt"
)

test_filt['input_ids'] = list(test_batch['input_ids'].numpy())
test_filt['attention_mask'] = list(test_batch['attention_mask'].numpy())

In [None]:
model = AutoModelForTokenClassification.from_pretrained('/u/scratch/t/ttthach/P1/SMALL_4', num_labels = 9, output_hidden_states=True)
test_filt_s = test_filt.copy()
test_filt_s['embeddings'] = test_filt_s['sequence'].apply(get_embedding)
test_merge_s = pd.merge(test_df, test_filt_s, on=['id', 'sequence'], how='left')
add_cols(test_merge_s)

test_cat_s = create_test_cat_df(test_merge_s)
test_nocat_s = create_nocat_df(test_cat_s)

test_nocat_s.columns = test_nocat_s.columns.astype(str)
test_nocat_s.to_feather('/u/scratch/t/ttthach/P1/test_nocat_s.feather')
#featherLoad = pd.read_feather('/u/scratch/t/ttthach/P1/train_df_nocat.feather', dtype_backend='pyarrow')
#train_df_nocat = featherLoad

x_final = test_nocat_s
y_prob_xgb_s = xgb_s.predict_proba(x_final)
prob_xgb_df_s = pd.DataFrame(y_prob_xgb_s)  
prob_xgb_df_s.columns = prob_xgb_df_s.columns.astype(str)
prob_xgb_df_s.to_feather('/u/scratch/t/ttthach/P1/prob_xgb_s.feather')

pred_df = xgb_s.predict(x_final)

In [56]:
model = AutoModelForTokenClassification.from_pretrained('/u/scratch/t/ttthach/P1/MED_4', num_labels = 9, output_hidden_states=True)
test_filt_m = test_filt.copy()
test_filt_m['embeddings'] = test_filt_m['sequence'].apply(get_embedding)
test_merge_m = pd.merge(test_df, test_filt_m, on=['id', 'sequence'], how='left')
add_cols(test_merge_m)

test_cat_m = create_test_cat_df(test_merge_m)
test_nocat_m = create_nocat_df(test_cat_m)

test_nocat_m.columns = test_nocat_m.columns.astype(str)
test_nocat_m.to_feather('/u/scratch/t/ttthach/P1/test_nocat_m.feather')
#featherLoad = pd.read_feather('/u/scratch/t/ttthach/P1/train_df_nocat.feather', dtype_backend='pyarrow')
#train_df_nocat = featherLoad

x_final = test_nocat_m
y_prob_xgb_m = xgb_m.predict_proba(x_final)
prob_xgb_df_m = pd.DataFrame(y_prob_xgb_m)  
prob_xgb_df_m.columns = prob_xgb_df_m.columns.astype(str)
prob_xgb_df_m.to_feather('/u/scratch/t/ttthach/P1/prob_xgb_m.feather')

pred_df = xgb_m.predict(x_final)

In [66]:
model = AutoModelForTokenClassification.from_pretrained('/u/scratch/t/ttthach/P1/LARGE_4', num_labels = 9, output_hidden_states=True)
test_filt_l = test_filt.copy()
test_filt_l['embeddings'] = test_filt_l['sequence'].apply(get_embedding)
test_merge_l = pd.merge(test_df, test_filt_l, on=['id', 'sequence'], how='left')
add_cols(test_merge_l)

test_cat_l = create_test_cat_df(test_merge_l)
test_nocat_l = create_nocat_df(test_cat_l)

test_nocat_l.columns = test_nocat_l.columns.astype(str)
test_nocat_l.to_feather('/u/scratch/t/ttthach/P1/test_nocat_l.feather')

x_final = test_nocat_l
y_prob_xgb_l = xgb_l.predict_proba(x_final)
prob_xgb_df_l = pd.DataFrame(y_prob_xgb_l)  
prob_xgb_df_l.columns = prob_xgb_df_l.columns.astype(str)
prob_xgb_df_l.to_feather('/u/scratch/t/ttthach/P1/prob_xgb_l.feather')


In [21]:
model = AutoModelForTokenClassification.from_pretrained('/u/scratch/t/ttthach/P1/FULL_4', num_labels = 9, output_hidden_states=True)
test_filt_f = test_filt.copy()
test_filt_f['embeddings'] = test_filt_f['sequence'].apply(get_embedding)
test_merge_f = pd.merge(test_df, test_filt_f, on=['id', 'sequence'], how='left')
add_cols(test_merge_f)

test_cat_f = create_test_cat_df(test_merge_f)
test_nocat_f = create_nocat_df(test_cat_f)

test_nocat_f.columns = test_nocat_f.columns.astype(str)
test_nocat_f.to_feather('/u/scratch/t/ttthach/P1/test_nocat_f.feather')

x_final = test_nocat_f
y_prob_xgb_f = xgb_f.predict_proba(x_final)
prob_xgb_df_f = pd.DataFrame(y_prob_xgb_f)  
prob_xgb_df_f.columns = prob_xgb_df_f.columns.astype(str)
prob_xgb_df_f.to_feather('/u/scratch/t/ttthach/P1/prob_xgb_f.feather')

In [37]:
model = AutoModelForTokenClassification.from_pretrained('/u/scratch/t/ttthach/P1/SMALL_MED_LARGE_FULL_4', num_labels = 9, output_hidden_states=True)
test_filt_smlf = test_filt.copy()
test_filt_smlf['embeddings'] = test_filt_smlf['sequence'].apply(get_embedding)
test_merge_smlf = pd.merge(test_df, test_filt_smlf, on=['id', 'sequence'], how='left')
add_cols(test_merge_smlf)

test_cat_smlf = create_test_cat_df(test_merge_smlf)
test_nocat_smlf = create_nocat_df(test_cat_smlf)

test_nocat_smlf.columns = test_nocat_smlf.columns.astype(str)
test_nocat_smlf.to_feather('/u/scratch/t/ttthach/P1/test_nocat_smlf.feather')

x_final = test_nocat_smlf
y_prob_xgb_smlf = xgb_smlf.predict_proba(x_final)
prob_xgb_df_smlf = pd.DataFrame(y_prob_xgb_smlf)  
prob_xgb_df_smlf.columns = prob_xgb_df_smlf.columns.astype(str)
prob_xgb_df_smlf.to_feather('/u/scratch/t/ttthach/P1/prob_xgb_smlf.feather')

In [49]:
model = AutoModelForTokenClassification.from_pretrained('/u/scratch/t/ttthach/P1/MED_LARGE_4', num_labels = 9, output_hidden_states=True)
test_filt_ml = test_filt.copy()
test_filt_ml['embeddings'] = test_filt_ml['sequence'].apply(get_embedding)
test_merge_ml = pd.merge(test_df, test_filt_ml, on=['id', 'sequence'], how='left')
add_cols(test_merge_ml)

test_cat_ml = create_test_cat_df(test_merge_ml)
test_nocat_ml = create_nocat_df(test_cat_ml)

test_nocat_ml.columns = test_nocat_ml.columns.astype(str)
test_nocat_ml.to_feather('/u/scratch/t/ttthach/P1/test_nocat_ml.feather')

x_final = test_nocat_ml
y_prob_xgb_ml = xgb_ml.predict_proba(x_final)
prob_xgb_df_ml = pd.DataFrame(y_prob_xgb_ml)  
prob_xgb_df_ml.columns = prob_xgb_df_ml.columns.astype(str)
prob_xgb_df_ml.to_feather('/u/scratch/t/ttthach/P1/prob_xgb_ml.feather')

In [44]:
model = AutoModelForTokenClassification.from_pretrained('/u/scratch/t/ttthach/P1/MED_LARGE_FULL_4', num_labels = 9, output_hidden_states=True)
test_filt_mlf = test_filt.copy()
test_filt_mlf['embeddings'] = test_filt_mlf['sequence'].apply(get_embedding)
test_merge_mlf = pd.merge(test_df, test_filt_mlf, on=['id', 'sequence'], how='left')
add_cols(test_merge_mlf)

test_cat_mlf = create_test_cat_df(test_merge_mlf)
test_nocat_mlf = create_nocat_df(test_cat_mlf)

test_nocat_mlf.columns = test_nocat_mlf.columns.astype(str)
test_nocat_mlf.to_feather('/u/scratch/t/ttthach/P1/test_nocat_mlf.feather')

x_final = test_nocat_mlf
y_prob_xgb_mlf = xgb_mlf.predict_proba(x_final)
prob_xgb_df_mlf = pd.DataFrame(y_prob_xgb_mlf)  
prob_xgb_df_mlf.columns = prob_xgb_df_mlf.columns.astype(str)
prob_xgb_df_mlf.to_feather('/u/scratch/t/ttthach/P1/prob_xgb_mlf.feather')

In [45]:
model = AutoModelForTokenClassification.from_pretrained('/u/scratch/t/ttthach/P1/SMALL_FULL_4', num_labels = 9, output_hidden_states=True)
test_filt_sf = test_filt.copy()
test_filt_sf['embeddings'] = test_filt_sf['sequence'].apply(get_embedding)
test_merge_sf = pd.merge(test_df, test_filt_sf, on=['id', 'sequence'], how='left')
add_cols(test_merge_sf)

test_cat_sf = create_test_cat_df(test_merge_sf)
test_nocat_sf = create_nocat_df(test_cat_sf)

test_nocat_sf.columns = test_nocat_sf.columns.astype(str)
test_nocat_sf.to_feather('/u/scratch/t/ttthach/P1/test_nocat_sf.feather')

x_final = test_nocat_sf
y_prob_xgb_sf = xgb_sf.predict_proba(x_final)
prob_xgb_df_sf = pd.DataFrame(y_prob_xgb_sf)  
prob_xgb_df_sf.columns = prob_xgb_df_sf.columns.astype(str)
prob_xgb_df_sf.to_feather('/u/scratch/t/ttthach/P1/prob_xgb_sf.feather')

In [46]:
model = AutoModelForTokenClassification.from_pretrained('/u/scratch/t/ttthach/P1/MED_FULL_4', num_labels = 9, output_hidden_states=True)
test_filt_mf = test_filt.copy()
test_filt_mf['embeddings'] = test_filt_mf['sequence'].apply(get_embedding)
test_merge_mf = pd.merge(test_df, test_filt_mf, on=['id', 'sequence'], how='left')
add_cols(test_merge_mf)

test_cat_mf = create_test_cat_df(test_merge_mf)
test_nocat_mf = create_nocat_df(test_cat_mf)

test_nocat_mf.columns = test_nocat_mf.columns.astype(str)
test_nocat_mf.to_feather('/u/scratch/t/ttthach/P1/test_nocat_mf.feather')

x_final = test_nocat_mf
y_prob_xgb_mf = xgb_mf.predict_proba(x_final)
prob_xgb_df_mf = pd.DataFrame(y_prob_xgb_mf)  
prob_xgb_df_mf.columns = prob_xgb_df_mf.columns.astype(str)
prob_xgb_df_mf.to_feather('/u/scratch/t/ttthach/P1/prob_xgb_mf.feather')

In [47]:
model = AutoModelForTokenClassification.from_pretrained('/u/scratch/t/ttthach/P1/LARGE_FULL_4', num_labels = 9, output_hidden_states=True)
test_filt_lf = test_filt.copy()
test_filt_lf['embeddings'] = test_filt_lf['sequence'].apply(get_embedding)
test_merge_lf = pd.merge(test_df, test_filt_lf, on=['id', 'sequence'], how='left')
add_cols(test_merge_lf)

test_cat_lf = create_test_cat_df(test_merge_lf)
test_nocat_lf = create_nocat_df(test_cat_lf)

test_nocat_lf.columns = test_nocat_lf.columns.astype(str)
test_nocat_lf.to_feather('/u/scratch/t/ttthach/P1/test_nocat_lf.feather')

x_final = test_nocat_lf
y_prob_xgb_lf = xgb_lf.predict_proba(x_final)
prob_xgb_df_lf = pd.DataFrame(y_prob_xgb_lf)  
prob_xgb_df_lf.columns = prob_xgb_df_lf.columns.astype(str)
prob_xgb_df_lf.to_feather('/u/scratch/t/ttthach/P1/prob_xgb_lf.feather')

In [38]:
pred_df = xgb_lf.predict(x_final)

In [48]:
prob_s = pd.read_feather('/u/scratch/t/ttthach/P1/prob_xgb_s.feather', dtype_backend='pyarrow')
prob_m = pd.read_feather('/u/scratch/t/ttthach/P1/prob_xgb_s.feather', dtype_backend='pyarrow')
prob_l = pd.read_feather('/u/scratch/t/ttthach/P1/prob_xgb_l.feather', dtype_backend='pyarrow')
prob_f = pd.read_feather('/u/scratch/t/ttthach/P1/prob_xgb_f.feather', dtype_backend='pyarrow')

prob_smlf = pd.read_feather('/u/scratch/t/ttthach/P1/prob_xgb_smlf.feather', dtype_backend='pyarrow')
prob_ml = pd.read_feather('/u/scratch/t/ttthach/P1/prob_xgb_ml.feather', dtype_backend='pyarrow')
prob_mlf = pd.read_feather('/u/scratch/t/ttthach/P1/prob_xgb_mlf.feather', dtype_backend='pyarrow')

prob_sf = pd.read_feather('/u/scratch/t/ttthach/P1/prob_xgb_sf.feather', dtype_backend='pyarrow')
prob_mf = pd.read_feather('/u/scratch/t/ttthach/P1/prob_xgb_mf.feather', dtype_backend='pyarrow')
prob_lf = pd.read_feather('/u/scratch/t/ttthach/P1/prob_xgb_lf.feather', dtype_backend='pyarrow')

In [52]:
avg_prob = (prob_l + prob_f + prob_smlf + prob_ml + prob_mlf + prob_sf + prob_mf + prob_lf) / 8

In [53]:
pred_df = np.array(avg_prob.idxmax(axis=1), dtype=np.int32)

In [54]:
test_submit = pd.read_csv(test_path, sep='\t')

test_submit['class_number'] = pred_df

test_submit['secondary_structure'] = test_submit['class_number'].apply(lambda x: idx_to_labels[x])

test_submit = test_submit.drop(columns=['class_number'])

idx_to_labels

test_submit.to_csv('predictions.csv', sep='\t', index=False)

test_submit.head(50)

Unnamed: 0,id,secondary_structure
0,3JRN_LYS_8,.
1,3JRN_TYR_9,E
2,3JRN_ASP_10,E
3,3JRN_VAL_11,E
4,3JRN_PHE_12,E
5,3JRN_LEU_13,E
6,3JRN_SER_14,E
7,3JRN_PHE_15,E
8,3JRN_ARG_16,.
9,3JRN_GLY_17,.
