In [1]:
from transformers import AutoTokenizer, AutoModel, EsmForTokenClassification
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
import evaluate
import torch
from Bio import SeqIO
import pandas as pd
import numpy as np
from datasets import Dataset

In [2]:
tokenizer = AutoTokenizer.from_pretrained("facebook/esm2_t6_8M_UR50D")
model = EsmForTokenClassification.from_pretrained("facebook/esm2_t6_8M_UR50D", num_labels = 9)

Some weights of EsmForTokenClassification were not initialized from the model checkpoint at facebook/esm2_t6_8M_UR50D and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
train_path = 'train.tsv'
train_df = pd.read_csv(train_path, sep='\t')
train_df[['id','amino_acid','index']] = train_df['id'].str.split('_', expand=True)
train_df['index'] = train_df['index'].astype(int)

# Labels:
idx_to_labels = {i: value for i, value in enumerate(train_df['secondary_structure'].unique())}
labels_to_idx = {v: k for k, v in idx_to_labels.items()}
train_df['label'] = pd.factorize(train_df['secondary_structure'])[0]

# Ids:
ids = train_df['id'].unique()

In [4]:
seq_dict = SeqIO.to_dict(SeqIO.parse("sequences.fasta", "fasta"))

In [5]:
train_df['sequence'] = train_df['id'].apply(lambda x: str(seq_dict[x].seq))
train_df['length'] = train_df['sequence'].str.len()

In [6]:
train_df = train_df[train_df['index'] < train_df['length']]

In [7]:
df_filt = train_df.groupby('id')[['sequence']].agg({'sequence': 'first'}).reset_index()

In [8]:
df_shuffled = df_filt.sample(frac=1, random_state=1).reset_index(drop=True)

In [9]:
holdout = df_shuffled.iloc[: (len(df_filt) // 20)]
df_filt = df_shuffled.iloc[(len(df_filt) // 20) + 1:]

In [10]:
id_to_length = train_df.groupby('id')['length'].first().to_dict()
id_to_indices_labels = train_df.groupby('id')[['index', 'label']].agg(list).to_dict()

id_to_labels = {}
for id in id_to_length:
    length = id_to_length[id]
    indices = id_to_indices_labels['index'][id]
    labels = id_to_indices_labels['label'][id]
    model_labels = np.full(length, -100, dtype=np.int64)

    train_idx = np.array(indices) - 1  # Adjust for zero indexing
    train_labels = np.array(labels)

    model_labels[train_idx] = train_labels
    id_to_labels[id] = model_labels

In [11]:
df_filt['labels'] = df_filt['id'].map(id_to_labels)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filt['labels'] = df_filt['id'].map(id_to_labels)


In [12]:
def sliding_window_with_labels(sequence, labels, window_size, step_size):
    windows = []
    seq_len = len(sequence)

    for start in range(0, seq_len - window_size + 1, step_size):
        end = start + window_size
        windows.append((sequence[start:end], labels[start:end]))

    if (seq_len - window_size) % step_size != 0:
        start = max(seq_len - window_size, 0)
        end = seq_len
        windows.append((sequence[start:end], labels[start:end]))

    return windows

def create_sliding_windows(df, window_size, step_size):
    windows_data = []

    for _, row in df.iterrows():
        sequence = row['sequence']
        labels = row['labels']
        
        labels = np.array(labels)
        windows = sliding_window_with_labels(sequence, labels, window_size, step_size)
        
        for window_seq, window_labels in windows:
            windows_data.append({
                'id': row['id'],
                'sequence': window_seq,
                'labels': window_labels
            })
    
    windows_df = pd.DataFrame(windows_data)
    return windows_df


In [13]:
windows_8_4 = create_sliding_windows(df_filt, 8, 4)
windows_16_8 = create_sliding_windows(df_filt, 16, 8)
windows_32_16 = create_sliding_windows(df_filt, 32, 16)
windows_64_32 = create_sliding_windows(df_filt, 64, 32)
windows_128_64 = create_sliding_windows(df_filt, 128, 64)
windows_256_128 = create_sliding_windows(df_filt, 256, 128)

In [14]:
#windows_df_small = pd.concat([windows_8_4, windows_16_8], ignore_index=True)
windows_df_med = pd.concat([windows_32_16, windows_64_32], ignore_index=True)
windows_df_large = pd.concat([windows_64_32, windows_128_64], ignore_index=True)

In [15]:
#windows_ds_small = Dataset.from_pandas(windows_df_small).remove_columns('id')
windows_ds_med = Dataset.from_pandas(windows_df_med).remove_columns('id')
windows_ds_large = Dataset.from_pandas(windows_df_large).remove_columns('id')
ds_full = Dataset.from_pandas(df_filt).remove_columns('id')

In [16]:
def tokenize_and_label(example):
    tokenized_input = tokenizer(example['sequence'], add_special_tokens=False, return_tensors="pt")
    tokenized_input['input_ids'] = tokenized_input['input_ids'][0] 
    tokenized_input['attention_mask'] = tokenized_input['attention_mask'][0] 
    tokenized_input['labels'] = example['labels']
    return tokenized_input

In [17]:
#tokenized_ds_small = windows_ds_small.map(tokenize_and_label).remove_columns('sequence')
tokenized_ds_med = windows_ds_med.map(tokenize_and_label).remove_columns('sequence')
tokenized_ds_large = windows_ds_large.map(tokenize_and_label).remove_columns('sequence')
tokenized_ds_full = ds_full.map(tokenize_and_label).remove_columns('sequence')

Map:   0%|          | 0/134003 [00:00<?, ? examples/s]

Map:   0%|          | 0/64225 [00:00<?, ? examples/s]

Map:   0%|          | 0/5847 [00:00<?, ? examples/s]

In [18]:
#tokenized_ds_small.set_format("pt", columns=["input_ids", "attention_mask", "labels"], output_all_columns=True)
tokenized_ds_med.set_format("pt", columns=["input_ids", "attention_mask", "labels"], output_all_columns=True)
tokenized_ds_large.set_format("pt", columns=["input_ids", "attention_mask", "labels"], output_all_columns=True)
tokenized_ds_full.set_format("pt", columns=["input_ids", "attention_mask", "labels"], output_all_columns=True)

In [19]:
import random

In [20]:
def create_splits(ds, num_splits = 5):
    splits = []
    ds_shuffled = ds.shuffle(seed=len(ds))
    split_size = len(ds_shuffled) // num_splits
    
    for i in range(num_splits - 1):
        splits.append(ds_shuffled.select(range(i*split_size, (i+1)*split_size)).train_test_split(test_size=0.2, seed=123))
    
    splits.append(ds_shuffled.select(range((num_splits - 1) * split_size, len(ds_shuffled))).train_test_split(test_size=0.2, seed=123))
    return splits

In [21]:
#splits_small = create_splits(tokenized_ds_small)
splits_med = create_splits(tokenized_ds_med)
splits_large = create_splits(tokenized_ds_large)
splits_full = create_splits(tokenized_ds_full)

In [22]:
from itertools import chain

accuracy = evaluate.load("accuracy")
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)
    true_predictions = [
        [p for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    
    true_labels = [
        [l for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    flat_predictions = list(chain.from_iterable(true_predictions))
    flat_labels = list(chain.from_iterable(true_labels))
    
    results = accuracy.compute(predictions=flat_predictions, references=flat_labels)

    return {
        "accuracy": results["accuracy"],
    }

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [23]:
import gc
gc.collect()
torch.cuda.empty_cache()

In [24]:
from transformers import AutoModelForTokenClassification, get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup, get_cosine_with_hard_restarts_schedule_with_warmup

In [25]:
def train_on_splits(splits, name, num_splits = 5, batch_size = 8, eval_batch_size = 8, num_epochs = 4):
    tokenizer = AutoTokenizer.from_pretrained("facebook/esm2_t6_8M_UR50D")
    model = EsmForTokenClassification.from_pretrained("facebook/esm2_t6_8M_UR50D", num_labels = 9)

    for i in range(num_splits):
        optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5, weight_decay=0.01, betas=(0.9, 0.999), eps=1e-08)
        num_training_steps = len(splits[i]["train"]) // batch_size * num_epochs
        num_warmup_steps = int(0.1 * num_training_steps)  # 10% warmup
        
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=num_warmup_steps,
            num_training_steps=num_training_steps
        )
        
        training_args = TrainingArguments(
            output_dir = name + '_' + str(i),
            per_device_train_batch_size=batch_size,
            per_device_eval_batch_size=eval_batch_size,
            num_train_epochs=num_epochs,
            eval_strategy="epoch",
            save_strategy="epoch",
            logging_strategy="epoch",
            load_best_model_at_end=True,
        )
        
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=splits[i]["train"],
            eval_dataset=splits[i]["test"],
            processing_class=tokenizer,
            data_collator=data_collator,
            compute_metrics=compute_metrics,
            optimizers = (optimizer, scheduler),
        )
        
        trainer.can_return_loss = True
        trainer.train()
        
        model_to_save = trainer.model.module if hasattr(trainer.model, 'module') else trainer.model  # Take care of distributed/parallel training
        model_to_save.save_pretrained(name + '_' + str(i))
        model = AutoModelForTokenClassification.from_pretrained(name + '_' + str(i))
    
        gc.collect()
        torch.cuda.empty_cache()


In [26]:
def train_on_splits_sequential(splits, name, model, num_splits = 5, batch_size = 8, eval_batch_size = 8, num_epochs = 4):
    tokenizer = AutoTokenizer.from_pretrained("facebook/esm2_t6_8M_UR50D")

    for i in range(num_splits):
        optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5, weight_decay=0.01, betas=(0.9, 0.999), eps=1e-08)
        num_training_steps = len(splits[i]["train"]) // batch_size * num_epochs
        num_warmup_steps = int(0.1 * num_training_steps)  # 10% warmup
        
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=num_warmup_steps,
            num_training_steps=num_training_steps
        )
        
        training_args = TrainingArguments(
            output_dir = name + '_' + str(i),
            per_device_train_batch_size=batch_size,
            per_device_eval_batch_size=eval_batch_size,
            num_train_epochs=num_epochs,
            eval_strategy="epoch",
            save_strategy="epoch",
            logging_strategy="epoch",
            load_best_model_at_end=True,
        )
        
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=splits[i]["train"],
            eval_dataset=splits[i]["test"],
            processing_class=tokenizer,
            data_collator=data_collator,
            compute_metrics=compute_metrics,
            optimizers = (optimizer, scheduler),
        )
        
        trainer.can_return_loss = True
        trainer.train()
        
        model_to_save = trainer.model.module if hasattr(trainer.model, 'module') else trainer.model  # Take care of distributed/parallel training
        model_to_save.save_pretrained(name + '_' + str(i))
        model = AutoModelForTokenClassification.from_pretrained(name + '_' + str(i))
    
        gc.collect()
        torch.cuda.empty_cache()

In [87]:
train_on_splits(splits_small, 'SMALL', batch_size = 64, eval_batch_size = 32)

Some weights of EsmForTokenClassification were not initialized from the model checkpoint at facebook/esm2_t6_8M_UR50D and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
  arr = np.array(obj)


Epoch,Training Loss,Validation Loss,Accuracy
1,1.5945,1.432325,0.490741
2,1.419,1.407922,0.496634
3,1.3931,1.392535,0.502865
4,1.3796,1.390555,0.503113


  arr = np.array(obj)
  arr = np.array(obj)
  arr = np.array(obj)
Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
  arr = np.array(obj)


Epoch,Training Loss,Validation Loss,Accuracy
1,1.4036,1.398084,0.500257
2,1.3849,1.389963,0.503873
3,1.369,1.386982,0.505728
4,1.357,1.386315,0.505816


  arr = np.array(obj)
  arr = np.array(obj)
  arr = np.array(obj)
Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
  arr = np.array(obj)


Epoch,Training Loss,Validation Loss,Accuracy
1,1.3831,1.381695,0.505122
2,1.3676,1.378771,0.50766
3,1.3525,1.378183,0.506796
4,1.3409,1.378016,0.507616


  arr = np.array(obj)
  arr = np.array(obj)
  arr = np.array(obj)
Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
  arr = np.array(obj)


Epoch,Training Loss,Validation Loss,Accuracy
1,1.3759,1.373788,0.506847
2,1.3609,1.371253,0.508469
3,1.3459,1.37239,0.507693
4,1.3336,1.374565,0.507525


  arr = np.array(obj)
  arr = np.array(obj)
  arr = np.array(obj)
Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
  arr = np.array(obj)


Epoch,Training Loss,Validation Loss,Accuracy
1,1.3689,1.375705,0.507543
2,1.3543,1.375005,0.507534
3,1.3385,1.37472,0.507863
4,1.326,1.375407,0.507663


  arr = np.array(obj)
  arr = np.array(obj)
  arr = np.array(obj)


In [95]:
gc.collect()
torch.cuda.empty_cache()
train_on_splits(splits_med, 'MED', batch_size = 32, eval_batch_size = 16)

Some weights of EsmForTokenClassification were not initialized from the model checkpoint at facebook/esm2_t6_8M_UR50D and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
  arr = np.array(obj)


Epoch,Training Loss,Validation Loss,Accuracy
1,1.5519,1.317934,0.552529
2,1.2787,1.270476,0.561821
3,1.2372,1.25196,0.566645
4,1.2171,1.247063,0.568557


  arr = np.array(obj)
  arr = np.array(obj)
  arr = np.array(obj)
Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
  arr = np.array(obj)


Epoch,Training Loss,Validation Loss,Accuracy
1,1.2492,1.233718,0.57153
2,1.216,1.235201,0.568143
3,1.1904,1.216748,0.575802
4,1.1733,1.217312,0.575493


  arr = np.array(obj)
  arr = np.array(obj)
  arr = np.array(obj)
Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
  arr = np.array(obj)


Epoch,Training Loss,Validation Loss,Accuracy
1,1.2091,1.21287,0.576557
2,1.1822,1.205137,0.578606
3,1.1572,1.204593,0.5796
4,1.1403,1.206364,0.579226


  arr = np.array(obj)
  arr = np.array(obj)
  arr = np.array(obj)
Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
  arr = np.array(obj)


Epoch,Training Loss,Validation Loss,Accuracy
1,1.1912,1.191289,0.583679
2,1.1645,1.185326,0.585035
3,1.1394,1.187752,0.585309
4,1.1225,1.188389,0.585154


  arr = np.array(obj)
  arr = np.array(obj)
  arr = np.array(obj)
Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
  arr = np.array(obj)


Epoch,Training Loss,Validation Loss,Accuracy
1,1.1762,1.172108,0.589473
2,1.1515,1.16981,0.590716
3,1.1268,1.170787,0.591212
4,1.1092,1.172383,0.591225


  arr = np.array(obj)
  arr = np.array(obj)
  arr = np.array(obj)


In [99]:
gc.collect()
torch.cuda.empty_cache()
train_on_splits(splits_large, 'LARGE', batch_size = 32, eval_batch_size = 16)

Some weights of EsmForTokenClassification were not initialized from the model checkpoint at facebook/esm2_t6_8M_UR50D and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
  arr = np.array(obj)


Epoch,Training Loss,Validation Loss,Accuracy
1,1.669,1.349055,0.553325
2,1.2927,1.278377,0.569482
3,1.2428,1.256199,0.574542
4,1.2215,1.248993,0.575931


  arr = np.array(obj)
  arr = np.array(obj)
  arr = np.array(obj)
Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
  arr = np.array(obj)


Epoch,Training Loss,Validation Loss,Accuracy
1,1.2344,1.212017,0.584659
2,1.1936,1.200984,0.586592
3,1.1683,1.180763,0.592496
4,1.1536,1.178746,0.593019


  arr = np.array(obj)
  arr = np.array(obj)
  arr = np.array(obj)
Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
  arr = np.array(obj)


Epoch,Training Loss,Validation Loss,Accuracy
1,1.1696,1.172383,0.591988
2,1.1432,1.157617,0.596845
3,1.1208,1.154438,0.598479
4,1.1077,1.153241,0.598872


  arr = np.array(obj)
  arr = np.array(obj)
  arr = np.array(obj)
Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
  arr = np.array(obj)


Epoch,Training Loss,Validation Loss,Accuracy
1,1.1514,1.142372,0.602592
2,1.127,1.134463,0.605879
3,1.1069,1.132644,0.607157
4,1.094,1.132817,0.60754


  arr = np.array(obj)
  arr = np.array(obj)
  arr = np.array(obj)
Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
  arr = np.array(obj)


Epoch,Training Loss,Validation Loss,Accuracy
1,1.1285,1.123622,0.608449
2,1.1054,1.117731,0.610568
3,1.0855,1.118349,0.610418
4,1.0726,1.116277,0.610805


  arr = np.array(obj)
  arr = np.array(obj)
  arr = np.array(obj)


In [48]:
gc.collect()
torch.cuda.empty_cache()
train_on_splits(splits_full, 'FULL', batch_size = 4, eval_batch_size = 4)

Some weights of EsmForTokenClassification were not initialized from the model checkpoint at facebook/esm2_t6_8M_UR50D and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
  arr = np.array(obj)


Epoch,Training Loss,Validation Loss,Accuracy
1,1.766,1.424772,0.535489
2,1.3194,1.302963,0.570147
3,1.2372,1.268079,0.579566
4,1.2026,1.259258,0.580801


  arr = np.array(obj)
  arr = np.array(obj)
  arr = np.array(obj)
Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
  arr = np.array(obj)


Epoch,Training Loss,Validation Loss,Accuracy
1,1.2211,1.210413,0.590954
2,1.1634,1.175169,0.599139
3,1.1253,1.163094,0.601722
4,1.1043,1.158343,0.603241


  arr = np.array(obj)
  arr = np.array(obj)
  arr = np.array(obj)
Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
  arr = np.array(obj)


Epoch,Training Loss,Validation Loss,Accuracy
1,1.1489,1.122527,0.614669
2,1.1112,1.106279,0.619847
3,1.0771,1.10155,0.620227
4,1.0563,1.100831,0.620299


  arr = np.array(obj)
  arr = np.array(obj)
  arr = np.array(obj)
Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
  arr = np.array(obj)


Epoch,Training Loss,Validation Loss,Accuracy
1,1.0981,1.058967,0.635158
2,1.0628,1.051987,0.635621
3,1.0329,1.049376,0.636566
4,1.0152,1.05052,0.636067


  arr = np.array(obj)
  arr = np.array(obj)
  arr = np.array(obj)
Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
  arr = np.array(obj)


Epoch,Training Loss,Validation Loss,Accuracy
1,1.0871,1.083987,0.622884
2,1.0518,1.081236,0.622811
3,1.0219,1.086322,0.623212
4,1.0046,1.083063,0.622957


  arr = np.array(obj)
  arr = np.array(obj)
  arr = np.array(obj)


In [75]:
gc.collect()
torch.cuda.empty_cache()

In [27]:
gc.collect()
torch.cuda.empty_cache()
train_on_splits_sequential(splits_med, 'SMALL_MED',  model = AutoModelForTokenClassification.from_pretrained('SMALL_4'), 
                           batch_size = 128, eval_batch_size = 128)

Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
  arr = np.array(obj)


Epoch,Training Loss,Validation Loss,Accuracy
1,1.2307,1.215166,0.572857
2,1.2075,1.208033,0.576526
3,1.1951,1.205444,0.577311
4,1.1873,1.203585,0.577467


  arr = np.array(obj)
  arr = np.array(obj)
  arr = np.array(obj)
Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
  arr = np.array(obj)


Epoch,Training Loss,Validation Loss,Accuracy
1,1.2014,1.207799,0.575028
2,1.187,1.204294,0.577039
3,1.1751,1.202103,0.57708
4,1.1677,1.202682,0.577753


  arr = np.array(obj)
  arr = np.array(obj)
  arr = np.array(obj)
Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
  arr = np.array(obj)


Epoch,Training Loss,Validation Loss,Accuracy
1,1.1942,1.192813,0.581335
2,1.181,1.19034,0.581868
3,1.1697,1.189495,0.582406
4,1.1625,1.189525,0.582198


  arr = np.array(obj)
  arr = np.array(obj)
  arr = np.array(obj)
Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
  arr = np.array(obj)


Epoch,Training Loss,Validation Loss,Accuracy
1,1.1896,1.180373,0.585177
2,1.1763,1.17679,0.587213
3,1.1646,1.175749,0.58737
4,1.1581,1.175457,0.587504


  arr = np.array(obj)
  arr = np.array(obj)
  arr = np.array(obj)
Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
  arr = np.array(obj)


Epoch,Training Loss,Validation Loss,Accuracy
1,1.1773,1.169686,0.590403
2,1.1637,1.166936,0.590944
3,1.1512,1.166812,0.590815
4,1.1438,1.166649,0.591127


  arr = np.array(obj)
  arr = np.array(obj)
  arr = np.array(obj)


In [36]:
gc.collect()
torch.cuda.empty_cache()
train_on_splits_sequential(splits_large, '/u/scratch/t/ttthach/P1/SMALL_MED_LARGE',  model = AutoModelForTokenClassification.from_pretrained('/u/scratch/t/ttthach/P1/SMALL_MED_4'), 
                           batch_size = 128, eval_batch_size = 128)

Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
  arr = np.array(obj)


Epoch,Training Loss,Validation Loss,Accuracy
1,1.116,1.099093,0.615941
2,1.1018,1.096191,0.616676
3,1.0934,1.094045,0.617814
4,1.0871,1.094155,0.617564


  arr = np.array(obj)
  arr = np.array(obj)
  arr = np.array(obj)
Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
  arr = np.array(obj)


Epoch,Training Loss,Validation Loss,Accuracy
1,1.1103,1.105067,0.612663
2,1.099,1.102254,0.614273
3,1.0879,1.100756,0.614536
4,1.0829,1.10069,0.614799


  arr = np.array(obj)
  arr = np.array(obj)
  arr = np.array(obj)
Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
  arr = np.array(obj)


Epoch,Training Loss,Validation Loss,Accuracy
1,1.104,1.109064,0.61024
2,1.0908,1.106867,0.610399
3,1.0832,1.106033,0.610738
4,1.0761,1.106814,0.610738


  arr = np.array(obj)
  arr = np.array(obj)
  arr = np.array(obj)
Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
  arr = np.array(obj)


Epoch,Training Loss,Validation Loss,Accuracy
1,1.0966,1.106107,0.612682
2,1.0833,1.105044,0.613778
3,1.0736,1.10445,0.613992
4,1.0671,1.105006,0.613978


  arr = np.array(obj)
  arr = np.array(obj)
  arr = np.array(obj)
Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
  arr = np.array(obj)


Epoch,Training Loss,Validation Loss,Accuracy
1,1.093,1.106647,0.61041
2,1.0797,1.102345,0.611182
3,1.0696,1.102968,0.611433
4,1.0639,1.10206,0.612008


  arr = np.array(obj)
  arr = np.array(obj)
  arr = np.array(obj)


In [38]:
gc.collect()
torch.cuda.empty_cache()
train_on_splits_sequential(splits_full, '/u/scratch/t/ttthach/P1/SMALL_MED_LARGE_FULL',  model = AutoModelForTokenClassification.from_pretrained('/u/scratch/t/ttthach/P1/SMALL_MED_LARGE_4'), 
                           batch_size = 4, eval_batch_size = 4)

Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
  arr = np.array(obj)


Epoch,Training Loss,Validation Loss,Accuracy
1,1.0146,1.052108,0.627503
2,0.9822,1.051482,0.628961
3,0.9527,1.056971,0.628241
4,0.9326,1.060925,0.627263


  arr = np.array(obj)
  arr = np.array(obj)
  arr = np.array(obj)
Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
  arr = np.array(obj)


Epoch,Training Loss,Validation Loss,Accuracy
1,1.02,1.039116,0.630042
2,0.9908,1.033415,0.632962
3,0.9648,1.034239,0.631595
4,0.945,1.036229,0.631831


  arr = np.array(obj)
  arr = np.array(obj)
  arr = np.array(obj)
Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
  arr = np.array(obj)


Epoch,Training Loss,Validation Loss,Accuracy
1,1.027,1.012668,0.642496
2,0.999,1.009022,0.643528
3,0.9674,1.0117,0.642514
4,0.9465,1.015542,0.642098


  arr = np.array(obj)
  arr = np.array(obj)
  arr = np.array(obj)
Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
  arr = np.array(obj)


Epoch,Training Loss,Validation Loss,Accuracy
1,1.0129,0.979284,0.65452
2,0.9834,0.977719,0.653771
3,0.9551,0.980655,0.653432
4,0.9374,0.982571,0.653592


  arr = np.array(obj)
  arr = np.array(obj)
  arr = np.array(obj)
Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
  arr = np.array(obj)


Epoch,Training Loss,Validation Loss,Accuracy
1,1.0161,1.021339,0.640397
2,0.9845,1.020484,0.641127
3,0.9538,1.027319,0.640689
4,0.9353,1.029371,0.640342


  arr = np.array(obj)
  arr = np.array(obj)
  arr = np.array(obj)


In [40]:
gc.collect()
torch.cuda.empty_cache()
train_on_splits_sequential(splits_large, '/u/scratch/t/ttthach/P1/MED_LARGE',  model = AutoModelForTokenClassification.from_pretrained('/u/scratch/t/ttthach/P1/MED_4'), 
                           batch_size = 128, eval_batch_size = 128)

Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
  arr = np.array(obj)


Epoch,Training Loss,Validation Loss,Accuracy
1,1.0954,1.079072,0.62438
2,1.0804,1.07432,0.626003
3,1.0711,1.072306,0.626499
4,1.0645,1.072463,0.62645


  arr = np.array(obj)
  arr = np.array(obj)
  arr = np.array(obj)
Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
  arr = np.array(obj)


Epoch,Training Loss,Validation Loss,Accuracy
1,1.0886,1.084376,0.622294
2,1.0766,1.08246,0.623203
3,1.065,1.08182,0.623301
4,1.0599,1.081356,0.62387


  arr = np.array(obj)
  arr = np.array(obj)
  arr = np.array(obj)
Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
  arr = np.array(obj)


Epoch,Training Loss,Validation Loss,Accuracy
1,1.084,1.088537,0.619649
2,1.0703,1.086844,0.620273
3,1.0615,1.086329,0.620641
4,1.0543,1.086958,0.620868


  arr = np.array(obj)
  arr = np.array(obj)
  arr = np.array(obj)
Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
  arr = np.array(obj)


Epoch,Training Loss,Validation Loss,Accuracy
1,1.0778,1.086046,0.621999
2,1.0634,1.084369,0.623368
3,1.0532,1.084196,0.623719
4,1.0462,1.084444,0.623622


  arr = np.array(obj)
  arr = np.array(obj)
  arr = np.array(obj)
Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
  arr = np.array(obj)


Epoch,Training Loss,Validation Loss,Accuracy
1,1.073,1.086235,0.619143
2,1.0584,1.083289,0.61994
3,1.0478,1.084415,0.62007
4,1.0416,1.083847,0.619867


  arr = np.array(obj)
  arr = np.array(obj)
  arr = np.array(obj)


In [49]:
gc.collect()
torch.cuda.empty_cache()
train_on_splits_sequential(splits_full, '/u/scratch/t/ttthach/P1/MED_LARGE_FULL',  model = AutoModelForTokenClassification.from_pretrained('/u/scratch/t/ttthach/P1/MED_LARGE_4'), 
                           batch_size = 4, eval_batch_size = 4)

Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
  arr = np.array(obj)


Epoch,Training Loss,Validation Loss,Accuracy
1,0.9903,1.029323,0.637505
2,0.9576,1.029736,0.636596
3,0.928,1.035997,0.637111
4,0.9073,1.038994,0.63639


  arr = np.array(obj)
  arr = np.array(obj)
  arr = np.array(obj)
Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
  arr = np.array(obj)


Epoch,Training Loss,Validation Loss,Accuracy
1,0.9978,1.01499,0.64054
2,0.9689,1.01324,0.640827
3,0.9429,1.013454,0.641941
4,0.9234,1.015615,0.641789


  arr = np.array(obj)
  arr = np.array(obj)
  arr = np.array(obj)
Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
  arr = np.array(obj)


Epoch,Training Loss,Validation Loss,Accuracy
1,1.0075,0.990164,0.652527
2,0.9778,0.986029,0.653884
3,0.945,0.989193,0.653975
4,0.9241,0.992531,0.653703


  arr = np.array(obj)
  arr = np.array(obj)
  arr = np.array(obj)
Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
  arr = np.array(obj)


Epoch,Training Loss,Validation Loss,Accuracy
1,0.9906,0.957836,0.662952
2,0.9603,0.95854,0.662061
3,0.9314,0.963865,0.659636
4,0.9134,0.965974,0.659565


  arr = np.array(obj)
  arr = np.array(obj)
  arr = np.array(obj)
Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
  arr = np.array(obj)


Epoch,Training Loss,Validation Loss,Accuracy
1,0.9934,1.001542,0.647895
2,0.9615,1.001965,0.646946
3,0.9308,1.010912,0.647493
4,0.9119,1.013103,0.646399


  arr = np.array(obj)
  arr = np.array(obj)
  arr = np.array(obj)


In [53]:
gc.collect()
torch.cuda.empty_cache()
train_on_splits_sequential(splits_full, '/u/scratch/t/ttthach/P1/LARGE_FULL',  model = AutoModelForTokenClassification.from_pretrained('/u/scratch/t/ttthach/P1/LARGE_4'), 
                           batch_size = 4, eval_batch_size = 4)

Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
  arr = np.array(obj)


Epoch,Training Loss,Validation Loss,Accuracy
1,1.02,1.055909,0.630745
2,0.9873,1.055037,0.630059
3,0.9579,1.060589,0.630196
4,0.9377,1.062977,0.630145


  arr = np.array(obj)
  arr = np.array(obj)
  arr = np.array(obj)
Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
  arr = np.array(obj)


Epoch,Training Loss,Validation Loss,Accuracy
1,1.0221,1.041839,0.633013
2,0.9951,1.037011,0.633232
3,0.9687,1.037691,0.633131
4,0.9505,1.039464,0.633232


  arr = np.array(obj)
  arr = np.array(obj)
  arr = np.array(obj)
Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
  arr = np.array(obj)


Epoch,Training Loss,Validation Loss,Accuracy
1,1.0315,1.009978,0.645266
2,1.0022,1.00553,0.646461
3,0.971,1.008345,0.646642
4,0.9506,1.011699,0.6459


  arr = np.array(obj)
  arr = np.array(obj)
  arr = np.array(obj)
Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
  arr = np.array(obj)


Epoch,Training Loss,Validation Loss,Accuracy
1,1.0137,0.981804,0.656249
2,0.9842,0.980771,0.656053
3,0.9569,0.983512,0.655108
4,0.9398,0.985319,0.655268


  arr = np.array(obj)
  arr = np.array(obj)
  arr = np.array(obj)
Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
  arr = np.array(obj)


Epoch,Training Loss,Validation Loss,Accuracy
1,1.0172,1.017091,0.643316
2,0.9845,1.016937,0.644392
3,0.9548,1.023861,0.643754
4,0.9367,1.023006,0.64317


  arr = np.array(obj)
  arr = np.array(obj)
  arr = np.array(obj)


In [55]:
gc.collect()
torch.cuda.empty_cache()
train_on_splits_sequential(splits_full, '/u/scratch/t/ttthach/P1/SMALL_FULL',  model = AutoModelForTokenClassification.from_pretrained('/u/scratch/t/ttthach/P1/SMALL_4'), 
                           batch_size = 4, eval_batch_size = 4)

Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
  arr = np.array(obj)


Epoch,Training Loss,Validation Loss,Accuracy
1,1.1673,1.16514,0.594235
2,1.1029,1.149861,0.597684
3,1.0702,1.145555,0.59952
4,1.0509,1.144315,0.600412


  arr = np.array(obj)
  arr = np.array(obj)
  arr = np.array(obj)
Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
  arr = np.array(obj)


Epoch,Training Loss,Validation Loss,Accuracy
1,1.1106,1.116447,0.608321
2,1.0792,1.108173,0.611156
3,1.0534,1.105051,0.612203
4,1.0352,1.103676,0.612051


  arr = np.array(obj)
  arr = np.array(obj)
  arr = np.array(obj)
Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
  arr = np.array(obj)


Epoch,Training Loss,Validation Loss,Accuracy
1,1.0977,1.087874,0.618109
2,1.0695,1.073339,0.623196
3,1.039,1.072366,0.624518
4,1.0202,1.073746,0.624391


  arr = np.array(obj)
  arr = np.array(obj)
  arr = np.array(obj)
Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
  arr = np.array(obj)


Epoch,Training Loss,Validation Loss,Accuracy
1,1.0707,1.034901,0.638563
2,1.0416,1.030194,0.63933
3,1.0149,1.030031,0.639526
4,0.9985,1.030325,0.639597


  arr = np.array(obj)
  arr = np.array(obj)
  arr = np.array(obj)
Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
  arr = np.array(obj)


Epoch,Training Loss,Validation Loss,Accuracy
1,1.0707,1.070051,0.625091
2,1.0383,1.068637,0.626003
3,1.0096,1.076935,0.624307
4,0.9935,1.072472,0.625164


  arr = np.array(obj)
  arr = np.array(obj)
  arr = np.array(obj)


In [61]:
gc.collect()
torch.cuda.empty_cache()
train_on_splits_sequential(splits_full, '/u/scratch/t/ttthach/P1/MED_FULL',  model = AutoModelForTokenClassification.from_pretrained('/u/scratch/t/ttthach/P1/MED_4'), 
                           batch_size = 4, eval_batch_size = 4)

Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
  arr = np.array(obj)


Epoch,Training Loss,Validation Loss,Accuracy
1,1.0301,1.064976,0.62565
2,0.995,1.064351,0.625066
3,0.9664,1.067723,0.626113
4,0.947,1.070214,0.625084


  arr = np.array(obj)
  arr = np.array(obj)
  arr = np.array(obj)
Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
  arr = np.array(obj)


Epoch,Training Loss,Validation Loss,Accuracy
1,1.0294,1.044552,0.631814
2,1.0012,1.040045,0.633755
3,0.9761,1.040055,0.633401
4,0.9577,1.041577,0.634076


  arr = np.array(obj)
  arr = np.array(obj)
  arr = np.array(obj)
Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
  arr = np.array(obj)


Epoch,Training Loss,Validation Loss,Accuracy
1,1.0358,1.017545,0.643058
2,1.0072,1.011851,0.646461
3,0.9763,1.014747,0.645176
4,0.9564,1.017306,0.645158


  arr = np.array(obj)
  arr = np.array(obj)
  arr = np.array(obj)
Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
  arr = np.array(obj)


Epoch,Training Loss,Validation Loss,Accuracy
1,1.0156,0.983757,0.654199
2,0.9869,0.983384,0.65427
3,0.9598,0.98696,0.653378
4,0.9429,0.98892,0.652612


  arr = np.array(obj)
  arr = np.array(obj)
  arr = np.array(obj)
Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
  arr = np.array(obj)


Epoch,Training Loss,Validation Loss,Accuracy
1,1.0186,1.023105,0.641254
2,0.9872,1.023357,0.640817
3,0.9576,1.030679,0.640689
4,0.9401,1.031882,0.639357


  arr = np.array(obj)
  arr = np.array(obj)
  arr = np.array(obj)


In [31]:
gc.collect()
torch.cuda.empty_cache()

holdout['labels'] = holdout['id'].map(id_to_labels)
holdout_ds = Dataset.from_pandas(holdout).remove_columns('id')
tokenized_ds_holdout = holdout_ds.map(tokenize_and_label).remove_columns('sequence')
tokenized_ds_holdout.set_format("pt", columns=["input_ids", "attention_mask", "labels"], output_all_columns=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  holdout['labels'] = holdout['id'].map(id_to_labels)


Map:   0%|          | 0/307 [00:00<?, ? examples/s]

In [29]:
def create_predictions(ds, model, tokenizer, data_collator, compute_metrics, dir_name):
    test_args = TrainingArguments(
        output_dir = dir_name,
        do_train = False,
        do_predict = True,
        per_device_eval_batch_size = 4,
        dataloader_drop_last = False
    )
    
    # init trainer
    trainer = Trainer(
                  model = model,
                  args = test_args,
                  processing_class=tokenizer,
                  data_collator=data_collator,
                  compute_metrics = compute_metrics)
    
    results = trainer.predict(ds)
    return results

In [30]:
accuracy = evaluate.load("accuracy")
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)
    true_predictions = [
        [p for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    
    true_labels = [
        [l for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    flat_predictions = list(chain.from_iterable(true_predictions))
    flat_labels = list(chain.from_iterable(true_labels))
    
    results = accuracy.compute(predictions=flat_predictions, references=flat_labels)

    return {
        "accuracy": results["accuracy"],
    }

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
tokenizer = AutoTokenizer.from_pretrained("facebook/esm2_t6_8M_UR50D")

In [34]:
small_results = create_predictions(tokenized_ds_holdout, AutoModelForTokenClassification.from_pretrained('/u/scratch/t/ttthach/P1/SMALL_4'), 
                                   tokenizer=tokenizer, data_collator=data_collator, compute_metrics=compute_metrics, 
                                   dir_name='/u/scratch/t/ttthach/P1/SMALL_VAL')
small_results.metrics

Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
  arr = np.array(obj)


{'test_loss': 1.2701047658920288,
 'test_model_preparation_time': 0.003,
 'test_accuracy': 0.5677826297913634,
 'test_runtime': 3.2317,
 'test_samples_per_second': 94.996,
 'test_steps_per_second': 23.826}

In [35]:
med_results = create_predictions(tokenized_ds_holdout, AutoModelForTokenClassification.from_pretrained('/u/scratch/t/ttthach/P1/MED_4'), 
                                   tokenizer=tokenizer, data_collator=data_collator, compute_metrics=compute_metrics, 
                                   dir_name='/u/scratch/t/ttthach/P1/MED_VAL')
med_results.metrics

Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
  arr = np.array(obj)


{'test_loss': 1.0722001791000366,
 'test_model_preparation_time': 0.0024,
 'test_accuracy': 0.6194357801344701,
 'test_runtime': 3.1858,
 'test_samples_per_second': 96.365,
 'test_steps_per_second': 24.17}

In [55]:
large_results = create_predictions(tokenized_ds_holdout, AutoModelForTokenClassification.from_pretrained('LARGE_4'), 
                                   tokenizer=tokenizer, data_collator=data_collator, compute_metrics=compute_metrics, 
                                   dir_name='LARGE_VAL')
large_results.metrics

Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
  arr = np.array(obj)


{'test_loss': 1.0640357732772827,
 'test_model_preparation_time': 0.0032,
 'test_accuracy': 0.625133430373605,
 'test_runtime': 2.03,
 'test_samples_per_second': 151.228,
 'test_steps_per_second': 37.93}

In [56]:
full_results = create_predictions(tokenized_ds_holdout, AutoModelForTokenClassification.from_pretrained('FULL_4'), 
                                   tokenizer=tokenizer, data_collator=data_collator, compute_metrics=compute_metrics, 
                                   dir_name='FULL_VAL')
full_results.metrics

Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
  arr = np.array(obj)


{'test_loss': 1.0721930265426636,
 'test_model_preparation_time': 0.0031,
 'test_accuracy': 0.6230401330838012,
 'test_runtime': 2.0259,
 'test_samples_per_second': 151.537,
 'test_steps_per_second': 38.008}

In [33]:
sm_results = create_predictions(tokenized_ds_holdout, AutoModelForTokenClassification.from_pretrained('/u/scratch/t/ttthach/P1/SMALL_MED_4'), 
                                   tokenizer=tokenizer, data_collator=data_collator, compute_metrics=compute_metrics, 
                                   dir_name='/u/scratch/t/ttthach/P1/SMALL_MED_VAL')
sm_results.metrics

Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
  arr = np.array(obj)


{'test_loss': 1.1020615100860596,
 'test_model_preparation_time': 0.0029,
 'test_accuracy': 0.6077909475289388,
 'test_runtime': 3.3362,
 'test_samples_per_second': 92.022,
 'test_steps_per_second': 23.08}

In [37]:
sml_results = create_predictions(tokenized_ds_holdout, AutoModelForTokenClassification.from_pretrained('/u/scratch/t/ttthach/P1/SMALL_MED_LARGE_4'), 
                                   tokenizer=tokenizer, data_collator=data_collator, compute_metrics=compute_metrics, 
                                   dir_name='/u/scratch/t/ttthach/P1/SMALL_MED_LARGE_VAL')
sml_results.metrics

Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
  arr = np.array(obj)


{'test_loss': 1.0752309560775757,
 'test_model_preparation_time': 0.0032,
 'test_accuracy': 0.6175781520759687,
 'test_runtime': 3.3017,
 'test_samples_per_second': 92.983,
 'test_steps_per_second': 23.322}

In [39]:
smlf_results = create_predictions(tokenized_ds_holdout, AutoModelForTokenClassification.from_pretrained('/u/scratch/t/ttthach/P1/SMALL_MED_LARGE_FULL_4'), 
                                   tokenizer=tokenizer, data_collator=data_collator, compute_metrics=compute_metrics, 
                                   dir_name='/u/scratch/t/ttthach/P1/SMALL_MED_LARGE_FULL_VAL')
smlf_results.metrics

Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
  arr = np.array(obj)


{'test_loss': 1.052809476852417,
 'test_model_preparation_time': 0.0032,
 'test_accuracy': 0.6277396548138906,
 'test_runtime': 3.2971,
 'test_samples_per_second': 93.112,
 'test_steps_per_second': 23.354}

In [41]:
ml_results = create_predictions(tokenized_ds_holdout, AutoModelForTokenClassification.from_pretrained('/u/scratch/t/ttthach/P1/MED_LARGE_4'), 
                                   tokenizer=tokenizer, data_collator=data_collator, compute_metrics=compute_metrics, 
                                   dir_name='/u/scratch/t/ttthach/P1/MED_LARGE_VAL')
ml_results.metrics

Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
  arr = np.array(obj)


{'test_loss': 1.0496424436569214,
 'test_model_preparation_time': 0.0034,
 'test_accuracy': 0.6273237679351217,
 'test_runtime': 3.291,
 'test_samples_per_second': 93.285,
 'test_steps_per_second': 23.397}

In [52]:
mlf_results = create_predictions(tokenized_ds_holdout, AutoModelForTokenClassification.from_pretrained('/u/scratch/t/ttthach/P1/MED_LARGE_FULL_4'), 
                                   tokenizer=tokenizer, data_collator=data_collator, compute_metrics=compute_metrics, 
                                   dir_name='/u/scratch/t/ttthach/P1/MED_LARGE_FULL_VAL')
mlf_results.metrics

Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
  arr = np.array(obj)


{'test_loss': 1.0312223434448242,
 'test_model_preparation_time': 0.0026,
 'test_accuracy': 0.6359187634296805,
 'test_runtime': 3.2167,
 'test_samples_per_second': 95.441,
 'test_steps_per_second': 23.938}

In [54]:
lf_results = create_predictions(tokenized_ds_holdout, AutoModelForTokenClassification.from_pretrained('/u/scratch/t/ttthach/P1/LARGE_FULL_4'), 
                                   tokenizer=tokenizer, data_collator=data_collator, compute_metrics=compute_metrics, 
                                   dir_name='/u/scratch/t/ttthach/P1/LARGE_FULL_VAL')
lf_results.metrics

Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
  arr = np.array(obj)


{'test_loss': 1.0366559028625488,
 'test_model_preparation_time': 0.0045,
 'test_accuracy': 0.6333541276772717,
 'test_runtime': 3.2933,
 'test_samples_per_second': 93.22,
 'test_steps_per_second': 23.381}

In [56]:
sf_results = create_predictions(tokenized_ds_holdout, AutoModelForTokenClassification.from_pretrained('/u/scratch/t/ttthach/P1/SMALL_FULL_4'), 
                                   tokenizer=tokenizer, data_collator=data_collator, compute_metrics=compute_metrics, 
                                   dir_name='/u/scratch/t/ttthach/P1/SMALL_FULL_VAL')
sf_results.metrics

Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
  arr = np.array(obj)


{'test_loss': 1.0622258186340332,
 'test_model_preparation_time': 0.0033,
 'test_accuracy': 0.6229985443959243,
 'test_runtime': 3.2846,
 'test_samples_per_second': 93.468,
 'test_steps_per_second': 23.443}

In [62]:
mf_results = create_predictions(tokenized_ds_holdout, AutoModelForTokenClassification.from_pretrained('/u/scratch/t/ttthach/P1/MED_FULL_4'), 
                                   tokenizer=tokenizer, data_collator=data_collator, compute_metrics=compute_metrics, 
                                   dir_name='/u/scratch/t/ttthach/P1/MED_FULL_VAL')
mf_results.metrics

Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
  arr = np.array(obj)


{'test_loss': 1.0335673093795776,
 'test_model_preparation_time': 0.003,
 'test_accuracy': 0.6340195466833022,
 'test_runtime': 3.2641,
 'test_samples_per_second': 94.052,
 'test_steps_per_second': 23.59}

In [62]:
avg_result = (0.33*med_results.predictions + 0.33*full_results.predictions + 0.34*large_results.predictions)

In [63]:
predictions = avg_result.argmax(axis=2)

In [64]:
labels = holdout['labels']

In [65]:
    true_predictions = [
        [p for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    
    true_labels = [
        [l for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]


In [66]:
    flat_predictions = list(chain.from_iterable(true_predictions))
    flat_labels = list(chain.from_iterable(true_labels))
    
    results = accuracy.compute(predictions=flat_predictions, references=flat_labels)
results

{'accuracy': 0.6286407430512234}

In [43]:
test_path = 'test.tsv'
test_df = pd.read_csv(test_path, sep='\t')
test_df[['id','amino_acid','index']] = test_df['id'].str.split('_', expand=True)
test_df['index'] = test_df['index'].astype(int)
test_df['sequence'] = test_df['id'].apply(lambda x: str(seq_dict[x].seq))
test_filt = test_df.groupby('id')[['sequence']].agg({'sequence': 'first'}).reset_index()

test_ds_filt = Dataset.from_pandas(test_filt).remove_columns('id')

In [44]:
def tokenize_and_convert(example):
    tokenized_input = tokenizer(example['sequence'], add_special_tokens=False, return_tensors="pt")
    tokenized_input['input_ids'] = tokenized_input['input_ids'][0] 
    tokenized_input['attention_mask'] = tokenized_input['attention_mask'][0] 
    return tokenized_input

In [45]:
tokenized_test_ds = test_ds_filt.map(tokenize_and_convert).remove_columns('sequence')
tokenized_test_ds.set_format("pt", columns=["input_ids", "attention_mask"], output_all_columns=True)

Map:   0%|          | 0/1552 [00:00<?, ? examples/s]

In [28]:
accuracy = evaluate.load("accuracy")
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)
    true_predictions = [
        [p for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    
    true_labels = [
        [l for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    flat_predictions = list(chain.from_iterable(true_predictions))
    flat_labels = list(chain.from_iterable(true_labels))
    
    results = accuracy.compute(predictions=flat_predictions, references=flat_labels)

    return {
        "accuracy": results["accuracy"],
    }

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
tokenizer = AutoTokenizer.from_pretrained("facebook/esm2_t6_8M_UR50D")

In [31]:
small_test = create_predictions(tokenized_test_ds, AutoModelForTokenClassification.from_pretrained('SMALL_4'), 
                                   tokenizer=tokenizer, data_collator=data_collator, compute_metrics=compute_metrics, 
                                   dir_name='SMALL_TEST')
#y_pred = np.argmax(small_test.predictions, axis=2)

Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
  arr = np.array(obj)


In [32]:
med_test = create_predictions(tokenized_test_ds, AutoModelForTokenClassification.from_pretrained('MED_4'), 
                                   tokenizer=tokenizer, data_collator=data_collator, compute_metrics=compute_metrics, 
                                   dir_name='MED_TEST')
#y_pred = np.argmax(med_test.predictions, axis=2)

Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
  arr = np.array(obj)


In [64]:
large_test = create_predictions(tokenized_test_ds, AutoModelForTokenClassification.from_pretrained('/u/scratch/t/ttthach/P1/LARGE_4'), 
                                   tokenizer=tokenizer, data_collator=data_collator, compute_metrics=compute_metrics, 
                                   dir_name='/u/scratch/t/ttthach/P1/LARGE_TEST')
#y_pred = np.argmax(large_test.predictions, axis=2)

Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
  arr = np.array(obj)


In [65]:
full_test = create_predictions(tokenized_test_ds, AutoModelForTokenClassification.from_pretrained('/u/scratch/t/ttthach/P1/FULL_4'), 
                                   tokenizer=tokenizer, data_collator=data_collator, compute_metrics=compute_metrics, 
                                   dir_name='/u/scratch/t/ttthach/P1/FULL_TEST')
#y_pred = np.argmax(full_test.predictions, axis=2)

Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
  arr = np.array(obj)


In [63]:
smlf_test = create_predictions(tokenized_test_ds, AutoModelForTokenClassification.from_pretrained('/u/scratch/t/ttthach/P1/SMALL_MED_LARGE_FULL_4'), 
                                   tokenizer=tokenizer, data_collator=data_collator, compute_metrics=compute_metrics, 
                                   dir_name='/u/scratch/t/ttthach/P1/SMALL_MED_LARGE_FULL_TEST')

#y_pred = np.argmax(smlf_test.predictions, axis=2)

Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
  arr = np.array(obj)


In [66]:
ml_test = create_predictions(tokenized_test_ds, AutoModelForTokenClassification.from_pretrained('/u/scratch/t/ttthach/P1/MED_LARGE_4'), 
                                   tokenizer=tokenizer, data_collator=data_collator, compute_metrics=compute_metrics, 
                                   dir_name='/u/scratch/t/ttthach/P1/MED_LARGE_TEST')

#y_pred = np.argmax(mlf_test.predictions, axis=2)

Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
  arr = np.array(obj)


In [67]:
mlf_test = create_predictions(tokenized_test_ds, AutoModelForTokenClassification.from_pretrained('/u/scratch/t/ttthach/P1/MED_LARGE_FULL_4'), 
                                   tokenizer=tokenizer, data_collator=data_collator, compute_metrics=compute_metrics, 
                                   dir_name='/u/scratch/t/ttthach/P1/MED_LARGE_FULL_TEST')

#y_pred = np.argmax(mlf_test.predictions, axis=2)

Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
  arr = np.array(obj)


In [68]:
lf_test = create_predictions(tokenized_test_ds, AutoModelForTokenClassification.from_pretrained('/u/scratch/t/ttthach/P1/LARGE_FULL_4'), 
                                   tokenizer=tokenizer, data_collator=data_collator, compute_metrics=compute_metrics, 
                                   dir_name='/u/scratch/t/ttthach/P1/LARGE_FULL_TEST')


Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
  arr = np.array(obj)


In [69]:
sf_test = create_predictions(tokenized_test_ds, AutoModelForTokenClassification.from_pretrained('/u/scratch/t/ttthach/P1/SMALL_FULL_4'), 
                                   tokenizer=tokenizer, data_collator=data_collator, compute_metrics=compute_metrics, 
                                   dir_name='/u/scratch/t/ttthach/P1/SMALL_FULL_TEST')


Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
  arr = np.array(obj)


In [73]:
mf_test = create_predictions(tokenized_test_ds, AutoModelForTokenClassification.from_pretrained('/u/scratch/t/ttthach/P1/MED_FULL_4'), 
                                   tokenizer=tokenizer, data_collator=data_collator, compute_metrics=compute_metrics, 
                                   dir_name='/u/scratch/t/ttthach/P1/MED_FULL_TEST')

#y_pred = np.argmax(mf_test.predictions, axis=2)

Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
  arr = np.array(obj)


In [74]:
pred = (large_test.predictions + full_test.predictions + smlf_test.predictions + ml_test.predictions + mlf_test.predictions + lf_test.predictions + sf_test.predictions + mf_test.predictions) / 8

In [75]:
y_pred = np.argmax(pred, axis=2)

In [76]:
pred_df = pd.DataFrame(y_pred)

In [47]:
#test_prob = test_filt.copy() #.drop(columns=['predictions'])
#test_prob['prob'] = prob.apply(lambda x: x.tolist(), axis=1)
#prob_merge = pd.merge(test_df, test_prob, on=['id','sequence'], how='left')
#prob_merge['probability'] = prob_merge.apply(lambda x: x['prob'][x['index'] - 1], axis=1)
#prob_merge = pd.merge(test_df, test_prob, on=['id','sequence'], how='left')
#prob_merge['prob_list'] = prob_merge.apply(lambda x: x['prob'][9*(x['index'] - 1):9*(x['index'])], axis=1)
#test_prob_submit = pd.read_csv(test_path, sep='\t')
#test_prob_submit['prob'] = prob_merge['prob_list']
#test_prob_submit.to_csv('/u/scratch/t/ttthach/P1/test_prob.csv', sep='\t', index=False)
#prob_df = test_prob_submit['prob'].apply(pd.Series)

In [77]:
test_filt['predictions'] = pred_df.apply(lambda x: x.tolist(), axis=1)

test_df_merged = pd.merge(test_df, test_filt, on=['id','sequence'], how='left')

test_df_merged['class_number'] = test_df_merged.apply(lambda x: x['predictions'][x['index'] - 1], axis=1)

test_df_merged['secondary_structure'] = test_df_merged['class_number'].apply(lambda x: idx_to_labels[x])

test_df_submit = pd.read_csv(test_path, sep='\t')

test_df_submit['secondary_structure'] = test_df_merged['secondary_structure']

test_df_submit.to_csv('predictions.csv', sep='\t', index=False)

test_df_submit.head(50)

Unnamed: 0,id,secondary_structure
0,3JRN_LYS_8,.
1,3JRN_TYR_9,E
2,3JRN_ASP_10,E
3,3JRN_VAL_11,E
4,3JRN_PHE_12,E
5,3JRN_LEU_13,E
6,3JRN_SER_14,E
7,3JRN_PHE_15,E
8,3JRN_ARG_16,.
9,3JRN_GLY_17,.
