# Setup
---
---


## Google Drive Setup
---

In [None]:
##  @brief  :   Mount Google Drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


## Libraries and Installations
---

### **Set up Environment**

In [None]:
!pip install -q torchsummaryX einops "fair-esm[esmfold]" transformers sentencepiece lxml pyfaidx
# OpenFold and its remaining dependency
!pip install fair-esm  # latest release, OR:
!pip install git+https://github.com/facebookresearch/esm.git



[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.6/44.6 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m93.1/93.1 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m47.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m101.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.3/510.3 kB[0m [31m50.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m776.9/776.9 kB[0m [31m66.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.5/79.5 kB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.9/77.9 kB[0m [31m11.9 MB/s

In [None]:
!pip install -q protobuf

### **Imports**

In [None]:
##  @brief  :   Libraries for File I/O & Parsing
import os
import sys
import glob
import pickle
import json
import csv
import argparse
from io import StringIO
import shutil
from urllib.parse import urlparse
from urllib.request import urlopen

In [None]:
##  @brief  :   Libraries for Data Analysis / Stats / Pre-processing
import random
import math
import time
import re
import itertools
from datetime import datetime, timedelta
import pickle
from numbers import Number

import numpy as np
import pandas as pd
import h5py
from scipy.stats import pearsonr

from tqdm import tqdm
from tqdm.notebook import tqdm as blue_tqdm

from IPython.display import display

In [None]:
##  @brief  :   Torch Libraries
import torch
import torch.nn as nn
from torch import nn, Tensor, einsum
import torch.utils.model_zoo as model_zoo
import torch.nn.functional as F
from torchsummaryX import summary
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence, unpad_sequence

import einops
from einops.layers.torch import Rearrange, Reduce
from einops import rearrange, repeat

In [None]:
##  @brief  :   Keras & TF Libraries
import tensorflow as tf
from tensorflow import keras
from keras import backend as K

In [None]:
##  @brief  :   Imports for External / Pre-Trained PLMs
import esm
from transformers import T5EncoderModel, T5Tokenizer
import sentencepiece
from lxml import etree
from pyfaidx import Faidx


##  @brief  :   Import Local Module for ProteinBERT
##  @note   :   Replace all instances of log function in module with print
proteinBERT_DIR = '/content/drive/MyDrive/11785 - Deep Learning/IDL_Project/Project/ProteinBERT/proteinbert_keras/proteinbert'
sys.path.insert(0,proteinBERT_DIR)

from tokenization import ADDED_TOKENS_PER_SEQ, index_to_token, token_to_index
from model_generation import ModelGenerator, PretrainingModelGenerator, FinetuningModelGenerator, InputEncoder, load_pretrained_model_from_dump, tokenize_seqs
from existing_model_loading import load_pretrained_model
from finetuning import OutputType, OutputSpec, finetune, evaluate_by_len
from conv_and_global_attention_model import get_model_with_hidden_layers_as_outputs

## Compute Specifications
---

In [None]:
##  @brief  :   Check GPU/CPU Specs
!nvidia-smi

DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
print("Device: ", DEVICE)

Sun Dec 10 16:04:58 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   30C    P0    44W / 400W |      0MiB / 40960MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

## W&B
---

In [None]:
!pip install wandb --quiet
import wandb

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m27.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.6/190.6 kB[0m [31m25.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m252.8/252.8 kB[0m [31m27.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.7/62.7 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
##  @brief  : W&B Login with API Key
wandb.login(key="c035e37d4ff69175c6c747cc2c851a8f8d11baab")

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

# File Paths
---
---

In [None]:
##  @brief  :   Root Path to Git Folder
ROOT = '/content/drive/MyDrive/11785 - Deep Learning/IDL_Project/Project/SPOT-1D-LM/'

MODEL_DATA_ROOT = os.path.join(ROOT, "spot_1d_lm")

##  @brief  :   Path containing Model Inputs
INPUT_DIR = os.path.join(ROOT, "inputs")

##  @brief  :   Path to Generated 1-hot encoded Sequences
ONEHOT_INPUT_DIR = os.path.join(INPUT_DIR, "one_hot")

##  @brief  :   Path containing Labels
LABEL_DIR = os.path.join(MODEL_DATA_ROOT, "labels")


## Dataset File Paths
---

In [None]:
##  @brief  :   Path to Dataset Folder -> Contains Lists of Paths to Fasta Files
DATASET_DIR = os.path.join(MODEL_DATA_ROOT, "lists")

##  @brief  :   Directory containing all available fasta files
FASTA_DIR = os.path.join(MODEL_DATA_ROOT, "fasta")

##  @brief  :   Directory to store csvs with processed inputs
DF_DIR = os.path.join(INPUT_DIR, "dataframes")

##  @brief  :   Directory to store compressed embeddings
FEAT_DIR = os.path.join(INPUT_DIR, "compressed_inputs")

### **Train Sets & Dev Sets**

In [None]:
##  @brief  :   Path to Train Data
TRAIN_FILE = os.path.join(DATASET_DIR, "train.txt")
TRAIN_DF = os.path.join(DF_DIR, "train.csv")
TRAIN_INPUTS = os.path.join(FEAT_DIR, "train")

##  @brief  :   Path to Validation Data
VAL_FILE = os.path.join(DATASET_DIR, "validation.txt")
VAL_DF = os.path.join(DF_DIR, "validation.csv")

### **Test Sets**

In [None]:
##  @brief  :   Dictionary of Test Files
TEST_FILES = {
    "SPOT-2016-HQ"  : os.path.join(DATASET_DIR, "SPOT-2016-HQ.txt"),
    "SPOT-2016"  : os.path.join(DATASET_DIR, "SPOT-2016.txt"),
    "SPOT-2018-HQ"  : os.path.join(DATASET_DIR, "SPOT-2018-HQ.txt"),
    "SPOT-2018"  : os.path.join(DATASET_DIR, "SPOT-2018.txt"),
    "CASP-12"  : os.path.join(DATASET_DIR, "casp12.txt"),
    "CASP-13"  : os.path.join(DATASET_DIR, "casp13_fm_17.txt"),
    "TEST-2018"  : os.path.join(DATASET_DIR, "test2018.txt"),
}

TEST_DFS = {
    "SPOT-2016-HQ"  : os.path.join(DF_DIR, "SPOT-2016-HQ.csv"),
    "SPOT-2016"  : os.path.join(DF_DIR, "SPOT-2016.csv"),
    "SPOT-2018-HQ"  : os.path.join(DF_DIR, "SPOT-2018-HQ.csv"),
    "SPOT-2018"  : os.path.join(DF_DIR, "SPOT-2018.csv"),
    "CASP-12"  : os.path.join(DF_DIR, "casp12.csv"),
    "CASP-13"  : os.path.join(DF_DIR, "casp13_fm_17.csv"),
    "TEST-2018"  : os.path.join(DF_DIR, "test2018.csv"),
}

## Model File Paths
---

### **Model Checkpoints**

In [None]:
CHECKPOINTS_DIR = os.path.join(MODEL_DATA_ROOT, "checkpoints")

Pretrained Models

In [None]:
##  @brief  :   Path to ESM-1b Checkpoints
ESM_CHECKPOINTS_DIR = os.path.join(CHECKPOINTS_DIR, "esm-1b")

##  @brief  :   Path to ProteinBERT
PB_CHECKPOINTS_DIR = os.path.join(CHECKPOINTS_DIR, "proteinBERT")

### **Model Embeddings**
*Embeddings from Pre-Trained Model to Use as Inputs to Spot-1D-Single*

In [None]:
##  @brief  :   Path to Generated ESM-1b Embeddings
ESM1B_INPUT_DIR = os.path.join(INPUT_DIR, "esm")

##  @brief  :   Path to Generated ProtTrans Embeddings
PT_INPUT_DIR = os.path.join(INPUT_DIR, "protTrans")

##  @brief  :   Path to Generated ProteinBERT Embeddings
PB_INPUT_DIR = os.path.join(INPUT_DIR, "proteinBERT")

## Label File Paths
---

In [None]:
##  @brief  :   Directory containing all available dssp ground truth files
DSSP_DIR = os.path.join(MODEL_DATA_ROOT, "dssp")

##  @brief  :   Directory containing all available theta ground truth files
THETA_DIR = os.path.join(MODEL_DATA_ROOT, "theta")

##  @brief  :   Directory containing all available hse ground truth files
HSE_DIR = os.path.join(MODEL_DATA_ROOT, "hse")

#Configurations
---
---

In [None]:
config = dict(
    file_list_train = "spot_1d_lm/lists/train.txt",
    file_list_val   = "spot_1d_lm/lists/val.txt",
    file_list_test  = "spot_1d_lm/lists/casp12.txt",
    embedding1      = "esm-1b",
    embedding2      = "proteinBERT",
    batch_size      = 10,
    epochs           = 100,
    loss            = torch.nn.CrossEntropyLoss(),
    device          = "cuda:3",
    optimizer       = 'adam',
    weight_decay   = 1e-4,
    momentum        = 0.9,
    lr   = 2e-4,
    run             = 1
)

In [None]:
CURRENT_CONFIG = config

# Pre-Trained Models
---
---

## ESM-1b
---

In [None]:
ESM_EMBEDDING_DIM = 1280

In [None]:
class PreTrainedESM1b():
    """!  Pre Trained ESM-1b Model Class

          Used to Access Pre-Trained model and generate embeddings for datasets
    """

    def __init__(self, model_path, input_path):
        """!  Pre Trained ESM-1b Model Class initializer

              @param[in]  :   model_path = path to saved model checkpoints
              @param[in]  :   input_path = path to saved embeddings
        """
        self.input_path = input_path

        self.embedding_dim = ESM_EMBEDDING_DIM

        ## Load Instance of Pre-Trained ESM-1b Model
        self.model, self.alphabet = esm.pretrained.esm1v_t33_650M_UR90S_1()
        self.batch_converter = self.alphabet.get_batch_converter()
        self.model = self.model.to(DEVICE)


    def generate_embedding(self, seq, prot_name):
        """!  Generate Embedding for a given batch of protein sequences

              @param[in]  :   seq - protein sequence as a string
              @param[in]  :   prot_name - name of protein sequence

              @result     :   save_arr - np sequence embedding
        """
        data = [(prot_name, seq)]
        batch_labels, batch_strs, batch_tokens = self.batch_converter(data)
        batch_tokens = batch_tokens.to(DEVICE)

        ## Extract per-residue representations
        with torch.no_grad():
          results = self.model(batch_tokens, repr_layers=[33], return_contacts=True)
        token_representations = results["representations"][33]

        # NOTE: token 0 is always a beginning-of-sequence token, so the first residue is token 1.
        for i, (prot_n, seq) in enumerate(data):
          if DEVICE == "cpu":
            save_arr = token_representations[i, 1: len(seq) + 1].numpy()
          else:
            save_arr = token_representations[i, 1: len(seq) + 1].cpu().numpy()

        del batch_tokens
        return save_arr

    def process_dataset(self, file_list):
        """!  Generate Embeddings for all sequences in a given Dataset

              @param[in]  :   file_list - list of protein files for a dataset

              @result     :   None
        """
        ## Extract Files from list and itertively process
        prot_list = read_list(file_list)
        for prot_path in tqdm(prot_list):
          prot_name = prot_path.split('/')[-1].split('.')[0]
          save_path =  os.path.join(self.input_path, prot_name + "_esm.npy")
          exists = os.path.isfile(save_path)
          ## Check no embedding exists
          if not exists:
            try:
              ## Extract Protein Sequence as a String & Process through Model
              seq = read_fasta_file(prot_path)
              #print(print("protein: ", prot_name, ", length: ", len(seq)))
              embedding = self.generate_embedding(seq, prot_name)

              ## Save np file
              np.save(save_path, embedding)
            except:
              print("No file available for: ",  prot_name, prot_path)
        return

    def load_embeddings(self, protein, seq):
        """!  Load Embeddings for a given protein
              @param[in]  :   protein - name of protein to load
              @param[in]  :   seq - protein sequence to generate if not availble

              @result     :   np array of embedding
        """
        prot_path =  os.path.join(self.input_path, protein + "_esm.npy")
        result = None
        try:
          result = np.load(prot_path)
        except:
          print("No file available for: ",  protein, prot_path)
          result = self.generate_embedding(seq, prot_name)

          ## Save np file
          np.save(prot_path, result)

        return result



In [None]:
import gc
print(FASTA_DIR)
torch.cuda.empty_cache()
gc.collect()

/content/drive/MyDrive/11785 - Deep Learning/IDL_Project/Project/SPOT-1D-LM/spot_1d_lm/fasta


1542

In [None]:
test = TEST_FILES["TEST-2018"]
print("testfile path: ", test)
esm_test = PreTrainedESM1b(ESM_CHECKPOINTS_DIR, ESM1B_INPUT_DIR)


testfile path:  /content/drive/MyDrive/11785 - Deep Learning/IDL_Project/Project/SPOT-1D-LM/spot_1d_lm/lists/test2018.txt


Downloading: "https://dl.fbaipublicfiles.com/fair-esm/models/esm1v_t33_650M_UR90S_1.pt" to /root/.cache/torch/hub/checkpoints/esm1v_t33_650M_UR90S_1.pt


In [None]:
#esm_test.process_dataset(VAL_FILE)
prot_list = read_list(TRAIN_FILE)

for prot_path in tqdm(prot_list):
  prot_name = prot_path.split('/')[-1].split('.')[0]
  try:
    esm_embedding = esm_test.load_embeddings(prot_name)
  except:
    save_path =  os.path.join(ESM1B_INPUT_DIR, prot_name + "_esm.npy")
    print(save_path)
    seq = read_fasta_file(prot_path)
    #print(print("protein: ", prot_name, ", length: ", len(seq)))
    embedding = generate_embedding(seq, prot_name)

    ## Save np file
    np.save(save_path, embedding)

  0%|          | 0/38913 [00:00<?, ?it/s]

/content/drive/MyDrive/11785 - Deep Learning/IDL_Project/Project/SPOT-1D-LM/inputs/esm/3TJ3_2_C_esm.npy


## ProteinBERT
---

In [None]:
PB_EMBEDDING_DIM = 1562

In [None]:
class PreTrainedProteinBERT():
    """!  Pre Trained ProteinBERT Model Class

          Used to Access Pre-Trained Keras model and generate embeddings for datasets
    """

    def __init__(self, model_path, input_path, max_seq_len=1100):
        """!  Pre Trained ProteinBERT Model Class initializer

              @param[in]  :   model_path = path to saved model checkpoints
              @param[in]  :   input_path = path to saved embeddings
              @param[in]  :   max_seq_len = maximum protein sequence length
        """
        self.input_path = input_path
        self.model_path = model_path
        self.max_seq_len = max_seq_len

        self.pad_value = 25

        ## Embedding dim
        self.embedding_dim = PB_EMBEDDING_DIM                     ## > NP Embeddings are size = (max_seq_len, embedding_dim)

        ## Load Pretrained model and input encoder
        self.pretrained_model_generator, self.input_encoder = load_pretrained_model()
        ## Lodel model to obtain local_representations & global represntations
        self.model = get_model_with_hidden_layers_as_outputs(self.pretrained_model_generator.create_model(self.max_seq_len))


    def generate_embedding(self, seq, prot_name):
        """!  Generate Embedding for a given batch of protein sequences

              @param[in]  :   seq - protein sequence as a string
              @param[in]  :   prot_name - name of protein sequence

              @result     :   save_arr - np sequence embedding
        """
        ## Get raw sequence length
        seq_len = len(seq)
        one_hot_seq = one_hot(seq)

        ## Replace Us with Xs to normalise encoding over models
        seq = seq.replace("U", "X")


        ## Encode Input sequence
        encoded_x = self.input_encoder.encode_X([seq], self.max_seq_len)

        ## Obtain local & global embeddings
        local_representations, global_representations = self.model.predict(encoded_x)
        ## Remove padding, end and start tokens
        save_arr = local_representations[0,1:seq_len+1,:]

        del local_representations, global_representations

        return save_arr

    def process_dataset(self, file_list):
        """!  Generate Embeddings for all sequences in a given Dataset

              @param[in]  :   file_list - list of protein files for a dataset

              @result     :   None
        """
        ## Extract Files from list and itertively process
        prot_list = read_list(file_list)
        counter = 0
        for prot_path in tqdm(prot_list):
          prot_name = prot_path.split('/')[-1].split('.')[0]
          save_path =  os.path.join(self.input_path, prot_name + "_pb.npy")
          exists = os.path.isfile(save_path)
          print(exists)
          try:
            seq = read_fasta_file(prot_path)
            embedding = self.generate_embedding(seq, prot_name)
          except:
            pass

          ## Check no embedding exists
          if not exists:
            try:
              ## Extract Protein Sequence as a String & Process through Model
              seq = read_fasta_file(prot_path)
              embedding = self.generate_embedding(seq, prot_name)

              ## Save np file
              np.save(save_path, embedding)
            except:
              print("No file available for: ",  prot_name, prot_path)
        return

    def load_embeddings(self, protein):
        """!  Load Embeddings for a given protein
              @param[in]  :   protein - name of protein to load

              @result     :   np array of embedding
        """
        prot_path =  os.path.join(self.input_path, protein + "_pb.npy")
        result = None
        try:
          result = np.load(prot_path)
        except:
          print("No file available for: ",  protein, prot_path)
          result = self.generate_embedding(seq, prot_name)

          ## Save np file
          np.save(prot_path, result)

        return result



In [None]:
protein = '3S9E_1_A'
pb_test = PreTrainedProteinBERT(PB_CHECKPOINTS_DIR, PB_INPUT_DIR)
#seq = read_fasta_file(prot_path)

 Local model dump file /root/proteinbert_models/default.pkl doesn't exist. Will download ftp://ftp.cs.huji.ac.il/users/nadavb/protein_bert/epoch_92400_sample_23500000.pkl into /root/proteinbert_models. Please approve or reject this (to exit and potentially call the function again with different parameters).
Do you approve downloading the file into the specified directory? Please specify "Yes" or "No":Yes


URLError: ignored

In [None]:
save_path =  os.path.join(PB_INPUT_DIR, prot_name + "_pb.npy")


NameError: ignored

In [None]:
#pb_test.process_dataset(VAL_FILE)
prot_list = read_list(VAL_FILE)

for prot_path in tqdm(prot_list):
  prot_name = prot_path.split('/')[-1].split('.')[0]
  try:
    pb_embedding = pb_test.load_embeddings(prot_name)
  except:
    print("No pb file available for: ",  prot_name, prot_path)
  try:
    esm_embedding = esm_test.load_embeddings(prot_name)
  except:
    print("No esm file available for: ",  prot_name, prot_path)

  0%|          | 0/99 [00:00<?, ?it/s]

No pb file available for:  4GQZ_1_A 4GQZ_1_A


100%|██████████| 99/99 [05:47<00:00,  3.51s/it]


4V6W_57_CY 4V6W_57_CY
  3%|▎         | 1212/38913 [05:51<5:28:59,  1.91it/s]No esm file available for:  3WU2_12_M 3WU2_12_M
  4%|▎         | 1409/38913 [07:50<11:17:13,  1.08s/it]No pb file available for:  3S9E_1_A 3S9E_1_A

## ProtTrans
---

In [None]:
PT_EMBEDDING_DIM = 1024

In [None]:
class PreTrainedProtTrans():
    """!  Pre Trained ProtTrans Model Class

          Used to Access Pre-Trained model and generate embeddings for datasets
    """

    def __init__(self, model_path, input_path):
        """!  Pre Trained ProtTrans Model Class initializer

              @param[in]  :   model_path = path to saved model checkpoints
              @param[in]  :   input_path = path to saved embeddings
        """
        self.input_path = input_path

        self.embedding_dim = PT_EMBEDDING_DIM

        ## loads tokenizer and model for prot_t5_xl_uniref50
        self.tokenizer = T5Tokenizer.from_pretrained("Rostlab/prot_t5_xl_uniref50", do_lower_case=False)
        self.model = T5EncoderModel.from_pretrained("Rostlab/prot_t5_xl_uniref50")
        self.model = self.model.to(DEVICE)



    def generate_embedding(self, seq, prot_name):
        """!  Generate Embedding for a given batch of protein sequences

              @param[in]  :   seq - protein sequence as a string
              @param[in]  :   prot_name - name of protein sequence

              @result     :   save_arr - np sequence embedding
        """
        ## Insert spaces between each character in the sequence
        seq_temp = seq.replace('', " ")
        ## Create a list of sequences
        sequences_Example = [seq_temp]
        ## Replace certain characters with 'X' for normalization
        sequences_Example = [re.sub(r"[UZOB]", "X", sequence) for sequence in sequences_Example]
        ## Tokenize the sequence
        ids = self.tokenizer.batch_encode_plus(sequences_Example, add_special_tokens=True, padding=True)

        ## convert the tokenized sequences to tensors and move to device
        input_ids = torch.tensor(ids['input_ids']).to(DEVICE)
        attention_mask = torch.tensor(ids['attention_mask']).to(DEVICE)
        ## Generate embeddings with no gradient calculations (for efficiency)
        with torch.no_grad():
          embedding = self.model(input_ids=input_ids, attention_mask=attention_mask)

        ## move embeddings to cpu if cuda and convert to numpy array
        if DEVICE == "cpu":
          embedding = embedding.last_hidden_state.numpy()
        else:
          embedding = embedding.last_hidden_state.cpu().numpy()

        ## extract and process embeddings for each sequence
        features = []
        for seq_num in range(len(embedding)):
          ## calculate the actual sequence length (excluding padding)
          seq_len = (attention_mask[seq_num] == 1).sum()
          ## extract the embeddings for the actual sequence length
          seq_emd = embedding[seq_num][:seq_len - 1]
          features.append(seq_emd)

        save_arr = features[0]

        del input_ids, attention_mask
        return save_arr


    def process_dataset(self, file_list):
        """!  Generate Embeddings for all sequences in a given Dataset

              @param[in]  :   file_list - list of protein files for a dataset

              @result     :   None
        """
        ## Set model to evaluation mode to disable dropout for deterministic results
        self.model.eval()

        ## Extract Files from list and itertively process
        prot_list = read_list(file_list)
        for prot_path in tqdm(prot_list):
          prot_name = prot_path.split('/')[-1].split('.')[0]
          save_path =  os.path.join(self.input_path, prot_name + "_pt.npy")
          print("protein: ", prot_name, prot_path)
          exists = os.path.isfile(save_path)
          print("exists: ", exists)
          ## Check no embedding exists
          if not exists:
            ## Extract Protein Sequence as a String & Process through Model
            try:
              seq = read_fasta_file(prot_path)
              embedding = self.generate_embedding(seq, prot_name)

              ## Save np file
              print(save_path)
              np.save(save_path, embedding)
              print(os.path.isfile(save_path))
            except:
              print("No file available for: ",  prot_name, prot_path)
        return


In [None]:
pt_test = PreTrainedProtTrans(ESM_CHECKPOINTS_DIR, PT_INPUT_DIR)

NameError: ignored

In [None]:
pt_test.process_dataset(TRAIN_FILE)

# Datasets
---
---

## Helper Functions & Utilities
---

### **File I/O Functions for Datasets**

In [None]:
def read_list(file_name):
    """!  Read a text file to get the list of elements

          @param[in]  :   file_name - complete path to a file (string)
          @return     :   list of elements in the text file
    """
    with open(file_name, 'r') as f:
        text = f.read().splitlines()
    return text


def read_fasta_file(fname):
    """! Reads the sequence from the fasta file

         @param[in]  :   fname - complete path to a fasta file (string)
         @return     :   protein sequence (string)
    """
    fname = os.path.join(FASTA_DIR, fname+".fasta")
    #print(fname, os.path.isfile(fname))
    with open(fname, 'r') as f:
        AA = ''.join(f.read().splitlines()[1:])
    return AA

In [None]:

def dataset_csv(file_name_list, df_file_name):
    """!  for a given dataset format all proteins into a csv containing sequence, sequence length & protein name

          @param[in]  :   file_name_list - complete path to a file (string)
          @param[in]  :   df_file_name - complete path to save output csv (string)

          @return     :   dataframe containing sequence, sequence length & protein name
    """
    protein_file_list = read_list(file_name_list)
    ## Create Data frame to store Data
    df = data_frame = pd.DataFrame(columns=['Protein', 'Sequence', 'Length'])

    ## Interate through protein list and extract data
    for prot_path in tqdm(protein_file_list):
      try:
          ## file path for the protein at index idx
          ## extracts the protein name from the protein path
          protein = prot_path.split('/')[-1].split('.')[0]
          """!  @brief  :  Determine Protein Sequence & length """
          seq = read_fasta_file(prot_path)
          protein_len = len(seq)

          """!  @brief  :   Append to Dataframe """
          df.loc[len(df)] = {'Protein':protein, 'Sequence': seq, 'Length':protein_len}

      except:
          print("no file for ", protein)

    ## save as csv
    df.to_csv(df_file_name, sep='\t')

    return df



In [None]:
#df_val = dataset_csv(TRAIN_FILE, TRAIN_DF)

In [None]:
#pan = pd.read_csv(TRAIN_DF, sep='\t')


In [None]:
#print(pan['Protein'].to_numpy())

### **File I/O Functions for Label files**

In [None]:
def get_dssp_info(dssp_file_name):
    """!  Parse DSSP file to obtain label info

          @param[in]  :   dssp_file_name - includes file path and extension
          @return     :   dssp - dssp as pd dataframe
    """
    with open(dssp_file_name, 'r') as file:
        lines = file.readlines()
    # end with

    # Extracting the relevant lines
    amino_acid_code = lines[1].strip()
    ss3 = lines[2].strip()
    phi_angles = lines[3].strip().split()
    psi_angles = lines[4].strip().split()
    asa = lines[5].strip().split()

    # Creating the dataframe
    dssp = pd.DataFrame({
        'AA CODE': list(amino_acid_code),
        'SS3': list(ss3),
        'PHI': phi_angles,
        'PSI': psi_angles,
        'ASA': asa
    })

    # this converts to float numbers and accounts for 'X'
    # converts missing 'X' to Nan
    dssp['PHI'] = pd.to_numeric(dssp['PHI'], errors='coerce')
    dssp['PSI'] = pd.to_numeric(dssp['PSI'], errors='coerce')
    dssp['ASA'] = pd.to_numeric(dssp['ASA'], errors='coerce')
    return dssp
# end def

In [None]:
def get_theta_info(theta_file_name):
    """!  Parse theta file to obtain label info

          @param[in]  :   theta_file_name i- ncludes file path and extension
          @return     :   theta - theta as pd dataframe
    """

    columns = ['RES NUM', 'AA CODE', 'THETA', 'TAU', 'OMEGA']
    # creates data frame from file
    theta = pd.read_csv(theta_file_name, sep=' ', names=columns)

    # this converts to float or ints and accounts for 'X' to NaN
    theta['THETA']  = pd.to_numeric(theta['THETA'], errors='coerce')
    theta['TAU']    = pd.to_numeric(theta['TAU'], errors='coerce')
    theta['OMEGA']  = pd.to_numeric(theta['OMEGA'], errors='coerce')

    return theta
# end def

In [None]:
def get_hse_info(hse_file_name, CASP=False):
    """!  Parse hse file to obtain label info

          @param[in]  :   hse_file_name - includes file path and extension
          @return     :   hse - hse as pd dataframe
    """
    columns = ['AA NAME', 'CHAIN ID', 'RES NUM', 'AA CODE',
               'HSE TOTAL', 'HSE UP', 'HSE DOWN']

    # Attempt to read the file
    hse = pd.read_csv(hse_file_name, sep=r'\s+', names=columns)

    # this converts to float or ints and accounts for 'X' to NaN
    hse['CHAIN ID'] = pd.to_numeric(hse['CHAIN ID'], errors='coerce')
    hse['RES NUM'] = pd.to_numeric(hse['RES NUM'], errors='coerce')
    hse['HSE TOTAL'] = pd.to_numeric(hse['HSE TOTAL'], errors='coerce')
    hse['HSE UP'] = pd.to_numeric(hse['HSE UP'], errors='coerce')
    hse['HSE DOWN'] = pd.to_numeric(hse['HSE DOWN'], errors='coerce')
    # print(hse.head)

    return hse
# end def

## Input Data Types
---

### **Protein Sequence 1-hot Enconding**

In [None]:
def one_hot(seq):
    """!  Converts a sequence to one hot encoding
          @param[in]  :   seq - amino acid sequence (string)
          @return     :   one hot encoding of the amino acid (array)[L,20]
    """
    prot_seq = seq
    BASES = 'ARNDCQEGHILKMFPSTWYV'
    bases = np.array([base for base in BASES])
    feat = np.concatenate(
        [[(bases == base.upper()).astype(int)] if str(base).upper() in BASES else np.array([[-1] * len(BASES)]) for base
         in prot_seq])
    return feat

In [None]:
def generate_sequence_embeddings(file_list):
    """!  Generate amino acid one hot embeddings for a given dataset
          @param[in]  :   file_list - list of protein files for a dataset
          @return     :   one hot encoding of the amino acid (array)[L,20]
    """
    ## Extract Files from list and itertively process
    prot_list = read_list(file_list)

    for prot_path in tqdm(prot_list):
        prot_name = prot_path.split('/')[-1].split('.')[0]
        save_path =  os.path.join(ONEHOT_INPUT_DIR, prot_name + "_1hot.npy")
        exists = os.path.isfile(save_path)
        print("exts=", exists)
        ## Check no embedding exists
        if not exists:
          ## Extract Protein Sequence as a String
          seq = read_fasta_file(prot_path)
          print("protein: ", prot_name, ", length: ", len(seq))

          ## Generate 1 hot embedding
          embedding = one_hot(seq)

          ## Save np file
          #print(save_path)
          np.save(save_path, embedding)
    return

In [None]:
#generate_sequence_embeddings(VAL_FILE)

In [None]:
#np.load('/content/drive/MyDrive/11785 - Deep Learning/IDL_Project/Project/SPOT-1D-LM/inputs/one_hot/3TJ3_2_C_1hot.npy')

In [None]:
##  @brief  : Check for missing data
prot_list = TRAIN_FILE
#for prot_path in tqdm(prot_list):


## Output Data Types
---

### **3 State Secondary Structure (SS3) - Classification**

Variable Definitions

In [None]:
##  @brief  : Class Definitions for SS3 Assignment
##            -> C = Coil Properties
##            -> E = Beta Strand Properties
##            -> H = Alpha Helix Properties
SS3_CLASSES = ['C', 'E', 'H']

##  @brief  : Class Definitions for SS8 Assignment
##            Coil Properties
##               -> C = Irregular Coil / Other
##               -> S = High Curvature Loop
##               -> T = Beta-Turn
##            Alpha Helix Properties
##               -> H = Alpha Helix
##               -> G = 310 Helix
##               -> I = Pi Helix
##            Beta Strand Properties
##               -> E = Beta Strand
##               -> B = Beta Bridge
SS8_CLASSES = ['C', 'S', 'T', 'H', 'G', 'I', 'E', 'B']

##  @brief  : Dictionary Mapping SS8 Classes to SS3 Classes
##            -> We choose to use SS3 only as it provides a simplification of the SS8 information
##            -> Reduce Complexity and Memory of Model by simplifying features
ss_conv_3_8_dict = {'X': 'X', 'C': 'C', 'S': 'C', 'T': 'C', 'H': 'H', 'G': 'H', 'I': 'H', 'E': 'E', 'B': 'E'}

### **Accessible Surface Area (ASA) - Regression**

Variable Definitions

In [None]:
##  @brief  : Standard ASA values for each amino acid
ASA_std = {"A": 115, "C": 135, "D": 150, "E": 190, "F": 210, "G": 75, "H": 195,
               "I": 175, "K": 200, "L": 170, "M": 185, "N": 160, "P": 145, "Q": 180,
               "R": 225, "S": 115, "T": 140, "V": 155, "W": 255, "Y": 230, "-": 1, "X": 1}


Helper Functions

In [None]:
def normalize_asa(asa_values, amino_acid_seq):
    """!  Normalize ASA values for given amino acid sequence

          @param[in]  :   asa_values - asa model output values
          @param[in]  :   amino_acid_seq - corresponding protein sequence

          @return     :   normalized_asa - normalized asa values
    """
    ASA_std = {"A": 115, "C": 135, "D": 150, "E": 190, "F": 210, "G": 75, "H": 195,
               "I": 175, "K": 200, "L": 170, "M": 185, "N": 160, "P": 145, "Q": 180,
               "R": 225, "S": 115, "T": 140, "V": 155, "W": 255, "Y": 230, "-": 1, "X": 1}

    normalized_asa = []
    for asa, aa in zip(asa_values, amino_acid_seq):
        max_asa = ASA_std.get(aa, 1)  # Default to 1 for unknown amino acids
        norm_asa = asa / max_asa  # Normalize the ASA value
        normalized_asa.append(norm_asa)

    return normalized_asa


def unnormalize_asa(normalized_asa, amino_acid_seq):
    """!  Unnormalize ASA values for given amino acid sequence

          @param[in]  :   normalized_asa - normalized asa values
          @param[in]  :   amino_acid_seq - corresponding protein sequence

          @return     :   unnormalized_asa - asa rawoutput values
    """
    unnormalized_asa = []
    for norm_asa, aa in zip(normalized_asa, amino_acid_seq):
        max_asa = ASA_std.get(aa, 1)  # Default to 1 for unknown amino acids
        abs_asa = norm_asa * max_asa  # Un-normalize the ASA value
        unnormalized_asa.append(abs_asa)

    return unnormalized_asa


def get_unnorm_asa_new(rel_asa, seq):
    """!  calculates absolute ASA from relative ASA
          uses standard ASA values for amino acids and computes the absolute ASA based on the
          sequence and prediced relative ASA

          @param[in]  :   asa_pred - The predicted relative ASA
          @param[in]  :   seq_list: Sequence of the protein

          @return     :   absolute ASA_PRED
    """

    ## defines a string of standard amino acid one letter
    ## codes plus a symbol for unknown X
    rnam1_std = "ACDEFGHIKLMNPQRSTVWY-X"

    ## tuple containing standard ASA for each AA in rnam1_std
    ASA_std = (115, 135, 150, 190, 210, 75, 195, 175, 200, 170,
               185, 160, 145, 180, 225, 115, 140, 155, 255, 230, 1, 1)
    ## creates dictionary mapping each AA to its standard ASA
    dict_rnam1_ASA = dict(zip(rnam1_std, ASA_std))

    ## processing each sequence in the batch
    ## the length of the first sequence in the batch
    ## assuming all sequences are of equal length or
    ## padded to same length
    max_seq_len = len(seq[0])
    array_list = [] # stores absolute ASA
    for i, single_seq in enumerate(list(seq)):
        ## gets relative ASA predictions for the current sequence
        rel_asa_current = rel_asa[i, :]
        ## calculates the difference b/w the max sequence length and the
        ## current sequence
        seq_len_diff = max_seq_len - len(single_seq)
        ## pads the current single sequence with X to match the
        ## max sequence length
        single_seq = single_seq + ("X" * seq_len_diff)
        ## creates an array of standard ASA values corresponding to each AA in padded
        ## sequence
        asa_max = np.array([dict_rnam1_ASA[i] for i in single_seq]).astype(np.float32)
        ## multiplies the relative ASA predictions with the standard ASA values to get
        ## absolute ASA values
        abs_asa = np.multiply(rel_asa_current.cpu().detach().numpy(), asa_max)
        array_list.append(abs_asa)

    final_array = np.array(array_list)
    return final_array
# end def

### **Half Sphere Exposure (HSE) - Regression**

Variable Definitions

In [None]:
max_hseu = 50                           ## > Maximum HSE-U Value
max_hsed = 65                           ## > Maximum HSE-D Value

Helper Functions

In [None]:
##  @brief  : HSE-u
def normalize_hseu(hseu_values):
    """!  Normalize HSE-U values by dividing by the maximum HSE-U value (50)

          @param[in]  :   hseu_values - HSE-U values for a protein.

          @return     :   normalized_hseu - normalized HSE-U values.
    """
    return hseu_values / max_hseu

def unnormalize_hseu(normalized_hseu_values):
    """!  Unnormalize HSE-U values by multiplying by the maximum HSE-U value (50).

          @param[in]  :   normalized_hseu_values - Normalized HSE-U values.

          @return     :   hseu_ - Original HSE-U values.
    """
    return normalized_hseu_values * max_hseu

In [None]:
##  @brief  : HSE-d
def normalize_hsed(hsed_values):
    """!  Normalize HSE-D values by dividing by the maximum HSE-D value (65)

          @param[in]  :   hsed_values - HSE-D values for a protein.

          @return     :   normalized_hsed - normalized HSE-D values.
    """
    return hsed_values / max_hsed

def unnormalize_hseu(normalized_hsed_values):
    """!  Unnormalize HSE-D values by multiplying by the maximum HSE-D value (65).

          @param[in]  :   normalized_hsed_values - Normalized HSE-D values.

          @return     :   hsed_ - Original HSE-D values.
    """
    return normalized_hsed_values * max_hsed

### **Protein Backbone Difedral Angles - Regression**

Helper Functions

In [None]:
def normalize_circular_angles(angles):
    """!  Normalize angle measurements by converting to sine and cosine components and concatenating them.

          @param[in]  :    angles (list or numpy array): Angles in degrees.

          @return     :    concatenated_angles - numpy array: A 2D array with sine and cosine components concatenated.
    """
    angles_rad = np.radians(angles)  # Convert angles from degrees to radians
    angles_sin = np.sin(angles_rad)  # Sine component [-1,1]
    angles_cos = np.cos(angles_rad)  # Cosine component [-1,1]

    # scale from [-1,1] to [0,1]
    angles_sin = (angles_sin + 1)/2
    angles_cos = (angles_cos + 1)/2

    concatenated_angles = np.column_stack((angles_sin, angles_cos))
    return concatenated_angles
# end def

def unnormalize_circular_angles(normalized_angles):
    """!  Unnormalize angles from their sine and cosine components.

          @param[in]  :    normalized_angles (numpy array): A 2D array with sine and cosine components concatenated.

          @return     :    original_angles_deg (numpy array): The original angles in degrees.
    """
    # Extract sine and cosine components
    # Sclae from [0,1] back to [-1,1]
    angles_sin = normalized_angles[:, 0] * 2 - 1
    angles_cos = normalized_angles[:, 1] * 2 - 1
    original_angles_rad = np.arctan2(angles_sin, angles_cos)  # Compute the angles in radians
    original_angles_deg = np.degrees(original_angles_rad)  # Convert radians to degrees
    original_angles_deg = np.mod(original_angles_deg, 360)  # Normalize angles to be within [0, 360] range
    return original_angles_deg
# end def

### **Generate / Format Label Data**

In [None]:
def convert_data_to_numpy(file_list):
    """!  Format all Label Data into a single numpy array & save

          @param[in]  :    file_list - list of protein files for a dataset

          @return     :    original_angles_deg (numpy array): The original angles in degrees.
    """
    ## Extensions for label file types
    dssp_ext = '.dssp'
    hse_ext = '.h'
    theta_ext = '.t'
    numpy_ext = '.npy'

    prot_list = read_list(file_list)

    for prot_path in tqdm(prot_list):
        prot_name = prot_path.split('/')[-1].split('.')[0]
        save_path = os.path.join(LABEL_DIR, prot_name + numpy_ext)
        if not os.path.exists(save_path):

            ## Get DSSP for protein
            dssp_path = os.path.join(DSSP_DIR, prot_name + dssp_ext)
            if not os.path.exists(dssp_path):
                continue
            dssp_data = get_dssp_info(dssp_path)

            ## Get HSE for protein
            hse_path = os.path.join(HSE_DIR, prot_name + hse_ext)
            if not os.path.exists(hse_path):
                continue
            hse_data = get_hse_info(hse_path)

            ## Get Theta for protein
            theta_path = os.path.join(THETA_DIR, prot_name + theta_ext)
            if not os.path.exists(theta_path):
                continue
            theta_data = get_hse_info(theta_path)

            ## Merge Dataframes
            hse_dssp = pd.merge(hse_data, dssp_data,
                            how='inner',
                            suffixes=('', '_remove'),
                            left_index=True,
                            right_index=True)
            hse_dssp.drop([i for i in hse_dssp.columns if 'remove' in i], axis=1, inplace=True)

            protein_data = pd.merge(hse_dssp, theta_data,
                                how='inner',
                                suffixes=('', '_remove'),
                                left_index=True,
                                right_index=True)

            protein_data.drop([i for i in protein_data.columns if 'remove' in i], axis=1, inplace=True)

            ## Reorder columns
            desired_order = ['AA NAME', 'CHAIN ID', 'RES NUM', 'AA CODE', 'SS3', 'ASA', 'HSE TOTAL', 'HSE UP', 'HSE DOWN', 'PHI', 'PSI', 'THETA', 'TAU', 'OMEGA']

            ## Check for missing data
            if len(desired_order) != len(protein_data.columns):
              for col in desired_order:
                if col not in protein_data.columns:
                  protein_data[col] = np.nan

            protein_data = protein_data[desired_order]

            np.save(save_path, protein_data.to_numpy())
            print(f'{prot_name} file generated')
        else:
          print(f'{prot_name} already converted')
    return


## Dataset Class Definitions
---

### **Classification Inference Dataset**

In [None]:
class Proteins_Dataset_Class(Dataset):
    """!  Protein Dataset Classification Class

          Dataset of Protein Sequences & learned Regression Features

          Loads information from File List
    """
    def __init__(self, file_name_list, file_df, pre_trained_model1, pre_trained_model2):
        """!  Protein Dataset Classification class initializer

              @param[in]  :   file_name_list - List of paths to protein sequence fasta files
              @param[in]  :   file_df - Dataframe of sequences
              @param[in]  :   pre_trained_model1 - Object to generate / process embeddings from pre-trained model 1
              @param[in]  :   pre_trained_model2 - Object to generate / process embeddings from pre-trained model 2
        """
        self.protein_file_list = read_list(file_name_list)       ## > list of file paths to fasta per protein
        self.pre_trained_model1 = pre_trained_model1
        self.pre_trained_model2 = pre_trained_model2
        self.features = []
        self.labels = []


        ## Store File name lists & Sequences
        self.df = pd.read_csv(file_df, sep='\t')
        self.protein_list = self.df['Protein'].to_numpy()
        self.sequence_list = self.df['Sequence'].to_numpy()
        self.protein_len_list = self.df['Length'].to_numpy()





    def __len__(self):
        """!  Determine number of Data points in dataset

              @return   :   length = length of dataset
        """
        return len(self.protein_list)

    def __getitem__(self, idx):
        """!  Return protein in dataset corresponding to given index

              @return   :   features - tensot containing:
                              * 1-hot embedding of protein sequence
                              * Embedding from esm-1b
                              * Embedding from ProtTrans / ProteinBERT
              @return   :   labels - tensor containing normalised values of protein:
                              * SS3 Class Indicies
              @return   :   protein_len - length of protein sequence
              @return   :   protein - protein name
              @return   :   seq - protein sequence string
        """

        """!  @brief  :   Fetch protein, sequence & sequence Length at given index """
        protein = self.protein_list[idx]
        sequence = self.sequence_list[idx]
        protein_len = self.protein_len_list[idx]


        """!  @brief  :   Load & Process Label Data for the Protein """
        try:
          labels = np.load(os.path.join(LABEL_DIR, protein + ".npy"), allow_pickle=True)
          # normalize specific labels

          ss3_indices = np.array([SS3_CLASSES.index(aa) if aa in SS3_CLASSES else -1 for aa in labels[:, 4]])
        except:
          print("no label")

        """!  @brief  :   Fetch Embeddings & One Hot encodings """
        onehot_path =  os.path.join(ONEHOT_INPUT_DIR, protein + "_1hot.npy")
        one_hot_enc = one_hot(sequence)

        ## Model 1
        embedding1 = self.pre_trained_model1.generate_embedding(sequence, protein)

        ## Model 2
        embedding2 = self.pre_trained_model2.generate_embedding(sequence, protein)

        """!  @brief  :   Concatenate Features """
        features = np.concatenate((one_hot_enc, embedding1, embedding2), axis=1)
        print(protein,np.shape(features))

        return torch.FloatTensor(features), torch.LongTensor(ss3_indices), self.protein_len_list[idx], self.protein_list[idx], self.sequence_list[idx]


    def collate_fn(self, batch):
        """!  Collate function for data read from text file per batch

              @return   :   padded_features - Concatenation of feature vectors padded with 0s
              @return   :   padded_labels - Concatenation of label vectors padded with 0s
              @return   :   protein_len - length of protein sequence
              @return   :   protein - protein name
              @return   :   seq - protein sequence string
        """

        # sort data by protein length in descending order
        # sort data by protein length in descending order
        batch.sort(key=lambda x: x[2], reverse=True)

        batch_features, batch_labels, protein_lengths = [], [], []
        protein_names, sequences = [], []

        # unpacks the sorted data into features, protein_len, and sequence
        # features, labels, protein_len, protein, seq = zip(*data)
        for features, labels, protein_len, protein, seq in batch:
            batch_features.append(features)
            batch_labels.append(labels)
            protein_lengths.append(protein_len)
            protein_names.append(protein)
            sequences.append(seq)
        # end for

        # Pad feature and label tensors to ensure they have the same shape
        # enforce_sorted=True
        # Pad label tensors with -1 (or another invalid class index)
        padded_features = nn.utils.rnn.pad_sequence(batch_features, batch_first=True, padding_value=-1)
        padded_labels = nn.utils.rnn.pad_sequence(batch_labels, batch_first=True, padding_value=-1)

        # returns the padded features, protein lengths,
        # protein names, and sequences
        return padded_features, padded_labels, torch.tensor(protein_lengths), protein_names, sequences

### **Regression Inference Dataset**

In [None]:
class Protein_Dataset_Reg(Dataset):
    """!  Protein Dataset Regression Class

          Dataset of Protein Sequences & learned Regression Features

          Loads information from File List
    """
    def __init__(self, file_name_list):
        """!  Protein Dataset Regression class initializer

              @param[in]  :   file_name_list - List of paths to protein sequence fasta files
        """
        self.protein_list = read_list(file_name_list)        ## > list of file paths to fasta per protein

    def __len__(self):
        """!  Determine number of Data points in dataset

              @return   :   length = length of dataset
        """
        return len(self.protein_list)

    def __getitem__(self, idx):
        """!  Return protein in dataset corresponding to given index

              @return   :   features - tensot containing:
                              * 1-hot embedding of protein sequence
                              * Embedding from esm-1b
                              * Embedding from ProtTrans / ProteinBERT
              @return   :   labels - tensor containing normalised values of protein:
                              * ASA
                              * HSE U
                              * HSE D
                              * phi
                              * psi
                              * theta
                              * tau
              @return   :   protein_len - length of protein sequence
              @return   :   protein - protein name
              @return   :   seq - protein sequence string
        """
        ## file path for the protein at index idx
        prot_path = self.protein_list[idx]
        ## extracts the protein name from the protein path
        protein = prot_path.split('/')[-1].split('.')[0]

        ## reads the protein sequence from prot_path
        seq = read_fasta_file(prot_path)
        ## applies one-hot encdoing to the sequence
        one_hot_enc = one_hot(seq)
        ## loads EEM and ProtTrans embeddings from the numpy files
        embedding1 = np.load(os.path.join("inputs/", protein + "_esm.npy"))
        embedding2 = np.load(os.path.join("inputs/", protein + "_pt.npy"))
        ## embedding1 = np.load(os.path.join("inputs/", protein + "_pb.npy"))

        ## load label data for the protein
        labels = np.load(os.path.join("spot_1d_lm/labels", protein + ".npy"))

        ## normalize specific labels
        norm_labels = np.empty((labels.shape[0], 11))
        ## normalize specific properties
        norm_labels[:,0] = normalize_asa(labels[:,5], labels[:,3]) # normalize ASA
        norm_labels[:,1] = normalize_hseu(labels[:,7]) # normalize HSE U
        norm_labels[:,2] = normalize_hseu(labels[:,8]) # normalize HSE D

        ## normalize dihedral angles
        phi     = normalize_circular_angles(labels[:,9])
        psi     = normalize_circular_angles(labels[:,10])
        theta   = normalize_circular_angles(labels[:,11])
        tau     = normalize_circular_angles(labels[:,12])

        ## add dihedral angles into nomalized labels
        norm_labels[:, 3:5] = phi
        norm_labels[:, 5:7] = psi
        norm_labels[:, 7:9] = theta
        norm_labels[:, 9:]  = tau

        ## features = np.concatenate((one_hot_enc, embedding1, embedding2), axis=1)
        ## concatenates the one-hot encoded sequence with the two embeddings
        features = np.concatenate((one_hot_enc, embedding1, embedding2), axis=1)
        protein_len = len(seq)

        return torch.FloatTensor(features), torch.FloatTensor(norm_labels), protein_len, protein, seq


def text_collate_fn(data):
    """!  Collate function for data read from text file per batch

          @return   :   padded_features - Concatenation of feature vectors padded with 0s
          @return   :   padded_labels - Concatenation of label vectors padded with 0s
          @return   :   protein_len - length of protein sequence
          @return   :   protein - protein name
          @return   :   seq - protein sequence string
    """

    ## sort data by protein length in descending order
    data.sort(key=lambda x: x[1], reverse=True)
    ## unpacks the sorted data into features, protein_len, and sequence
    features, labels, protein_len, protein, seq = zip(*data)

    ## Pad feature and label tensors to ensure they have the same shape
    padded_features = nn.utils.rnn.pad_sequence(features, batch_first=True, padding_value=0)
    padded_labels = nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=0)

    ## returns the padded features, protein lengths,
    ## protein names, and sequences
    return padded_features, padded_labels, protein_len, protein, seq



### **Test Inference Dataset**

In [None]:
class Proteins_Dataset_Test(Dataset):
    """!  Protein Dataset Regression class initializer

          @param[in]  :   file_name_list - List of paths to protein sequence fasta files
    """
    def __init__(self, file_name_list):
        """!  Protein Dataset Test class initializer

              @param[in]  :   file_name_list - List of paths to protein sequence fasta files
        """
        self.protein_list = read_list(file_name_list)       ## > list of file paths to fasta per protein

    def __len__(self):
        """!  Determine number of Data points in dataset

              @return   :   length = length of dataset
        """
        return len(self.protein_list)

    def __getitem__(self, idx):
        """!  Return protein in dataset corresponding to given index

              @return   :   features - tensot containing:
                              * 1-hot embedding of protein sequence
                              * Embedding from esm-1b
                              * Embedding from ProtTrans / ProteinBERT
              @return   :   protein_len - length of protein sequence
              @return   :   protein - protein name
              @return   :   seq - protein sequence string
        """
        """!  Return protein in dataset corresponding to given index

              @return   :   features - tensot containing:
                              * 1-hot embedding of protein sequence
                              * Embedding from esm-1b
                              * Embedding from ProtTrans / ProteinBERT
              @return   :   labels - tensor containing normalised values of protein:
                              * SS3 Class Indicies
              @return   :   protein_len - length of protein sequence
              @return   :   protein - protein name
              @return   :   seq - protein sequence string
        """
        ## file path for the protein at index idx
        prot_path = self.protein_list[idx]
        ## extracts the protein name from the protein path
        protein = prot_path.split('/')[-1].split('.')[0]
        ##

        """!  @brief  :  Determine Protein Sequence & length """
        seq = read_fasta_file(prot_path)
        protein_len = len(seq)


        """!  @brief  :   Load One Hot Encoding """
        ## Check if one-hot encoding pregenerated
        onehot_path =  os.path.join(ONEHOT_INPUT_DIR, protein + "_1hot.npy")
        if os.path.isfile(onehot_path):
          one_hot_enc = np.load(onehot_path)
        else:
          ## Else Generate and save
          seq = read_fasta_file(prot_path)
          one_hot_enc = one_hot(seq)
          np.save(onehot_path, one_hot_enc)


        """!  @brief  :   Load Embeddings from pre-trained models """
        ## Model 1
        embedding1 = self.pre_trained_model1.load_embeddings(protein)

        ## Model 2
        embedding2 = self.pre_trained_model2.load_embeddings(protein)

        #print("embedding1", np.shape(embedding1))
        #print("embedding2", np.shape(embedding2))
        #print("one hot",np.shape(one_hot_enc))

        """!  @brief  :   Verify all Files are valid and exist """
        assert(embedding1.any() != None)
        assert(embedding2.any() != None)
        assert(one_hot_enc.any() != None)


        """!  @brief  :   Concatenate Feature Data """
        features = np.concatenate((one_hot_enc, embedding1, embedding2), axis=1)


        # returns a tuple of features, length of protein sequences,
        # protein name, and protein sequence
        return torch.FloatTensor(features), protein_len, protein, seq



    def text_collate_fn(batch):
        """!  Collate function for data read from text file per batch

              @return   :   padded_features - Concatenation of feature vectors padded with 0s
              @return   :   padded_labels - Concatenation of label vectors padded with 0s
              @return   :   protein_len - length of protein sequence
              @return   :   protein - protein name
              @return   :   seq - protein sequence string
        """

        # sort data by protein length in descending order
        batch.sort(key=lambda x: x[1], reverse=True)

        batch_features, protein_lengths = [], []
        protein_names, sequences = [], []

        # unpacks the sorted data into features, protein_len, and sequence
        # features, labels, protein_len, protein, seq = zip(*data)
        for features, protein_len, protein, seq in batch:
            batch_features.append(features)
            protein_lengths.append(protein_len)
            protein_names.append(protein)
            sequences.append(seq)
        # end for

        # Pad feature and label tensors to ensure they have the same shape
        padded_features = nn.utils.rnn.pad_sequence(batch_features, batch_first=True, padding_value=0)

        # returns the padded features, protein lengths,
        # protein names, and sequences
        return padded_features, torch.tensor(protein_lengths), protein_names, sequences
    # end def
# end class

##Dataset Initialisation
---

In [None]:
def remove_missing_proteins(FILE_NAME):
  """!  Remove proteins with missing fasta or labels from dataset list

        @note   :   All Proteins Missing Fasta are missing labels
  """
  new_list = []
  p_list = read_list(FILE_NAME)
  for prot_path in tqdm(p_list):
    protein = prot_path.split('/')[-1].split('.')[0]
    ## Check for Fasta
    fname = os.path.join(FASTA_DIR, protein+".fasta")
    ## Check for DSSP
    dsname = os.path.join(LABEL_DIR, protein + ".npy")
    if not (os.path.isfile(dsname) or os.path.isfile(fname)):
      new_list.append(prot_path)

  ## Write to New List
  #with open(FILE_NAME,'w') as f:
	  #f.write('\n'.join(new_list))

  return new_list

  #new_train = remove_missing_proteins(TRAIN_FILE)
  #print(new_train)


In [None]:
new_val= remove_missing_proteins(TRAIN_FILE)
print(new_val)

In [None]:
  QM,

In [None]:
train_data = Proteins_Dataset_Class(TRAIN_FILE, TRAIN_DF, esm_test, pb_test)
val_data = Proteins_Dataset_Class(VAL_FILE, VAL_DF,esm_test, pb_test)
#test_data = Proteins_Dataset_Test(test_file_list)

In [None]:
features, ss3_indices, protein_len, protein, sequence = val_data.__getitem__(2)

1I4Y_1_A (114, 3144)


In [None]:
print(np.shape(features))
enc = features[:, :20]
emb1 = features[:, 20:20+ESM_EMBEDDING_DIM]
emb2 = features[:, 20+ESM_EMBEDDING_DIM:]
print(sequence)
print(enc)
print(ss3_indices)



In [None]:
train_loader = build_train_dataset(config, train_data)
val_loader = build_val_dataset(config, val_data)
#test_loader = build_test_dataset(config, test_data)

Batch size           :  10
Train batches        :  3892
Val batches          :  10


In [None]:
print("\nChecking the shapes of the data...")
for batch in train_loader:
    x, y, plen_test, protein_name_test, sequence_test = batch
    #print(np.shape(x), np.shape(y), np.shape(len))
    print(y)
    break


Checking the shapes of the data...
4Q3M_1_A (274, 2862)
4V17_1_A (159, 2862)
1Z5X_d1z5xe- (237, 2862)
3S8I_1_A (148, 2862)
1C4O_d1c4oa1 (408, 2862)
1MVF_d1mvfe- (44, 2862)
3GZA_1_A (443, 2862)
1IN0_d1in0b1 (88, 2862)
2CP9_1_A (64, 2862)
1MU2_d1mu2a2 (427, 2862)
tensor([[-1, -1, -1,  ...,  0,  0,  0],
        [ 0,  0,  0,  ..., -1, -1, -1],
        [ 0,  0,  0,  ..., -1, -1, -1],
        ...,
        [ 0,  0,  0,  ..., -1, -1, -1],
        [ 0,  0,  0,  ..., -1, -1, -1],
        [ 0,  0,  0,  ..., -1, -1, -1]])


In [None]:
label = np.load('/content/drive/MyDrive/11785 - Deep Learning/IDL_Project/Project/SPOT-1D-LM/spot_1d_lm/labels/4FQX_2_D.npy', allow_pickle=True)


In [None]:
print(np.shape(x), np.shape(y), len)

In [None]:
print(label[:,4], label[:,5])
print(ss3_indices)
ss3 = ss3_indices
vidx = np.where(ss3 != -1)[0] # valid indices
ss3 = ss3[vidx]
sequence = ''.join(label[vidx, 3])
print(seq)
print(sequence)

##Test/Check Datasets
---

# Dataloaders
---
---

## Build Dataloader Helpers
---

### **Train Dataloader**

In [None]:
def build_train_dataset(config, train_data):
  """!  Build Dataloader for Training Dataset
        @param[in]  :   config = parameter configuration dictionary
        @param[in]  :   train_data = training Dataset

        @return     :   batch of protein sequences, features, labels, protein lengths, and protein names
  """
  ## Define dataloader object
  train_loader = torch.utils.data.DataLoader(
    dataset     = train_data,
    collate_fn  = train_data.collate_fn,
    num_workers = 0,
    batch_size  = config['batch_size'],
    pin_memory  = True,
    shuffle     = True
  )

  ## Print Summary of Training Data
  print("Batch size           : ", config['batch_size'])
  print("Train batches        : ", train_loader.__len__())

  return train_loader

### **Validation Dataloader**

In [None]:
def build_val_dataset(config, val_data):
  """!  Build Dataloader for Validation Dataset
        @param[in]  :   config = parameter configuration dictionary
        @param[in]  :   val_data = validation Dataset

        @return     :   batch of protein sequences, features, labels, protein lengths, and protein names
  """
  ## Define dataloader object
  val_loader = torch.utils.data.DataLoader(
    dataset     = val_data,
    collate_fn  = val_data.collate_fn,
    num_workers = 0,
    batch_size  = config['batch_size'],
    pin_memory  = True,
    shuffle     = False
  )

  ## Print Summary of Validation Data
  print("Val batches          : ", val_loader.__len__())

  return val_loader

### **Test Dataloader**

In [None]:
def build_test_dataset(config, test_data):
  """!  Build Dataloader for Test Dataset
        @param[in]  :   config = parameter configuration dictionary
        @param[in]  :   test_data = Test Dataset

        @return     :   batch of protein sequences, features, protein lengths, and protein names
  """
  ## Define dataloader object
  test_loader = torch.utils.data.DataLoader(
    dataset     = test_data,
    collate_fn = test_data.collate_fn,
    num_workers = 2,
    batch_size  = config['batch_size'],
    pin_memory  = True,
    shuffle     = False
  )
  ## Print Summary of Test Data
  print("Test batches         : ", test_loader.__len__())

  return test_loader

##Test/Check Dataloaders
---

In [None]:
train_loader = build_train_dataset(config, train_data)
#val_loader = build_val_dataset(config, val_data)
#test_loader = build_test_dataset(config, test_data)

NameError: ignored

# Model Architecture Definitions
---
---

## Utilities and Helper Functions
---

### **Permute Block**

In [None]:
class PermuteBlock(torch.nn.Module):
    """!  Permute Block Class

          Defines function to reshape network layers
    """
    def forward(self, x):
        return x.transpose(1, 2)

##ResNet & Conv Blocks
---

### **Custom Conv1d Layers**

In [None]:
class conv3x3(torch.nn.Module):
    """!  3x3 Conv1d Layer Class

          3x3 Convolution with Padding
    """
    def __init__(self, input_channels, output_channels, stride=1):
        """!  3x3 Conv1d Layer Class Initializer

              @param[in]  :   input_channels - number of input channels
              @param[in]  :   output_channels - number of output channels
              @param[in]  :   stride - stride of the convolution
        """
        super(conv3x3, self).__init__()
        self.conv = torch.nn.Conv1d(input_channels, output_channels, kernel_size=7, stride=stride,
                     padding=3, bias=False)

    def forward(self,x):
        """!  3x3 Conv1d Layer Forward Pass

              @param[in]    :   x - input tensor
              @return       :   x - output tensor
        """
        return self.conv(x)

In [None]:
class conv5x5(torch.nn.Module):
    """!  5x5 Conv1d Layer Class

          5x5 Convolution with Padding
    """
    def __init__(self, input_channels, output_channels, stride=1):
        """!  5x5 Conv1d Layer Class Initializer

              @param[in]  :   input_channels - number of input channels
              @param[in]  :   output_channels - number of output channels
              @param[in]  :   stride - stride of the convolution
        """
        super(conv5x5, self).__init__()
        self.conv = torch.nn.Conv1d(input_channels, output_channels, kernel_size=9, stride=stride,
                     padding=4, bias=False)

    def forward(self,x):
        """!  5x5 Conv1d Layer Forward Pass

              @param[in]    :   x - input tensor
              @return       :   x - output tensor
        """
        return self.conv(x)

In [None]:
class conv7x7(torch.nn.Module):
    """!  7x7 Conv1d Layer Class

          7x7 Convolution with Padding
    """
    def __init__(self, input_channels, output_channels, stride=1):
        """!  7x7 Conv1d Layer Class Initializer

              @param[in]  :   input_channels - number of input channels
              @param[in]  :   output_channels - number of output channels
              @param[in]  :   stride - stride of the convolution
        """
        super(conv7x7, self).__init__()
        self.conv = nn.Conv1d(input_channels, output_channels, kernel_size=15, stride=stride,
                     padding=7, bias=False)

    def forward(self,x):
        """!  7x7 Conv1d Layer Forward Pass

              @param[in]    :   x - input tensor
              @return       :   x - output tensor
        """
        return self.conv(x)

### **Custom Residual Blocks**

In [None]:
class BasicBlock3x3(nn.Module):
    """!  Basic 3x3 Residual Block Class

          Input -> Conv3x3 -> BN -> ReLU -> Conv3x3 -> BN -> Downsample -> Output
               \___________________________________________________________/ (Residual Connection)
    """
    expansion = 1

    def __init__(self, input_channels, output_channels, stride=1, downsample=None):
        """!  Basic 3x3 Residual Block Initializer

              @param[in]  :   input_channels - number of input channels
              @param[in]  :   output_channels - number of output channels
              @param[in]  :   stride - stride of the convolution
              @param[in]  :   downsample - boolean var determining if downsampling is required
        """
        super(BasicBlock3x3, self).__init__()
        self.conv1 = conv3x3(input_channels, output_channels, stride)
        self.bn1 = nn.BatchNorm1d(output_channels)
        self.relu = nn.ReLU(inplace=True)
        self.drop1 = nn.Dropout(p=0.5)
        self.conv2 = conv3x3(output_channels, output_channels)
        self.bn2 = nn.BatchNorm1d(output_channels)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        """!  Basic 3x3 Residual Block Forward Pass

              @param[in]    :   x - input tensor
              @return       :   out - output tensor
        """
        residual = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)
        out = self.drop1(out)

        out = self.conv2(out)
        out = self.bn2(out)

        if self.downsample is not None:
            residual = self.downsample(x)

        out += residual
        out = self.relu(out)

        return out

In [None]:
class BasicBlock5x5(nn.Module):
    """!  Basic 5x5 Residual Block Class

          Input -> Conv5x5 -> BN -> ReLU -> Conv3x3 -> BN -> Downsample -> Output
               \___________________________________________________________/ (Residual Connection)
    """
    expansion = 1

    def __init__(self, input_channels, output_channels, stride=1, downsample=None):
        """!  Basic 5x5 Residual Block Initializer

              @param[in]  :   input_channels - number of input channels
              @param[in]  :   output_channels - number of output channels
              @param[in]  :   stride - stride of the convolution
              @param[in]  :   downsample - boolean var determining if downsampling is required
        """
        super(BasicBlock5x5, self).__init__()
        self.conv1 = conv5x5(input_channels, output_channels, stride)
        self.bn1 = nn.BatchNorm1d(output_channels)
        self.relu = nn.ReLU(inplace=True)
        self.drop1 = nn.Dropout(p=0.5)

        self.conv2 = conv5x5(output_channels, output_channels)
        self.bn2 = nn.BatchNorm1d(output_channels)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        """!  Basic 5x5 Residual Block Forward Pass

              @param[in]    :   x - input tensor
              @return       :   out1 - output tensor
        """
        residual = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)
        out = self.drop1(out)

        out = self.conv2(out)
        out = self.bn2(out)

        if self.downsample is not None:
            residual = self.downsample(x)

        d = residual.shape[2] - out.shape[2]

        out1 = residual + out
        out1 = self.relu(out1)

        return out1

In [None]:
class BasicBlock7x7(nn.Module):
    """!  Basic 7x7 Residual Block Class

          Input -> Conv7x7 -> BN -> ReLU -> Conv3x3 -> BN -> Downsample -> Output
               \___________________________________________________________/ (Residual Connection)
    """
    expansion = 1

    def __init__(self, input_channels, output_channels, stride=1, downsample=None):
        """!  Basic 7x7 Residual Block Initializer

              @param[in]  :   input_channels - number of input channels
              @param[in]  :   output_channels - number of output channels
              @param[in]  :   stride - stride of the convolution
              @param[in]  :   downsample - boolean var determining if downsampling is required
        """
        super(BasicBlock7x7, self).__init__()
        self.conv1 = conv7x7(input_channels, output_channels, stride)
        self.bn1 = nn.BatchNorm1d(output_channels)
        self.relu = nn.ReLU(inplace=True)
        self.drop1 = nn.Dropout(p=0.5)

        self.conv2 = conv7x7(output_channels, output_channels)
        self.bn2 = nn.BatchNorm1d(output_channels)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        """!  Basic 7x7 Residual Block Forward Pass

              @param[in]    :   x - input tensor
              @return       :   out1 - output tensor
        """
        residual = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)
        out = self.drop1(out)

        out = self.conv2(out)
        out = self.bn2(out)

        if self.downsample is not None:
            residual = self.downsample(x)

        d = residual.shape[2] - out.shape[2]

        out1 = residual + out
        out1 = self.relu(out1)

        return out1

## LSTMs & RNNs
---

## SPOT-1D-Single Architectures
---

### **Model 1&4  Architecture - 2 Layer BiLSTM**

In [None]:
class TwoLayerBLSTM(nn.Module):
    """!  Two Layer BiLSTM Class

          SPOT-1D-Single Model 1 Architecture (for Classification)
          SPOT-1D-Single Model 4 Architecture (for Regression)
    """
    def __init__(self, input_size=2862, hidden_dim=1024, output_size=3, dropout=0.5, classification=True,
                 ih_weight_init_fn=None, hh_weight_init_fn=None, linear_weight_init_fn=None,
                 linear_activation_fn=torch.nn.ReLU()):
        """!  Two Layer BiLSTM Class Initializer

              @param[in]  :   input_size - input dimension
              @param[in]  :   hidden_dim - size of lstm hidden layer
              @param[in]  :   output_size - output dimension / number of classes
              @param[in]  :   dropout - dropout probability
              @param[in]  :   classification - boolean var determining if classification or Regression
              @param[in]  :   ih_weight_init_fn - function for initialising input-hidden weights
              @param[in]  :   hh_weight_init_fn - function for initialising hidden-hidden weights
              @param[in]  :   linear_weight_init_fn - function for initialising linear layer weights
              @param[in]  :   linear_activation_fn - activation function for linear layer
        """
        super(TwoLayerBLSTM, self).__init__()
        self.linear_weight_init_fn = linear_weight_init_fn
        self.ih_weight_init_fn = ih_weight_init_fn
        self.hh_weight_init_fn = hh_weight_init_fn
        self.linear_activation_fn = linear_activation_fn

        self.classification = classification

        self.hidden_dim = hidden_dim

        self.lstm1 = nn.LSTM(input_size=input_size, hidden_size=self.hidden_dim, num_layers=2, batch_first=True,
                             bidirectional=True, dropout=dropout)
        self.drop1 = nn.Dropout(p=dropout)

        self.linear1 = nn.Linear(in_features=2*self.hidden_dim, out_features=1000)
        self.linear2 = nn.Linear(in_features=1000, out_features=1000)
        self.linear3 = nn.Linear(in_features=1000, out_features=output_size)

        ## If Regression Model Add Sigmoid Layer
        if not classification:
          self.sigmoid = torch.nn.Sigmoid()

        ## Initialise Weights
        self.init_weights()

    def init_weights(self):
        """!  Initialise Weights for 2 Layer BiLSTM
        """
        ## Initialise LSTM
        if self.ih_weight_init_fn != None and self.hh_weight_init_fn != None:
          for name, param in self.lstm1.named_parameters():
            if 'weight_ih' in name:
                self.ih_weight_init_fn(param.data)
            elif 'weight_hh' in name:
                self.hh_weight_init_fn(param.data)
            elif 'bias' in name:
                param.data.fill_(0)

        ## Initialise Linear Layers
        if self.linear_weight_init_fn != None:
          self.linear_weight_init_fn(self.linear1)


    def forward(self, x, seq_lens):
        """!  Two Layer BiLSTM Forward Pass

              @param[in]    :   x - padded input tensor
              @return       :   x - padded output tensor
        """
        #### creating a mask of shape [B, L]

        x = nn.utils.rnn.pack_padded_sequence(x, seq_lens, batch_first=True)
        x, (hidden, cell) = self.lstm1(x)
        x, y = nn.utils.rnn.pad_packed_sequence(x, batch_first=True, padding_value=0)
        x = self.drop1(x)

        x = self.linear_activation_fn(self.linear1(x))
        x = self.linear_activation_fn(self.linear2(x))
        x = self.linear3(x)

        ## If Regression Model Pass Through Sigmoid Layer
        if not self.classification:
          x = self.sigmoid(x)

        return x

### **Model 2&5 Architecture - MSResNet**

In [None]:
class MsResNet(nn.Module):
    """! MS ResNet Class

         SPOT-1D-Single Model 2 Architecture (for Classification)
         SPOT-1D-Single Model 5 Architecture (for Regression)
    """
    def __init__(self, input_channel=2862, layers=[5, 5, 5, 1], num_classes=3, classification=True,
                 weight_init_fn=None):
        """!  MSResNet Class Initializer

              @param[in]  :   input_channel - input dimension
              @param[in]  :   layers - list of required conv blocks
              @param[in]  :   num_classes - output dimension / number of classes
              @param[in]  :   dropout - dropout probability
              @param[in]  :   classification - boolean var determining if classification or Regression
              @param[in]  :   weight initialization function for Conv Layers & Linear Layer
        """
        self.inplanes3 = 64
        self.inplanes5 = 64
        self.inplanes7 = 64

        self.classification = classification

        self.weight_init_fn = weight_init_fn

        super(MsResNet, self).__init__()

        self.conv1 = nn.Conv1d(input_channel, 64, kernel_size=7, stride=1, padding=3,
                               bias=False)
        self.bn1 = nn.BatchNorm1d(64)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool1d(kernel_size=3, stride=2, padding=1)

        self.layer3x3_1 = self._make_layer3(BasicBlock3x3, 64, layers[0], stride=1)
        self.layer3x3_2 = self._make_layer3(BasicBlock3x3, 128, layers[1], stride=1)
        self.layer3x3_3 = self._make_layer3(BasicBlock3x3, 256, layers[2], stride=1)
        self.maxpool3 = nn.AvgPool1d(kernel_size=16, stride=1, padding=0)

        self.layer5x5_1 = self._make_layer5(BasicBlock5x5, 64, layers[0], stride=1)
        self.layer5x5_2 = self._make_layer5(BasicBlock5x5, 128, layers[1], stride=1)
        self.layer5x5_3 = self._make_layer5(BasicBlock5x5, 256, layers[2], stride=1)
        self.maxpool5 = nn.AvgPool1d(kernel_size=11, stride=1, padding=0)

        self.layer7x7_1 = self._make_layer7(BasicBlock7x7, 64, layers[0], stride=1)
        self.layer7x7_2 = self._make_layer7(BasicBlock7x7, 128, layers[1], stride=1)
        self.layer7x7_3 = self._make_layer7(BasicBlock7x7, 256, layers[2], stride=1)
        self.maxpool7 = nn.AvgPool1d(kernel_size=6, stride=1, padding=0)

        self.fc = nn.Linear(256, num_classes)

        ## If Regression Model Add Sigmoid Layer
        if not classification:
          self.sigmoid = torch.nn.Sigmoid()

        ## Initialise Weights
        #self.init_weights()


    def _make_layer3(self, block, planes, blocks, stride=2):
        """!  Construct 3x3 Conv Block Layer with given properties

              @param[in]  :   block - conv block type
              @param[in]  :   planes - output dimension
              @param[in]  :   blocks - number of conv blocks in Layer
              @param[in]  :   stride - block stride
        """
        downsample = None
        if stride != 1 or self.inplanes3 != planes * block.expansion:
            downsample = nn.Sequential(
                nn.Conv1d(self.inplanes3, planes * block.expansion,
                          kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm1d(planes * block.expansion),
            )

        layers = []
        layers.append(block(self.inplanes3, planes, stride, downsample))
        self.inplanes3 = planes * block.expansion
        for i in range(1, blocks):
            layers.append(block(self.inplanes3, planes))

        return nn.Sequential(*layers)


    def _make_layer5(self, block, planes, blocks, stride=2):
        """!  Construct 5x5 Conv Block Layer with given properties

              @param[in]  :   block - conv block type
              @param[in]  :   planes - output dimension
              @param[in]  :   blocks - number of conv blocks in Layer
              @param[in]  :   stride - block stride
        """
        downsample = None
        if stride != 1 or self.inplanes5 != planes * block.expansion:
            downsample = nn.Sequential(
                nn.Conv1d(self.inplanes5, planes * block.expansion,
                          kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm1d(planes * block.expansion),
            )

        layers = []
        layers.append(block(self.inplanes5, planes, stride, downsample))
        self.inplanes5 = planes * block.expansion
        for i in range(1, blocks):
            layers.append(block(self.inplanes5, planes))

        return nn.Sequential(*layers)


    def _make_layer7(self, block, planes, blocks, stride=2):
        """!  Construct 7x7 Conv Block Layer with given properties

              @param[in]  :   block - conv block type
              @param[in]  :   planes - output dimension
              @param[in]  :   blocks - number of conv blocks in Layer
              @param[in]  :   stride - block stride
        """
        downsample = None
        if stride != 1 or self.inplanes7 != planes * block.expansion:
            downsample = nn.Sequential(
                nn.Conv1d(self.inplanes7, planes * block.expansion,
                          kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm1d(planes * block.expansion),
            )

        layers = []
        layers.append(block(self.inplanes7, planes, stride, downsample))
        self.inplanes7 = planes * block.expansion
        for i in range(1, blocks):
            layers.append(block(self.inplanes7, planes))

        return nn.Sequential(*layers)

    def forward(self, x0, len):
        """!  MsResNet Forward Pass

              @param[in]    :   x - padded in[put tensor]
              @param[in]    :   len - length of input tensor
              @return       :   out 1 - padded output tensor
        """
        x0 = x0.permute(0, 2, 1)
        x0 = self.conv1(x0)
        x0 = self.bn1(x0)
        x0 = self.relu(x0)

        x = self.layer3x3_1(x0)
        x = self.layer3x3_2(x)
        x = self.layer3x3_3(x)

        y = self.layer5x5_1(x0)
        y = self.layer5x5_2(y)
        y = self.layer5x5_3(y)

        z = self.layer7x7_1(x0)
        z = self.layer7x7_2(z)
        z = self.layer7x7_3(z)

        out = x + y + z
        out = out.permute(0, 2, 1)
        out1 = self.fc(out)

        ## If Regression Model Pass Through Sigmoid Layer
        if not self.classification:
          x = self.sigmoid(x)

        return out1

### **Model 3&6 Architecture - MSResLSTM**

In [None]:
class MsResNetLSTM(nn.Module):
    """! MS ResNet LSTM Class

         SPOT-1D-Single Model 3 Architecture (for Classification)
         SPOT-1D-Single Model 6 Architecture (for Regression)
    """
    def __init__(self, input_channel=2862, layers=[5, 5, 5, 1], num_classes=3, dropout=0.5, classification=True,
                 ih_weight_init_fn=None, hh_weight_init_fn=None,):
        """!  MSResNet Class Initializer

              @param[in]  :   input_channel - input dimension
              @param[in]  :   layers - list of required conv blocks
              @param[in]  :   num_classes - output dimension / number of classes
              @param[in]  :   dropout - dropout probability
              @param[in]  :   classification - boolean var determining if classification or Regression
              @param[in]  :   ih_weight_init_fn - function for initialising input-hidden weights
              @param[in]  :   hh_weight_init_fn - function for initialising hidden-hidden weights
        """
        self.inplanes3 = 64
        self.inplanes5 = 64
        self.inplanes7 = 64

        self.ih_weight_init_fn = ih_weight_init_fn
        self.hh_weight_init_fn = hh_weight_init_fn

        self.classification = classification

        super(MsResNetLSTM, self).__init__()

        self.conv1 = nn.Conv1d(input_channel, 64, kernel_size=7, stride=1, padding=3,
                               bias=False)
        self.bn1 = nn.BatchNorm1d(64)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool1d(kernel_size=3, stride=2, padding=1)

        self.lstm = nn.LSTM(input_size=64, hidden_size=128, num_layers=4, batch_first=True,
                             bidirectional=True, dropout=dropout)

        # maxplooing kernel size: 16, 11, 6
        self.maxpool3 = nn.AvgPool1d(kernel_size=16, stride=1, padding=0)


        self.layer5x5_1 = self._make_layer5(BasicBlock5x5, 64, layers[0], stride=1)
        self.layer5x5_2 = self._make_layer5(BasicBlock5x5, 128, layers[1], stride=1)
        self.layer5x5_3 = self._make_layer5(BasicBlock5x5, 256, layers[2], stride=1)
        self.maxpool5 = nn.AvgPool1d(kernel_size=11, stride=1, padding=0)


        self.layer7x7_1 = self._make_layer7(BasicBlock7x7, 64, layers[0], stride=1)
        self.layer7x7_2 = self._make_layer7(BasicBlock7x7, 128, layers[1], stride=1)
        self.layer7x7_3 = self._make_layer7(BasicBlock7x7, 256, layers[2], stride=1)
        self.maxpool7 = nn.AvgPool1d(kernel_size=6, stride=1, padding=0)

        self.fc = nn.Linear(256, num_classes)

        ## If Regression Model Add Sigmoid Layer
        if not classification:
          self.sigmoid = torch.nn.Sigmoid()

        ## Initialise Weights
        self.init_weights()

    def init_weights(self):
        """!  Initialise Weights for 2 Layer BiLSTM
        """
        ## Initialise LSTM
        if self.ih_weight_init_fn and self.hh_weight_init_fn:
          for name, param in self.lstm1.named_parameters():
            if 'weight_ih' in name:
                self.ih_weight_init_fn(param.data)
            elif 'weight_hh' in name:
                self.hh_weight_init_fn(param.data)
            elif 'bias' in name:
                param.data.fill_(0)


    def _make_layer3(self, block, planes, blocks, stride=2):
        """!  Construct 3x3 Conv Block Layer with given properties

              @param[in]  :   block - conv block type
              @param[in]  :   planes - output dimension
              @param[in]  :   blocks - number of conv blocks in Layer
              @param[in]  :   stride - block stride
        """
        downsample = None
        if stride != 1 or self.inplanes3 != planes * block.expansion:
            downsample = nn.Sequential(
                nn.Conv1d(self.inplanes3, planes * block.expansion,
                          kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm1d(planes * block.expansion),
            )

        layers = []
        layers.append(block(self.inplanes3, planes, stride, downsample))
        self.inplanes3 = planes * block.expansion
        for i in range(1, blocks):
            layers.append(block(self.inplanes3, planes))

        return nn.Sequential(*layers)


    def _make_layer5(self, block, planes, blocks, stride=2):
        """!  Construct 5x5 Conv Block Layer with given properties

              @param[in]  :   block - conv block type
              @param[in]  :   planes - output dimension
              @param[in]  :   blocks - number of conv blocks in Layer
              @param[in]  :   stride - block stride
        """
        downsample = None
        if stride != 1 or self.inplanes5 != planes * block.expansion:
            downsample = nn.Sequential(
                nn.Conv1d(self.inplanes5, planes * block.expansion,
                          kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm1d(planes * block.expansion),
            )

        layers = []
        layers.append(block(self.inplanes5, planes, stride, downsample))
        self.inplanes5 = planes * block.expansion
        for i in range(1, blocks):
            layers.append(block(self.inplanes5, planes))

        return nn.Sequential(*layers)


    def _make_layer7(self, block, planes, blocks, stride=2):
        """!  Construct 7x7 Conv Block Layer with given properties

              @param[in]  :   block - conv block type
              @param[in]  :   planes - output dimension
              @param[in]  :   blocks - number of conv blocks in Layer
              @param[in]  :   stride - block stride
        """
        downsample = None
        if stride != 1 or self.inplanes7 != planes * block.expansion:
            downsample = nn.Sequential(
                nn.Conv1d(self.inplanes7, planes * block.expansion,
                          kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm1d(planes * block.expansion),
            )

        layers = []
        layers.append(block(self.inplanes7, planes, stride, downsample))
        self.inplanes7 = planes * block.expansion
        for i in range(1, blocks):
            layers.append(block(self.inplanes7, planes))

        return nn.Sequential(*layers)

    def forward(self, x0, seq_lens):
        """!  MsResNetLSTM Forward Pass
              @param[in]    :   x0 - padded in[put tensor]
              @param[in]    :   seq_lens - length of input tensor
              @return       :   out 1 - padded output tensor
        """
        x0 = x0.permute(0,2,1)
        x0 = self.conv1(x0)
        x0 = self.bn1(x0)
        x0 = self.relu(x0)

        x = x0.permute(0,2,1)
        x = nn.utils.rnn.pack_padded_sequence(x, seq_lens, batch_first=True)
        x, (hidden, cell) = self.lstm(x)
        x, _ = nn.utils.rnn.pad_packed_sequence(x, batch_first=True, padding_value=0)
        x = x.permute(0,2,1)

        y = self.layer5x5_1(x0)
        y = self.layer5x5_2(y)
        y = self.layer5x5_3(y)

        z = self.layer7x7_1(x0)
        z = self.layer7x7_2(z)
        z = self.layer7x7_3(z)

        out = x+y+z
        out = out.permute(0,2,1)
        out1 = self.fc(out)

        ## If Regression Model Pass Through Sigmoid Layer
        if not self.classification:
          x = self.sigmoid(x)

        return out1

### **Ensemble Network**

In [None]:
class EnsembleNetwork(nn.Module):
    """! SPOT-1D-Single Ensemble Network Class
    """
    def __init__(self, input_size=2862, num_classes=3, hidden_dim=1000, sequence_length=1024,classification=True):
        """!  SPOT-1D-Single Ensemble Class Initializer

              @param[in]  :   input_size - input dimension
              @param[in]  :   sequence_length - sequence length (max?)
              @param[in]  :   num_classes - output dimension / number of classes
              @param[in]  :   hidden_dim - dimension of lstm hidden layer
              @param[in]  :   classification - boolean var determining if classification or Regression
        """
        super(EnsembleNetwork, self).__init__()
        self.sequence_length = sequence_length

        self.classification = classification

        # Initialize the three models with the correct input sizes
        self.model1 = TwoLayerBLSTM(input_size=input_size, hidden_dim=hidden_dim)
        self.model2 = MsResNet(input_channel=input_size, layers=[5, 5, 5, 1], num_classes=num_classes)
        self.model3 = MsResNetLSTM(input_channel=input_size, layers=[5, 5, 5, 1], num_classes=num_classes)

    def forward(self, x, seq_lens):
        """!  SPOT-1D-Single Ensemble Forward Pass

              @param[in]    :   x - padded in[put tensor]
              @param[in]    :   seq_lens - length of input tensor
              @return       :   out 1 - padded output tensor
        """
        # Assume x is of shape [batch_size, sequence_length, 20]

        # Pass the input through each model
        output1 = self.model1(x, seq_lens)
        output2 = self.model2(x, seq_lens)
        output3 = self.model3(x, seq_lens)

        # Average the logits from each model
        logits = (output1 + output2 + output3) / 3.0

        # Apply softmax or another final activation if needed
        return F.log_softmax(logits, dim=-1)

In [None]:
model = gc.collect()
torch.cuda.empty_cache()
model = EnsembleNetwork().to(DEVICE)
#summary(model, [x.to(DEVICE), plen_test.to(DEVICE)])
CHECKPOINT_PATH = '/content/drive/MyDrive/test_ab1.pth'

In [None]:
model(x.to(DEVICE), plen_test)

tensor([[[-1.2878, -2.2909, -0.4733],
         [-1.5664, -1.1067, -0.7753],
         [-1.0787, -1.0664, -1.1529],
         ...,
         [-1.3340, -1.5215, -0.6574],
         [-0.8472, -1.0706, -1.4758],
         [-1.9390, -0.5682, -1.2392]],

        [[-1.0164, -1.3802, -0.9504],
         [-1.6903, -1.2941, -0.6136],
         [-1.3803, -1.1311, -0.8538],
         ...,
         [-1.7576, -0.7307, -1.0614],
         [-1.1781, -0.6812, -1.6815],
         [-1.3797, -0.6130, -1.5768]],

        [[-0.9875, -1.5670, -0.8703],
         [-0.9853, -1.1946, -1.1274],
         [-0.8875, -1.5658, -0.9691],
         ...,
         [-2.1115, -1.8622, -0.3235],
         [-1.3138, -1.0003, -1.0122],
         [-1.1532, -0.8016, -1.4450]],

        ...,

        [[-0.8275, -1.3555, -1.1873],
         [-0.7908, -1.3746, -1.2255],
         [-0.7934, -1.4555, -1.1570],
         ...,
         [-1.5901, -2.3864, -0.3508],
         [-1.1355, -1.3528, -0.8670],
         [-0.7973, -1.4119, -1.1849]],

        [[

# Model Training
___
___

##Training Configuration
---

### **Optimizer**

In [None]:
def build_optimizer(model, config):
  """!   Build Optimizer for Model
          @param[in]  :   model = model to train
          @param[in]  :   config = parameter configuration dictionary

          @return     :   optimizer
  """
  if config['optimizer'] == "sgd":
      optimizer = torch.optim.SGD(model.parameters(),
                              lr=config['lr'], momentum=config['momentum'],
                              weight_decay=config['weight_decay'])
  elif config['optimizer'] == "adam":
      optimizer = torch.optim.AdamW(model.parameters(),
                               lr=config['lr'], weight_decay=config['weight_decay'])
  elif config['optimizer'] == "nadam":
    optimizer = torch.optim.NAdam(model.parameters(),
                               lr=config['lr'], weight_decay=config['weight_decay'])
  elif config['optimizer'] == "rms":
    optimizer = torch.optim.RMSprop(model.parameters(),
                               lr=config['lr'], weight_decay=config['weight_decay'])
  elif config['optimizer'] == "adagrad":
    optimizer = torch.optim.Adagrad(model.parameters(),
                               lr=config['lr'], lr_decay=0.01,weight_decay=config['weight_decay'])
  elif config['lbfgs'] == "rms":
    torch.optim.LBFGS(model.parameters(),
                               lr=config['lr'], weight_decay=config['weight_decay'])
  return optimizer

### **Criterion**

In [None]:
def build_criterion(model, config):
  """!   Build Criterion for Model
          @param[in]  :   model = model to train
          @param[in]  :   config = parameter configuration dictionary

          @return     :   torch criterion
  """
  if config['criterion'] == "BCE":
    criterion = torch.nn.BCELoss()
  elif config['criterion'] == "cross_entropy":
    criterion = torch.nn.CrossEntropyLoss(label_smoothing=0.1)
  return criterion

In [None]:
criterion = torch.nn.CrossEntropyLoss(ignore_index=-1)

In [None]:
def compute_masked_loss():
  """!   Build Optimizer for Network
          @param[in]  :   outputs - output tensor
          @param[in]  :   SS3 indicies - output classifier data

          @return     :   loss
  """

### **Scheduler**

In [None]:
scheduler   = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                         mode='min',
                                                         factor=0.8,
                                                         patience=3,
                                                         verbose=True)

### **Scaler**

In [None]:
scaler      = torch.cuda.amp.GradScaler()  # Initialize the gradient scaler for mixed-precision training

## Classification Train & Validate Functions
---

### **Train Epoch**

In [None]:
def train_epoch(model, train_loader, optimizer, criterion, scaler):
    """!   Run a single Training Epoch
          @param[in]  :   model = NN network to train
          @param[in]  :   train_loader = dataloader for training dataset
          @param[in]  :   optimizer = training optimizer
          @param[in]  :   criterion = loss function
          @param[in]  :   scaler = scaler for mixed precision learning

          @return     :   model = trained NN network
          @return     :   ep_loss = average loss for epoch
          @return     :   ep_acc = average accuracy for epoch
    """
    model.train() # Set Model in Training mode
    num_correct = 0
    total_loss = 0
    total_accuracy = 0

    batch_bar   = tqdm(total=len(train_loader), dynamic_ncols=True, leave=False, position=0, desc='Train')

    ## Loop through points in dataset
    for i, (batch) in enumerate(train_loader):
          ## Initialise Gradient
          optimizer.zero_grad()

          ## Move Data to Device (Ideally GPU)
          x, y, lengths, protein_names, sequences = batch
          x, y = x.to(DEVICE), y.to(DEVICE)

          with torch.cuda.amp.autocast():
              ## Predict output with model & calculate loss
              ## Forward Propagation
              outputs = model(x, lengths)
              #print("x", x.size())
              #print("out, prob, y",raw_outputs.size(), prob_dist.size(), y.size())
              loss    = criterion(outputs, y)

          ## Update no. of correct predictions & loss as we iterate
          total_loss += float(loss.item())
          total_accuracy += int((torch.argmax(outputs, axis=1) == y).sum())


          ## Update Batch Bar
          batch_bar.set_postfix(loss="{:.04f}".format(float(total_loss / (i + 1))),
                              acc="{:.04f}%".format(float(total_accuracy*100 / (i + 1))),
                              lr="{:.04f}".format(float(optimizer.param_groups[0]['lr'])))
          batch_bar.update()

          ## Log in W&B
          wandb.log({"batch loss": loss.item})

          ## Backward pass with scaled gradients
          scaler.scale(loss).backward()
          scaler.step(optimizer)  # Update model parameters
          scaler.update()  # Update the scale for next iteration

          scheduler.step()

          ### Release memory
          del x, y, lengths, protein_names, sequences
          torch.cuda.empty_cache()

    ## Calculate Average Loss for Epoch
    batch_bar.close()
    ep_loss = float(total_loss / len(train_loader))
    ep_acc = float(total_accuracy / len(train_loader))
    lr = float(optimizer.param_groups[0]['lr'])

    return model, ep_loss, ep_acc

### **Validate Epoch**

In [None]:
def eval(model, val_loader):
    """!  Evaluate Model with Validation Data Set
          @param[in]  :   model = trained NN network to evaluate
          @param[in]  :   val_loader = dataloader for validation dataset

          @return     :   ep_loss = average loss for epoch
          @return     :   ep_acc = average accuracy for epoch
    """
    model.eval() # set model in evaluation mode
    vloss, v_acc = 0, 0 # Monitoring loss and accuracy
    batch_bar   = tqdm(total=len(val_loader), dynamic_ncols=True, position=0, leave=False, desc='Val')

    ## Loop through points in dataset
    for i, (batch) in enumerate(val_loader):

        ## Move Data to Device (Ideally GPU)
        x, y, lengths, protein_names, sequences = batch
        x, y = x.to(DEVICE), y.to(DEVICE)

        # makes sure that there are no gradients computed as we are not training the model now
        with torch.inference_mode():
            ### Forward Propagation
            ### Loss Calculation
            outputs = model(x, lengths)
            loss    = criterion(outputs, y)

        vloss      += float(loss.item())
        v_acc       += int((torch.argmax(outputs, axis=1) == y).sum())

        # Do you think we need loss.backward() and optimizer.step() here?

        batch_bar.set_postfix(loss="{:.04f}".format(float(vloss / (i + 1))),
                              acc="{:.04f}%".format(float(v_acc / (i + 1))))
        batch_bar.update()

        ### Release memory
        del x, y, raw_outputs, prob_dist
        torch.cuda.empty_cache()

    batch_bar.close()
    vloss   /= len(val_loader)
    v_acc    /= len(val_loader)

    return vloss, v_acc

### **Training & Val Full**

In [None]:
def train(model, model_config, finish= True):
    """!  Train NN
          @param[in]  :   model = NN network to train
          @param[in]  :   model_config = dictionary of model params
    """
    ## Reset cache & torch
    torch.cuda.empty_cache()
    gc.collect()
    wandb.watch(model, log="all")


    '''!  @brief  :   INIT & SETUP '''
    ## Building Training and Validation Datasets
    train_loader = build_train_dataset(model_config, train_data)
    val_loader = build_val_dataset(model_config, val_data)

    ## Define Optimizer, Criterion, Scheduler & Scaler
    optimizer = build_optimizer(model, model_config)
    scheduler   = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                         mode='min',
                                                         factor=0.8,
                                                         patience=3,
                                                         verbose=True)

    ## Set parameter to keep track of best epoch accuracy
    best_lev_dist = float("inf")


    ## Loop through Epochs & Train/Evaluate Model
    for epoch in range(model_config['epochs']):

      '''!  @brief  :   RUN TRAIN & EVAL FUNCTIONS '''
      ## Determine Current Learning Rate for Epoch
      curr_lr  = float(optimizer.param_groups[0]['lr'])

      ## Train Model -> Obtain Loss & Accuracy
      train_model, train_loss, train_acc = train_epoch(model, train_loader, optimizer, criterion, scaler)

      ## Test Against Validation Set --> Obtain Loss & Accuracy
      val_loss, val_dist = eval(model, val_loader)

      scheduler.step(val_loss)


      '''!  @brief  :   LOG RESULTS '''
      print("\t Epoch {}/{}:".format(epoch + 1,model_config['epochs']))
      print("\tTrain Acc {:.04f}%\tTrain Loss {:.04f}\t Learning Rate {:.07f}".format(train_acc*100, train_loss, curr_lr))
      print("\tLev Dist {:.04f}%\tVal Loss {:.04f}".format(val_dist, val_loss))

      ## Log Metrics at the End of Each Epoch
      ## Create W&B dic of Epoch metrics
      metrics = {
          "train_loss":train_loss,
          "train_acc": train_acc*100,
          'val_dist': val_dist,
          'valid_loss': val_loss,
          'lr': curr_lr,
      }

      ## Log Metrics to W&B
      wandb.log(metrics)

      '''!  @brief  :   ARTIFACT & MODEL CHECKPOINT '''
      ## If Improved Accuracy, Update Accuracy & Model/Optimizer States
      if val_dist < best_lev_dist:
        best_lev_dist = val_dist

        ## Save to Google Drive
        torch.save({
              'model_state_dict': model.state_dict(),
              'optimizer_state_dict': optimizer.state_dict(),
              'val_dist': val_dist,
              'epoch': epoch
              }, (CHECKPOINT_PATH))

        ## Save Checkpoint to W&B
        #wandb.save(model_config['checkpoint_path'])



      ## Create W&B Artifact
      '''model_artifact = wandb.Artifact(run_config['model'], type='model')
      model_artifact.add_file("Model")

      ## Save to W&B
      run.log_artifact(model_artifact)'''


    #wandb.finish()




In [None]:
run = wandb.init(
    name = "project-submission", ## Wandb creates random run names if you skip this field
    reinit = True, ### Allows reinitalizing runs when you re-run this cell
    # run_id = ### Insert specific run id here if you want to resume a previous run
    # resume = "must" ### You need this to resume previous runs, but comment out reinit = True when using this
    project = "project-ablations", ### Project should be created in your wandb account
    config = config ### Wandb Config for your run
)


In [None]:
train(model, CURRENT_CONFIG)

Batch size           :  10
Train batches        :  3892
Val batches          :  10


Train:   0%|          | 0/3892 [00:00<?, ?it/s]

2DCW_1_A (42, 2862)
3HMX_2_B (197, 2862)
3CU2_1_A (237, 2862)
2CFM_1_A (561, 2862)
2K1N_3_A (55, 2862)
2KSD_1_A (115, 2862)
3P4T_1_A (403, 2862)
2L3S_1_A (163, 2862)
1DGN_1_A (89, 2862)
3AAI_1_A (94, 2862)


RuntimeError: ignored