# Module Testing

Modules tested:
- Toy data generator
- MockESM
- DCCN

In [1]:
import sys
import os

root = os.path.abspath('..')
sys.path.insert(0, root)

import torch
import torch.nn as nn
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

# models
from modules.mock_esm import MockESM
from modules.dccn import DCCN_1D

# toy data generator
from tests.generate_toy_data import generate_toy_data

# linear probe  
from tests.dccn_probe import LinearProbe

In [2]:
n_samples = 50
n_go_terms = 5
min_len = 10
max_len = 20

df, go_vocab, labels = generate_toy_data(
    n_samples=n_samples, 
    n_go_terms=n_go_terms, 
    min_len=min_len, 
    max_len=max_len, 
    seed = 1000
)

In [3]:
print(go_vocab)
for i in range(3):
    print(f'{df.iloc[i]['sequence']}; {df.iloc[i]['go_terms']}')

['GO:0000001', 'GO:0000002', 'GO:0000003', 'GO:0000004', 'GO:0000005']
EPNDRGVQFIINSHNI; ['GO:0000002', 'GO:0000001']
SFRQHAVRHVGEKGLDAMAK; ['GO:0000005', 'GO:0000001']
LPMIINYSELPRN; ['GO:0000005', 'GO:0000003']


In [4]:
# Convert sequences to indices
AA_TO_IDX = {aa: i for i, aa in enumerate('ACDEFGHIKLMNPQRSTVWY')}
train_prop = 0.8
test_val_prop = 0.5

def sequence_to_indices(sequence):
    return [AA_TO_IDX[aa] for aa in sequence]

def process_sequences(sequences, max_len):
    """
    convert sequences to indices, pad to max length
    """

    padded = []
    for seq in sequences:
        tokens = sequence_to_indices(seq)
        if len(tokens) < max_len:
            tokens.extend([20] * (max_len - len(tokens)))  # 20 is the padding index
        else:
            tokens = tokens[:max_len]  # truncate to max length
        padded.append(tokens)
    
    return torch.tensor(padded, dtype=torch.long)

max_seq_len = max(len(seq) for seq in df['sequence'])
print(f"Max sequence length: {max_seq_len}")

train_seqs, test_val_seqs = train_test_split(df['sequence'], test_size=train_prop)
val_seqs, test_seqs = train_test_split(test_val_seqs, test_size=test_val_prop)

train_lab, test_val_lab = train_test_split(labels, test_size=train_prop)
val_lab, test_lab = train_test_split(test_val_lab, test_size=test_val_prop)

train_seqs_final = process_sequences(train_seqs, max_seq_len)
val_seqs_final = process_sequences(val_seqs, max_seq_len)
test_seqs_final = process_sequences(test_seqs, max_seq_len)

train_labels_final = torch.tensor(train_lab, dtype=torch.float32)
val_labels_final = torch.tensor(val_lab, dtype=torch.float32)
test_labels_final = torch.tensor(test_lab, dtype=torch.float32)

print(f'Sequence dimensions')
print(f'    Train: {train_seqs_final.shape}')
print(f'    Val: {val_seqs_final.shape}')
print(f'    Test: {test_seqs_final.shape}')
print(f'Label dimensions')
print(f'    Train: {train_labels_final.shape}')
print(f'    Val: {val_labels_final.shape}')
print(f'    Test: {test_labels_final.shape}')


Max sequence length: 20
Sequence dimensions
    Train: torch.Size([10, 20])
    Val: torch.Size([20, 20])
    Test: torch.Size([20, 20])
Label dimensions
    Train: torch.Size([10, 5])
    Val: torch.Size([20, 5])
    Test: torch.Size([20, 5])


In [5]:
# minimal hyperparameters
embed_len = 16      # ESM embedding dimension (for each amino acid)
hidden_len = 32     # MockESM hidden layer
proj_len = 24       # ESM projection dimension
go_dim = n_go_terms # output dimension

In [6]:
# initialize models
mock_esm = MockESM(
    seq_len=max_seq_len, 
    hidden_len=hidden_len, 
    embed_len=embed_len, 
    proj_len=proj_len
)

dccn = DCCN_1D(embed_len=proj_len)

linear_probe = LinearProbe(in_dim=proj_len, go_dim=go_dim)

print(f'MockESM dimensions: {embed_len} -> {hidden_len} -> {proj_len}')
print(f'DCCN dimensions: {proj_len} channels with {dccn.dilations} dilations')
print(f'LinearProbe dimensions: {proj_len} -> {go_dim}')

def param_count(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'\n Number of parameters:')
print(f'    MockESM: {param_count(mock_esm)}')
print(f'    DCCN: {param_count(dccn)}')
print(f'    LinearProbe: {param_count(linear_probe)}')
print(f'    Total: {param_count(mock_esm) + param_count(dccn) + param_count(linear_probe)}')


MockESM dimensions: 16 -> 32 -> 24
DCCN dimensions: 24 channels with (1, 2, 4, 8) dilations
LinearProbe dimensions: 24 -> 5

 Number of parameters:
    MockESM: 1672
    DCCN: 16512
    LinearProbe: 125
    Total: 18309


In [7]:
test_batch_seqs = train_seqs_final[:10]
test_batch_labels = train_labels_final[:10]
models = {
    'mock_esm': mock_esm,
    'dccn': dccn,
    'linear_probe': linear_probe
}

In [8]:
# test step
def test_step(seqs, labels, models):
    mock_esm, dccn, probe = models['mock_esm'], models['dccn'], models['linear_probe']
    loss_fn = nn.BCEWithLogitsLoss()

    for m in (mock_esm, dccn, probe):
        m.train()                      # enable grads
        m.zero_grad(set_to_none=True)

    # forward
    print(f"initial dimensions: {seqs.shape}")
    z = mock_esm(seqs)
    print(f"dimensions after mock_esm: {z.shape}")
    z = dccn(z)
    print(f"dimensions after dccn: {z.shape}")
    z = z.mean(dim=1)
    print(f"dimensions after mean: {z.shape}")
    logits = probe(z)
    print(f"dimensions after probe: {logits.shape}")

    # backward (without optimizer step)
    loss = loss_fn(logits, labels.float())
    loss.backward()

    # check if gradients exist
    grad_ok = all(
        p.grad is not None for m in (mock_esm, dccn, probe) for p in m.parameters() if p.requires_grad
    )

    return loss, grad_ok

loss, grad_ok = test_step(test_batch_seqs, test_batch_labels, models)
print(loss, grad_ok)

initial dimensions: torch.Size([10, 20])
dimensions after mock_esm: torch.Size([10, 20, 24])
dimensions after dccn: torch.Size([10, 20, 24])
dimensions after mean: torch.Size([10, 24])
dimensions after probe: torch.Size([10, 5])
tensor(0.7973, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>) True
