In [1]:
from data_module import CodonDataModule
from calm.sequence import CodonSequence
from calm.alphabet import Alphabet
import argparse

In [35]:
parser = argparse.ArgumentParser()
args = argparse.Namespace(**{'batch_size':32, 'max_positions':1024})

alphabet = Alphabet.from_architecture('CodonModel')
datamodule = CodonDataModule(args, alphabet,
                            'data/meltome/meltome_data.csv', args.batch_size,
                            fine_tune=True, target_column='melting_temperature')

In [36]:
datamodule

<data_module.CodonDataModule at 0x7fd300e778e0>

In [37]:
datamodule.setup()

In [38]:
data = datamodule.train_data[:5]
data

[(<calm.sequence.CodonSequence at 0x7fd3ac3e7d00>, 58.9084787992705),
 (<calm.sequence.CodonSequence at 0x7fd3ac4aa5c0>, 48.2402877443515),
 (<calm.sequence.CodonSequence at 0x7fd3ac2f6620>, 48.108157673752096),
 (<calm.sequence.CodonSequence at 0x7fd3ac3174c0>, 53.3678254394789),
 (<calm.sequence.CodonSequence at 0x7fd3ac379fc0>, 41.75821094851)]

In [39]:
batch = next(iter(datamodule.train_dataloader()))

print("Batch Sequence Shapes:", [seq.shape for seq in batch["input"]])
print("Batch Labels:", batch["labels"])

Batch Sequence Shapes: [torch.Size([1024]), torch.Size([1024]), torch.Size([1024]), torch.Size([1024]), torch.Size([1024]), torch.Size([1024]), torch.Size([1024]), torch.Size([1024]), torch.Size([1024]), torch.Size([1024]), torch.Size([1024]), torch.Size([1024]), torch.Size([1024]), torch.Size([1024]), torch.Size([1024]), torch.Size([1024]), torch.Size([1024]), torch.Size([1024]), torch.Size([1024]), torch.Size([1024]), torch.Size([1024]), torch.Size([1024]), torch.Size([1024]), torch.Size([1024]), torch.Size([1024]), torch.Size([1024]), torch.Size([1024]), torch.Size([1024]), torch.Size([1024]), torch.Size([1024]), torch.Size([1024]), torch.Size([1024])]
Batch Labels: tensor([58.9085, 48.2403, 48.1082, 53.3678, 41.7582, 56.3471, 51.2968, 44.0078,
        45.6959, 50.3465, 48.8603, 54.5324, 89.0259, 53.2879, 44.9986, 52.2246,
        58.4144, 61.3045, 38.8794, 52.0154, 57.6974, 49.2634, 48.1655, 58.2843,
        38.1419, 47.2221, 55.2085, 48.0492, 51.3307, 55.1612, 52.6146, 48.3048],
 

In [None]:
batch_sequence_shapes

In [7]:
print(f"Train size: {len(datamodule.train_data)}, Val size: {len(datamodule.val_data)}")

Train size: 11817, Val size: 2955


In [6]:
sequences, labels = zip(*data)  

In [7]:
labels

(58.9084787992705,
 48.2402877443515,
 48.108157673752096,
 53.3678254394789,
 41.75821094851)

In [16]:
from collections import namedtuple

from calm.alphabet import Alphabet
from calm.sequence import CodonSequence
from calm.ft_pipeline import (
    FTPipeline,
    PipelineInput,
    FTDataCollator,
    FTDataTrimmer,
    FTDataPadder,
    FTDataPreprocessor,
)


def fake_args():
    Args = namedtuple('args', [
        'mask_proportion',
        'max_positions',
        'mask_percent',
        'leave_percent'
    ])
    return Args(mask_proportion=.25, max_positions=10,
        mask_percent=.8, leave_percent=.1)

def test_DataCollator_codon():
    args = fake_args()
    alphabet = Alphabet.from_architecture('CodonModel')
    data_collator = FTDataCollator(args, alphabet)

    seq1 = CodonSequence('AUG GGA CGC UUU UAC CAA AUG GGA CGC UUU UAC CAA UAA ' * 10)
    seq2 = CodonSequence('AUG GGA CGC UAA')
    input_ = PipelineInput(sequence=[seq1, seq2], labels=[10.6, 10.8])
    output = data_collator(input_)
    print(output)

def test_DataTrimmer_codon():
    args = fake_args()
    alphabet = Alphabet.from_architecture('CodonModel')
    data_trimmer = FTPipeline([
        FTDataCollator(args, alphabet),
        FTDataTrimmer(args, alphabet)
    ])

    seq1 = CodonSequence('AUG GGA CGC UUU UAC CAA AUG GGA CGC UUU UAC CAA UAA ' * 10)
    seq2 = CodonSequence('AUG GGA CGC UAA')
    output = data_trimmer([(seq1, 10.6), (seq2, 10.8)])
    print(output)

def test_DataPadder_codon():
    args = fake_args()
    alphabet = Alphabet.from_architecture('CodonModel')
    data_padder = FTPipeline([
        FTDataCollator(args, alphabet),
        FTDataTrimmer(args, alphabet),
        FTDataPadder(args, alphabet),
    ])

    seq1 = CodonSequence('AUG GGA CGC UUU UAC CAA AUG GGA CGC UUU UAC CAA UAA ' * 10)
    seq2 = CodonSequence('AUG GGA CGC UAA')
    output = data_padder([(seq1, 10.6), (seq2, 10.8)])
    print(output)

def test_DataPreprocessor_codon():
    args = fake_args()
    alphabet = Alphabet.from_architecture('CodonModel')
    data_preprocessor = FTPipeline([
        FTDataCollator(args, alphabet),
        FTDataTrimmer(args, alphabet),
        FTDataPadder(args, alphabet),
        FTDataPreprocessor(args, alphabet)
    ])

    seq1 = CodonSequence('AUG GGA CGC UUU UAC CAA AUG GGA CGC UUU UAC CAA UAA ' * 10)
    seq2 = CodonSequence('AUG GGA CGC UAA')
    output = data_preprocessor([(seq1, 10.6), (seq2, 10.8)])
    print(output)



In [17]:
test_DataCollator_codon()
test_DataTrimmer_codon()
test_DataPadder_codon()
test_DataPreprocessor_codon()

PipelineData(sequence=['<cls> AUG GGA CGC UUU UAC CAA AUG GGA CGC UUU UAC CAA UAA AUG GGA CGC UUU UAC CAA AUG GGA CGC UUU UAC CAA UAA AUG GGA CGC UUU UAC CAA AUG GGA CGC UUU UAC CAA UAA AUG GGA CGC UUU UAC CAA AUG GGA CGC UUU UAC CAA UAA AUG GGA CGC UUU UAC CAA AUG GGA CGC UUU UAC CAA UAA AUG GGA CGC UUU UAC CAA AUG GGA CGC UUU UAC CAA UAA AUG GGA CGC UUU UAC CAA AUG GGA CGC UUU UAC CAA UAA AUG GGA CGC UUU UAC CAA AUG GGA CGC UUU UAC CAA UAA AUG GGA CGC UUU UAC CAA AUG GGA CGC UUU UAC CAA UAA AUG GGA CGC UUU UAC CAA AUG GGA CGC UUU UAC CAA UAA <eos>', '<cls> AUG GGA CGC UAA <eos>'], labels=[10.6, 10.8])
{'sequence': ['<cls> AUG GGA CGC UUU UAC CAA AUG GGA CGC', '<cls> AUG GGA CGC UAA <eos>'], 'labels': [10.6, 10.8]}
{'sequence': ['<cls> AUG GGA CGC UUU UAC CAA AUG GGA CGC', '<cls> AUG GGA CGC UAA <eos> <pad> <pad> <pad> <pad>'], 'labels': [10.6, 10.8]}
{'input': tensor([[ 0, 11, 64, 50, 25, 22, 36, 11, 64, 50],
        [ 0, 11, 64, 50, 20,  2,  1,  1,  1,  1]], dtype=torch.int32), 'lab