In [1]:
# default_exp models.pretrained.lstm

In [2]:
# all_func


## fasta + BioPython

In [3]:
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

import pandas as pd

from peptide.basics import *
from peptide.preprocessing.data import (
    ProteinDataset,
    ACPDataset,
    AMPDataset,
    DNABindDataset,
)


In [4]:
record = SeqRecord(
    Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF"),
    id="YP_025292.1",
    name="HokC",
    description="toxic membrane protein, small",
)
print(record.format('fasta'))

>YP_025292.1 toxic membrane protein, small
MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF



In [5]:
record[:5]

SeqRecord(seq=Seq('MKQHK'), id='YP_025292.1', name='HokC', description='toxic membrane protein, small', dbxrefs=[])

In [6]:
acp_data = ACPDataset(DATA_STORE)
amp_data = AMPDataset(DATA_STORE)
dnabind_data = DNABindDataset(DATA_STORE)

## ProSE

Sample python commands to generate embeddings from pretrained model in the ProSE codebase

**ACP**
- Remove `-d 0` from command to run on CPU
```
python embed_sequences.py -d 0 --pool avg -o ~/.peptide/datasets/lstm/avg/acp_avgpool_test.h5 ~/.peptide/datasets/fasta/ACPDataset_test.fasta
# loading the pre-trained ProSE MT model
# writing: /home/vinod/.peptide/datasets/lstm/avg/acp_avgpool_test.h5
# embedding with pool=avg
```

**AMP**
- For AMP - some type of pooling needs to be done on the non-truncated sequences as `train` and `test` have different max seq lengths
```
python embed_sequences.py -d 0 --pool avg -o ~/.peptide/datasets/lstm/avg/amp_avgpool_train.h5 ~/.peptide/datasets/fasta/AMPDataset_train.fasta
# loading the pre-trained ProSE MT model
# writing: /home/vinod/.peptide/datasets/lstm/avg/amp_avgpool_train.h5
# embedding with pool=avg
```
- Truncated
```
python embed_sequences.py -d 0 --pool avg -o ~/.peptide/datasets/lstm/avg/amp_avgpool_test_seqlen_150.h5 ~/.peptide/datasets/fasta/AMPDataset_test_seqlen_150.fasta
# loading the pre-trained ProSE MT model
# writing: /home/vinod/.peptide/datasets/lstm/avg/amp_avgpool_test_seqlen_150.h5
# embedding with pool=avg
```

**DNA Binding**
- Same as AMP - some pooling needed for the full non-truncated sequences
```
python embed_sequences.py -d 0 --pool avg -o ~/.peptide/datasets/lstm/avg/dnabind_avgpool_test.h5 ~/.peptide/datasets/fasta/DNABindDataset_test.fasta
# loading the pre-trained ProSE MT model
# writing: /home/vinod/.peptide/datasets/lstm/avg/dnabind_avgpool_test.h5
# embedding with pool=avg
```
- Truncated example
```
python embed_sequences.py -d 0 --pool avg -o ~/.peptide/datasets/lstm/avg/dnabind_avgpool_train_seqlen_300.h5 ~/.peptide/datasets/fasta/DNABindDataset_train_seqlen_300.fasta
# loading the pre-trained ProSE MT model
# writing: /home/vinod/.peptide/datasets/lstm/avg/dnabind_avgpool_train_seqlen_300.h5
# embedding with pool=avg
```

## H5

In [None]:
import h5py

In [None]:
filename = "data/acp_test_avgpool.h5"

In [None]:

with h5py.File(filename, "r") as f:
    # Print all root level object names (aka keys) 
    # these can be group or dataset names 
    print("Keys: %s" % f.keys())
    print("--")
    # get first object name/key; may or may NOT be a group
    a_group_key = list(f.keys())[1]

    # get the object type for a_group_key: usually group or dataset
    print(type(f[a_group_key])) 

    # If a_group_key is a group name, 
    # this gets the object names in the group and returns as a list
    data = list(f[a_group_key])

    # If a_group_key is a dataset name, 
    # this gets the dataset values and returns as a list
    data = list(f[a_group_key])
    # preferred methods to get dataset values:
    ds_obj = f[a_group_key]      # returns as a h5py dataset object
    ds_arr = f[a_group_key][()]  # returns as a numpy array

Keys: <KeysViewHDF5 ['1', '2', '3', '4']>
--
<class 'h5py._hl.dataset.Dataset'>


In [None]:
ds_arr.shape

(6165,)

In [None]:
ds_arr

array([ 0.04545455,  0.04545455,  0.11363637, ...,  0.09409127,
       -0.05029093,  0.62466204], dtype=float32)

In [None]:
f = h5py.File(filename, "r")

In [None]:
for key in f.keys():
    print(f'indx: {key}') #Names of the root level object names in HDF5 file - can be groups or datasets.
    print(type(f[key])) # get the object type: usually group or dataset
    data = f[key][()]
    print(data.shape)
f.close()


indx: 1
<class 'h5py._hl.dataset.Dataset'>
(6165,)
indx: 2
<class 'h5py._hl.dataset.Dataset'>
(6165,)
indx: 3
<class 'h5py._hl.dataset.Dataset'>
(6165,)
indx: 4
<class 'h5py._hl.dataset.Dataset'>
(6165,)


In [2]:
import torch

In [5]:
acp_train = torch.load('/home/vinod/.peptide/datasets/lstm/acp_nopool_train.h5')

UnpicklingError: invalid load key, 'H'.

y