In [1]:
# default_exp models.pretrained.lstm

In [2]:
# all_func


## fasta + BioPython

In [13]:
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

import h5py

import pandas as pd
import numpy as np

from peptide.basics import *
from peptide.preprocessing.data import (
    ProteinDataset,
    ACPDataset,
    AMPDataset,
    DNABindDataset,
)


In [4]:
record = SeqRecord(
    Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF"),
    id="YP_025292.1",
    name="HokC",
    description="toxic membrane protein, small",
)
print(record.format('fasta'))

>YP_025292.1 toxic membrane protein, small
MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF



In [5]:
record[:5]

SeqRecord(seq=Seq('MKQHK'), id='YP_025292.1', name='HokC', description='toxic membrane protein, small', dbxrefs=[])

In [6]:
acp_data = ACPDataset(DATA_STORE)
amp_data = AMPDataset(DATA_STORE)
dnabind_data = DNABindDataset(DATA_STORE)

## ProSE

Sample python commands to generate embeddings from pretrained model in the ProSE codebase

**ACP**
- Remove `-d 0` from command to run on CPU
```
python embed_sequences.py -d 0 --pool avg -o ~/.peptide/datasets/lstm/avg/acp_avgpool_test.h5 ~/.peptide/datasets/fasta/ACPDataset_test.fasta
# loading the pre-trained ProSE MT model
# writing: /home/vinod/.peptide/datasets/lstm/avg/acp_avgpool_test.h5
# embedding with pool=avg
```

**AMP**
- For AMP - some type of pooling needs to be done on the non-truncated sequences as `train` and `test` have different max seq lengths
```
python embed_sequences.py -d 0 --pool avg -o ~/.peptide/datasets/lstm/avg/amp_avgpool_train.h5 ~/.peptide/datasets/fasta/AMPDataset_train.fasta
# loading the pre-trained ProSE MT model
# writing: /home/vinod/.peptide/datasets/lstm/avg/amp_avgpool_train.h5
# embedding with pool=avg
```
- Truncated
```
python embed_sequences.py -d 0 --pool avg -o ~/.peptide/datasets/lstm/avg/amp_avgpool_test_seqlen_150.h5 ~/.peptide/datasets/fasta/AMPDataset_test_seqlen_150.fasta
# loading the pre-trained ProSE MT model
# writing: /home/vinod/.peptide/datasets/lstm/avg/amp_avgpool_test_seqlen_150.h5
# embedding with pool=avg
```

**DNA Binding**
- Same as AMP - some pooling needed for the full non-truncated sequences
```
python embed_sequences.py -d 0 --pool avg -o ~/.peptide/datasets/lstm/avg/dnabind_avgpool_test.h5 ~/.peptide/datasets/fasta/DNABindDataset_test.fasta
# loading the pre-trained ProSE MT model
# writing: /home/vinod/.peptide/datasets/lstm/avg/dnabind_avgpool_test.h5
# embedding with pool=avg
```
- Truncated example
```
python embed_sequences.py -d 0 --pool avg -o ~/.peptide/datasets/lstm/avg/dnabind_avgpool_train_seqlen_300.h5 ~/.peptide/datasets/fasta/DNABindDataset_train_seqlen_300.fasta
# loading the pre-trained ProSE MT model
# writing: /home/vinod/.peptide/datasets/lstm/avg/dnabind_avgpool_train_seqlen_300.h5
# embedding with pool=avg
```

## H5

In [4]:
filename = "/home/vinod/.peptide/datasets/lstm/avg/acp_avgpool_test.h5"

In [5]:

with h5py.File(filename, "r") as f:
    # Print all root level object names (aka keys) 
    # these can be group or dataset names 
    print("Keys: %s" % f.keys())
    print("--")
    # get first object name/key; may or may NOT be a group
    a_group_key = list(f.keys())[1]

    # get the object type for a_group_key: usually group or dataset
    print(type(f[a_group_key])) 

    # If a_group_key is a group name, 
    # this gets the object names in the group and returns as a list
    data = list(f[a_group_key])

    # If a_group_key is a dataset name, 
    # this gets the dataset values and returns as a list
    data = list(f[a_group_key])
    # preferred methods to get dataset values:
    ds_obj = f[a_group_key]      # returns as a h5py dataset object
    ds_arr = f[a_group_key][()]  # returns as a numpy array

Keys: <KeysViewHDF5 ['0 |0', '1 |0', '10 |1', '100 |0', '101 |0', '102 |1', '103 |1', '104 |0', '105 |1', '106 |0', '107 |0', '108 |0', '109 |1', '11 |0', '110 |0', '111 |0', '112 |1', '113 |1', '114 |0', '115 |1', '116 |0', '117 |0', '118 |1', '119 |1', '12 |0', '120 |0', '121 |1', '122 |0', '123 |0', '124 |1', '125 |0', '126 |0', '127 |1', '128 |0', '129 |1', '13 |1', '130 |1', '131 |1', '132 |1', '133 |0', '134 |0', '135 |1', '136 |0', '137 |1', '138 |0', '139 |0', '14 |1', '140 |0', '141 |1', '142 |0', '143 |1', '144 |0', '145 |0', '146 |1', '147 |0', '148 |1', '149 |0', '15 |1', '150 |0', '151 |0', '152 |1', '153 |1', '154 |1', '155 |1', '156 |0', '157 |1', '158 |0', '159 |0', '16 |1', '160 |1', '161 |1', '162 |0', '163 |1', '164 |1', '165 |1', '166 |1', '167 |1', '168 |0', '169 |0', '17 |0', '170 |0', '171 |0', '172 |0', '173 |0', '174 |1', '175 |1', '176 |1', '177 |0', '178 |1', '179 |1', '18 |1', '180 |1', '181 |1', '182 |1', '183 |1', '184 |1', '185 |1', '186 |0', '187 |0', '1

In [6]:
ds_arr.shape

(6165,)

In [7]:
ds_arr

array([ 0.04545455,  0.04545455,  0.11363637, ...,  0.09409132,
       -0.05029086,  0.62466204], dtype=float32)

In [18]:
ys = []
Xs = []
with h5py.File(filename, "r") as f:

    for key in f.keys():
        label = key.split('|')[-1]
        ys.append(float(label))
        seq = f[key][()]
        Xs.append(seq)
Xs = np.stack(Xs, axis=0)
ys = np.stack(ys, axis=0)

In [19]:
Xs.shape, ys.shape

((344, 6165), (344,))

In [20]:
Xs

array([[ 0.05      ,  0.        ,  0.        , ...,  0.12216329,
        -0.27930424, -0.04172204],
       [ 0.04545455,  0.04545455,  0.11363637, ...,  0.09409132,
        -0.05029086,  0.62466204],
       [ 0.06666667,  0.        ,  0.        , ..., -0.06018177,
        -0.05313366,  0.00379704],
       ...,
       [ 0.2631579 ,  0.        ,  0.        , ..., -0.1341106 ,
        -0.0427261 , -0.12614213],
       [ 0.125     ,  0.15625   ,  0.0625    , ..., -0.01764085,
         0.05235878,  0.02743137],
       [ 0.06666667,  0.06666667,  0.06666667, ..., -0.03098731,
        -0.05793136, -0.06837866]], dtype=float32)

In [21]:
ys

array([0., 0., 1., 0., 0., 1., 1., 0., 1., 0., 0., 0., 1., 0., 0., 0., 1.,
       1., 0., 1., 0., 0., 1., 1., 0., 0., 1., 0., 0., 1., 0., 0., 1., 0.,
       1., 1., 1., 1., 1., 0., 0., 1., 0., 1., 0., 0., 1., 0., 1., 0., 1.,
       0., 0., 1., 0., 1., 0., 1., 0., 0., 1., 1., 1., 1., 0., 1., 0., 0.,
       1., 1., 1., 0., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 1.,
       1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 1., 0., 1.,
       1., 0., 1., 0., 0., 0., 1., 1., 0., 0., 0., 0., 0., 1., 0., 0., 1.,
       0., 0., 0., 1., 0., 0., 1., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 1., 0., 0., 1., 1., 0., 0., 1., 0., 1., 1., 0., 1.,
       1., 1., 0., 0., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 0., 1., 0.,
       1., 0., 1., 0., 1., 1., 0., 1., 0., 0., 1., 0., 1., 0., 1., 0., 1.,
       1., 0., 0., 1., 0., 0., 1., 0., 0., 1., 0., 1., 0., 1., 0., 0., 0.,
       1., 1., 1., 1., 0., 1., 0., 0., 1., 0., 1., 1., 0., 0., 1., 0., 1.,
       1., 0., 1., 1., 1.