In [1]:
# default_exp models.pretrained.lstm

In [2]:
# all_func


In [27]:
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

import h5py

import pandas as pd
import numpy as np

from peptide.basics import *
from peptide.preprocessing.data import (
    ProteinDataset,
    ACPDataset,
    AMPDataset,
    DNABindDataset,
)


In [29]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

# from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import (
    accuracy_score,
    recall_score,
    precision_score,
    f1_score,
    classification_report,
)
from sklearn.model_selection import GridSearchCV

import numpy as np

from xgboost import XGBClassifier


## fasta + BioPython

In [4]:
record = SeqRecord(
    Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF"),
    id="YP_025292.1",
    name="HokC",
    description="toxic membrane protein, small",
)
print(record.format('fasta'))

>YP_025292.1 toxic membrane protein, small
MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF



In [5]:
record[:5]

SeqRecord(seq=Seq('MKQHK'), id='YP_025292.1', name='HokC', description='toxic membrane protein, small', dbxrefs=[])

In [6]:
acp_data = ACPDataset(DATA_STORE)
amp_data = AMPDataset(DATA_STORE)
dnabind_data = DNABindDataset(DATA_STORE)

## ProSE

### Create embeddings from fasta

Sample python commands to generate embeddings from pretrained model in the ProSE codebase

**ACP**
- Remove `-d 0` from command to run on CPU
```
python embed_sequences.py -d 0 --pool avg -o ~/.peptide/datasets/lstm/avg/acp_avgpool_test.h5 ~/.peptide/datasets/fasta/ACPDataset_test.fasta
# loading the pre-trained ProSE MT model
# writing: /home/vinod/.peptide/datasets/lstm/avg/acp_avgpool_test.h5
# embedding with pool=avg
```

**AMP**
- For AMP - some type of pooling needs to be done on the non-truncated sequences as `train` and `test` have different max seq lengths
```
python embed_sequences.py -d 0 --pool avg -o ~/.peptide/datasets/lstm/avg/amp_avgpool_train.h5 ~/.peptide/datasets/fasta/AMPDataset_train.fasta
# loading the pre-trained ProSE MT model
# writing: /home/vinod/.peptide/datasets/lstm/avg/amp_avgpool_train.h5
# embedding with pool=avg
```
- Truncated
```
python embed_sequences.py -d 0 --pool avg -o ~/.peptide/datasets/lstm/avg/amp_avgpool_test_seqlen_150.h5 ~/.peptide/datasets/fasta/AMPDataset_test_seqlen_150.fasta
# loading the pre-trained ProSE MT model
# writing: /home/vinod/.peptide/datasets/lstm/avg/amp_avgpool_test_seqlen_150.h5
# embedding with pool=avg
```

**DNA Binding**
- Same as AMP - some pooling needed for the full non-truncated sequences
```
python embed_sequences.py -d 0 --pool avg -o ~/.peptide/datasets/lstm/avg/dnabind_avgpool_test.h5 ~/.peptide/datasets/fasta/DNABindDataset_test.fasta
# loading the pre-trained ProSE MT model
# writing: /home/vinod/.peptide/datasets/lstm/avg/dnabind_avgpool_test.h5
# embedding with pool=avg
```
- Truncated example
```
python embed_sequences.py -d 0 --pool avg -o ~/.peptide/datasets/lstm/avg/dnabind_avgpool_train_seqlen_300.h5 ~/.peptide/datasets/fasta/DNABindDataset_train_seqlen_300.fasta
# loading the pre-trained ProSE MT model
# writing: /home/vinod/.peptide/datasets/lstm/avg/dnabind_avgpool_train_seqlen_300.h5
# embedding with pool=avg
```

### Get embeddings - read from H5

In [23]:
def get_embeddings(h5_file):
    
    Xs = []
    ys = []
    with h5py.File(h5_file, "r") as f:
        for key in f.keys():
            label = key.split('|')[-1]
            ys.append(int(label))
            seq = f[key][()]
            Xs.append(seq)
    Xs = np.stack(Xs, axis=0)
    ys = np.stack(ys, axis=0)
    return Xs, ys

### AMP

In [24]:
train_h5 = "/home/vinod/.peptide/datasets/lstm/avg/amp_avgpool_train.h5"
test_h5 = "/home/vinod/.peptide/datasets/lstm/avg/amp_avgpool_test.h5"

In [25]:
X_train, y_train = get_embeddings(train_h5)
X_test, y_test = get_embeddings(test_h5)

In [26]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((3234, 6165), (3234,), (808, 6165), (808,))

In [30]:
lr = LogisticRegression(max_iter=10000)
lr.fit(X_train, y_train)

In [31]:
svc = LinearSVC(max_iter=10000)
svc.fit(X_train, y_train)

In [32]:
xgb = XGBClassifier()
xgb.fit(X_train, y_train)

In [33]:
lr_preds = lr.predict(X_test)
svc_preds = svc.predict(X_test)
xgb_preds = xgb.predict(X_test)

In [34]:
scores = []
for preds in [lr_preds, svc_preds, xgb_preds]:
    scores.append(
        [
            accuracy_score(y_test, preds),
            recall_score(y_test, preds),
            precision_score(y_test, preds),
            f1_score(y_test, preds),
        ]
    )

pd.DataFrame(
    scores, columns=["acc", "recall", "precision", "f1"], index=["lr", "svc", "xgb"]
)

Unnamed: 0,acc,recall,precision,f1
lr,0.931931,0.919395,0.940722,0.929936
svc,0.925743,0.911839,0.935401,0.923469
xgb,0.929455,0.901763,0.952128,0.926261


In [None]:
print(classification_report(y_test, lr_preds))

              precision    recall  f1-score   support

           0       0.74      0.70      0.72       172
           1       0.72      0.75      0.73       172

    accuracy                           0.73       344
   macro avg       0.73      0.73      0.73       344
weighted avg       0.73      0.73      0.73       344

