In [1]:
# default_exp models.pretrained.transformer

In [2]:
# all_func


In [2]:
import pandas as pd

from peptide.basics import *
from peptide.preprocessing.data import (
    ProteinDataset,
    ACPDataset,
    AMPDataset,
    DNABindDataset,
)

In [None]:
import random
from collections import Counter
from tqdm import tqdm
from pathlib import Path

import torch
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

import esm

In [51]:
import scipy
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression, SGDRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score, recall_score, precision_score, f1_score

from xgboost import XGBClassifier

## ESM

### Create embeddings (in bulk) from fasta

**ACP**
```
python scripts/extract.py esm1b_t33_650M_UR50S ~/.peptide/datasets/fasta/ACPDataset_train.fasta  ~/.peptide/datasets/transformer/mean/acp/train/ \    
    --repr_layers 33 --include mean
Transferred model to GPU
Read /home/vinod/.peptide/datasets/fasta/ACPDataset_train.fasta with 1378 sequences
Processing 1 of 10 batches (292 sequences)
Processing 2 of 10 batches (215 sequences)
Processing 3 of 10 batches (178 sequences)
Processing 4 of 10 batches (157 sequences)
Processing 5 of 10 batches (132 sequences)
Processing 6 of 10 batches (117 sequences)
Processing 7 of 10 batches (105 sequences)
Processing 8 of 10 batches (91 sequences)
Processing 9 of 10 batches (80 sequences)
Processing 10 of 10 batches (11 sequences)
```

**AMP**
```
python scripts/extract.py esm1b_t33_650M_UR50S ~/.peptide/datasets/fasta/AMPDataset_test.fasta  ~/.peptide/datasets/transformer/mean/amp/test \
    --repr_layers 33 --include mean
Transferred model to GPU
Read /home/vinod/.peptide/datasets/fasta/AMPDataset_test.fasta with 808 sequences
Processing 1 of 9 batches (204 sequences)
Processing 2 of 9 batches (157 sequences)
Processing 3 of 9 batches (124 sequences)
Processing 4 of 9 batches (102 sequences)
Processing 5 of 9 batches (85 sequences)
Processing 6 of 9 batches (63 sequences)
Processing 7 of 9 batches (44 sequences)
Processing 8 of 9 batches (26 sequences)
Processing 9 of 9 batches (3 sequences)
```

### Get embeddings
- From ESM Example - https://github.com/facebookresearch/esm/blob/main/examples/sup_variant_prediction.ipynb

In [37]:
def get_embeddings(fasta_path, emb_path, emb_layer):
    ys = []
    Xs = []
    for header, _seq in esm.data.read_fasta(fasta_path):
        label = header.split('|')[-1]
        ys.append(int(label))
        emb_file = f'{emb_path}/{header[1:]}.pt'
        embs = torch.load(emb_file)
        Xs.append(embs['mean_representations'][emb_layer])
    Xs = np.stack(Xs, axis=0)
    ys = np.stack(ys, axis=0)
    return Xs, ys

#### ACP

In [38]:
train_fasta = "/home/vinod/.peptide/datasets/fasta/ACPDataset_train.fasta"
train_emb = "/home/vinod/.peptide/datasets/transformer/mean/acp/train/"

test_fasta = "/home/vinod/.peptide/datasets/fasta/ACPDataset_test.fasta"
test_emb = "/home/vinod/.peptide/datasets/transformer/mean/acp/test/"

emb_layer = 33

In [39]:
X_train, y_train = get_embeddings(train_fasta, train_emb, emb_layer)
X_test, y_test = get_embeddings(test_fasta, test_emb, emb_layer)

In [41]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((1378, 1280), (1378,), (344, 1280), (344,))

In [42]:
lr = LogisticRegression(max_iter=10000)
lr.fit(X_train, y_train)

In [45]:
svc = LinearSVC(max_iter=10000)
svc.fit(X_train, y_train)



In [46]:
xgb = XGBClassifier()
xgb.fit(X_train, y_train)

In [56]:
lr_preds = lr.predict(X_test)
svc_preds = svc.predict(X_test)
xgb_preds = xgb.predict(X_test)

In [57]:
print(classification_report(y_test, lr_preds))

              precision    recall  f1-score   support

           0       0.74      0.70      0.72       172
           1       0.72      0.75      0.73       172

    accuracy                           0.73       344
   macro avg       0.73      0.73      0.73       344
weighted avg       0.73      0.73      0.73       344



In [58]:
scores = []
for preds in [lr_preds, svc_preds, xgb_preds]:
    scores.append(
        [
            accuracy_score(y_test, preds),
            recall_score(y_test, preds),
            precision_score(y_test, preds),
            f1_score(y_test, preds),
        ]
    )

pd.DataFrame(
    scores, columns=["acc", "recall", "precision", "f1"], index=["lr", "svc", "xgb"]
)

Unnamed: 0,acc,recall,precision,f1
lr,0.726744,0.75,0.716667,0.732955
svc,0.729651,0.75,0.72067,0.735043
xgb,0.729651,0.75,0.72067,0.735043
