In [57]:
import os, re
import pandas as pd
import numpy as np

In [97]:
def get_fragment(seq,lab):
    aa = list(seq)
    labs = list(lab)
    start, end = 0, 0
    frags = []
    for i in range(len(aa)):
        if labs[i] == 'A':
            if (i > 0) and (labs[i-1]=='N'):
                start = i
        elif labs[i] == 'N':  
            if  (i > 0) and (labs[i-1]=='A'):
                end = i
                frags.append(''.join(aa[start:end]))
    if start == end == 0:
        frags = [seq,]
    elif end < start:
        frags.append(''.join(aa[start:]))
    return ','.join(frags)

In [129]:
apd_pred = pd.read_csv('/mnt/asustor/wenhui.li/02.AMP/train/esm_token/APD_evaluation/prediction.tsv', sep='\t')
apd_db = pd.read_csv('/mnt/asustor/wenhui.li/02.AMP/dataset/labels/indenpendAPD/independent_test_APD.dataset.csv')
apd_db['slen'] = [len(x) for x in apd_db.Sequence.tolist()]
apd_db['lablen'] = [len(x) for x in apd_db.token_lab.tolist()]
apd_db = apd_db[apd_db.slen==apd_db.lablen]
apd_db['smORF'] = [ 1 if not 'N' in set(list(x)) else 0 for x in apd_db.token_lab.tolist() ]
apd_db['ProID'] = apd_db['ProId'] 
apd_db['kAMP'] = [get_fragment(row.Sequence,row.token_lab) for _,row in apd_db.iterrows()]

lamp_pred = pd.read_csv('/mnt/asustor/wenhui.li/02.AMP/train/esm_token/LAMP2_evaluation/prediction.tsv', sep='\t')
lamp_db = pd.read_csv('/mnt/asustor/wenhui.li/02.AMP/dataset/labels/indenpendLAMP2/independent_test_LAMP2.dataset.csv')
lamp_db['slen'] = [len(x) for x in lamp_db.Sequence.tolist()]
lamp_db['lablen'] = [len(x) for x in lamp_db.token_lab.tolist()]
lamp_db = lamp_db[lamp_db.slen==lamp_db.lablen]
lamp_db['smORF'] = [ 1 if not 'N' in set(list(x)) else 0 for x in lamp_db.token_lab.tolist() ]
lamp_db['ProID'] = lamp_db['ProId'] 
lamp_db['kAMP'] = [get_fragment(row.Sequence,row.token_lab) for _,row in lamp_db.iterrows()]

In [125]:
def merge_db_and_pred(db_df,pred_df):

    df = pd.merge(db_df[['ProID','seq_lab','smORF','kAMP','Sequence','token_lab']],
                    pred_df[['ProID','AMPlen','Position','AMP']],
                    left_on='ProID',right_on='ProID',how='outer')
    df = df[~df.seq_lab.isna()]
    df['AMPlen'].fillna(0, inplace=True)
    df['AMPlen'] = df['AMPlen'].astype('int')
    df['seq_lab'] = df['seq_lab'].astype('int')
    df['smORF'] = df['smORF'].astype('int')
    df['Position'].fillna('-', inplace=True)
    df['AMP'].fillna('-', inplace=True)
    df['smORF_pred'] = [ 1 if row.AMP==row.Sequence else 0 for _,row in df.iterrows() ]
    df['smORF_pred'] = [ -1 if row.AMP=='-' else row.smORF_pred for _,row in df.iterrows() ]
    df = df[['ProID','seq_lab','smORF','smORF_pred','AMPlen','Position','kAMP','AMP','Sequence']]
    df.columns = ['ProID','seq_lab','smORF','smORF_pred','AMPlen','Position','knownAMP','predAMP','Sequence']
    
    return df.sort_values(by=['ProID','Position'])

In [222]:
lab_path = '/mnt/asustor/wenhui.li/02.AMP/dataset/labels/indenpendAPD/labels'
xx = pd.read_csv('/mnt/asustor/wenhui.li/02.AMP/dataset/labels/indenpendAPD/dataset_smORF/test.csv')
for _, row in xx.iterrows():
    lab = np.load(os.path.join(lab_path, row.ProId+'.npy'))
    if lab.shape[0] != len(row.Sequence):
        print(row.ProId)

In [None]:
apd_df = merge_db_and_pred(apd_db,apd_pred)
apd_df.to_csv('/mnt/asustor/wenhui.li/02.AMP/train/esm_token/APD_evaluation/APD_pred.tsv',sep='\t',index=False)

In [211]:
print('APD3 dataset:\n-----------------')
print('all known-AMPs = %d' % apd_df['ProID'].drop_duplicates().shape[0])
x1 = apd_df[apd_df.smORF==1]['ProID'].drop_duplicates().shape[0]
print('all smORF AMPs = %d' % x1)
y1 = apd_df[(apd_df.predAMP==apd_df.Sequence) & (apd_df.predAMP==apd_df.knownAMP)]['ProID'].drop_duplicates().shape[0]
print('pred smORF AMPs = %d' % y1)
ratio = y1/x1
print('smORF detect ratio = %.3f' % ratio)

APD3 dataset:
-----------------
all known-AMPs = 5939
all smORF AMPs = 254
pred smORF AMPs = 167
smORF detect ratio = 0.657


In [None]:
# with open('/mnt/asustor/wenhui.li/02.AMP/benchmarking/SAMP/SAMP/Independent_APD/smORF.lst', 'w') as outw:
#     for x in apd_db[apd_db.smORF==1].ProId.drop_duplicates().tolist():
#         outw.write('%s\n' % x)

In [254]:
apd_smorf_lst = apd_db[apd_db.smORF==1].ProId.drop_duplicates().tolist()
print(100*len(apd_smorf_lst)/apd_db.shape[0])

## AMP-BERT
tmp = pd.read_csv('/mnt/asustor/wenhui.li/02.AMP/benchmarking/AMP-BERT/results/APD_pred.tsv',sep='\t')
tmp = tmp[tmp.predLab==1]
tp = tmp[tmp.ProID.isin(apd_smorf_lst)]['ProID'].drop_duplicates().shape[0]
ratio = tp/254
print('-----------------\nTesting AMP-BERT, n = %d' % tp)
print('Recall of smORF = %.3f' % ratio)
print('-----------------\n')

## amPEPpy
tmp = pd.read_csv('/mnt/asustor/wenhui.li/02.AMP/benchmarking/amPEPpy/APD.result.tsv',sep='\t')
tmp = tmp[tmp.predicted=='AMP']
tp = tmp[tmp.seq_id.isin(apd_smorf_lst)]['seq_id'].drop_duplicates().shape[0]
ratio = tp/254
print('-----------------\nTesting amPEPpy, n = %d' % tp)
print('Recall of smORF = %.3f' % ratio)
print('-----------------\n')

## AmpGram
tmp = pd.read_csv('/mnt/asustor/wenhui.li/02.AMP/benchmarking/AmpGram/independent_test_APD.dataset.AmpGram.pred.txt',sep=' ')
tmp = tmp[tmp.probability>0.5]
tp = tmp[tmp.seq_name.isin(apd_smorf_lst)]['seq_name'].drop_duplicates().shape[0]
ratio = tp/254
print('-----------------\nTesting AmpGram, n = %d' % tp)
print('Recall of smORF = %.3f' % ratio)
print('-----------------\n')

## Ampir
tmp = pd.read_csv('/mnt/asustor/wenhui.li/02.AMP/benchmarking/Ampir/ampir_predictions.APD.csv')
tmp = tmp[tmp['AMP Probability']>0.5]
tp = tmp[tmp.Name.isin(apd_smorf_lst)]['Name'].drop_duplicates().shape[0]
ratio = tp/254
print('-----------------\nTesting Ampir, n = %d' % tp)
print('Recall of smORF = %.3f' % ratio)
print('-----------------\n')

## AMPlify-balanced
tmp = pd.read_csv('/mnt/asustor/wenhui.li/02.AMP/benchmarking/AMPlify/balanced/APD.AMPlify_balanced.tsv',sep='\t')
tmp = tmp[tmp.Prediction=='AMP']
tp = tmp[tmp.Sequence_ID.isin(apd_smorf_lst)]['Sequence_ID'].drop_duplicates().shape[0]
ratio = tp/254
print('-----------------\nTesting AMPlify-balanced, n = %d' % tp)
print('Recall of smORF = %.3f' % ratio)
print('-----------------\n')

## AMPlify-imbalanced
tmp = pd.read_csv('/mnt/asustor/wenhui.li/02.AMP/benchmarking/AMPlify/imbalanced/APD.AMPlify_imbalanced.tsv',sep='\t')
tmp = tmp[tmp.Prediction=='AMP']
tp = tmp[tmp.Sequence_ID.isin(apd_smorf_lst)]['Sequence_ID'].drop_duplicates().shape[0]
ratio = tp/254
print('-----------------\nTesting AMPlify-imbalanced, n = %d' % tp)
print('Recall of smORF = %.3f' % ratio)
print('-----------------\n')

## deepAMPNet
tmp = pd.read_csv('/mnt/asustor/wenhui.li/02.AMP/benchmarking/deepAMPNet/deepAMPNet/APD_dataset/APD.out.csv')
tmp = tmp[tmp['class']==1]
tp = tmp[tmp.seq_name.isin(apd_smorf_lst)]['seq_name'].drop_duplicates().shape[0]
ratio = tp/254
print('-----------------\nTesting deepAMPNet, n = %d' % tp)
print('Recall of smORF = %.3f' % ratio)
print('-----------------\n')

## iAMPCN
tmp = pd.read_csv('/mnt/asustor/wenhui.li/02.AMP/benchmarking/iAMPCN/independent_test_APD.iAMPCN.csv')
tmp = tmp[tmp.AMP=='Yes']
tp = tmp[tmp.name.isin(apd_smorf_lst)]['name'].drop_duplicates().shape[0]
ratio = tp/254
print('-----------------\nTesting iAMPCN, n = %d' % tp)
print('Recall of smORF = %.3f' % ratio)
print('-----------------\n')

## Macrel
tmp = pd.read_csv('/mnt/asustor/wenhui.li/02.AMP/benchmarking/Macrel/APD/macrel.out.prediction',sep='\t')
tmp = tmp[tmp.AMP_probability>0.5]
tp = tmp[tmp.Access.isin(apd_smorf_lst)]['Access'].drop_duplicates().shape[0]
ratio = tp/254
print('-----------------\nTesting Macrel, n = %d' % tp)
print('Recall of smORF = %.3f' % ratio)
print('-----------------\n')

## PGAT-ABPp
tmp = pd.read_csv('/mnt/asustor/wenhui.li/02.AMP/benchmarking/PGAT-ABPp/PGAT-ABPp/APD_predict/pred_out.tsv',sep='\t')
tmp = tmp[tmp.Label==1]
tp = tmp[tmp.ProID.isin(apd_smorf_lst)]['ProID'].drop_duplicates().shape[0]
ratio = tp/254
print('-----------------\nTesting PGAT-ABPp, n = %d' % tp)
print('Recall of smORF = %.3f' % ratio)
print('-----------------\n')

## SAMP
tp = 191
ratio = tp/254
print('-----------------\nTesting SAMP, n = %d' % tp)
print('Recall of smORF = %.3f' % ratio)
print('-----------------\n')

## Ma et al. 2022
tmp = pd.read_csv('/mnt/asustor/wenhui.li/02.AMP/benchmarking/WangJun_2022/c_AMPs_results/APD_pos.tsv',sep='\t')
tmp = tmp[tmp.Label==1]
tp = tmp[tmp.ProID.isin(apd_smorf_lst)]['ProID'].drop_duplicates().shape[0]
ratio = tp/254
print('-----------------\nTesting Wangjun_2022, n = %d' % tp)
print('Recall of smORF = %.3f' % ratio)
print('-----------------\n')

## 2-steps-ESM2
tmp = pd.read_csv('/mnt/asustor/wenhui.li/02.AMP/train/2_step_train/APD_evaluation_smORF/out_pred.tsv',sep='\t')
tmp = tmp[tmp.AMP==tmp.Sequence]
tp = tmp[tmp.ProID.isin(apd_smorf_lst)]['ProID'].drop_duplicates().shape[0]
ratio = tp/254
print('-----------------\nTesting 2-steps-ESM2, n = %d' % tp)
print('Recall of smORF = %.3f' % ratio)
print('-----------------\n')

## ESM2-token-LoRA
tmp = pd.read_csv('/mnt/asustor/wenhui.li/02.AMP/train/esm_token_focalloss/APD_evaluation_smORF/out_prediction.tsv',sep='\t')
tmp = tmp[tmp.AMP==tmp.Sequence]
tp = tmp[tmp.ProID.isin(apd_smorf_lst)]['ProID'].drop_duplicates().shape[0]
ratio = tp/254
print('-----------------\nTesting ESM2-token-LoRA, n = %d' % tp)
print('Recall of smORF = %.3f' % ratio)
print('-----------------\n')

4.276814278498064
-----------------
Testing AMP-BERT, n = 235
Recall of smORF = 0.925
-----------------

-----------------
Testing amPEPpy, n = 194
Recall of smORF = 0.764
-----------------

-----------------
Testing AmpGram, n = 233
Recall of smORF = 0.917
-----------------

-----------------
Testing Ampir, n = 45
Recall of smORF = 0.177
-----------------

-----------------
Testing AMPlify-balanced, n = 226
Recall of smORF = 0.890
-----------------

-----------------
Testing AMPlify-imbalanced, n = 223
Recall of smORF = 0.878
-----------------

-----------------
Testing deepAMPNet, n = 248
Recall of smORF = 0.976
-----------------

-----------------
Testing iAMPCN, n = 232
Recall of smORF = 0.913
-----------------

-----------------
Testing Macrel, n = 175
Recall of smORF = 0.689
-----------------

-----------------
Testing PGAT-ABPp, n = 96
Recall of smORF = 0.378
-----------------

-----------------
Testing SAMP, n = 191
Recall of smORF = 0.752
-----------------

-----------------
T

In [None]:
lamp_df = merge_db_and_pred(lamp_db,lamp_pred)
lamp_df.to_csv('/mnt/asustor/wenhui.li/02.AMP/train/esm_token/LAMP2_evaluation/LAMP2_pred.tsv',sep='\t',index=False)

In [255]:
print('LAMP2 dataset:\n-----------------')
print('all known-AMPs = %d' % lamp_df['ProID'].drop_duplicates().shape[0])
x1 = lamp_df[lamp_df.smORF==1]['ProID'].drop_duplicates().shape[0]
print('all smORF AMPs = %d' % x1)
y1 = lamp_df[(lamp_df.predAMP==lamp_df.Sequence) & (lamp_df.predAMP==lamp_df.knownAMP)]['ProID'].drop_duplicates().shape[0]
print('pred smORF AMPs = %d' % y1)
ratio = y1/x1
print('smORF detect ratio = %.3f' % ratio)

LAMP2 dataset:
-----------------
all known-AMPs = 5492
all smORF AMPs = 942
pred smORF AMPs = 197
smORF detect ratio = 0.209


In [206]:
with open('/mnt/asustor/wenhui.li/02.AMP/benchmarking/SAMP/SAMP/Independent_LAMP2/smORF.lst', 'w') as outw:
    for x in lamp_db[lamp_db.smORF==1].ProId.drop_duplicates().tolist():
        outw.write('%s\n' % x)

In [256]:
lamp_smorf_lst = lamp_db[lamp_db.smORF==1].ProId.drop_duplicates().tolist()
print(100*len(lamp_smorf_lst)/lamp_db.shape[0])

## AMP-BERT
tmp = pd.read_csv('/mnt/asustor/wenhui.li/02.AMP/benchmarking/AMP-BERT/results/LAMP2_pred.tsv',sep='\t')
tmp = tmp[tmp.predLab==1]
tp = tmp[tmp.ProID.isin(lamp_smorf_lst)]['ProID'].drop_duplicates().shape[0]
ratio = tp/942
print('-----------------\nTesting AMP-BERT, n = %d' % tp)
print('Recall of smORF = %.3f' % ratio)
print('-----------------\n')

## amPEPpy
tmp = pd.read_csv('/mnt/asustor/wenhui.li/02.AMP/benchmarking/amPEPpy/LAMP2.result.tsv',sep='\t')
tmp = tmp[tmp.predicted=='AMP']
tp = tmp[tmp.seq_id.isin(lamp_smorf_lst)]['seq_id'].drop_duplicates().shape[0]
ratio = tp/942
print('-----------------\nTesting amPEPpy, n = %d' % tp)
print('Recall of smORF = %.3f' % ratio)
print('-----------------\n')

## AmpGram
tmp = pd.read_csv('/mnt/asustor/wenhui.li/02.AMP/benchmarking/AmpGram/independent_test_LAMP2.dataset.AmpGram.pred.txt',sep=' ')
tmp = tmp[tmp.probability>0.5]
tp = tmp[tmp.seq_name.isin(lamp_smorf_lst)]['seq_name'].drop_duplicates().shape[0]
ratio = tp/942
print('-----------------\nTesting AmpGram, n = %d' % tp)
print('Recall of smORF = %.3f' % ratio)
print('-----------------\n')

## Ampir
tmp = pd.read_csv('/mnt/asustor/wenhui.li/02.AMP/benchmarking/Ampir/ampir_predictions.LAMP2.csv')
tmp = tmp[tmp['AMP Probability']>0.5]
tp = tmp[tmp.Name.isin(lamp_smorf_lst)]['Name'].drop_duplicates().shape[0]
ratio = tp/942
print('-----------------\nTesting Ampir, n = %d' % tp)
print('Recall of smORF = %.3f' % ratio)
print('-----------------\n')

## AMPlify-balanced
tmp = pd.read_csv('/mnt/asustor/wenhui.li/02.AMP/benchmarking/AMPlify/balanced/LAMP2.AMPlify_balanced.tsv',sep='\t')
tmp = tmp[tmp.Prediction=='AMP']
tp = tmp[tmp.Sequence_ID.isin(lamp_smorf_lst)]['Sequence_ID'].drop_duplicates().shape[0]
ratio = tp/942
print('-----------------\nTesting AMPlify-balanced, n = %d' % tp)
print('Recall of smORF = %.3f' % ratio)
print('-----------------\n')

## AMPlify-imbalanced
tmp = pd.read_csv('/mnt/asustor/wenhui.li/02.AMP/benchmarking/AMPlify/imbalanced/LAMP2.AMPlify_imbalanced.tsv',sep='\t')
tmp = tmp[tmp.Prediction=='AMP']
tp = tmp[tmp.Sequence_ID.isin(lamp_smorf_lst)]['Sequence_ID'].drop_duplicates().shape[0]
ratio = tp/942
print('-----------------\nTesting AMPlify-imbalanced, n = %d' % tp)
print('Recall of smORF = %.3f' % ratio)
print('-----------------\n')

## deepAMPNet
tmp = pd.read_csv('/mnt/asustor/wenhui.li/02.AMP/benchmarking/deepAMPNet/deepAMPNet/LAMP2_dataset/LAMP2.out.csv')
tmp = tmp[tmp['class']==1]
tp = tmp[tmp.seq_name.isin(lamp_smorf_lst)]['seq_name'].drop_duplicates().shape[0]
ratio = tp/942
print('-----------------\nTesting deepAMPNet, n = %d' % tp)
print('Recall of smORF = %.3f' % ratio)
print('-----------------\n')

## iAMPCN
tmp = pd.read_csv('/mnt/asustor/wenhui.li/02.AMP/benchmarking/iAMPCN/independent_test_LAMP2.iAMPCN.csv')
tmp = tmp[tmp.AMP=='Yes']
tp = tmp[tmp.name.isin(lamp_smorf_lst)]['name'].drop_duplicates().shape[0]
ratio = tp/942
print('-----------------\nTesting iAMPCN, n = %d' % tp)
print('Recall of smORF = %.3f' % ratio)
print('-----------------\n')

## Macrel
tmp = pd.read_csv('/mnt/asustor/wenhui.li/02.AMP/benchmarking/Macrel/LAMP2/macrel.out.prediction',sep='\t')
tmp = tmp[tmp.AMP_probability>0.5]
tp = tmp[tmp.Access.isin(lamp_smorf_lst)]['Access'].drop_duplicates().shape[0]
ratio = tp/942
print('-----------------\nTesting Macrel, n = %d' % tp)
print('Recall of smORF = %.3f' % ratio)
print('-----------------\n')

## PGAT-ABPp
tmp = pd.read_csv('/mnt/asustor/wenhui.li/02.AMP/benchmarking/PGAT-ABPp/PGAT-ABPp/LAMP2_predict/pred_out.tsv',sep='\t')
tmp = tmp[tmp.Label==1]
tp = tmp[tmp.ProID.isin(lamp_smorf_lst)]['ProID'].drop_duplicates().shape[0]
ratio = tp/942
print('-----------------\nTesting PGAT-ABPp, n = %d' % tp)
print('Recall of smORF = %.3f' % ratio)
print('-----------------\n')

## SAMP
tp = 522
ratio = tp/942
print('-----------------\nTesting SAMP, n = %d' % tp)
print('Recall of smORF = %.3f' % ratio)
print('-----------------\n')

## Ma et al. 2022
tmp = pd.read_csv('/mnt/asustor/wenhui.li/02.AMP/benchmarking/WangJun_2022/c_AMPs_results/LAMP2_pos.tsv',sep='\t')
tmp = tmp[tmp.Label==1]
tp = tmp[tmp.ProID.isin(lamp_smorf_lst)]['ProID'].drop_duplicates().shape[0]
ratio = tp/942
print('-----------------\nTesting Wangjun_2022, n = %d' % tp)
print('Recall of smORF = %.3f' % ratio)
print('-----------------\n')

## 2-steps-ESM2
tmp = pd.read_csv('/mnt/asustor/wenhui.li/02.AMP/train/2_step_train/LAMP2_evaluation_smORF/out_pred.tsv',sep='\t')
tmp = tmp[tmp.AMP==tmp.Sequence]
tp = tmp[tmp.ProID.isin(lamp_smorf_lst)]['ProID'].drop_duplicates().shape[0]
ratio = tp/942
print('-----------------\nTesting 2-steps-ESM2, n = %d' % tp)
print('Recall of smORF = %.3f' % ratio)
print('-----------------\n')

## ESM2-token-LoRA
tmp = pd.read_csv('/mnt/asustor/wenhui.li/02.AMP/train/esm_token_focalloss/LAMP2_evaluation_smORF/out_prediction.tsv',sep='\t')
tmp = tmp[tmp.AMP==tmp.Sequence]
tp = tmp[tmp.ProID.isin(lamp_smorf_lst)]['ProID'].drop_duplicates().shape[0]
ratio = tp/942
print('-----------------\nTesting ESM2-token-LoRA, n = %d' % tp)
print('Recall of smORF = %.3f' % ratio)
print('-----------------\n')

17.152221412964312
-----------------
Testing AMP-BERT, n = 849
Recall of smORF = 0.901
-----------------

-----------------
Testing amPEPpy, n = 365
Recall of smORF = 0.387
-----------------

-----------------
Testing AmpGram, n = 784
Recall of smORF = 0.832
-----------------

-----------------
Testing Ampir, n = 553
Recall of smORF = 0.587
-----------------

-----------------
Testing AMPlify-balanced, n = 359
Recall of smORF = 0.381
-----------------

-----------------
Testing AMPlify-imbalanced, n = 304
Recall of smORF = 0.323
-----------------

-----------------
Testing deepAMPNet, n = 917
Recall of smORF = 0.973
-----------------

-----------------
Testing iAMPCN, n = 690
Recall of smORF = 0.732
-----------------

-----------------
Testing Macrel, n = 195
Recall of smORF = 0.207
-----------------

-----------------
Testing PGAT-ABPp, n = 224
Recall of smORF = 0.238
-----------------

-----------------
Testing SAMP, n = 522
Recall of smORF = 0.554
-----------------

----------------