In [1]:
import pandas as pd

from sklearn.metrics import roc_auc_score, accuracy_score

In [2]:
# results are split in 29 different files due to PCPS prediction server capacity
files = [
    '00', '01', '02', '03', '04', '05',
    '06', '07', '08', '09', '10', '11',
    '12', '13', '14', '15', '16', '17',
    '18', '19', '20', '21', '22', '23',
    '24', '25', '26', '27', '28'
]

base_path = '../../params/c_term/pcps/results'

dfs = []
for file in files:
    temp_df = pd.read_csv(base_path + file + '.txt', header=None)
    dfs.append(temp_df)
    
df = pd.concat(dfs)

In [3]:
# filter and preprocess the files
df.rename({0: "aa", 1: "cleavage", 2: "score"}, axis=1, inplace=True)
df = df.loc[df["aa"] != "Aminoacid"]
df = df.applymap(lambda x: x.strip().replace(" ", ""))
df.loc[df["aa"].str.startswith(">"), "aa"] = df.loc[
    df["aa"].str.startswith(">"), "aa"
].str.replace(">", "")
df.reset_index(drop=True, inplace=True)
df["idx"] = df["aa"].apply(lambda x: int(x) if x.isdigit() else None)
df["idx"].ffill(inplace=True)
df.index = df["idx"].astype(int)
df = df.loc[df["cleavage"] != "10BASES", ["aa", "cleavage", "score"]]
df["cleavage"].replace({"YES": 1, "NO": 0}, inplace=True)
df.sort_index(inplace=True)

In [4]:
# cleavage sites are located after the 6th amino acid
# --> group by every index, take 5th index of group
df_preds = df.groupby(df.index).nth(5)[['cleavage', 'score']]
df_preds.head()

Unnamed: 0_level_0,cleavage,score
idx,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0,0.2038
1,1,0.6535
2,0,0.012
3,1,0.7213
4,0,0.1655


In [5]:
# load test set results
c_test = pd.read_csv('../../data/c_test.csv')

In [6]:
# accuracy
accuracy_score(c_test['elution_cleavage'], df_preds['cleavage'])

0.610475519549137

In [7]:
# AUC
roc_auc_score(c_test['elution_cleavage'], df_preds['score'])

0.5129134073297384