In [1]:
import pandas as pd
import pepfeature
from string import ascii_lowercase as alc
from sklearn.model_selection import train_test_split

In [2]:
seed = 10902128+8403014

In [3]:
drop_list = ['NAME', 'SOURCE GENE - DB LINK', 'SOURCE GENE - NOTE', 'PDB NAME', 'PDB LINK', 'PDB FILE LINK', 'TARGET ACTIVITY - TARGET SPECIES',
       'TARGET ACTIVITY - ACTIVITY MEASURE GROUP',
       'TARGET ACTIVITY - ACTIVITY MEASURE VALUE',
       'TARGET ACTIVITY - CONCENTRATION', 'TARGET ACTIVITY - UNIT',
       'TARGET ACTIVITY - PH', 'TARGET ACTIVITY - IONIC STRENGTH',
       'TARGET ACTIVITY - SALT TYPE', 'TARGET ACTIVITY - MEDIUM',
       'TARGET ACTIVITY - CFU', 'TARGET ACTIVITY - CFU GROUP',
       'TARGET ACTIVITY - NOTE',
       'TARGET ACTIVITY - ACTIVITY (μg/ml) (Calculated By DBAASP)',
       'TARGET ACTIVITY - REFERENCE','INTERCHAIN BOND - NOTE','SYNERGY - ACTIVITY MEASURE GROUP', 'SYNERGY - UNIT',
       'SYNERGY - PEPTIDE ID', 'SYNERGY - ANTIBIOTIC ID',
       'SYNERGY - ANTIBIOTIC NAME', 'SYNERGY - PEPTIDE ACTIVITY',
       'SYNERGY - PEPTIDE ANTIBIOTIC ACTIVITY',
       'SYNERGY - ANTIBIOTIC ACTIVITY',
       'SYNERGY - ANTIBIOTIC PEPTIDE ACTIVITY', 'SYNERGY - FICI',
       'SYNERGY - REFERENCE', 'UNIRPROT - ID', 'UNIRPROT - DESCRIPTION',
       'UNIRPROT - PRO PEPTIDE', 'UNIRPROT - URL', 'ARTICLES - JOURNAL',
       'ARTICLES - YEAR', 'ARTICLES - VOLUME', 'ARTICLES - PAGES',
       'ARTICLES - TITLE', 'ARTICLES - ADDITIONAL', 'ARTICLES - PUBMED',
       'ARTICLES - AUTHORS', 'TARGET GROUP', 'TARGET OBJECT',
       'SOURCE GENE - KINGDOM', 'SOURCE GENE - SOURCE',
       'SOURCE GENE - SUBKINGDOM', 'SOURCE GENE - GENE',
       'SOURCE GENE - GENE IN SEQUENCE', 'INTERCHAIN BOND - CHAIN 1',
       'INTERCHAIN BOND - CHAIN 2', 'INTERCHAIN BOND - CHAIN 3',
       'INTERCHAIN BOND - CHAIN 4', 'INTERCHAIN BOND - BOND',
       'INTRACHAIN BOND - POSITION 1', 'INTRACHAIN BOND - POSITION 2',
       'INTRACHAIN BOND - BOND', 'INTRACHAIN BOND - NOTE',
       'UNUSUAL OR MODIFIED AMINO ACID - POSITION',
       'UNUSUAL OR MODIFIED AMINO ACID - MODIFICATION TYPE',
       'UNUSUAL OR MODIFIED AMINO ACID - BEFORE MODIFICATION',
       'UNUSUAL OR MODIFIED AMINO ACID - NOTE','HEMOLITIC CYTOTOXIC ACTIVITY - NOTE',
       'HEMOLITIC CYTOTOXIC ACTIVITY - REFERENCE', 'SYNERGY - TARGET SPECIE','HEMOLITIC CYTOTOXIC ACTIVITY - PH',
       'HEMOLITIC CYTOTOXIC ACTIVITY - IONIC STRENGTH',
       'HEMOLITIC CYTOTOXIC ACTIVITY - SALT TYPE', 'HEMOLITIC CYTOTOXIC ACTIVITY - ACTIVITY (μg/ml) (Calculated By DBAASP)',
       'SYNTHESIS TYPE',]

df = pd.read_csv('./peptides-complete.csv', index_col=False)

df = df.drop(drop_list, axis=1)

  df = pd.read_csv('./peptides-complete.csv', index_col=False)


In [4]:
df = df[df['HEMOLITIC CYTOTOXIC ACTIVITY - TARGET CELL'] == 'Human erythrocytes']
df = df[df['COMPLEXITY'] == 'Monomer']
df = df[df['HEMOLITIC CYTOTOXIC ACTIVITY - LYSIS GROUP'] != 'MHC']
df = df.dropna(subset=['HEMOLITIC CYTOTOXIC ACTIVITY - LYSIS VALUE', 'HEMOLITIC CYTOTOXIC ACTIVITY - CONCENTRATION'])
for lower_case in alc:
    df = df[df['SEQUENCE'].str.contains(lower_case) == False]

df['HEMOLITIC CYTOTOXIC ACTIVITY - LYSIS VALUE'] = df['HEMOLITIC CYTOTOXIC ACTIVITY - LYSIS VALUE'].str.lstrip('<>')
df['HEMOLITIC CYTOTOXIC ACTIVITY - LYSIS VALUE'] = df['HEMOLITIC CYTOTOXIC ACTIVITY - LYSIS VALUE'].str.split('%').str[0]

import re
def float_ignore_plus_minus(mynumber):
    mynumber = re.sub(r'[A-Za-z]', '', mynumber)
    try:
        if '-' in mynumber:
            return float(mynumber.split('-')[1])
        else:
            return sum(map(float,mynumber.replace('l', '1').strip('()=≤').split("±")))
    except:
        return float("NaN")

df['HEMOLITIC CYTOTOXIC ACTIVITY - LYSIS VALUE'] = df['HEMOLITIC CYTOTOXIC ACTIVITY - LYSIS VALUE'].apply(float_ignore_plus_minus)

df['SEQUENCE'] = df['SEQUENCE'].str.replace(" ", "")
pepfeature.aa_molecular_weight.calc_df(df, Ncores=1, aa_column='SEQUENCE')
df = df.dropna(subset=['HEMOLITIC CYTOTOXIC ACTIVITY - TARGET CELL', 'HEMOLITIC CYTOTOXIC ACTIVITY - LYSIS GROUP', 'HEMOLITIC CYTOTOXIC ACTIVITY - LYSIS VALUE'])

# display(df)

In [5]:
min_len = 10
max_len = 50

df['len'] = df['SEQUENCE'].str.len()
df = df[df['len'] <= max_len]
df = df[df['len'] >= min_len]
# display(df)

In [6]:
def conc_str_process(concentration_str):
    def float_ignore_plus_minus(mynumber):
        try:
            return sum(map(float,mynumber.split("±")))
        except:
            return float("NaN")

    concentration_str = concentration_str.replace("–","-")
    concentration_str = concentration_str.replace("->","-")
    concentration_str = concentration_str.replace(",",".") 

    if concentration_str[0] == '<':
        if concentration_str[1] == '=':
            concentration_tmp = float_ignore_plus_minus(concentration_str[2:])
        else:
            concentration_tmp = float_ignore_plus_minus(concentration_str[1:])
        concentration = concentration_tmp
    elif concentration_str[0] == '>':
        if concentration_str[1] == '=':
            concentration_tmp = float_ignore_plus_minus(concentration_str[2:])
        else:
            concentration_tmp = float_ignore_plus_minus(concentration_str[1:])
        concentration = concentration_tmp
    elif "-"  in concentration_str:
        concentrations = concentration_str.split("-")
        concentration =  float_ignore_plus_minus(concentrations[0]) + float_ignore_plus_minus(concentrations[1])
        concentration /= 2
    else:
        concentration = float_ignore_plus_minus(concentration_str)

    return concentration

df['HEMOLITIC CYTOTOXIC ACTIVITY - CONCENTRATION'] = df['HEMOLITIC CYTOTOXIC ACTIVITY - CONCENTRATION'].apply(lambda x: conc_str_process(x))
df = df.dropna(subset=['HEMOLITIC CYTOTOXIC ACTIVITY - CONCENTRATION'])

In [7]:
def calc_ugml_concentration(x, val, unit, mw, output):
    if(x[unit] == 'µg/ml'):
        x[output] = x[val]
    else:
        x[output] = x[val] * x[mw] / 1000
    return x

def calc_uM_concentration(x, val , unit, mw, output):
    if(x[unit] == 'ug/ml'):
        x[output] = x[val] / x[mw] * 1000
    else:
        x[output] = x[val]
    return x

df = df.apply(lambda x: calc_ugml_concentration(x, 'HEMOLITIC CYTOTOXIC ACTIVITY - CONCENTRATION', 'HEMOLITIC CYTOTOXIC ACTIVITY - UNIT', 'feat_molecular_weight', 'concentration'), axis=1)
# df = df.apply(lambda x: calc_uM_concentration(x, 'HEMOLITIC CYTOTOXIC ACTIVITY - CONCENTRATION', 'HEMOLITIC CYTOTOXIC ACTIVITY - UNIT', 'feat_molecular_weight', 'concentration'), axis=1)

In [8]:
df = df.drop(['COMPLEXITY', 'HEMOLITIC CYTOTOXIC ACTIVITY - TARGET CELL', 'HEMOLITIC CYTOTOXIC ACTIVITY - CONCENTRATION', 'HEMOLITIC CYTOTOXIC ACTIVITY - UNIT', 'feat_molecular_weight', 'N TERMINUS', 'C TERMINUS', 'HEMOLITIC CYTOTOXIC ACTIVITY - LYSIS GROUP'], axis=1).copy()

In [9]:
# df = df.drop(['ID'], axis=1)
df = df.rename(columns={'SEQUENCE':'sequence', 'HEMOLITIC CYTOTOXIC ACTIVITY - LYSIS VALUE':'lysis'})
df = df[['sequence', 'concentration', 'lysis', 'len']]
df = df.astype({'sequence':'str', 'concentration':'float32', 'lysis':'float32'})
# df

In [10]:
df = df[df['sequence'] != ""]
diff_prot = df.groupby('sequence', as_index=False)
diff_prot.ngroups

4640

In [11]:
proteins = pd.DataFrame(diff_prot.aggregate(list))
# proteins

In [12]:
proteins['zipped_list'] = proteins.apply(lambda x: sorted(zip(x['concentration'], x['lysis']), key=lambda pair: (pair[0], -pair[1])), axis=1)

In [13]:
def remove_inverse_pairs(x: list):
    ret = []
    ret.append(x[0])
    for pair in x[1:]:  # (conc, lysis)
        if pair[0] == ret[-1][0]:
            continue
        if pair[0] > ret[-1][0] and pair[1] >= ret[-1][1]:
            ret.append(pair)
    return ret

def select_largest(x: list):
    return x[-1:]

def unzip(x: pd.Series):
    x['concentration'], x['lysis'] = zip(*x['zipped_list'])
    return x

proteins['zipped_list'] = proteins['zipped_list'].apply(lambda x: select_largest(x))
proteins = proteins.apply(lambda x:unzip(x), axis=1)
proteins = proteins.drop('zipped_list', axis=1)
proteins = proteins.drop('len', axis=1)
# proteins

In [17]:
proteins = proteins.explode(column=['concentration', 'lysis'])
proteins = proteins.astype({'sequence':'str', 'concentration':'float32', 'lysis':'float32'})
display(proteins)

Unnamed: 0,sequence,concentration,lysis,label
3073,KRRLALFRLFRLALKLVLKK,0.002221,25.0,0.0
415,FALALKALKKALKKLKKALKKAL,0.001110,50.0,0.0
4399,VLIKTRLFIKRK,0.000150,12.5,0.0
3138,KWCFRVCYRGICYRRCA,0.000003,10.0,0.0
4549,WMLKKFRGMF,0.001672,2.0,0.0
...,...,...,...,...
3585,LSVDKRPVLHPEHIYGHNHY,0.001529,2.0,0.0
4458,WAGSAAIGAALPSVVGAFQKKKKKK,0.003305,1.0,0.0
463,FFHHIFRGIVHVGKTVHRLVTG,0.001304,50.0,0.0
3694,PNDPDSPCVYRMPNARGCSI,0.001407,0.0,0.0


In [15]:
# restrict concentration
proteins = proteins[proteins['concentration'] <= 300]
proteins = proteins[proteins['concentration'] > 0]
proteins['concentration'] = proteins['concentration'].apply(lambda x: x/300)
proteins['concentration'] = proteins['concentration'].astype('float32')
# split the test set for performance evaluation of final models
proteins, test = train_test_split(proteins, test_size=0.1, random_state=seed)
test.to_parquet(f'test.parquet')

In [16]:
for threshold in range(10, 100, 10):
    proteins['label'] = proteins['lysis'].apply(lambda x: 1.0 if x > threshold else 0.0)
    proteins['label'] = proteins['label'].astype('float32')
    group = proteins.groupby('label')
    print(f'threshold={threshold}\t\toriginal distribution:\tneg:{group.size().values[0]}, pos:{group.size().values[1]}')
    pos = proteins[proteins['label'] == 1]
    neg = proteins[proteins['label'] == 0]
    if len(pos) > len(neg):
        pos = pos.sample(n=len(neg), random_state=seed)
    else:
        neg = neg.sample(n=len(pos), random_state=seed)
    labeled_proteins = pd.concat([pos,neg],axis=0)
    train, valid = train_test_split(labeled_proteins, test_size=0.1, random_state=seed, stratify=labeled_proteins['label'])
    group = train.groupby('label')
    print(f'\t\ttrain set after down sampling:\tneg:{group.size().values[0]}, pos:{group.size().values[1]}')
    group = valid.groupby('label')
    print(f'\t\tvalid set after down sampling:\tneg:{group.size().values[0]}, pos:{group.size().values[1]}')
    print('')
    train.to_parquet(f'train{threshold}.parquet')
    valid.to_parquet(f'valid{threshold}.parquet')


threshold=10		original distribution:	neg:1275, pos:1386
		train set after down sampling:	neg:1148, pos:1147
		valid set after down sampling:	neg:127, pos:128

threshold=20		original distribution:	neg:1416, pos:1245
		train set after down sampling:	neg:1121, pos:1120
		valid set after down sampling:	neg:124, pos:125

threshold=30		original distribution:	neg:1512, pos:1149
		train set after down sampling:	neg:1034, pos:1034
		valid set after down sampling:	neg:115, pos:115

threshold=40		original distribution:	neg:1564, pos:1097
		train set after down sampling:	neg:987, pos:987
		valid set after down sampling:	neg:110, pos:110

threshold=50		original distribution:	neg:2351, pos:310
		train set after down sampling:	neg:279, pos:279
		valid set after down sampling:	neg:31, pos:31

threshold=60		original distribution:	neg:2402, pos:259
		train set after down sampling:	neg:233, pos:233
		valid set after down sampling:	neg:26, pos:26

threshold=70		original distribution:	neg:2439, pos:222
		t