In [1]:
cd ../../src

/Users/in-divye.singh/Documents/Projects/MIC_predictor/src


In [2]:
import biovec
import numpy as np
import pandas as pd
from itertools import chain, combinations
from collections import Counter

from utils import *

from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split

from Bio.SeqUtils.ProtParam import ProteinAnalysis

In [3]:
def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

mape_scorer = make_scorer(mean_absolute_percentage_error, greater_is_better=False)

In [4]:
avp_ic50 = pd.read_csv("../data/raw/AVP-IC50Pred_train.csv")
ha_avp = pd.read_csv("../data/raw/HA_AVP.csv")

In [5]:
df = pd.concat([avp_ic50[['Sequence','MIC']], ha_avp], axis=0).drop_duplicates(['Sequence']).reset_index(drop=True)
df = sequence_filtering(df)

In [6]:
df

Unnamed: 0,Sequence,MIC
0,AAQRRGRVGRNPNQVGD,442.00000
1,HRILARIRQMMT,435.50000
2,RNPSQVGD,383.00000
3,RVGRNPNQVGD,374.00000
4,AAQRRGRIGRNPSQVGD,358.00000
...,...,...
707,NGAICWGPCPTAFRQIGNCGHFKVRCCKIR,0.11515
708,CFPYITRPGTYHDWWYTRKNRQ,0.30000
709,YTSLIHSLIEESQNQQEKNEQELLELDKWASLWNWF,0.01131
710,GTNLSVPNPLGFFPDHQLDPAFGANSNNPDWDFNPNKDHWPEANKVG,0.00008


In [7]:
def get_physicochemical_properties(df):
    params = ['aromaticity', 'helix', 'turn', 'sheet', 'gravy', 'net_charge_at_pH7point4']

    prop = []
    for seq in df.Sequence:
        X = ProteinAnalysis(seq)
        aromaticity = X.aromaticity()
        sec_struc = X.secondary_structure_fraction()
        helix = sec_struc[0]
        turn = sec_struc[1]
        sheet = sec_struc[2]
        gravy = X.gravy() # hydrophobicity related
        net_charge_at_pH7point4 = X.charge_at_pH(7.4)

        prop.append([aromaticity, helix, turn, sheet, gravy, net_charge_at_pH7point4])
    return pd.DataFrame(prop, columns=params)

In [12]:
aa_freq = reduce_by_kmer_frequency(df)

In [9]:
uniprot_embedding = biovec.models.load_protvec("../data/embeddings/uniprot__kmer_3_contextWindow_10_vector_100_reduction_None")

avg_protvec = convert_sequences_to_avg_vectors(df['Sequence'], uniprot_embedding, kmer=3)
avg_protvec = avg_protvec.reset_index(drop=True)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
Creating vectors: 100%|██████████| 712/712 [00:01<00:00, 381.20sequence/s]


In [14]:
physicochemical_prop = get_physicochemical_properties(df)

In [16]:
X = pd.concat([aa_freq, avg_protvec, physicochemical_prop[['helix','turn','sheet']]], axis=1)

In [18]:
y = df['MIC']

In [49]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [50]:
knn = KNeighborsRegressor(metric='euclidean', n_neighbors=1, weights='uniform')

In [51]:
_ = knn.fit(X_train, y_train)

In [52]:
y_pred = knn.predict(X_test)

In [53]:
mean_absolute_percentage_error(y_test, y_pred)

5067.804799150274

In [54]:
from scipy.stats import pearsonr

In [55]:
pearsonr(y_test, y_pred)

(0.661191812130947, 2.53367270036445e-19)

In [60]:
ape = 100*np.abs(y_test-y_pred)/y_test

In [61]:
list(zip(y_test, y_pred, ape))

[(0.79, 1.7, 115.1898734177215),
 (3.52, 11.6, 229.54545454545453),
 (23.5, 14.79, 37.06382978723405),
 (13.0, 0.02, 99.84615384615384),
 (2.0, 1.0, 50.0),
 (2.02, 24.0, 1088.118811881188),
 (20.0, 15.0, 25.0),
 (21.5, 23.8, 10.697674418604654),
 (100.0, 14.79, 85.21),
 (2.0, 2.0, 0.0),
 (160.0, 25.0, 84.375),
 (54.4, 45.5, 16.360294117647058),
 (100.0, 100.0, 0.0),
 (0.006999999999999999, 0.011309999999999999, 61.57142857142857),
 (0.597, 0.067, 88.77721943048577),
 (0.026000000000000002, 0.021, 19.230769230769234),
 (118.0, 333.0, 182.20338983050848),
 (3.38, 7.82, 131.36094674556216),
 (8.0, 6.0, 25.0),
 (6.8, 1.7, 75.0),
 (1.34, 0.66, 50.74626865671642),
 (24.6, 13.0, 47.154471544715456),
 (0.255, 25.91, 10060.78431372549),
 (0.62, 0.005, 99.19354838709677),
 (100.0, 0.66, 99.34),
 (0.001, 0.016, 1500.0),
 (117.0, 200.0, 70.94017094017094),
 (0.75, 3.0, 300.0),
 (10.0, 0.67, 93.3),
 (5.47, 7.82, 42.96160877513712),
 (200.0, 33.0, 83.5),
 (41.1, 16.0, 61.070559610705594),
 (10.5, 29

In [62]:
test_data = pd.read_csv("../data/raw/AVP-IC50Pred_test.csv")

In [63]:
test_data

Unnamed: 0,AVPdb/HIPdb_ID,Sequence,Reference,IC50_(microM)
0,>HIP163,ACWWAGAKQEF,16854053,333.000
1,>HIP174,ASWWAGIKQEF,16854053,294.000
2,>AVP1481,RDVSDFTDSVRDPKTSEILD,US7491489,258.344
3,>HIP1136,QETAYFLLKLAGR,12643937,150.000
4,>HIP142,AEAMAQVTN,15113844,124.000
...,...,...,...,...
71,>HIP983,YTSLIHSLIEESQNQQEKNEQELLELDKWASLWNWF,19949052,7.570
72,>AVP1000,MDVNPYFLFLKVPAQ,19841738,0.008
73,>HIP843,TSLIHSLIEESQNQQEKNEQELLELDKWASLWNWFN,US6861059,0.006
74,>AVP0977,MDVNPTLLFLKVPAQNAISTTFPYT,19841738,0.002


In [74]:
protvec = convert_sequences_to_avg_vectors(test_data['Sequence'], uniprot_embedding, kmer=3)
protvec = protvec.reset_index(drop=True)

Creating vectors: 100%|██████████| 76/76 [00:00<00:00, 812.87sequence/s]


In [66]:
aa_freq_test = reduce_by_kmer_frequency(test_data)

In [67]:
physicochemical_prop_test = get_physicochemical_properties(test_data)

In [75]:
X_test = pd.concat([aa_freq_test, protvec, physicochemical_prop_test[['helix','turn','sheet']]], axis=1)

In [76]:
y_pred_test = knn.predict(X_test)

In [78]:
y_test = test_data['IC50_(microM)']

In [79]:
mean_absolute_percentage_error(y_test, y_pred_test)

1710.4256898338665

In [54]:
from scipy.stats import pearsonr

In [80]:
pearsonr(y_test, y_pred_test)

(0.43828856623655504, 7.498261073666752e-05)