In [1]:
from tqdm import tqdm
import biovec
import numpy as np
import pandas as pd
from itertools import chain

In [2]:
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split

In [39]:
from collections import Counter

AMINO_ACID_RESIDUES = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']

def prot_vec_to_vecs(pv, x, k):
    return pv.to_vecs(x)

def split_n_grams(seq, n):
    """
    'AGAMQSASM' => [['AGA', 'MQS', 'ASM'], ['GAM','QSA'], ['AMQ', 'SAS']]
    In case of n = 3
    """
    grams = []
    for i in range(n):
        grams.append(zip(*[iter(seq[i:])] * n))

    str_ngrams = []
    for ngrams in grams:
        x = []
        for ngram in ngrams:
            x.append("".join(ngram))
        str_ngrams.append(x)
    return str_ngrams

def words_to_vec(pv, seq, n=5):
    ngram_patterns = split_n_grams(seq, n)

    vectors = []
    for ngrams in ngram_patterns:
        ngram_vecs = []
        for ngram in ngrams:
            try:
                ngram_vecs.append(pv[ngram])
            except:
                print(ngram)
                raise Exception("Model has never trained this n-gram: " + ngram)
        vectors.append(sum(ngram_vecs))
    return vectors

def dipeptide_encoding(seq, n):
    """
    Returns n-Gram Motif frequency
    https://www.biorxiv.org/content/10.1101/170407v1.full.pdf
    """
    aa_list = list(seq)
    return {''.join(aa_list): n for aa_list, n in Counter(zip(*[aa_list[i:] for i in range(n)])).items() if
            not aa_list[0][-1] == (',')}

def get_kmer_list(kmer):
    return ["".join(s) for s in product(AMINO_ACID_RESIDUES, repeat=kmer)]

def reduce_by_kmer_frequency(data, kmer=1):
    seq_vec = data.Sequence.apply(lambda x: dipeptide_encoding(x, kmer)).to_list()
    df = pd.DataFrame(seq_vec)
    df = df.fillna(0)
    missing_kmer = set(get_kmer_list(2)).difference(set(df.columns))
    
    return df.div(df.sum(axis=1), axis=0)

def convert_sequences_to_vectors(data, embedding, to_vec=prot_vec_to_vecs, kmer=5):
    output = pd.DataFrame()
    errors = list()
    for row in tqdm(data, desc="Creating vectors", unit="sequence"):
        try:
            output = output.append(pd.DataFrame(sum(to_vec(embedding, row, kmer))).T)
        except:
            output = output.append(pd.DataFrame(np.zeros((1, embedding.vector_size))))
            errors.append(row)
    return output, errors

OTHER_ALPHABETS = "UOXBZJ"

def convert_sequences_to_weighted_avg_vectors(data, embedding, kmer_frequency, to_vec=words_to_vec, kmer=3):
    output = pd.DataFrame()
    errors = list()
    for row in tqdm(data, desc="Creating vectors", unit="sequence"):
        try:
            ngrams = list(chain(*split_n_grams(df['Sequence'][0], 3)))
            
            
        except:
            output = output.append(pd.DataFrame(np.zeros((1, embedding.vector_size))))
            errors.append(row)
    return output, errors

def contains(other_alphabets, seq):
    for o in str(other_alphabets):
        if o in str(seq):
            return True
    return False

def trim_all(strings):
    return list(set(value.strip().strip(',').lower() for value in strings))

def sequence_filtering(data):
    sequences = data[data.apply(lambda r: not contains(OTHER_ALPHABETS, r['Sequence']), axis=1)]
    sequences = sequences[sequences.apply(lambda r: not str(r['Sequence']) == 'nan', axis=1)]
    sequences['Sequence'] = sequences['Sequence'].apply(lambda x: x.upper())
    sequences['Sequence'] = sequences['Sequence'].apply(lambda x: x.strip())
    return sequences

In [7]:
df['Sequence'][0]

'AAQRRGRVGRNPNQVGD'

In [49]:
m = set(get_kmer_list(2)).difference(set(reduce_by_kmer_frequency(df, 2).columns))

In [None]:
set.int

In [20]:
reduce_by_kmer_frequency(df, 2)

Unnamed: 0,AA,AQ,QR,RR,RG,GR,RV,VG,RN,NP,...,MD,WT,HM,MN,WH,YH,MH,HC,YP,HD
0,0.0625,0.0625,0.0625,0.0625,0.0625,0.125,0.0625,0.125000,0.062500,0.062500,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000
1,0.0000,0.0000,0.0000,0.0000,0.0000,0.000,0.0000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000
2,0.0000,0.0000,0.0000,0.0000,0.0000,0.000,0.0000,0.142857,0.142857,0.142857,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000
3,0.0000,0.0000,0.0000,0.0000,0.0000,0.100,0.1000,0.200000,0.100000,0.100000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000
4,0.0625,0.0625,0.0625,0.0625,0.0625,0.125,0.0000,0.062500,0.062500,0.062500,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
707,0.0000,0.0000,0.0000,0.0000,0.0000,0.000,0.0000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000
708,0.0000,0.0000,0.0000,0.0000,0.0000,0.000,0.0000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.047619,0.0,0.0,0.0,0.047619
709,0.0000,0.0000,0.0000,0.0000,0.0000,0.000,0.0000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000
710,0.0000,0.0000,0.0000,0.0000,0.0000,0.000,0.0000,0.021739,0.000000,0.065217,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000


In [6]:
words_to_vec(uniprot_embedding, df['Sequence'][0], 3)



[array([ 0.17840016,  0.10194338, -0.47546378,  0.34923783,  0.19382586,
        -0.00841195, -0.1956464 , -0.02964587, -0.06821886, -0.19423723,
         0.42379928, -0.2701295 , -0.11932593, -0.06688377, -0.4859301 ,
        -0.09928209,  0.2533235 , -0.0704791 ,  0.05782356,  0.47145924,
        -0.38474178,  0.2661892 ,  0.10141155, -0.16269746, -0.0840092 ,
        -0.04632344, -0.23630382,  0.35703817,  0.02871538,  0.0016368 ,
        -0.19829777,  0.20658784,  0.00164619, -0.18399148, -0.0497794 ,
         0.17336111, -0.03721755, -0.23522782,  0.27826744,  0.43412933,
        -0.21858539,  0.08337232,  0.3285236 , -0.12546863,  0.01661152,
        -0.05671665,  0.4057494 ,  0.21513452, -0.24613959, -0.41372108,
         0.3279411 ,  0.02937788,  0.0502742 ,  0.08003722,  0.6157034 ,
         0.18165904,  0.28262824,  0.15321814, -0.13501489, -0.10417226,
        -0.20435144,  0.1163333 , -0.0956545 , -0.20064476,  0.4790177 ,
         0.04977179, -0.01374127, -0.14567481,  0.0

In [4]:
def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [4]:
avp_ic50 = pd.read_csv("../data/raw/AVP-IC50Pred_train.csv")
ha_avp = pd.read_csv("../data/raw/HA_AVP.csv")

df = pd.concat([avp_ic50[['Sequence', 'MIC']], ha_avp], axis=0).drop_duplicates(['Sequence']).reset_index(drop=True)
df = sequence_filtering(df)

### Amino acid frequency

In [21]:
############# Amino acid frequency #############
aa_freq = reduce_by_alphabet_frequency(df).sort_index(axis=1)

In [22]:
aa_freq

Unnamed: 0,A,C,D,E,F,G,H,I,K,L,M,N,P,Q,R,S,T,V,W,Y
0,0.117647,0.000000,0.058824,0.000000,0.000000,0.176471,0.000000,0.000000,0.000000,0.000000,0.000000,0.117647,0.058824,0.117647,0.235294,0.000000,0.000000,0.117647,0.000000,0.000000
1,0.083333,0.000000,0.000000,0.000000,0.000000,0.000000,0.083333,0.166667,0.000000,0.083333,0.166667,0.000000,0.000000,0.083333,0.250000,0.000000,0.083333,0.000000,0.000000,0.000000
2,0.000000,0.000000,0.125000,0.000000,0.000000,0.125000,0.000000,0.000000,0.000000,0.000000,0.000000,0.125000,0.125000,0.125000,0.125000,0.125000,0.000000,0.125000,0.000000,0.000000
3,0.000000,0.000000,0.090909,0.000000,0.000000,0.181818,0.000000,0.000000,0.000000,0.000000,0.000000,0.181818,0.090909,0.090909,0.181818,0.000000,0.000000,0.181818,0.000000,0.000000
4,0.117647,0.000000,0.058824,0.000000,0.000000,0.176471,0.000000,0.058824,0.000000,0.000000,0.000000,0.058824,0.058824,0.117647,0.235294,0.058824,0.000000,0.058824,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
707,0.066667,0.166667,0.000000,0.000000,0.066667,0.133333,0.033333,0.100000,0.066667,0.000000,0.000000,0.066667,0.066667,0.033333,0.100000,0.000000,0.033333,0.033333,0.033333,0.000000
708,0.000000,0.045455,0.045455,0.000000,0.045455,0.045455,0.045455,0.045455,0.045455,0.000000,0.000000,0.045455,0.090909,0.045455,0.136364,0.000000,0.136364,0.000000,0.090909,0.136364
709,0.027778,0.000000,0.027778,0.166667,0.027778,0.000000,0.027778,0.055556,0.055556,0.166667,0.000000,0.083333,0.000000,0.111111,0.000000,0.111111,0.027778,0.000000,0.083333,0.027778
710,0.063830,0.000000,0.106383,0.021277,0.085106,0.085106,0.042553,0.000000,0.042553,0.063830,0.000000,0.170213,0.148936,0.021277,0.000000,0.042553,0.021277,0.042553,0.042553,0.000000


In [23]:
X_train, X_test, y_train, y_test = train_test_split(aa_freq, df['MIC'], test_size=0.2)

In [24]:
X_train.shape, X_test.shape

((569, 20), (143, 20))

In [25]:
reg = RandomForestRegressor()

In [26]:
reg.fit(X_train, y_train)

RandomForestRegressor()

In [27]:
y_pred = reg.predict(X_test)

In [28]:
list(zip(y_test, y_pred))

[(0.085, 58.52571379999999),
 (187.0, 39.82565),
 (11.0, 66.27410833333333),
 (100.0, 54.07133000000005),
 (5.0, 63.839130238095265),
 (1.0, 12.31927171666666),
 (0.005, 1.4606103400000001),
 (4.0, 9.83449),
 (0.016, 11.039812399999995),
 (10.0, 6.9715000000000025),
 (7.17, 25.012905000000003),
 (0.011000000000000001, 0.1299199999999999),
 (0.006999999999999999, 0.5472639333333332),
 (100.0, 211.34680666666662),
 (0.003, 0.0060106666666666685),
 (5.0, 53.4983),
 (3.0, 56.425712999999995),
 (100.0, 61.878610000000045),
 (13.5, 7.185905833333331),
 (9.83, 7.162059999999999),
 (3.0, 22.537392033333337),
 (7.0, 8.982587933333331),
 (6.7, 40.64767150000001),
 (11.62, 16.220660000000006),
 (5.0, 4.644317067099565),
 (21.5, 31.212478700000016),
 (0.1, 12.387194999999995),
 (0.003, 0.07271870000000004),
 (8.0, 41.9032480952381),
 (34.135999999999996, 78.108696),
 (10.0, 13.555833700000003),
 (100.0, 65.071),
 (3.97, 11.725389999999999),
 (47.3, 15.150920300000003),
 (33.889, 77.59510299999997)

In [29]:
mean_squared_error(y_pred=y_pred, y_true=y_test)

4228.588980979417

In [30]:
mean_absolute_percentage_error(y_pred=y_pred, y_true=y_test)

38014.50131797916

### Prot2Vec

In [5]:
############# Prot2Vec #############
uniprot_embedding = biovec.models.load_protvec("../data/embeddings/uniprot__kmer_3_contextWindow_10_vector_100_reduction_None")

vectors, _ = convert_sequences_to_vectors(df['Sequence'], uniprot_embedding, words_to_vec, kmer=3)
vectors = vectors.reset_index(drop=True)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
Creating vectors: 100%|██████████| 712/712 [00:00<00:00, 1149.17sequence/s]


In [32]:
vectors

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,-0.035865,0.056587,-0.912851,0.710750,0.600315,0.412987,0.214245,0.087536,-0.006778,-0.553902,...,0.234127,0.001601,0.024704,0.530668,-0.046863,-0.315385,-0.518008,0.482755,-0.339732,0.713406
1,-0.792693,-0.036186,-0.534990,-0.369818,-0.248938,0.263291,-0.287422,0.382745,-0.394377,-0.173330,...,0.962917,-0.311649,0.264908,0.946307,-0.269523,-0.548274,0.092901,-0.110641,0.260107,0.217777
2,-0.026501,-0.291550,-0.201532,0.452551,0.174593,0.538034,0.044253,-0.211522,0.271494,-0.058814,...,0.417982,-0.270066,-0.058481,-0.181214,0.178602,0.231382,-0.294070,-0.129272,-0.005186,0.004585
3,-0.143584,-0.150013,-0.288958,0.442270,0.260510,0.756453,0.238232,0.169671,0.388300,0.292154,...,0.072858,-0.408436,0.111759,-0.007251,-0.259025,-0.079575,-0.197588,0.014973,-0.254488,0.185948
4,0.033022,-0.003353,-0.648364,0.625699,0.353737,0.439275,-0.089358,-0.031649,-0.188973,-0.858152,...,0.604532,-0.018100,0.191170,0.319797,0.211116,0.092421,-0.444209,-0.073724,0.109172,0.685706
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
707,-1.073433,-0.501622,-0.422303,1.193956,1.306920,1.063010,-1.103380,1.403239,0.657871,1.634630,...,0.178000,0.022571,0.205378,-0.446653,1.526093,1.593848,-1.880108,0.481448,1.041668,-1.731232
708,-0.361206,0.054028,-0.058891,0.335209,0.404084,0.369986,0.566835,0.704883,-0.241717,0.570128,...,0.412215,0.215851,0.387135,0.530442,0.762547,0.872716,0.294733,0.212151,1.096743,-0.768404
709,-0.143712,1.136464,-0.832907,0.998365,-2.254707,0.462950,1.306185,2.521483,1.922461,-0.804041,...,1.742563,-1.963696,0.209050,0.058067,0.555618,1.645272,-0.723226,-0.605143,2.387918,-1.566553
710,0.156833,-0.221012,1.236367,0.925153,0.096579,3.274387,2.485103,0.659247,0.876062,-0.048977,...,1.047525,-0.615316,-0.329523,-0.631238,-1.097751,2.060387,-0.853296,0.338695,2.203732,-2.769299


In [33]:
X_train, X_test, y_train, y_test = train_test_split(vectors, df['MIC'], test_size=0.2)

In [34]:
from sklearn.metrics import make_scorer

In [35]:
from sklearn.model_selection import GridSearchCV
# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'max_depth': [80, 90, 100, 110],
    'max_features': [2, 3],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200, 300, 1000]
}
# Create a based model
rf = RandomForestRegressor()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2, scoring=make_scorer(mean_absolute_percentage_error))

In [36]:
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 288 candidates, totalling 864 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    5.8s
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed:   27.2s
[Parallel(n_jobs=-1)]: Done 341 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 624 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 864 out of 864 | elapsed:  2.4min finished


GridSearchCV(cv=3, estimator=RandomForestRegressor(), n_jobs=-1,
             param_grid={'bootstrap': [True], 'max_depth': [80, 90, 100, 110],
                         'max_features': [2, 3], 'min_samples_leaf': [3, 4, 5],
                         'min_samples_split': [8, 10, 12],
                         'n_estimators': [100, 200, 300, 1000]},
             scoring=make_scorer(mean_absolute_percentage_error), verbose=2)

In [108]:
reg.fit(X_train, y_train)

RandomForestRegressor()

In [37]:
grid_search.best_params_

{'bootstrap': True,
 'max_depth': 110,
 'max_features': 2,
 'min_samples_leaf': 5,
 'min_samples_split': 8,
 'n_estimators': 100}

In [38]:
best_grid = grid_search.best_estimator_

In [42]:
y_pred = best_grid.predict(X_test)

In [43]:
mean_squared_error(y_pred=y_pred, y_true=y_test)

4028.083615203235

In [44]:
mean_absolute_error(y_pred=y_pred, y_true=y_test)

38.45556642992497

In [45]:
mean_absolute_percentage_error(y_pred=y_pred, y_true=y_test)

65797.7620105145

In [46]:
pwd

'/Users/in-divye.singh/Documents/Projects/MIC_predictor/notebooks'