In [43]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from fuzzywuzzy import fuzz
import Levenshtein
import random

In [41]:
import keras
from keras.layers import Dense, Dropout, LSTM, Conv2D, Conv1D, MaxPooling1D, MaxPooling2D, Flatten, ZeroPadding1D
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
df = pd.read_csv('Hemolysis/Cleaned_hemolytic_data.csv')
df.drop(columns=['Unnamed: 0'], inplace=True)
df.columns = [c.lower() for c in df.columns]
df.sequence = df.sequence.str.upper()

In [16]:
def knn_toxicities(peptide_row, df, k, min_uncertainty):
    sequence = peptide_row.sequence
    distances = df[df.uncertainty < min_uncertainty][df.sequence != sequence].sequence.apply(lambda x: Levenshtein.distance(x, sequence))
    bestk = df.loc[distances.sort_values().iloc[:k]]
    avg_toxicity = np.mean(bestk.log10_hc50)
    return avg_toxicity


In [17]:
for i, row in df.iterrows():
    bestk = knn_toxicities(row, df, 5, 0.1)
    df.at[i, 'bestk'] = bestk

  This is separate from the ipykernel package so we can avoid doing imports until


In [23]:
df[['log10_hc50', 'bestk']].corr()

Unnamed: 0,log10_hc50,bestk
log10_hc50,1.0,0.03919
bestk,0.03919,1.0


In [21]:
df[df.uncertainty < 0.1][['log10_hc50', 'bestk']].corr()

Unnamed: 0,log10_hc50,bestk
log10_hc50,1.0,0.035214
bestk,0.035214,1.0


In [22]:
df[df.uncertainty > 0.1][['log10_hc50', 'bestk']].corr()

Unnamed: 0,log10_hc50,bestk
log10_hc50,1.0,0.062517
bestk,0.062517,1.0


# Using CNN Architecture from MIC Prediction

In [30]:
CHARACTER_DICT = set([character for sequence in df.sequence for character in sequence])
MAX_SEQUENCE_LENGTH = int(df.sequence.str.len().describe(percentiles=[0.95])['95%'])

# Each amino acid its own group
character_to_index = {
    (character): i
    for i, character in enumerate(CHARACTER_DICT)
}

# Group them together heavily
"""character_to_index = {
    ('R', 'K', 'H'): 0,
    ('D', 'E'): 1,
    ('S', 'T', 'N', 'Q', 'C'): 2,
    ('A', 'V', 'I', 'L', 'M', 'F', 'Y', 'W', 'P', 'G'): 3,
}

# Group them together more sparingly
character_to_index = {
    ('R'): 0,
    ('H'): 1,
    ('K'): 2,
    ('D', 'E'): 3,
    ('S', 'T', 'N', 'Q', 'C'): 4,
    ('G', 'P'): 5,
    ('A', 'V', 'I', 'L', 'M'): 6,
    ('F', 'Y', 'W'): 7,
}"""

index2character = {
    value: key
    for key, value in character_to_index.items()
}

def sequence_to_vector(sequence):
    default = np.zeros([MAX_SEQUENCE_LENGTH, len(character_to_index)])
    for i, character in enumerate(sequence[:MAX_SEQUENCE_LENGTH]):
        default[i][character_to_index[character]] = 1
    return default

def row_to_vector(row, shuffle_sequence=False):
    sequence = list(row['sequence'])
    if shuffle_sequence:
        random.shuffle(sequence)
    return sequence_to_vector(sequence)

In [35]:
vectors = []
SHUFFLE_SEQUENCE = False
for row in df.iterrows():
    vectors.append(row_to_vector(row[1], shuffle_sequence=SHUFFLE_SEQUENCE))

vectors = np.array(vectors)

labels = np.array(df.log10_hc50)

In [39]:
def generate_train_test_splits(
        vectors, labels,
        extra_training_vectors=[], extra_training_labels=[], extra_sample_weights=[],
        cutoff=0.85
):
    cutoff = int(cutoff * len(labels))
    idx = range(len(vectors))
    random.shuffle(idx)
    reordered_vectors = vectors[idx]
    reordered_labels = labels[idx]
    reordered_sample_weights = sample_weights[idx]
    if len(extra_training_vectors) > 0:
        train_x = np.concatenate((reordered_vectors[:cutoff], extra_training_vectors))
        train_y = np.concatenate((reordered_labels[:cutoff], extra_training_labels))
        train_sample_weights = np.concatenate((reordered_sample_weights[:cutoff], pa_sample_weights))
    else:
        train_x = reordered_vectors[:cutoff]
        train_y = reordered_labels[:cutoff]
        train_sample_weights = reordered_sample_weights[:cutoff]
    test_x = reordered_vectors[cutoff:]
    test_y = reordered_labels[cutoff:]
    return train_x, train_y, test_x, test_y, train_sample_weights

In [45]:
# Convolutional NN
def conv_model():
    model = keras.models.Sequential()
    model.add(ZeroPadding1D(
        5, input_shape = (MAX_SEQUENCE_LENGTH, len(character_to_index))
    ))
    model.add(Conv1D(
        64,
        kernel_size = 5,
        strides = 1,
        activation = 'relu',
        #input_shape = (MAX_SEQUENCE_LENGTH, len(character_to_index) + 1)
    ))
    model.add(MaxPooling1D(pool_size=2, strides=2))
    #model.add(Dropout(0.5))
    model.add(Conv1D(64, 5, activation='relu'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Flatten())
    model.add(Dropout(0.5))
    model.add(Dense(100, activation='relu'))
    #model.add(Dense(100, activation='relu'))
    model.add(Dense(20, activation='relu'))
    model.add(Dense(1))
    model.compile(loss='mean_squared_error', optimizer='adam')
    return model

In [None]:
convmodel = conv_model()
train_x, train_y, test_x, test_y, _ = generate_train_test_splits(vectors, labels)
convmodel.fit(train_x, train_y, batch_size=40, epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100

In [86]:
print("CNN test error, MSE of log10_hc50")
print(convmodel.evaluate(test_x, test_y))

CNN test error, MSE of log10_hc50
0.4875335242725225


In [78]:
cnn_preds = convmodel.predict(test_x)
cnn_preds = cnn_preds.reshape((cnn_preds.shape[0]))
pd.DataFrame({'cnn_preds': cnn_preds, 'test_y': test_y}).corr()

Unnamed: 0,cnn_preds,test_y
cnn_preds,1.0,0.548227
test_y,0.548227,1.0
