By Zeal Jinwala  <br> 
Date: June 16, 2022  <br> 
Data Source: The dataset (courtesy of Natalia Petrova) is a subset of the data used in "Prediction of catalytic residues using Support Vector Machine with selected protein sequence and structural properties", Natalia Petrova and Cathy Wu, 2006. 
http://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-7-312
Please review that publication to learn more about this dataset and the catalytic residue prediction problem.


In [137]:
from scipy.io import arff
import pandas as pd
import numpy as np
from Bio.SeqUtils.ProtParam import ProteinAnalysis
from tensorflow.keras import models, layers, utils, backend as K
from sklearn.model_selection import train_test_split


### Load and rearrange catasite data

In [143]:
data = arff.loadarff('NataliaPetrova.catsite.arff')
df = pd.DataFrame(data[0])
df['AAName1LetterCode'] = df['AAName1LetterCode'].apply(lambda x: x.decode("utf-8"))
df['class'] = df['class'].apply(lambda x: x.decode("utf-8"))

aaOneHot = pd.DataFrame([ProteinAnalysis(i).count_amino_acids() for i in df['AAName1LetterCode']])
df1 = df[['nearest_cleft_SA_area','nearest_cleft_distance','distance_to_3_largest_clefts','HB_main_chain_protein','ScoreConsScore','class']]
result = pd.concat([aaOneHot, df1], axis=1, join="inner")

X = result.dropna()
T = X['class'].astype(float).values
X = X.iloc[:,:25].values
X_train,X_test,y_train,y_test = train_test_split(X,T,test_size = 0.1)

### Model Design

In [144]:
# Neural network model
    # 1. Define the neural network structure ( # of input units,  # of hidden units, etc). 
    # 2. Initialize the model's parameters
    # 3. Loop:
    #     - Implement forward propagation
    #     - Compute loss
    #     - Implement backward propagation to get the gradients
    #     - Update parameters (gradient descent)

# define the keras model
model = models.Sequential(name="Perceptron", layers=[
    layers.Dense(             #a fully connected layer
          name="dense",
          input_dim=25,        #with 25 features as the input
          units=1,            #and 1 node because we want 1 output
          activation='linear' #f(x)=x
    )
])
model.summary()

Model: "Perceptron"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 1)                 26        
                                                                 
Total params: 26
Trainable params: 26
Non-trainable params: 0
_________________________________________________________________


### Train and Test

In [146]:
# compile the neural network
model.compile(optimizer='adam', loss='binary_crossentropy', 
              metrics=['accuracy'])
training = model.fit(x=X_train, y=y_train, batch_size=32, epochs=100, shuffle=True, verbose=0)

In [147]:
# Evaluate the model on the test data using `evaluate`
print("Evaluate on test data")
results = model.evaluate(X_test, y_test, batch_size=128)
print("test loss, test acc:", results)

# Generate predictions (probabilities -- the output of the last layer)
# on new data using `predict`
print("Generate predictions for 3 samples")
predictions = model.predict(X_test[:3])
print("predictions shape:", predictions.shape)

Evaluate on test data
test loss, test acc: [-2.7220497131347656, 0.0]
Generate predictions for 3 samples
predictions shape: (3, 1)
