## Imports

In [None]:
%pip install Bio
from Bio.SeqUtils import IsoelectricPoint
from Bio.Seq import Seq
from Bio.SeqUtils.ProtParam import ProteinAnalysis
import statistics
!apt-get update
!apt-get install emboss
import pandas as pd
import numpy as np
import pandas as pd
import os
from Bio import SeqIO
from typing import Sequence
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader
from keras.layers import Conv1D, MaxPooling1D, Flatten, Dropout, Dense, ReLU, Input
from keras.models import Model
from keras.optimizers import Adam
from keras.losses import binary_crossentropy
from keras.activations import sigmoid
import tensorflow as tf
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from keras.losses import categorical_crossentropy
from keras.utils import to_categorical
from sklearn.preprocessing import StandardScaler
from keras.losses import categorical_crossentropy
from keras.utils import to_categorical

## Importing Data

In [2]:
directory = '/content'
data = []

for file in os.listdir(directory):
    if file.endswith('.fa'):
        ix = 0
        for record in SeqIO.parse(os.path.join(directory, file), 'fasta'):
            sequence = str(record.seq)
            function = file.split('_')[0]
            index = f"{function}_{ix}"
            data.append({'Sequence': sequence, 'Class': function, 'Index': index})
            ix += 1

df = pd.DataFrame(data)
df.set_index('Index', inplace=True)


In [3]:
df

Unnamed: 0_level_0,Sequence,Class
Index,Unnamed: 1_level_1,Unnamed: 2_level_1
1_0,MAECMAARLSAQEEQIDLLSQEVSRLRDGLCGNPGALSALASTPEL...,1
1_1,MLCFRMGVSLLLSRSLSLCRSSLRTFGPHRRYNKASDALSERLRVF...,1
1_2,SIPATPPPHHRDSHLLLDAFLQSYNSFQFFTHLYSWGDLVRRLKEE...,1
1_3,MAMQAAEWVERLQRQEREIKFLTAEIDHLKNYGCLGASPTLEELRE...,1
1_4,ALGLFTGIGLSEAKARETLRNGALSSLLRRAVVQARSALGPALDKA...,1
...,...,...
4_995,DKILIANRGEIACRVIKTCKKMGIKTVAVHSDVDSSAVHVKMADEA...,4
4_996,QTFEKLLIANRGEIACRVIKTCKKMGIKTVAIHSDVDANAVHVKMA...,4
4_997,MVFGHRLDSMISEVFFDLIDSEWETLCSRWCLITSHKLYSSVYDPN...,4
4_998,MAGLWVGGSVLVAAGRRGSRSPRPLMRSVALWTLKHVPQYSRQRLL...,4


### Prepare Binary for binary classification

In [4]:
df['Class'] = df['Class'].str.replace('non-enzyme', '8')


In [5]:
df['Binary'] = df['Class'].apply(lambda x: 1 if x in ["1", "2", "3", "4", "5", "6", "7"] else 0)
df

Unnamed: 0_level_0,Sequence,Class,Binary
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1_0,MAECMAARLSAQEEQIDLLSQEVSRLRDGLCGNPGALSALASTPEL...,1,1
1_1,MLCFRMGVSLLLSRSLSLCRSSLRTFGPHRRYNKASDALSERLRVF...,1,1
1_2,SIPATPPPHHRDSHLLLDAFLQSYNSFQFFTHLYSWGDLVRRLKEE...,1,1
1_3,MAMQAAEWVERLQRQEREIKFLTAEIDHLKNYGCLGASPTLEELRE...,1,1
1_4,ALGLFTGIGLSEAKARETLRNGALSSLLRRAVVQARSALGPALDKA...,1,1
...,...,...,...
4_995,DKILIANRGEIACRVIKTCKKMGIKTVAVHSDVDSSAVHVKMADEA...,4,1
4_996,QTFEKLLIANRGEIACRVIKTCKKMGIKTVAIHSDVDANAVHVKMA...,4,1
4_997,MVFGHRLDSMISEVFFDLIDSEWETLCSRWCLITSHKLYSSVYDPN...,4,1
4_998,MAGLWVGGSVLVAAGRRGSRSPRPLMRSVALWTLKHVPQYSRQRLL...,4,1


In [6]:
df["Binary"].value_counts()

1    5000
0    5000
Name: Binary, dtype: int64

## Creating the stats table

In [7]:
df = df.reset_index()
df["Sequence"] = df["Sequence"].str.replace("X", "")
df["Sequence"] = df["Sequence"].str.replace("U", "C")

# Iterate through the rows of the DataFrame and create new objects
statistical = []
for i, row in df.iterrows():
    obj_stats = []
    stats = ProteinAnalysis(row["Sequence"])
    count = stats.count_amino_acids()
    percent = stats.get_amino_acids_percent()
    obj_stats.append(round(stats.molecular_weight(),2))
    obj_stats.append(round(stats.aromaticity(),2))
    obj_stats.append(round(stats.instability_index(),2))
    sec_struc = stats.secondary_structure_fraction()  # [helix, turn, sheet]
    obj_stats.append(round(sec_struc[0],2))
    obj_stats.append(round(sec_struc[1],2))
    obj_stats.append(round(sec_struc[2],2))

    obj_stats.append(round(stats.molar_extinction_coefficient()[0],2))
    obj_stats.append(round(stats.charge_at_pH(10),2))
    obj_stats.append(round(stats.charge_at_pH(7),2))
    obj_stats.append(round(stats.charge_at_pH(4),2))

    obj_stats.append(round(stats.isoelectric_point(),2))
    obj_stats.append(round(stats.gravy(),2))

    flex = stats.flexibility()

    obj_stats.append(round(statistics.mean(flex),2))
    obj_stats.append(round(stats.molecular_weight() / sum(count.values()),2))

    obj_stats.append(round(percent["A"] + percent["C"] + percent["G"] \
                         + percent["S"] + percent["T"],2))
    
    obj_stats.append(round(percent["A"] + percent["C"] + percent["D"] \
                         + percent["G"] + percent["N"] + percent["P"] \
                         + percent["S"] + percent["T"] + percent["V"],2))
    
    obj_stats.append(round( percent["A"] + percent["I"] + percent["L"] \
                          + percent["V"] ,2))

    obj_stats.append(round( percent["F"] + percent["H"] + percent["W"] \
                          + percent["Y"] ,2))
    
    obj_stats.append(round( percent["A"] + percent["C"] + percent["F"] \
                          + percent["G"] + percent["I"] + percent["L"] \
                          + percent["M"] + percent["P"] + percent["V"] \
                          + percent["W"] + percent["Y"],2))
    
    
    obj_stats.append(round( percent["D"] + percent["E"] + percent["H"] \
                          + percent["K"] + percent["N"] + percent["Q"] \
                          + percent["R"] + percent["S"] + percent["T"],2))

    obj_stats.append(round( percent["D"] + percent["E"] + percent["H"] \
                          + percent["K"] + percent["R"],2))
    
    obj_stats.append(round( percent["H"] + percent["K"] + percent["R"],2))

    obj_stats.append(round( percent["D"] + percent["E"],2))

    obj_stats.append(round( percent["A"],2))
    obj_stats.append(round( percent["R"],2))
    obj_stats.append(round( percent["N"],2))
    obj_stats.append(round( percent["D"],2))
    obj_stats.append(round( percent["C"],2))
    obj_stats.append(row["Index"])
    obj_stats.append(row["Class"])
    obj_stats.append(row["Binary"])


    statistical.append(obj_stats)


df2 = pd.DataFrame(statistical, columns=['Weight', 'Aromaticity', 'Instability', \
                                         'Helix', 'Turn', 'Sheet', 'Extinction', \
                                         'Charge10', 'Charge7', 'Charge4', \
                                         'Isoelectric', 'GRAVY', 'Flexibility', \
                                         'AverageWeight', 'Tiny', 'Small', \
                                         'Aliphatic', 'Aromatic', 'NonPolar', \
                                         'Polar', 'Charged', 'Basic', 'Acidic', \
                                         'Ala', 'Arg', 'Asn', 'Asp', 'Cys', \
                                         'Index', "Class", "Binary"])

df = df.set_index("Index")
df

Unnamed: 0_level_0,Sequence,Class,Binary
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1_0,MAECMAARLSAQEEQIDLLSQEVSRLRDGLCGNPGALSALASTPEL...,1,1
1_1,MLCFRMGVSLLLSRSLSLCRSSLRTFGPHRRYNKASDALSERLRVF...,1,1
1_2,SIPATPPPHHRDSHLLLDAFLQSYNSFQFFTHLYSWGDLVRRLKEE...,1,1
1_3,MAMQAAEWVERLQRQEREIKFLTAEIDHLKNYGCLGASPTLEELRE...,1,1
1_4,ALGLFTGIGLSEAKARETLRNGALSSLLRRAVVQARSALGPALDKA...,1,1
...,...,...,...
4_995,DKILIANRGEIACRVIKTCKKMGIKTVAVHSDVDSSAVHVKMADEA...,4,1
4_996,QTFEKLLIANRGEIACRVIKTCKKMGIKTVAIHSDVDANAVHVKMA...,4,1
4_997,MVFGHRLDSMISEVFFDLIDSEWETLCSRWCLITSHKLYSSVYDPN...,4,1
4_998,MAGLWVGGSVLVAAGRRGSRSPRPLMRSVALWTLKHVPQYSRQRLL...,4,1


## Implementing types of padding

In [8]:
def pad_sequence(sequence, maxlen, padding='post'):
  num_padding = maxlen - len(sequence)
  padded_sequence = ""

  if padding == 'post':
    padded_sequence = sequence + "0" * (maxlen - len(sequence))

  elif padding == 'extreme':
    half_padding = num_padding // 2
    padded_sequence = "0" * half_padding + sequence + "0" * (num_padding - half_padding)

  elif padding == 'mid':
    half_sequence = len(sequence) // 2
    padded_sequence = sequence[:half_sequence] + "0" * num_padding + sequence[half_sequence:]
            
  elif padding == 'uniform':
    for i, c in enumerate(sequence[:-1]):
        padded_sequence += c + "0"
        # If there are no more padding characters left, stop interleaving
        if i + 1 == num_padding:
            padded_sequence += sequence[i+1:]
            break
    # If there are still some padding characters left, add them to the end of the string
    padded_sequence += "0" * (num_padding - len(padded_sequence) + len(sequence))

  return padded_sequence

# First stage - perform task 1 and task 2 on the first Dataset with different types of paddings 

In [9]:
df['post'] = df['Sequence'].apply(lambda x: pad_sequence(x, 1000,"post"))

In [10]:
df['extr'] = df['Sequence'].apply(lambda x: pad_sequence(x, 1000,"extreme"))


In [11]:
df['mid']  = df['Sequence'].apply(lambda x: pad_sequence(x, 1000,"mid"))


In [12]:
df

Unnamed: 0_level_0,Sequence,Class,Binary,post,extr,mid
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1_0,MAECMAARLSAQEEQIDLLSQEVSRLRDGLCGNPGALSALASTPEL...,1,1,MAECMAARLSAQEEQIDLLSQEVSRLRDGLCGNPGALSALASTPEL...,0000000000000000000000000000000000000000000000...,MAECMAARLSAQEEQIDLLSQEVSRLRDGLCGNPGALSALASTPEL...
1_1,MLCFRMGVSLLLSRSLSLCRSSLRTFGPHRRYNKASDALSERLRVF...,1,1,MLCFRMGVSLLLSRSLSLCRSSLRTFGPHRRYNKASDALSERLRVF...,0000000000000000000000000000000000000000000000...,MLCFRMGVSLLLSRSLSLCRSSLRTFGPHRRYNKASDALSERLRVF...
1_2,SIPATPPPHHRDSHLLLDAFLQSYNSFQFFTHLYSWGDLVRRLKEE...,1,1,SIPATPPPHHRDSHLLLDAFLQSYNSFQFFTHLYSWGDLVRRLKEE...,0000000000000000000000000000000000000000000000...,SIPATPPPHHRDSHLLLDAFLQSYNSFQFFTHLYSWGDLVRRLKEE...
1_3,MAMQAAEWVERLQRQEREIKFLTAEIDHLKNYGCLGASPTLEELRE...,1,1,MAMQAAEWVERLQRQEREIKFLTAEIDHLKNYGCLGASPTLEELRE...,0000000000000000000000000000000000000000000000...,MAMQAAEWVERLQRQEREIKFLTAEIDHLKNYGCLGASPTLEELRE...
1_4,ALGLFTGIGLSEAKARETLRNGALSSLLRRAVVQARSALGPALDKA...,1,1,ALGLFTGIGLSEAKARETLRNGALSSLLRRAVVQARSALGPALDKA...,0000000000000000000000000000000000000000000000...,ALGLFTGIGLSEAKARETLRNGALSSLLRRAVVQARSALGPALDKA...
...,...,...,...,...,...,...
4_995,DKILIANRGEIACRVIKTCKKMGIKTVAVHSDVDSSAVHVKMADEA...,4,1,DKILIANRGEIACRVIKTCKKMGIKTVAVHSDVDSSAVHVKMADEA...,0000000000000000000000000000000000000000000000...,DKILIANRGEIACRVIKTCKKMGIKTVAVHSDVDSSAVHVKMADEA...
4_996,QTFEKLLIANRGEIACRVIKTCKKMGIKTVAIHSDVDANAVHVKMA...,4,1,QTFEKLLIANRGEIACRVIKTCKKMGIKTVAIHSDVDANAVHVKMA...,0000000000000000000000000000000000000000000000...,QTFEKLLIANRGEIACRVIKTCKKMGIKTVAIHSDVDANAVHVKMA...
4_997,MVFGHRLDSMISEVFFDLIDSEWETLCSRWCLITSHKLYSSVYDPN...,4,1,MVFGHRLDSMISEVFFDLIDSEWETLCSRWCLITSHKLYSSVYDPN...,0000000000000000000000000000000000000000000000...,MVFGHRLDSMISEVFFDLIDSEWETLCSRWCLITSHKLYSSVYDPN...
4_998,MAGLWVGGSVLVAAGRRGSRSPRPLMRSVALWTLKHVPQYSRQRLL...,4,1,MAGLWVGGSVLVAAGRRGSRSPRPLMRSVALWTLKHVPQYSRQRLL...,0000000000000000000000000000000000000000000000...,MAGLWVGGSVLVAAGRRGSRSPRPLMRSVALWTLKHVPQYSRQRLL...


## Implementing pI encoding and scaling physico-chemical data

The project takes two approaches for binary and multiclass classification. 

The first approach is to use sequence data and represent it as an array of pIs corresponding to each aminoacid, for example:

QGHEAA = [-1.35, -1.03, 0.59, -3.78, -0.98, -0.98]

In the second approach, ProtParam is run on an input sequence and 28 physico-chemial parameters are extracted

In [13]:
def isoelectric_encoding(sequence):
  alphabet_dict = {'0': 0, 'A': -0.98, 'C': -1.98, 'D': -4.02, 'E': -3.78, \
                   'F': -1.52, 'G': -1.03, 'H': 0.59, 'I': -1.02 ,'K':2.87,\
                   'L': -1.02, 'M': -1.25 , 'N':-1.59 , 'P':-0.70, 'Q':-1.35, \
                   'R': 3.76, 'S':-1.32, 'T':-1.40, 'V':-1.03, 'W':-1.06, 'Y':-1.34}

  vector = np.array([alphabet_dict[char] for char in sequence])
  return vector

In [14]:
df['post'] = df['post'].apply(lambda x: isoelectric_encoding(x))

In [15]:
df['extr'] = df['extr'].apply(lambda x: isoelectric_encoding(x))


In [16]:
df['mid']  = df['mid'].apply(lambda x: isoelectric_encoding(x))


In [17]:
df

Unnamed: 0_level_0,Sequence,Class,Binary,post,extr,mid
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1_0,MAECMAARLSAQEEQIDLLSQEVSRLRDGLCGNPGALSALASTPEL...,1,1,"[-1.25, -0.98, -3.78, -1.98, -1.25, -0.98, -0....","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-1.25, -0.98, -3.78, -1.98, -1.25, -0.98, -0...."
1_1,MLCFRMGVSLLLSRSLSLCRSSLRTFGPHRRYNKASDALSERLRVF...,1,1,"[-1.25, -1.02, -1.98, -1.52, 3.76, -1.25, -1.0...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-1.25, -1.02, -1.98, -1.52, 3.76, -1.25, -1.0..."
1_2,SIPATPPPHHRDSHLLLDAFLQSYNSFQFFTHLYSWGDLVRRLKEE...,1,1,"[-1.32, -1.02, -0.7, -0.98, -1.4, -0.7, -0.7, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-1.32, -1.02, -0.7, -0.98, -1.4, -0.7, -0.7, ..."
1_3,MAMQAAEWVERLQRQEREIKFLTAEIDHLKNYGCLGASPTLEELRE...,1,1,"[-1.25, -0.98, -1.25, -1.35, -0.98, -0.98, -3....","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-1.25, -0.98, -1.25, -1.35, -0.98, -0.98, -3...."
1_4,ALGLFTGIGLSEAKARETLRNGALSSLLRRAVVQARSALGPALDKA...,1,1,"[-0.98, -1.02, -1.03, -1.02, -1.52, -1.4, -1.0...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-0.98, -1.02, -1.03, -1.02, -1.52, -1.4, -1.0..."
...,...,...,...,...,...,...
4_995,DKILIANRGEIACRVIKTCKKMGIKTVAVHSDVDSSAVHVKMADEA...,4,1,"[-4.02, 2.87, -1.02, -1.02, -1.02, -0.98, -1.5...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-4.02, 2.87, -1.02, -1.02, -1.02, -0.98, -1.5..."
4_996,QTFEKLLIANRGEIACRVIKTCKKMGIKTVAIHSDVDANAVHVKMA...,4,1,"[-1.35, -1.4, -1.52, -3.78, 2.87, -1.02, -1.02...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-1.35, -1.4, -1.52, -3.78, 2.87, -1.02, -1.02..."
4_997,MVFGHRLDSMISEVFFDLIDSEWETLCSRWCLITSHKLYSSVYDPN...,4,1,"[-1.25, -1.03, -1.52, -1.03, 0.59, 3.76, -1.02...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-1.25, -1.03, -1.52, -1.03, 0.59, 3.76, -1.02..."
4_998,MAGLWVGGSVLVAAGRRGSRSPRPLMRSVALWTLKHVPQYSRQRLL...,4,1,"[-1.25, -0.98, -1.03, -1.02, -1.06, -1.03, -1....","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-1.25, -0.98, -1.03, -1.02, -1.06, -1.03, -1...."


In [18]:
df2

Unnamed: 0,Weight,Aromaticity,Instability,Helix,Turn,Sheet,Extinction,Charge10,Charge7,Charge4,...,Basic,Acidic,Ala,Arg,Asn,Asp,Cys,Index,Class,Binary
0,70250.86,0.09,49.92,0.29,0.21,0.30,92820,-45.78,-6.83,68.05,...,0.16,0.14,0.08,0.08,0.04,0.05,0.02,1_0,1,1
1,78644.85,0.09,48.71,0.31,0.22,0.28,82850,-41.76,-1.07,71.86,...,0.14,0.12,0.06,0.08,0.02,0.05,0.03,1_1,1,1
2,78866.83,0.10,44.61,0.30,0.21,0.25,63260,-59.75,-9.96,68.57,...,0.15,0.14,0.07,0.05,0.04,0.06,0.02,1_2,1,1
3,75942.96,0.09,48.22,0.32,0.16,0.26,65780,-57.81,-0.08,76.57,...,0.17,0.14,0.07,0.05,0.03,0.07,0.03,1_3,1,1
4,86873.53,0.09,41.13,0.31,0.19,0.29,89730,-56.23,-2.92,89.27,...,0.16,0.13,0.08,0.07,0.03,0.05,0.02,1_4,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,66814.89,0.06,41.19,0.29,0.22,0.28,52830,-45.39,-5.09,59.01,...,0.14,0.13,0.09,0.05,0.04,0.05,0.02,4_995,4,1
9996,71172.81,0.06,40.77,0.30,0.23,0.26,52830,-42.52,-2.26,60.61,...,0.14,0.13,0.09,0.05,0.03,0.05,0.01,4_996,4,1
9997,79269.90,0.07,41.64,0.30,0.22,0.27,68300,-50.99,-6.15,70.65,...,0.14,0.13,0.08,0.06,0.03,0.05,0.01,4_997,4,1
9998,76667.06,0.06,40.13,0.30,0.23,0.27,58330,-41.91,-0.00,71.66,...,0.14,0.12,0.09,0.06,0.04,0.05,0.02,4_998,4,1


In [19]:
scaler = StandardScaler()
data_subset = df2.iloc[:,0:28]
scaled_subset = scaler.fit_transform(data_subset)
scaled_df = pd.DataFrame(scaled_subset,columns=data_subset.columns)
scaled_df

Unnamed: 0,Weight,Aromaticity,Instability,Helix,Turn,Sheet,Extinction,Charge10,Charge7,Charge4,...,NonPolar,Polar,Charged,Basic,Acidic,Ala,Arg,Asn,Asp,Cys
0,-1.198274,0.586970,0.595991,-0.039385,-0.699757,1.417425,0.304468,0.528205,-0.197352,-0.373794,...,-0.536345,0.536390,0.999847,0.919589,0.789417,0.433285,1.936355,0.299335,-0.244319,-0.159798
1,-0.535873,0.586970,0.456330,0.533911,-0.400996,0.714668,-0.004791,0.737796,0.148971,-0.175709,...,-0.061094,0.061162,0.180614,-0.041570,-0.104448,-0.673729,1.936355,-1.421969,-0.244319,0.716766
2,-0.518356,1.115725,-0.016902,0.247263,-0.699757,-0.339467,-0.612453,-0.200150,-0.385545,-0.346759,...,-0.298719,0.298776,0.453691,0.439009,0.789417,-0.120222,-0.365724,0.299335,0.644760,-0.159798
3,-0.749089,0.586970,0.399773,0.820559,-2.193559,0.011912,-0.534285,-0.099004,0.208495,0.069167,...,-0.298719,0.298776,1.272925,1.400168,0.789417,-0.120222,-0.365724,-0.561317,1.533838,0.716766
4,0.113482,0.586970,-0.418572,0.533911,-1.297278,1.066047,0.208619,-0.016628,0.037739,0.729449,...,0.414158,-0.414066,0.999847,0.919589,0.342485,0.433285,1.168995,-0.561317,-0.244319,-0.159798
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,-1.469419,-0.999293,-0.411647,-0.039385,-0.400996,0.714668,-0.935981,0.548539,-0.092734,-0.843790,...,0.176532,-0.176452,0.180614,-0.041570,0.342485,0.986793,-0.365724,0.299335,-0.244319,-0.159798
9996,-1.125520,-0.999293,-0.460124,0.247263,-0.102236,0.011912,-0.935981,0.698172,0.077421,-0.760605,...,-0.061094,0.061162,-0.092464,-0.041570,0.342485,0.986793,-0.365724,-0.561317,-0.244319,-1.036361
9997,-0.486548,-0.470539,-0.359706,0.247263,-0.400996,0.363290,-0.456117,0.256571,-0.156467,-0.238618,...,-0.061094,0.061162,0.453691,-0.041570,0.342485,0.433285,0.401636,-0.561317,-0.244319,-1.036361
9998,-0.691948,-0.999293,-0.533994,0.247263,-0.102236,0.363290,-0.765376,0.729976,0.213305,-0.186108,...,0.414158,-0.414066,0.180614,-0.041570,-0.104448,0.986793,0.401636,0.299335,-0.244319,-0.159798


### The project experiments on three model architectures, with no convolution, with one convolutional layer and with three convolutional layers (implemented from scratch in Keras)

## Architecture #1 - no convolution

In [20]:
def noConv(input_size, hidden_size, num_classes):
    inputs = Input(shape=(input_size,))
    fc1 = Dense(hidden_size, activation='relu')(inputs)
    dropout1 = Dropout(0.5)(fc1)
    fc2 = Dense(hidden_size, activation='relu')(dropout1)
    dropout2 = Dropout(0.25)(fc2)
    fc3 = Dense(num_classes, activation='sigmoid')(dropout2)
    model = Model(inputs, fc3)
    return model




## Architecture #2 - 1 convolutional layer

In [21]:
def oneCNN(input_size, num_classes):

    inputs = Input(shape=(input_size, 1))
    conv_layer1 = Conv1D(32, 3, activation='relu', input_shape=(input_size, 1))(inputs)
    pooling_layer1 = MaxPooling1D(pool_size=2)(conv_layer1)
    dropout1 = Dropout(0.5)(pooling_layer1)
    flatten = Flatten()(conv_layer1)
    fc1 = Dense(16, activation='relu')(flatten)
    fc2 = Dense(8, activation='relu')(fc1)
    fc3 = Dense(num_classes, activation='sigmoid')(fc2)
    
    model = Model(inputs, fc3)
    return model


## Architecture #3 - stack of 5 convolutional layers

In [22]:
def stackedCNN(input_size, num_classes):
    inputs = Input(shape=(input_size, 1))
    conv_layer1 = Conv1D(32, 2, activation='relu', input_shape=(input_size, 1))(inputs)
    pooling_layer1 = MaxPooling1D(pool_size=2)(conv_layer1)
    conv_layer2 = Conv1D(256, 2, activation='relu')(pooling_layer1)
    dropout1 = Dropout(0.5)(conv_layer2)
    conv_layer3 = Conv1D(128, 2, activation='relu')(dropout1)
    pooling_layer2 = MaxPooling1D(pool_size=2)(conv_layer3)
    conv_layer4 = Conv1D(64, 2, activation='relu')(pooling_layer2)
    dropout2 = Dropout(0.25)(conv_layer4)
    conv_layer5 = Conv1D(32, 2, activation='relu')(dropout2)
    flatten = Flatten()(conv_layer5)
    fc1 = Dense(16, activation='relu')(flatten)
    fc2 = Dense(8, activation='relu')(fc1)
    fc3 = Dense(num_classes, activation='sigmoid')(fc2)
    
    model = Model(inputs, fc3)
    return model


# Dataset 2 tasks

In [23]:
batch_size = 64
epochs = 45

In [24]:
df3 = df[df["Binary"] != 0]
df4 = df2[df2["Binary"] != 0]
data_subset = df4.iloc[:,0:28]
scaled_subset = scaler.fit_transform(data_subset)
scaled_df2 = pd.DataFrame(scaled_subset,columns=data_subset.columns)
scaled_df2

Unnamed: 0,Weight,Aromaticity,Instability,Helix,Turn,Sheet,Extinction,Charge10,Charge7,Charge4,...,NonPolar,Polar,Charged,Basic,Acidic,Ala,Arg,Asn,Asp,Cys
0,-1.121913,0.681241,1.420085,-0.571078,-0.651642,2.123379,0.367321,0.620084,-0.617403,-0.506896,...,-1.179873,1.179663,1.184727,0.978823,1.140794,0.278267,2.427625,0.743253,-0.46089,-0.128901
1,-0.473737,0.681241,1.223561,0.431165,-0.056426,1.009446,0.031797,0.946102,-0.092338,-0.301655,...,-0.483741,0.483738,0.107768,-0.254884,-0.165060,-1.084451,2.427625,-1.548616,-0.46089,1.309729
2,-0.456596,1.299989,0.557655,-0.069957,-0.651642,-0.661453,-0.627471,-0.512868,-0.902725,-0.478884,...,-0.831807,0.831700,0.466754,0.361970,1.140794,-0.403092,-0.391698,0.743253,0.72881,-0.128901
3,-0.682375,0.681241,1.143977,0.932287,-3.627720,-0.104487,-0.542664,-0.355536,-0.002093,-0.047933,...,-0.831807,0.831700,1.543714,1.595676,1.140794,-0.403092,-0.391698,-0.402681,1.91851,1.309729
4,0.161673,0.681241,-0.007554,0.431165,-1.842073,1.566412,0.263332,-0.227400,-0.260979,0.636202,...,0.212390,-0.212188,1.184727,0.978823,0.487867,0.278267,1.487850,-0.402681,-0.46089,-0.128901
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,-1.387236,-1.175002,0.002191,-0.571078,-0.056426,1.009446,-0.978475,0.651713,-0.458790,-0.993870,...,-0.135676,0.135775,0.107768,-0.254884,0.487867,0.959626,-0.391698,0.743253,-0.46089,-0.128901
4996,-1.050721,-1.175002,-0.066024,-0.069957,0.538789,-0.104487,-0.978475,0.884467,-0.200815,-0.907680,...,-0.483741,0.483738,-0.251219,-0.254884,0.487867,0.959626,-0.391698,-0.402681,-0.46089,-1.567532
4997,-0.425472,-0.556254,0.075278,-0.069957,-0.056426,0.452480,-0.457858,0.197559,-0.555416,-0.366837,...,-0.483741,0.483738,0.466754,-0.254884,0.487867,0.278267,0.548076,-0.402681,-0.46089,-1.567532
4998,-0.626461,-1.175002,-0.169971,-0.069957,0.538789,0.452480,-0.793382,0.933937,0.005200,-0.312429,...,0.212390,-0.212188,0.107768,-0.254884,-0.165060,0.959626,0.548076,0.743253,-0.46089,-0.128901


In [25]:
df3 = pd.get_dummies(df3, columns=['Class'])
df4 = pd.get_dummies(df4, columns=['Class'])


Prepare Data

In [26]:
def split_data(X,y):

  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
  X_train = tf.convert_to_tensor(X_train, dtype=tf.float32)
  y_train = tf.convert_to_tensor(y_train, dtype=tf.float32)
  X_test = tf.convert_to_tensor(X_test, dtype=tf.float32)
  y_test = tf.convert_to_tensor(y_test, dtype=tf.float32)

  return X_train, X_test, y_train, y_test

In [27]:
# task1_sequence
X = df["mid"].values
X = X.tolist()
X = np.stack(X)
y = df['Binary'].values

X_train, X_test, y_train, y_test = split_data(X,y)

#task1_tabular
X2 = scaled_df.values
y2 = df2['Binary'].values

X_train2, X_test2, y_train2, y_test2 = split_data(X2,y2)

#task2_sequence
X3 = df3["mid"].values
X3 = X3.tolist()
X3 = np.stack(X3)
y3 = df3.loc[:,'Class_1':'Class_5'].values


X_train3, X_test3, y_train3, y_test3 = split_data(X3,y3)


#task2_tabular
X4 = scaled_df2.values
y4 = df4.loc[:,'Class_1':'Class_5'].values

X_train4, X_test4, y_train4, y_test4 = split_data(X4,y4)



In [28]:
input_tabular = 28
input_sequence = 1000
hidden_size = 64

In [29]:
def run_model(architecture, tabular, out_classes, X_train, X_test, y_train, y_test):

    if architecture == "no_conv":
      if tabular == True:
        model = noConv(input_tabular, hidden_size, out_classes)
      elif tabular == False:
        model = noConv(input_sequence, hidden_size, out_classes)

    if architecture == "one_conv":
      if tabular == True:
        model = oneCNN(input_tabular, out_classes)
      elif tabular == False:
        model = oneCNN(input_sequence, out_classes)

    if architecture == "five_conv":
      if tabular == True:
        model = stackedCNN(input_tabular, out_classes)
      elif tabular == False:
        model = stackedCNN(input_sequence, out_classes)

    if out_classes == 1:
      loss = binary_crossentropy
    else:  
      loss = categorical_crossentropy

    model.compile(optimizer=Adam(), loss = loss, metrics=["accuracy"])
    model.fit(X_train, y_train, batch_size = batch_size, epochs = epochs, verbose=1)
    _, test_acc = model.evaluate(X_test, y_test, verbose=0)
    print(str(out_classes) + " " + str(tabular) + " " + architecture)
    print("Test accuracy:", test_acc)
    model.summary()

    return model

Rewriting this as a loop takes the same amount of space and is less readable:

In [30]:
model = run_model("no_conv", False, 1, X_train, X_test, y_train, y_test)
model = run_model("no_conv", True,  1, X_train2, X_test2, y_train2, y_test2)
model = run_model("no_conv", False, 5, X_train3, X_test3, y_train3, y_test3)
model = run_model("no_conv", True,  5, X_train4, X_test4, y_train4, y_test4)

model = run_model("one_conv", False, 1, X_train, X_test, y_train, y_test)
model = run_model("one_conv", True,  1, X_train2, X_test2, y_train2, y_test2)
model = run_model("one_conv", False, 5, X_train3, X_test3, y_train3, y_test3)
model = run_model("one_conv", True,  5, X_train4, X_test4, y_train4, y_test4)

model = run_model("five_conv", False, 1, X_train, X_test, y_train, y_test)
model = run_model("five_conv", True,  1, X_train2, X_test2, y_train2, y_test2)
model = run_model("five_conv", False, 5, X_train3, X_test3, y_train3, y_test3)
model = run_model("five_conv", True,  5, X_train4, X_test4, y_train4, y_test4)



Epoch 1/45
Epoch 2/45
Epoch 3/45
Epoch 4/45
Epoch 5/45
Epoch 6/45
Epoch 7/45
Epoch 8/45
Epoch 9/45
Epoch 10/45
Epoch 11/45
Epoch 12/45
Epoch 13/45
Epoch 14/45
Epoch 15/45
Epoch 16/45
Epoch 17/45
Epoch 18/45
Epoch 19/45
Epoch 20/45
Epoch 21/45
Epoch 22/45
Epoch 23/45
Epoch 24/45
Epoch 25/45
Epoch 26/45
Epoch 27/45
Epoch 28/45
Epoch 29/45
Epoch 30/45
Epoch 31/45
Epoch 32/45
Epoch 33/45
Epoch 34/45
Epoch 35/45
Epoch 36/45
Epoch 37/45
Epoch 38/45
Epoch 39/45
Epoch 40/45
Epoch 41/45
Epoch 42/45
Epoch 43/45
Epoch 44/45
Epoch 45/45
1 False no_conv
Test accuracy: 0.8880000114440918
Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 1000)]            0         
                                                                 
 dense (Dense)               (None, 64)                64064     
                                                                 
 dropout