# Variational auto encoder (VAE) fed into supervised learning algorithm

*   Generate 100 embeddings by using VAE.
*   Fed into feed forward neural network and got 95.56% accuracy.
*   Fed into SVM, and got 94.84% accuracy.
*   Final score at Kaggle : 94.317%



In [1]:
import torch
import torchvision
import torchvision.transforms as transforms
import numpy as np
from numpy import array
from numpy import argmax
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
import re

In [2]:
import pandas as pd
df = pd.read_csv (r'/content/train_features.csv')


In [3]:
def equal_length(seq):
    seq2 = seq.ljust(1058, '-') 
    fir = seq2[0:658]
    remove_characters = ['K','M', 'N', 'R', 'S', 'W', 'Y']
    for character in remove_characters:
        fir = fir.replace(character, "-")
    return fir
dna = df.dna.apply(equal_length)


In [4]:

from numpy import argmax
def one_hot_encode(seq):
  # define universe of possible input values
  #alphabet = 'ACGT-KMNRSWY'
  alphabet = 'ACGT-'
  # define a mapping of chars to integers
  char_to_int = dict((c, i) for i, c in enumerate(alphabet))
  int_to_char = dict((i, c) for i, c in enumerate(alphabet))
  # integer encode input data
  integer_encoded = [char_to_int[char] for char in seq]
  # one hot encode
  onehot_encoded = list()
  for value in integer_encoded:
	  letter = [0 for _ in range(len(alphabet))]
	  letter[value] = 1
	  onehot_encoded.append(letter)
  return(np.asarray(onehot_encoded))
dna_code = dna.apply(one_hot_encode)

In [5]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
class Sampling(layers.Layer):
    """Uses (z_mean, z_log_var) to sample z, the vector encoding a digit."""

    def call(self, inputs):
        z_mean, z_log_var = inputs
        batch = tf.shape(z_mean)[0]
        dim = tf.shape(z_mean)[1]
        epsilon = tf.keras.backend.random_normal(shape=(batch, dim))
        return z_mean + tf.exp(0.5 * z_log_var) * epsilon

latent_dim = 100

encoder_inputs = keras.Input(shape=(658, 5, 1))
x = layers.Conv2D(32, 3, activation="relu", strides=(7, 1),padding="same")(encoder_inputs)
x = layers.Conv2D(64, 3, activation="relu",  strides=(2, 1),padding="same")(x)
x = layers.Flatten()(x)
#x = layers.Dense(16, activation="relu")(x)
z_mean = layers.Dense(latent_dim, name="z_mean")(x)
z_log_var = layers.Dense(latent_dim, name="z_log_var")(x)
z = Sampling()([z_mean, z_log_var])
encoder = keras.Model(encoder_inputs, [z_mean, z_log_var, z], name="encoder")

encoder.summary()

latent_inputs = keras.Input(shape=(latent_dim,))
x = layers.Dense(47 * 5 * 64, activation="relu")(latent_inputs)
x = layers.Reshape((47, 5, 64))(x)
x = layers.Conv2DTranspose(64, 3, activation="relu", strides=(2, 1), padding="same")(x)
x = layers.Conv2DTranspose(32, 3, activation="relu", strides=(7, 1), padding="same")(x)
decoder_outputs = layers.Conv2DTranspose(1, 3, activation="sigmoid", padding="same")(x)
decoder = keras.Model(latent_inputs, decoder_outputs, name="decoder")
decoder.summary()

Model: "encoder"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 658, 5, 1)]  0                                            
__________________________________________________________________________________________________
conv2d (Conv2D)                 (None, 94, 5, 32)    320         input_1[0][0]                    
__________________________________________________________________________________________________
conv2d_1 (Conv2D)               (None, 47, 5, 64)    18496       conv2d[0][0]                     
__________________________________________________________________________________________________
flatten (Flatten)               (None, 15040)        0           conv2d_1[0][0]                   
____________________________________________________________________________________________

In [6]:

class VAE(keras.Model):
    def __init__(self, encoder, decoder, **kwargs):
        super(VAE, self).__init__(**kwargs)
        self.encoder = encoder
        self.decoder = decoder
        self.total_loss_tracker = keras.metrics.Mean(name="total_loss")
        self.reconstruction_loss_tracker = keras.metrics.Mean(
            name="reconstruction_loss"
        )
        self.kl_loss_tracker = keras.metrics.Mean(name="kl_loss")

    @property
    def metrics(self):
        return [
            self.total_loss_tracker,
            self.reconstruction_loss_tracker,
            self.kl_loss_tracker,
        ]

    def train_step(self, data):
        with tf.GradientTape() as tape:
            z_mean, z_log_var, z = self.encoder(data)
            reconstruction = self.decoder(z)
            reconstruction_loss = tf.reduce_mean(
                tf.reduce_sum(
                    keras.losses.binary_crossentropy(data, reconstruction), axis=(1, 2)
                )
            )
            kl_loss = -0.5 * (1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var))
            kl_loss = tf.reduce_mean(tf.reduce_sum(kl_loss, axis=1))
            total_loss = reconstruction_loss + kl_loss
        grads = tape.gradient(total_loss, self.trainable_weights)
        self.optimizer.apply_gradients(zip(grads, self.trainable_weights))
        self.total_loss_tracker.update_state(total_loss)
        self.reconstruction_loss_tracker.update_state(reconstruction_loss)
        self.kl_loss_tracker.update_state(kl_loss)
        return {
            "loss": self.total_loss_tracker.result(),
            "reconstruction_loss": self.reconstruction_loss_tracker.result(),
            "kl_loss": self.kl_loss_tracker.result(),
        }

In [7]:
x_dna = dna_code.to_list()
x_dna = np.array(x_dna)
x_dna.shape


(12906, 658, 5)

In [8]:

xtrain = np.concatenate([x_dna], axis=0)
xtrain = np.expand_dims(xtrain, -1)

vae = VAE(encoder, decoder)
vae.compile(optimizer=keras.optimizers.Adam())
vae.fit(xtrain, epochs=10, batch_size=32)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f5730e2fe50>

In [13]:
y_train = pd.read_csv (r'/content/train_labels.csv')
ytrain = np.array(y_train['labels'], dtype = "uint32")

z_mean, _, _ = vae.encoder.predict(xtrain)
x_input = z_mean
model = tf.keras.models.Sequential([  
  keras.Input(shape=100),
  layers.Flatten(),
  layers.Dense(500, activation='relu'),
  layers.Dropout(0.2, seed=3),
  layers.Dense(1024, activation='relu'),
  layers.Dense(1220, activation='softmax')
])

model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

model.fit(x_input, ytrain, epochs=40, validation_split=0.3, verbose=2)

Epoch 1/40
283/283 - 2s - loss: 3.1555 - accuracy: 0.5552 - val_loss: 0.5902 - val_accuracy: 0.8915
Epoch 2/40
283/283 - 1s - loss: 0.3026 - accuracy: 0.9338 - val_loss: 0.3971 - val_accuracy: 0.9323
Epoch 3/40
283/283 - 1s - loss: 0.1513 - accuracy: 0.9611 - val_loss: 0.3680 - val_accuracy: 0.9437
Epoch 4/40
283/283 - 1s - loss: 0.1095 - accuracy: 0.9729 - val_loss: 0.3434 - val_accuracy: 0.9478
Epoch 5/40
283/283 - 1s - loss: 0.0757 - accuracy: 0.9803 - val_loss: 0.3437 - val_accuracy: 0.9489
Epoch 6/40
283/283 - 1s - loss: 0.0676 - accuracy: 0.9835 - val_loss: 0.3358 - val_accuracy: 0.9483
Epoch 7/40
283/283 - 1s - loss: 0.0481 - accuracy: 0.9858 - val_loss: 0.3121 - val_accuracy: 0.9548
Epoch 8/40
283/283 - 1s - loss: 0.0468 - accuracy: 0.9870 - val_loss: 0.3597 - val_accuracy: 0.9507
Epoch 9/40
283/283 - 1s - loss: 0.0488 - accuracy: 0.9884 - val_loss: 0.3327 - val_accuracy: 0.9545
Epoch 10/40
283/283 - 1s - loss: 0.0448 - accuracy: 0.9885 - val_loss: 0.4004 - val_accuracy: 0.9421

<keras.callbacks.History at 0x7f566e967a10>

In [14]:
from numpy import mean
from numpy import std
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
# define the model
X_train, X_test, y_train, y_test = train_test_split(x_input, ytrain, test_size=0.4, random_state=0)
clf = svm.SVC(kernel='linear', C=1).fit(X_train, y_train)
clf.score(X_test, y_test)

0.9484795661437149