# 3 ETAPA: Autoencoder usando el conjunto original

En esta notebook, es entrenado un autoencoder a partir de redes neurales con el conjunto de datos original. 

Inicialmente, será un entrenado un `autoencoder` y este será usados para el entrenamiento de una arquitectura de red neuronales fija. Para la segunda etapa, será usado este mismo autoencoder pero se usará el wrapper de Keras para poder buscar los mejores hiperparámetros de la red mediante `GridSearchCV`. Los parámetros que serán variados son la cantidad de neuronas en cada capa de la red neuronal.

Para el uso de este notebook es necesario instalar los siguientes módulos:
* pip install scikit-multilearn
* pip install tensorflow
* pip install tensorflow-addons

In [None]:
import numpy as np
import pandas as pd

import tensorflow as tf
import tensorflow.keras.layers as L
import tensorflow.keras.backend as K
from tensorflow.keras.layers import Input, Dense
import tensorflow_addons as tfa
from tensorflow.keras.callbacks import ReduceLROnPlateau, ModelCheckpoint, EarlyStopping

from sklearn.model_selection import KFold
from sklearn.metrics import log_loss
from skmultilearn.model_selection import iterative_train_test_split

from sklearn.model_selection import KFold, StratifiedKFold, StratifiedShuffleSplit, ShuffleSplit
from skmultilearn.model_selection import IterativeStratification

In [None]:
def loga_loss(y_test, y_pred, eps=1e-15):
    """función para el cálculo de la perdida logarítmica establecida en el concurso de kaggle
    
    y_test --> Variable de salida de prueba
    
    y_pred --> Variable predicha (Predicción de probabilidad de activación)"""
    
    los = np.zeros(y_test.shape)
    n, m = y_test.shape
    y_true = np.clip(y_test, eps, 1-eps)
    for M in range(m):
        for N in range(n):
            log_los = -((y_true[N,M]*np.log(y_pred[N,M]+eps))+((1-y_true[N,M])*np.log(1-y_pred[N,M]+eps)))
            los[N,M] = log_los
    return los

In [None]:
nfolds = 10
cv_i = IterativeStratification(n_splits=nfolds)
cv = StratifiedKFold(n_splits = nfolds, shuffle=False)
cv_k = KFold(n_splits = nfolds, shuffle=True)
cv_s = StratifiedShuffleSplit(n_splits=nfolds)

In [None]:
reduce_lr = ReduceLROnPlateau(monitor='val_loss', patience=3, verbose=0)
early_stop = EarlyStopping(monitor='val_loss', patience=6, restore_best_weights=True, verbose=0)

In [None]:
trainFeatures = pd.read_csv('train_features.csv')
trainTargetScored = pd.read_csv('train_targets_scored.csv')
testFeatures = pd.read_csv('test_features.csv')

trainFeatures['cp_time'] = trainFeatures['cp_time'].map({24:1, 48:2, 72:3})
trainFeatures['cp_dose'] = trainFeatures['cp_dose'].map({'D1':0, 'D2':1})
trainFeatures = trainFeatures.drop(columns="sig_id")
trainTargetScored = trainTargetScored.drop(columns="sig_id")

testFeatures['cp_time'] = testFeatures['cp_time'].map({24:1, 48:2, 72:3})
testFeatures['cp_dose'] = testFeatures['cp_dose'].map({'D1':0, 'D2':1})
testFeatures = testFeatures.drop(columns="sig_id")

#Seperating gene and cell columns
gene_cols = [c for c in trainFeatures.columns if c.startswith('g-')]
cell_cols = [c for c in trainFeatures.columns if c.startswith('c-')]
#using QunatileTransformer to transform oue gene and cell columns
#QunatileTransformer method transforms the features to follow a uniform or a normal distribution.
from sklearn.preprocessing import QuantileTransformer
qt = QuantileTransformer(n_quantiles=100, random_state=0)
qt.fit(trainFeatures[gene_cols + cell_cols])
trainFeatures[gene_cols+cell_cols] = qt.transform(trainFeatures[gene_cols + cell_cols])
testFeatures[gene_cols+cell_cols] = qt.transform(testFeatures[gene_cols + cell_cols])

DataTrain = pd.concat([trainFeatures, trainTargetScored], axis = 1)
Train = DataTrain[DataTrain['cp_type'] == 'trt_cp']
Evaluar = testFeatures[testFeatures['cp_type'] == 'trt_cp']


trainFeature_X = Train.iloc[:,1:875].reset_index(drop=True)
targetsCols_y = Train.iloc[:,876:].reset_index(drop=True)

higher = [col for col in targetsCols_y if (targetsCols_y[col].sum() > 100)]
Targety_H = targetsCols_y[higher]

featuresCount = trainFeature_X.shape[1]
print("Features count = %d" % featuresCount)

targetsCols  = Targety_H.columns
targetsCount = len(targetsCols)
print("Targets count = %d" % targetsCount)

## Split iterativo

In [None]:
X_train, y_train, X_test, y_test = iterative_train_test_split(np.array(trainFeature_X), np.array(Targety_H), test_size = 0.3)

# Modelamiento

## 1. Autoencoder

In [None]:
# AutoEncoder Model Preparation

def autoencoder():
    n_inputs = trainFeature_X.shape[1]
    # define encoder
    input_data_shape= Input(shape=(n_inputs,))
    # encoder level
    encoder= Dense(512, activation='relu')(input_data_shape)
    encoder= Dense(128, activation='relu')(encoder)
    encoder= Dense(64, activation='relu')(encoder)
    encoder= Dense(32, activation='relu')(encoder)
    # bottleneck
    n_bottleneck = 50
    bottleneck = Dense(n_bottleneck)(encoder)
    # define decoder
    decoder = Dense(32, activation='relu')(bottleneck)
    decoder = Dense(64, activation='relu')(decoder)
    decoder = Dense(128, activation='relu')(decoder)
    decoder = Dense(512, activation='relu')(decoder)
    # output layer
    output = Dense(n_inputs, activation='linear')(decoder)
    # define autoencoder model
    autoencoder = tf.keras.Model(inputs=input_data_shape, outputs=output)
    # compile autoencoder model
    autoencoder.compile(optimizer='adam', loss='mse')
    
    return autoencoder

In [None]:
autoencoder = autoencoder()
autoencoder.summary()

In [None]:
autoencoder.fit(X_train, X_train, epochs=250, callbacks=[reduce_lr, early_stop], batch_size=128, shuffle=True, 
                validation_split=0.2)


## 2. Red neuronal incluyendo el autoencoder

In [None]:
def model_NN(autoencoder):
    for layer in autoencoder.layers:
        layer.trainable = False
    
    input_1 = autoencoder.input
    output = autoencoder.layers[5].output
    layer_input = L.BatchNormalization()(input_1)
    
    layer_1 = tfa.layers.WeightNormalization(L.Dense(874))(output)
    layer_1 = L.ReLU()(layer_1)
    layer_1 = L.BatchNormalization()(layer_1)
    layer_1 = L.Dropout(0.2)(layer_1)

    layer_2 = tfa.layers.WeightNormalization(L.Dense(500))(layer_1)
    layer_2 = L.ReLU()(layer_2)
    layer_2 = L.BatchNormalization()(layer_2)
    layer_2 = L.Dropout(0.2)(layer_2)

    layer_3 = tfa.layers.WeightNormalization(L.Dense(250))(layer_2)
    layer_3 = L.ReLU()(layer_3)
    layer_3 = L.BatchNormalization()(layer_3)
    layer_3 = L.Dropout(0.2)(layer_3)

    layer_4 = tfa.layers.WeightNormalization(L.Dense(100))(layer_3)
    layer_4 = L.ReLU()(layer_4)
    layer_4 = L.BatchNormalization()(layer_4)
    layer_4 = L.Dropout(0.2)(layer_4)

    classifier = tfa.layers.WeightNormalization(L.Dense(41, activation="sigmoid"))(layer_4)

    model = tf.keras.Model(input_1, classifier)

    model.compile(optimizer=tfa.optimizers.AdamW(lr=0.001, weight_decay=1e-5 , clipvalue=900), loss='binary_crossentropy')
#     model.compile(loss='binary_crossentropy', optimizer='adam')

    return model

In [None]:
modelo_nn, log_nn = [],[]
for (fn, (train_ind, val_ind)) in enumerate(cv_s.split(X_train, y_train)):
    X_tr, X_val = np.array(X_train[train_ind]), np.array(X_train[val_ind])
    y_tr, y_val = np.array(y_train[train_ind]), np.array(y_train[val_ind])
    print(f'fold {fn+1}')        
    #Cheking empty columns
    check_for_empty_cols = np.where(y_tr.sum(axis = 0) == 0)[0]
    if len(check_for_empty_cols):
                y_tr[0,check_for_empty_cols] = 1
    
    modelo = model_NN(autoencoder)

    #Training
    modelo.fit(X_tr, y_tr, validation_data=(X_val, y_val), callbacks=[reduce_lr, early_stop], epochs=100, batch_size=128)
    score = modelo.evaluate(X_test, y_test, verbose=0)

    # y_pred = modelo.predict(X_val)
    
    # log_v = loga_loss(y_val, y_pred, eps=1e-15)
    print(f'fold {fn+1} --> log_loss: {round(score,5)}')
    print('--------------------------')
    modelo_nn.append(modelo)
    log_nn.append(score)

In [None]:
def model_NN():
    model = tf.keras.models.Sequential()
    model.add(L.Input(train_features.shape[1]))
    model.add(L.BatchNormalization())
    
    model.add(tfa.layers.WeightNormalization(L.Dense(874)))
    model.add(L.ReLU())
    model.add(L.BatchNormalization())
    model.add(L.Dropout(0.2))

    model.add(tfa.layers.WeightNormalization(L.Dense(500)))
    model.add(L.ReLU())
    model.add(L.BatchNormalization())
    model.add(L.Dropout(0.2))

    model.add(tfa.layers.WeightNormalization(L.Dense(250)))
    model.add(L.ReLU())
    model.add(L.BatchNormalization())
    model.add(L.Dropout(0.2))
    
    model.add(tfa.layers.WeightNormalization(L.Dense(100)))
    model.add(L.ReLU())
    model.add(L.BatchNormalization())
    model.add(L.Dropout(0.2))

    model.add(tfa.layers.WeightNormalization(L.Dense(41, activation="sigmoid")))

    model.compile(optimizer=tfa.optimizers.AdamW(lr=0.001, weight_decay=1e-5 , clipvalue=900), loss='binary_crossentropy')
#     model.compile(loss='binary_crossentropy', optimizer='adam')

    return model

In [None]:
modelo_nn, log_nn = [],[]
for (fn, (train_ind, val_ind)) in enumerate(cv_s.split(train_features, y_train)):
    X_tr, X_val = np.array(train_features[train_ind]), np.array(train_features[val_ind])
    y_tr, y_val = np.array(y_train[train_ind]), np.array(y_train[val_ind])
    print(f'fold {fn+1}')        
    #Cheking empty columns
    check_for_empty_cols = np.where(y_tr.sum(axis = 0) == 0)[0]
    if len(check_for_empty_cols):
                y_tr[0,check_for_empty_cols] = 1
    
    modelo = model_NN()

    #Training
    modelo.fit(X_tr, y_tr, validation_data=(X_val, y_val), callbacks=[reduce_lr, early_stop], epochs=100, batch_size=128)
    score = modelo.evaluate(test_features, y_test, verbose=0)

    # y_pred = modelo.predict(X_val)
    
    # log_v = loga_loss(y_val, y_pred, eps=1e-15)
    print(f'fold {fn+1} --> log_loss: {round(score,5)}')
    print('--------------------------')
    modelo_nn.append(modelo)
    log_nn.append(score)

In [None]:
for i in range(len(modelo_nn)):
    pred = modelo_nn[i].predict(X_test)
    l_loss = loga_loss(y_test, pred, eps=1e-15)
    print(f'Cross-entropy score for model {i+1}')
    print(round(np.mean(l_loss),5))

## 3. Arquitectura de red neural usango GridSearchCV

In [None]:
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import GridSearchCV

In [None]:
reduce_lr_2 = ReduceLROnPlateau(monitor='loss', patience=3, verbose=0)
early_stop_2 = EarlyStopping(monitor='loss', patience=6, restore_best_weights=True, verbose=0)

In [None]:
def MODEL_NN(l_1, l_2, l_3, l_4):
    for layer in autoencoder.layers:
        layer.trainable = False
    
    input_1 = autoencoder.input
    output = autoencoder.layers[5].output
    layer_input = L.BatchNormalization()(input_1)
    
    layer_1 = tfa.layers.WeightNormalization(L.Dense(l_1))(output)
    layer_1 = L.ReLU()(layer_1)
    layer_1 = L.BatchNormalization()(layer_1)
    layer_1 = L.Dropout(0.2)(layer_1)

    layer_2 = tfa.layers.WeightNormalization(L.Dense(l_2))(layer_1)
    layer_2 = L.ReLU()(layer_2)
    layer_2 = L.BatchNormalization()(layer_2)
    layer_2 = L.Dropout(0.2)(layer_2)

    layer_3 = tfa.layers.WeightNormalization(L.Dense(l_3))(layer_2)
    layer_3 = L.ReLU()(layer_3)
    layer_3 = L.BatchNormalization()(layer_3)
    layer_3 = L.Dropout(0.2)(layer_3)

    layer_4 = tfa.layers.WeightNormalization(L.Dense(l_4))(layer_3)
    layer_4 = L.ReLU()(layer_4)
    layer_4 = L.BatchNormalization()(layer_4)
    layer_4 = L.Dropout(0.2)(layer_4)

    classifier = tfa.layers.WeightNormalization(L.Dense(41, activation="sigmoid"))(layer_4)

    model = tf.keras.Model(input_1, classifier)

    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
#     model.compile(loss='binary_crossentropy', optimizer='adam')

    return model

In [None]:
optimizer=['adam','rmsprop'] 
l_1 = [700, 874, 1000]
l_2 = [350, 450, 500]
l_3 = [150, 250, 300]
l_4 = [50, 75, 100]
param_grid = dict(l_1=l_1, l_2=l_2, l_3=l_3, l_4=l_4)

In [None]:
# enc = model_NN(autoencoder, l_1=874, l_2=500, l_3=250, l_4=100, optimizer='adam')
modelo = KerasClassifier(build_fn=MODEL_NN, epochs=50, batch_size=128)

In [None]:
nfolds = 4
cv_s = StratifiedShuffleSplit(n_splits=nfolds)

In [None]:
grid = GridSearchCV(estimator=modelo, param_grid=param_grid, cv=cv_s)#, scoring='accuracy')
grid.fit(X_train, y_train, callbacks=[reduce_lr_2, early_stop_2])##,epochs=100, batch_size=128, 

In [None]:
grid.best_params_

In [None]:
def model_NN(autoencoder):
    for layer in autoencoder.layers:
        layer.trainable = False
    
    input_1 = autoencoder.input
    output = autoencoder.layers[5].output
    layer_input = L.BatchNormalization()(input_1)
    
    layer_1 = tfa.layers.WeightNormalization(L.Dense(874))(output)
    layer_1 = L.ReLU()(layer_1)
    layer_1 = L.BatchNormalization()(layer_1)
    layer_1 = L.Dropout(0.2)(layer_1)

    layer_2 = tfa.layers.WeightNormalization(L.Dense(500))(layer_1)
    layer_2 = L.ReLU()(layer_2)
    layer_2 = L.BatchNormalization()(layer_2)
    layer_2 = L.Dropout(0.2)(layer_2)

    layer_3 = tfa.layers.WeightNormalization(L.Dense(200))(layer_2)
    layer_3 = L.ReLU()(layer_3)
    layer_3 = L.BatchNormalization()(layer_3)
    layer_3 = L.Dropout(0.2)(layer_3)

    layer_4 = tfa.layers.WeightNormalization(L.Dense(75))(layer_3)
    layer_4 = L.ReLU()(layer_4)
    layer_4 = L.BatchNormalization()(layer_4)
    layer_4 = L.Dropout(0.2)(layer_4)

    classifier = tfa.layers.WeightNormalization(L.Dense(41, activation="sigmoid"))(layer_4)

    model = tf.keras.Model(input_1, classifier)

    model.compile(optimizer=tfa.optimizers.AdamW(lr=0.001, weight_decay=1e-5 , clipvalue=900), loss='binary_crossentropy')
#     model.compile(loss='binary_crossentropy', optimizer='adam')

    return model

In [None]:
modelo_nn, log_nn = [],[]
for (fn, (train_ind, val_ind)) in enumerate(cv_s.split(X_train, y_train)):
    X_tr, X_val = np.array(X_train[train_ind]), np.array(X_train[val_ind])
    y_tr, y_val = np.array(y_train[train_ind]), np.array(y_train[val_ind])
    print(f'fold {fn+1}')        
    #Cheking empty columns
    check_for_empty_cols = np.where(y_tr.sum(axis = 0) == 0)[0]
    if len(check_for_empty_cols):
                y_tr[0,check_for_empty_cols] = 1
    
    modelo = model_NN(autoencoder)

    #Training
    modelo.fit(X_tr, y_tr, validation_data=(X_val, y_val), callbacks=[reduce_lr, early_stop], epochs=100, batch_size=128)
    score = modelo.evaluate(X_test, y_test, verbose=0)

    # y_pred = modelo.predict(X_val)
    
    # log_v = loga_loss(y_val, y_pred, eps=1e-15)
    print(f'fold {fn+1} --> log_loss: {round(score,5)}')
    print('--------------------------')
    modelo_nn.append(modelo)
    log_nn.append(score)

In [None]:
modelo_nn, log_nn = [],[]
for (fn, (train_ind, val_ind)) in enumerate(cv_s.split(X_train, y_train)):
    X_tr, X_val = np.array(X_train[train_ind]), np.array(X_train[val_ind])
    y_tr, y_val = np.array(y_train[train_ind]), np.array(y_train[val_ind])
    print(f'fold {fn+1}')        
    #Cheking empty columns
    check_for_empty_cols = np.where(y_tr.sum(axis = 0) == 0)[0]
    if len(check_for_empty_cols):
                y_tr[0,check_for_empty_cols] = 1
    
    modelo = model_NN(autoencoder)

    #Training
    modelo.fit(X_tr, y_tr, validation_data=(X_val, y_val), callbacks=[reduce_lr, early_stop], epochs=100, batch_size=128)
    score = modelo.evaluate(X_test, y_test, verbose=0)

    # y_pred = modelo.predict(X_val)
    
    # log_v = loga_loss(y_val, y_pred, eps=1e-15)
    print(f'fold {fn+1} --> log_loss: {round(score,5)}')
    print('--------------------------')
    modelo_nn.append(modelo)
    log_nn.append(score)

In [None]:
ypred =  modelo.predict(X_test)

In [None]:
NN=loga_loss(y_test, ypred, eps=1e-15)
print(f'log_loss: {round(np.mean(NN),3)}')