In [1]:
#######################
###  Global paths   ###
#######################
custom_module_path = f'/content/drive/MyDrive/MoA/utilites'
dataset_path = f'/content/drive/MyDrive/MoA/dataset'

In [2]:
#######################
### Library imports ###
#######################
# standard library
import os
import sys

# data packages
import numpy as np
import pandas as pd

# tensorflow
import tensorflow as tf

# sklearn 
from sklearn.metrics import f1_score
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

#custom tooling
sys.path.append(custom_module_path)
import preprocess

In [3]:
###################
###    Models   ###
###################

def cnn(input_shape, num_classes, num_filters, size_kernel, drop):
    
    model = tf.keras.models.Sequential()
    
    # Convolucional layer
    model.add(tf.keras.layers.Conv1D(filters = num_filters, kernel_size = size_kernel, activation='relu', input_shape= input_shape[1:]))
    model.add(tf.keras.layers.MaxPooling1D())
    #model.add(tf.keras.layers.Conv1D(filters = num_filters, kernel_size = size_kernel, activation='relu', input_shape= input_shape[1:])   
    #model.add(tf.keras.layers.MaxPooling1D())

    # Dense layer
    model.add(tf.keras.layers.Flatten())    
    model.add(tf.keras.layers.Dense(128, activation='relu'))
    model.add(tf.keras.layers.Dropout(drop))
    model.add(tf.keras.layers.Dense(64, activation='relu'))
    model.add(tf.keras.layers.Dropout(drop))
    model.add(tf.keras.layers.Dense(num_classes, activation='sigmoid'))

    # Reshape output
    model.add(tf.keras.layers.Reshape((num_classes,1)))
    
    return model

In [4]:
###################
###  Utilities  ###
###################

# Implementation BCEWithLogitsLoss of pytorch with keras
# https://stackoverflow.com/questions/59669860/implementing-bcewithlogitsloss-from-pytorch-in-keras

def split_data(X,y, size_test=0.1):
    X_tr, X_test, y_tr, y_test = train_test_split(X, y, test_size=size_test, random_state=19)
    return X_tr,X_test,y_tr,y_test 

def predict_proba(preds):
    preds_proba = 1 / (1 + np.exp(-preds))
    return preds_proba.astype("float32")

def multi_log_loss(y_pred, y_true):
    losses = -y_true * np.log(y_pred + 1e-15) - (1 - y_true) * np.log(1 - y_pred + 1e-15)
    return np.mean(losses)

def preprocess_data(X,y):
    transformer = preprocess.Preprocessor() 
    transformer.fit(X)
    X = transformer.transform(X)
    y = y.drop(["sig_id"], axis = 1).values.astype("float32") 
    return pd.DataFrame(X),pd.DataFrame(y)

def reshape_data(data):
    nrows, nclos = data.shape
    return data.reshape(nrows, nclos,1)

def get_f1_score(model, X_train, X_val, y_train, y_val):
    optimiser = tf.keras.optimizers.Adam(learning_rate=0.0001)
    model.compile(optimizer=optimiser,
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    
    model.fit(X_train, y_train, validation_data=(X_val, y_val), batch_size=32, epochs=30, verbose=0)
    
    y_predict = np.argmax(predict_proba(X_val), axis=1) 
    y_val = np.argmax(y_val,axis=1)
    mf1 = f1_score(y_val, y_predict,average='weighted')
    return mf1 

def cross_validation(X,Y,models):
    kf = KFold(n_splits = 10, shuffle= True)

    for m, values in models.items():
        print(f'Cross Validation for model {values[0]}\n')

        for train_index, val_index in kf.split(X):
          X_train, X_val = X.iloc[train_index,], X.iloc[val_index,]
          y_train, y_val = Y.iloc[train_index], Y.iloc[val_index]
          
          X_train, X_val = np.array(X_train), np.array(X_val)
          y_train, y_val = y_train.values.astype("float32"), y_val.values.astype("float32")
          
          X_train, X_val = reshape_data(X_train), reshape_data(X_val)
          y_train, y_val = reshape_data(y_train), reshape_data(y_val)
 
          values.append(get_f1_score(m, X_train, X_val, y_train, y_val))

        print(f'Done model {values[0]}\n')
    print(f'Done')

In [5]:
drugs = pd.read_csv(f'{dataset_path}/train_drug.csv')
train_drug = pd.read_csv(f'{dataset_path}/train_drug.csv')
X = pd.read_csv(f'{dataset_path}/train_features.csv')
y = pd.read_csv(f'{dataset_path}/train_targets_scored.csv')

X,y = preprocess_data(X,y)
X, X_test, y, y_test = split_data(X,y)

# cnn(input_shape, num_classes, num_filters, size_kernel, drop)
models = {cnn((21432, 877, 1), 206, 1, 3, 0.2):['cnn_0'],
          cnn((21432, 877, 1), 206, 1, 3, 0.4):['cnn_1'],
          cnn((21432, 877, 1), 206, 1, 5, 0.2):['cnn_2'],
          cnn((21432, 877, 1), 206, 1, 5, 0.4):['cnn_3'],
          cnn((21432, 877, 1), 206, 5, 3, 0.2):['cnn_4'],
          cnn((21432, 877, 1), 206, 5, 3, 0.4):['cnn_5'],             
          cnn((21432, 877, 1), 206, 5, 5, 0.2):['cnn_6'],
          cnn((21432, 877, 1), 206, 5, 5, 0.4):['cnn_7'],
          }

In [6]:
cross_validation(X,y,models)

Cross Validation for model cnn_0

Done model cnn_0

Cross Validation for model cnn_1

Done model cnn_1

Cross Validation for model cnn_2

Done model cnn_2

Cross Validation for model cnn_3

Done model cnn_3

Cross Validation for model cnn_4

Done model cnn_4

Cross Validation for model cnn_5

Done model cnn_5

Cross Validation for model cnn_6

Done model cnn_6

Cross Validation for model cnn_7

Done model cnn_7

Done


In [7]:
for m,values in models.items():
    print(f' model {values[0]} weighted f1-score mean is {np.mean(values[1:])}')

 model cnn_0 weighted f1-score mean is 0.0009557834926042432
 model cnn_1 weighted f1-score mean is 0.0009676126484348086
 model cnn_2 weighted f1-score mean is 0.0008884552932962788
 model cnn_3 weighted f1-score mean is 0.0010302318412807898
 model cnn_4 weighted f1-score mean is 0.0009833984086535527
 model cnn_5 weighted f1-score mean is 0.0010113169825709226
 model cnn_6 weighted f1-score mean is 0.0009071672995420532
 model cnn_7 weighted f1-score mean is 0.000985270811751334
