# Baseline
Work done by Fronsdal, T. et al (2021).

Fronsdal, T. (2021). Deep Learning Approaches for Predicting Drug
Mechanisms of Action (Healthcare). [URL paper](http://cs230.stanford.edu/projects_winter_2021/reports/70500795.pdf)

In [1]:
#######################
###  Global paths   ###
#######################
custom_module_path = f'/content/drive/MyDrive/MoA/utilites'
dataset_path = f'/content/drive/MyDrive/MoA/dataset'

In [2]:
#######################
### Library imports ###
#######################

# standard library
import os
import sys
import pickle
import copy

# data packages
import numpy as np
import pandas as pd

# pytorch
import torch
import torch.nn as nn

# sklearn 
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

#custom tooling
sys.path.append(custom_module_path)
import models
import preprocess

In [3]:
########################
### Global variables ###
########################
label_smoothing = True
smoothing = 0.001
device = ("cuda" if torch.cuda.is_available() else "cpu")
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

In [None]:
###################
### Import Data ###
###################
drugs = pd.read_csv(f'{dataset_path}/train_drug.csv')
train_drug = pd.read_csv(f'{dataset_path}/train_drug.csv')

X = pd.read_csv(f'{dataset_path}/train_features.csv')
y = pd.read_csv(f'{dataset_path}/train_targets_scored.csv')

X, X_test, y, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
##########################
### Data Preprocessing ###
##########################
target_names = y_test.drop(["sig_id"], axis = 1).columns

transformer = preprocess.Preprocessor() 
transformer.fit(X)
X = transformer.transform(X)
X_test = transformer.transform(X_test)
y = y.drop(["sig_id"], axis = 1).values.astype("float32")
y_test = y_test.drop(["sig_id"], axis = 1).values.astype("float32")

In [6]:
# Define network architecture 
n_input = X.shape[1]
n_output = y.shape[1]
hidden_units = 256
dropout = 0.4

net_obj = models.Sequential(
    nn.Linear(n_input, hidden_units),
    nn.BatchNorm1d(hidden_units),
    nn.LeakyReLU(),
    nn.Dropout(dropout),
    nn.Linear(hidden_units, hidden_units),
    nn.BatchNorm1d(hidden_units),
    nn.LeakyReLU(),
    nn.Dropout(dropout),
    nn.Linear(hidden_units, n_output)
)

log_loss = nn.BCEWithLogitsLoss()
smoothed_log_loss = models.SmoothCrossEntropyLoss(smoothing=smoothing, device=device)

if label_smoothing:
    loss = smoothed_log_loss
else:
    loss = log_loss

# Initialize network
net = models.Network(
    net_obj=net_obj, 
    max_epochs=20,
    batch_size=128, 
    device=device,
    loss_fn=loss, 
    lr=0.01,
    weight_decay=0.00001,
    lr_scheduler="ReduceLROnPlateau",
    seed=2021
)

net.fit(
    X,
    y,
    eval_set=[(X_test, y_test)],
    eval_names=['test'],
    eval_metric=[log_loss],
    verbose=1
)

net.metric_history_df_.tail(4)

{'epoch': 0, 'data': 'test', 'metric': 'metric_0', 'value': 0.018977610394358635}
{'epoch': 0, 'data': 'test', 'metric': 'objective', 'value': 0.022002551704645157}
{'epoch': 0, 'data': 'train', 'metric': 'metric_0', 'value': 0.018207622691988945}
{'epoch': 0, 'data': 'train', 'metric': 'objective', 'value': 0.021240711212158203}
{'epoch': 1, 'data': 'test', 'metric': 'metric_0', 'value': 0.01802409254014492}
{'epoch': 1, 'data': 'test', 'metric': 'objective', 'value': 0.021006641909480095}
{'epoch': 1, 'data': 'train', 'metric': 'metric_0', 'value': 0.017155269160866737}
{'epoch': 1, 'data': 'train', 'metric': 'objective', 'value': 0.020144015550613403}
{'epoch': 2, 'data': 'test', 'metric': 'metric_0', 'value': 0.017913755029439926}
{'epoch': 2, 'data': 'test', 'metric': 'objective', 'value': 0.020932117477059364}
{'epoch': 2, 'data': 'train', 'metric': 'metric_0', 'value': 0.017077457159757614}
{'epoch': 2, 'data': 'train', 'metric': 'objective', 'value': 0.020105473697185516}
{'epo

Unnamed: 0,epoch,data,metric,value
76,19,test,metric_0,0.016605
77,19,test,objective,0.019729
78,19,train,metric_0,0.015322
79,19,train,objective,0.018456


In [7]:
########################
###    Evaluation    ###
########################

def multi_log_loss(y_pred, y_true):
    losses = -y_true * np.log(y_pred + 1e-15) - (1 - y_true) * np.log(1 - y_pred + 1e-15)
    return np.mean(losses)

preds = net.predict_proba(X)
test_probs = net.predict_proba(X_test)

print('Train loss: ', multi_log_loss(preds, y))
print('Test loss: ', multi_log_loss(test_probs, y_test))

test_probs = np.argmax(net.predict_proba(X_test), axis=1) 
y_test = np.argmax(y_test,axis=1)

print('F1 score :', f1_score(test_probs, y_test,average='weighted'))

Train loss:  0.015322392
Test loss:  0.016604517
F1 score : 0.14401070894185086
