In [1]:
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.datasets import make_classification
from sklearn.neighbors import NearestNeighbors
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.mongoexp import MongoTrials
from types import SimpleNamespace

import os
from datetime import datetime
import sys
import time
try:
    from iterstrat.ml_stratifiers import MultilabelStratifiedKFold  # trainML
except:
    sys.path.append("../input/iterative-stratification")  # kaggle
    from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

In [2]:
BASE_PATH = os.environ.get('TRAINML_DATA_PATH') if os.environ.get('TRAINML_DATA_PATH') else '../input/lish-moa'
BASE_PATH

'/opt/trainml/input'

In [3]:
train_features = pd.read_csv(f'{BASE_PATH}/train_features.csv')
train_targets = pd.read_csv(f'{BASE_PATH}/train_targets_scored.csv')
test_features = pd.read_csv(f'{BASE_PATH}/test_features.csv')

sample_submission = pd.read_csv(f'{BASE_PATH}/sample_submission.csv')

### Data preprocessing

In [4]:
def preprocess(df):
    df = df.copy()
    df.loc[:, 'cp_type'] = df.loc[:, 'cp_type'].map({'trt_cp': 0, 'ctl_vehicle': 1})
    df.loc[:, 'cp_dose'] = df.loc[:, 'cp_dose'].map({'D1': 0, 'D2': 1})
    del df['sig_id']
    return df

train_data = preprocess(train_features)
test_data = preprocess(test_features)

del train_targets['sig_id']

train_targets = train_targets.loc[train_data['cp_type']==0].reset_index(drop=True)
train_data = train_data.loc[train_data['cp_type']==0].reset_index(drop=True)

In [5]:
X_original = train_data.values
Y_original = train_targets.values

X_test = test_data.values

### Data augmentation helper functions

Currently only doing data oversampling with MLSMOTE algorithm.

In [6]:
def get_tail_labels(df: pd.DataFrame, ql=[0.03, 1.]) -> list:
    " Find the underepresented targets a.k.a. minority labels. "
    irlbl = df.sum(axis=0)
    irlbl = irlbl[(irlbl > irlbl.quantile(ql[0])) & ((irlbl < irlbl.quantile(ql[1])))]  # Filtering
    irlbl = irlbl.max() / irlbl
    threshold_irlbl = irlbl.median()
    tail_labels = irlbl[irlbl > threshold_irlbl].index.tolist()
    return tail_labels

def get_minority_samples(X: pd.DataFrame, y: pd.DataFrame, ql=[0.03, 1.]):
    " Find minority samples associated with minority labels. "
    tail_labels = get_tail_labels(y, ql=ql)
    index = y[y[tail_labels].apply(lambda x: (x == 1).any(), axis=1)].index.tolist()
    
    X_sub = X[X.index.isin(index)].reset_index(drop = True)
    y_sub = y[y.index.isin(index)].reset_index(drop = True)
    return X_sub, y_sub

def nearest_neighbour(X: pd.DataFrame, neigh) -> list:
    " Find nearest neighbors for each sample in X dataframe. "
    nbs = NearestNeighbors(n_neighbors=neigh, metric='euclidean', algorithm='kd_tree').fit(X)
    euclidean, indices = nbs.kneighbors(X)
    return indices

def MLSMOTE(X, y, n_samples, n_neighbors=5):
    " Generate new samples using MLSMOTE algorithm. "
    indices2 = nearest_neighbour(X, neigh=n_neighbors)
    n = len(indices2)
    new_X = np.zeros((n_samples, X.shape[1]))
    target = np.zeros((n_samples, y.shape[1]))
    for i in range(n_samples):
        reference = random.randint(0, n-1)
        neighbor = random.choice(indices2[reference, 1:])
        all_point = indices2[reference]
        nn_df = y[y.index.isin(all_point)]
        ser = nn_df.sum(axis = 0, skipna = True)
        target[i] = np.array([1 if val > 0 else 0 for val in ser])
        ratio = random.random()
        gap = X.loc[reference,:] - X.loc[neighbor,:]
        new_X[i] = np.array(X.loc[reference,:] + ratio * gap)
    new_X = pd.DataFrame(new_X, columns=X.columns)
    target = pd.DataFrame(target, columns=y.columns)
    return new_X, target

def augment_data(X, y, oversample_args: tuple):
    " Augment feature/targets data (just doing oversampling for now)"
    n_samples, n_neighbors = oversample_args

    X_sub, y_sub = get_minority_samples(X, y)
    X_res, y_res = MLSMOTE(X_sub, y_sub, n_samples, n_neighbors)
    X_augmented = pd.concat([X, X_res])
    y_augmented = pd.concat([y, y_res])
    return X_augmented, y_augmented

### Define model, dataset, gradient descent

In [7]:
class MoaModel(nn.Module):
    def __init__(    
        self,
        n_columns,
        n_targets,
        layer1_outputs,
        layer1_dropout,
        layer2_outputs,
        layer2_dropout,
        layer3_enable,
        layer3_outputs,
        layer3_dropout,
        final_layer_dropout,
    ):
        super(MoaModel, self).__init__()   
        self.batch_norm1 = nn.BatchNorm1d(n_columns)
        self.dropout1 = nn.Dropout(layer1_dropout)
        self.dense1 = nn.utils.weight_norm(nn.Linear(n_columns, layer1_outputs))
        
        self.batch_norm2 = nn.BatchNorm1d(layer1_outputs)
        self.dropout2 = nn.Dropout(layer2_dropout)
        self.dense2 = nn.utils.weight_norm(nn.Linear(layer1_outputs, layer2_outputs))
        
        self.layer3 = layer3_enable
        if self.layer3:
            self.batch_norm3 = nn.BatchNorm1d(layer2_outputs)
            self.dropout3 = nn.Dropout(layer3_dropout)
            self.dense3 = nn.utils.weight_norm(
                nn.Linear(layer2_outputs, layer3_outputs)
            )
            
        final_layer_inputs = layer3_outputs if self.layer3 else layer2_outputs
        self.batch_norm_final = nn.BatchNorm1d(final_layer_inputs)
        self.dropout_final = nn.Dropout(final_layer_dropout)
        self.dense_final = nn.utils.weight_norm(nn.Linear(final_layer_inputs, n_targets))
        
    def forward(self, X):
        X = self.batch_norm1(X)
        X = self.dropout1(X)
        X = F.relu(self.dense1(X))
        
        X = self.batch_norm2(X)
        X = self.dropout2(X)
        X = F.relu(self.dense2(X))
        
        if self.layer3:
            X = self.batch_norm3(X)
            X = self.dropout3(X)
            X = F.relu(self.dense3(X))
            
        X = self.batch_norm_final(X)
        X = self.dropout_final(X)
        X = F.sigmoid(self.dense_final(X))
        
        return X
    
    def _load_from_file(self, file):
        self.load_state_dict(torch.load(file))
        
    def save(self, file):
        torch.save(self.state_dict(), file)
        
def batch_gd(model, device, criterion, optimizer, train_loader, val_loader, epochs):
    train_losses = np.zeros(epochs)
    val_losses = np.zeros(epochs)
    for it in range(epochs):
        t0 = datetime.now()

        model.train()
        train_loss = []
        for inputs, targets in train_loader:
            inputs, targets = inputs.to(device), targets.to(device)

            optimizer.zero_grad()

            outputs = model(inputs)
            loss = criterion(outputs, targets)

            loss.backward()
            optimizer.step()

            train_loss.append(loss.item() / len(train_loader))

        train_loss = np.mean(train_loss)

        model.eval()
        val_loss = []
        for inputs, targets in val_loader:
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            val_loss.append(loss.item() / len(val_loader))
        val_loss = np.mean(val_loss)

        train_losses[it] = train_loss
        val_losses[it] = val_loss

        dt = datetime.now() - t0
#         print(
#             f"Epoch {it+1}/{epochs}, Train Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}, Duration {dt}"
#         )

    return train_losses, val_losses

In [8]:
class MoaDataset(Dataset):
    def __init__(self, features, targets, mode="train"):
        self.mode = mode
        self.data = features
        if mode == "train":
            self.targets = targets

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        if self.mode == "train":
            return torch.FloatTensor(self.data[idx]), torch.FloatTensor(
                self.targets[idx]
            )
        elif self.mode == "eval":
            return torch.FloatTensor(self.data[idx]), 0

### Helper functions for hyperopt's `objective`

In [9]:
def augment_data(X, y, n_samples, n_neighbors):
    """
    Augment feature/targets data with oversampling (using MLSMOTE algorithm)
    """
    X_sub, y_sub = get_minority_samples(X, y)
    X_res, y_res = MLSMOTE(X_sub, y_sub, n_samples, n_neighbors)
    X_augmented = pd.concat([X, X_res])
    y_augmented = pd.concat([y, y_res])
    return X_augmented, y_augmented


def make_model(
        n_columns,
        n_targets,
        layer1_outputs,
        layer1_dropout,
        layer2_outputs,
        layer2_dropout,
        layer3_enable,
        layer3_outputs,
        layer3_dropout,
        final_layer_dropout,
        device
    ):
    """
    Make a 2 or 3 layer neural network with specified outputs and dropout per layer.
    """
    model = MoaModel(n_columns, n_targets, layer1_outputs, layer1_dropout, layer2_outputs,
                   layer2_dropout, layer3_enable, layer3_outputs, layer3_dropout, final_layer_dropout)
    model.to(device)
    return model


def train(model, device, optimizer, X, Y, n_splits, batch_size, epochs):
    """
    Run model training with a stratified k-fold cross validation split according to specified
    parameters. Returns tuple (train_losses, val_losses).
    """
    kfold = MultilabelStratifiedKFold(n_splits=n_splits, random_state=42, shuffle=True)

    criterion = nn.BCELoss()

    train_losses = np.array([])
    val_losses = np.array([])

    for n, (tr, te) in enumerate(kfold.split(X, Y)):
        X_train, X_val = X[tr], X[te]
        y_train, y_val = Y[tr], Y[te]

        train_dataset = MoaDataset(X_train, y_train)
        val_dataset = MoaDataset(X_val, y_val)
        train_loader = torch.utils.data.DataLoader(
            dataset=train_dataset, batch_size=batch_size, shuffle=True
        )
        val_loader = torch.utils.data.DataLoader(
            dataset=val_dataset, batch_size=batch_size, shuffle=False
        )
        split_train_losses, split_val_losses = batch_gd(
            model, device, criterion, optimizer, train_loader, val_loader, epochs
        )
#         print(
#             f"Fold {n+1}, final train loss: {split_train_losses[epochs-1]:5.5f}, final train loss: {split_val_losses[epochs-1]:5.5f}"
#         )
        train_losses = np.concatenate((train_losses, split_train_losses))
        val_losses = np.concatenate((val_losses, split_val_losses))

    model.save("latest_model")
    return train_losses, val_losses


def predict(model, device, data_loader):
    """
    Generate label predictions given trained model and feature data.
    """
    model.eval()
    preds = []

    for inputs, _ in data_loader:
        inputs = inputs.to(device)

        with torch.no_grad():
            outputs = model(inputs)

        preds.append(outputs.detach().cpu().numpy())

    preds = np.concatenate(preds)

    return preds


def calculate_loss(model, device, X_eval, Y_eval, batch_size):
    """
    Feed original data set into trained model and generate log-loss score.
    """
    eval_dataset = MoaDataset(X_eval, None, mode='eval')
    eval_loader = torch.utils.data.DataLoader(dataset=eval_dataset, batch_size=batch_size, shuffle=False)
    preds = predict(model, device, eval_loader)
    
    loss_fn = nn.BCELoss()
    Y_pred = torch.from_numpy(preds.astype(float))
    Y_true = torch.from_numpy(Y_eval.astype(float))
    loss = loss_fn(Y_pred, Y_true)
    return loss.item()

### Hyperopt config - objective function and search space

In [10]:
def objective(space):
    args = SimpleNamespace(**space)
    
    # Augment data
    X_original = args.train_data.values
    Y_original = args.train_targets.values
    train_data, train_targets = augment_data(args.train_data, args.train_targets, args.n_samples, args.n_neighbors)
    X = train_data.values
    Y = train_targets.values
    
    # Build model architecture
    device = args.device
    with torch.cuda.device('cuda:0'):
        torch.cuda.empty_cache()
    model = make_model(
                args.n_columns,
                args.n_targets,
                args.layer1_outputs,
                args.layer1_dropout,
                args.layer2_outputs,
                args.layer2_dropout,
                args.layer3_enable,
                args.layer3_outputs,
                args.layer3_dropout,
                args.final_layer_dropout,
                device
    )
    
    # Train model
    optimizer = None
    if args.optimizer == 'adam':
        optimizer = torch.optim.Adam(model.parameters())
    elif args.optimizer == 'adagrad':
        optimizer = torch.optim.Adagrad(model.parameters())
    elif args.optimizer == 'sgd':
        optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
    elif args.optimizer == 'rmsprop':
        optimizer = torch.optim.RMSprop(model.parameters())
    train_losses, val_losses = train(model, device, optimizer, X, Y, args.n_splits, args.batch_size, args.epochs)
    
    # Calculate loss
    loss = calculate_loss(model, device, X_original, Y_original, args.batch_size)

    return {
        'loss': loss,
        'train_losses': train_losses,
        'val_losses': val_losses,
        'model_result': model,
        'status': STATUS_OK,
        'eval_time': time.time()
    }
    

space = {
    # general
    'device': torch.device("cuda:0" if torch.cuda.is_available() else "cpu"),
    'train_data': train_data,
    'train_targets': train_targets,
    
    # data augmentation
    'n_samples': hp.randint('n_samples', 5000),
    'n_neighbors': 3 + hp.randint('n_neighbors', 10),
    
    # model architecture
    'n_columns': train_data.shape[1],
    'n_targets': train_targets.shape[1],
    'layer1_outputs': 256 + hp.randint('layer1_outputs', 4096),
    'layer1_dropout': hp.uniform('layer1_dropout', 0, 1),
    'layer2_outputs': 256 + hp.randint('layer2_outputs', 4096),
    'layer2_dropout': hp.uniform('layer2_dropout', 0, 1),
    'layer3_enable': hp.choice('layer3_enable', [True, False]),
    'layer3_outputs': 256 + hp.randint('layer3_outputs', 4096),
    'layer3_dropout': hp.uniform('layer3_dropout', 0, 1),
    'final_layer_dropout': hp.uniform('final_layer_dropout', 0, 1),
    
    # cross-validation
    'optimizer': hp.choice('optimizer', ['adam', 'adagrad', 'sgd', 'rmsprop']),
    'n_splits': 5 + hp.randint('n_splits', 10),
    'batch_size': 64 + hp.randint('batch_size', 4096),
    'epochs': 10 + hp.randint('epochs', 90)
}

### Run model training via hyperopt

In [11]:
trials = Trials() # TODO: Make it MongoTrials
best = fmin(
    objective,
    space=space,
    algo=tpe.suggest,
    max_evals=75,
    trials=trials
    # max_queue_len=4 <-- what again is this used for? multiple workers?
)
print(best)

  0%|          | 0/75 [00:00<?, ?trial/s, best loss=?]





  1%|▏         | 1/75 [01:43<2:07:09, 103.10s/trial, best loss: 0.010824652442552133]





  3%|▎         | 2/75 [05:29<2:50:17, 139.97s/trial, best loss: 0.010824652442552133]





  4%|▍         | 3/75 [09:56<3:33:48, 178.17s/trial, best loss: 0.010824652442552133]





  5%|▌         | 4/75 [22:08<6:47:21, 344.24s/trial, best loss: 0.010824652442552133]





  7%|▋         | 5/75 [42:10<11:41:48, 601.55s/trial, best loss: 0.010824652442552133]





  8%|▊         | 6/75 [52:39<11:41:23, 609.91s/trial, best loss: 0.010824652442552133]





  9%|▉         | 7/75 [53:47<8:27:07, 447.46s/trial, best loss: 0.010824652442552133] 





 11%|█         | 8/75 [58:02<7:15:03, 389.61s/trial, best loss: 0.0023574808074626795]





 12%|█▏        | 9/75 [1:08:44<8:31:52, 465.34s/trial, best loss: 0.0023574808074626795]





 13%|█▎        | 10/75 [1:12:48<7:12:07, 398.88s/trial, best loss: 0.0023574808074626795]





 15%|█▍        | 11/75 [1:20:01<7:16:33, 409.27s/trial, best loss: 0.0023574808074626795]





 16%|█▌        | 12/75 [1:25:29<6:44:03, 384.82s/trial, best loss: 0.0023574808074626795]





 17%|█▋        | 13/75 [1:29:24<5:51:01, 339.71s/trial, best loss: 0.0023574808074626795]





 19%|█▊        | 14/75 [1:38:18<6:44:41, 398.05s/trial, best loss: 0.0023574808074626795]





 20%|██        | 15/75 [1:51:43<8:40:07, 520.13s/trial, best loss: 0.0011169380490011344]





 21%|██▏       | 16/75 [1:59:40<8:18:53, 507.35s/trial, best loss: 0.0011169380490011344]





 23%|██▎       | 17/75 [2:05:48<7:29:54, 465.42s/trial, best loss: 0.0011169380490011344]





 24%|██▍       | 18/75 [2:09:07<6:06:07, 385.39s/trial, best loss: 0.0011169380490011344]





 25%|██▌       | 19/75 [2:14:19<5:39:14, 363.46s/trial, best loss: 0.0011169380490011344]





 27%|██▋       | 20/75 [2:18:01<4:54:11, 320.94s/trial, best loss: 0.0011169380490011344]





 28%|██▊       | 21/75 [2:31:50<7:06:10, 473.53s/trial, best loss: 0.0011169380490011344]





 29%|██▉       | 22/75 [2:42:22<7:40:20, 521.14s/trial, best loss: 0.0011169380490011344]





 31%|███       | 23/75 [2:47:16<6:32:30, 452.88s/trial, best loss: 0.0011169380490011344]





 32%|███▏      | 24/75 [2:52:22<5:47:36, 408.96s/trial, best loss: 0.0011169380490011344]





 33%|███▎      | 25/75 [3:01:45<6:19:12, 455.05s/trial, best loss: 0.0011169380490011344]





 35%|███▍      | 26/75 [3:08:16<5:55:58, 435.88s/trial, best loss: 0.0002038472425409562]





 36%|███▌      | 27/75 [3:17:15<6:13:17, 466.62s/trial, best loss: 0.0002038472425409562]





 37%|███▋      | 28/75 [3:23:02<5:37:24, 430.74s/trial, best loss: 0.0002038472425409562]





 39%|███▊      | 29/75 [3:27:04<4:46:52, 374.19s/trial, best loss: 0.0002038472425409562]





 40%|████      | 30/75 [3:41:21<6:29:15, 519.01s/trial, best loss: 0.0002038472425409562]





 41%|████▏     | 31/75 [3:55:10<7:28:48, 612.02s/trial, best loss: 0.0002038472425409562]





 43%|████▎     | 32/75 [4:03:08<6:49:47, 571.80s/trial, best loss: 0.0002038472425409562]





 44%|████▍     | 33/75 [4:15:15<7:13:00, 618.57s/trial, best loss: 0.0002038472425409562]





 45%|████▌     | 34/75 [4:20:24<5:59:14, 525.72s/trial, best loss: 0.0002038472425409562]





 47%|████▋     | 35/75 [4:26:56<5:23:39, 485.48s/trial, best loss: 0.0002038472425409562]





 48%|████▊     | 36/75 [4:39:06<6:03:08, 558.67s/trial, best loss: 0.0002038472425409562]





 51%|█████     | 38/75 [4:56:00<5:29:30, 534.35s/trial, best loss: 0.0002038472425409562]





 52%|█████▏    | 39/75 [5:08:30<5:59:20, 598.90s/trial, best loss: 0.0002038472425409562]





 53%|█████▎    | 40/75 [5:14:05<5:03:12, 519.79s/trial, best loss: 0.0002038472425409562]





 55%|█████▍    | 41/75 [5:20:01<4:26:40, 470.59s/trial, best loss: 0.0002038472425409562]





 56%|█████▌    | 42/75 [5:26:42<4:07:29, 449.97s/trial, best loss: 4.2018466896342e-05]  





 57%|█████▋    | 43/75 [5:43:01<5:24:33, 608.56s/trial, best loss: 4.2018466896342e-05]





 59%|█████▊    | 44/75 [5:46:47<4:15:07, 493.80s/trial, best loss: 4.2018466896342e-05]





 60%|██████    | 45/75 [5:52:40<3:45:41, 451.38s/trial, best loss: 4.2018466896342e-05]





 61%|██████▏   | 46/75 [5:58:30<3:23:34, 421.20s/trial, best loss: 4.2018466896342e-05]





 63%|██████▎   | 47/75 [6:05:15<3:14:16, 416.30s/trial, best loss: 4.2018466896342e-05]





 64%|██████▍   | 48/75 [6:10:44<2:55:31, 390.07s/trial, best loss: 4.2018466896342e-05]





 65%|██████▌   | 49/75 [6:17:34<2:51:34, 395.95s/trial, best loss: 4.2018466896342e-05]





 67%|██████▋   | 50/75 [6:21:34<2:25:27, 349.12s/trial, best loss: 4.2018466896342e-05]





 68%|██████▊   | 51/75 [6:38:03<3:36:30, 541.29s/trial, best loss: 4.2018466896342e-05]





 69%|██████▉   | 52/75 [6:47:52<3:32:55, 555.44s/trial, best loss: 4.2018466896342e-05]





 71%|███████   | 53/75 [6:57:54<3:28:49, 569.51s/trial, best loss: 4.2018466896342e-05]





 72%|███████▏  | 54/75 [7:06:13<3:11:52, 548.20s/trial, best loss: 4.2018466896342e-05]





 73%|███████▎  | 55/75 [7:13:54<2:54:01, 522.07s/trial, best loss: 4.2018466896342e-05]





 75%|███████▍  | 56/75 [7:20:38<2:34:05, 486.63s/trial, best loss: 4.2018466896342e-05]





 76%|███████▌  | 57/75 [7:27:40<2:20:14, 467.47s/trial, best loss: 4.2018466896342e-05]



job exception: CUDA out of memory. Tried to allocate 60.00 MiB (GPU 0; 7.80 GiB total capacity; 6.72 GiB already allocated; 10.31 MiB free; 6.96 GiB reserved in total by PyTorch)



 76%|███████▌  | 57/75 [7:27:59<2:21:28, 471.58s/trial, best loss: 4.2018466896342e-05]


RuntimeError: CUDA out of memory. Tried to allocate 60.00 MiB (GPU 0; 7.80 GiB total capacity; 6.72 GiB already allocated; 10.31 MiB free; 6.96 GiB reserved in total by PyTorch)

---

#### **NOTE:** GPU ran out of memory after training the 57th model! After each trial's loss is calculated, we need to get model out of GPU memory. See my workaround in 

---
