In [19]:
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.datasets import make_classification
from sklearn.neighbors import NearestNeighbors
from autoPyTorch import AutoNetMultilabel

import os
from datetime import datetime
import sys
import json
try:
    from iterstrat.ml_stratifiers import MultilabelStratifiedKFold  # trainML
except:
    sys.path.append("../input/iterative-stratification")  # kaggle
    from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

In [2]:
BASE_PATH = os.environ.get('TRAINML_DATA_PATH') if os.environ.get('TRAINML_DATA_PATH') else '../input/lish-moa'
BASE_PATH

'/opt/data'

In [3]:
train_features = pd.read_csv(f'{BASE_PATH}/train_features.csv')
train_targets = pd.read_csv(f'{BASE_PATH}/train_targets_scored.csv')
test_features = pd.read_csv(f'{BASE_PATH}/test_features.csv')

sample_submission = pd.read_csv(f'{BASE_PATH}/sample_submission.csv')

In [6]:
def preprocess(df):
    df = df.copy()
    df.loc[:, 'cp_type'] = df.loc[:, 'cp_type'].map({'trt_cp': 0, 'ctl_vehicle': 1})
    df.loc[:, 'cp_dose'] = df.loc[:, 'cp_dose'].map({'D1': 0, 'D2': 1})
    del df['sig_id']
    return df

train_data = preprocess(train_features)
test_data = preprocess(test_features)

del train_targets['sig_id']

train_targets = train_targets.loc[train_data['cp_type']==0].reset_index(drop=True)
train_data = train_data.loc[train_data['cp_type']==0].reset_index(drop=True)

In [7]:
# Data augmentation
def get_tail_labels(df: pd.DataFrame, ql=[0.03, 1.]) -> list:
    " Find the underepresented targets a.k.a. minority labels. "
    irlbl = df.sum(axis=0)
    irlbl = irlbl[(irlbl > irlbl.quantile(ql[0])) & ((irlbl < irlbl.quantile(ql[1])))]  # Filtering
    irlbl = irlbl.max() / irlbl
    threshold_irlbl = irlbl.median()
    tail_labels = irlbl[irlbl > threshold_irlbl].index.tolist()
    return tail_labels

def get_minority_samples(X: pd.DataFrame, y: pd.DataFrame, ql=[0.03, 1.]):
    " Find minority samples associated with minority labels. "
    tail_labels = get_tail_labels(y, ql=ql)
    index = y[y[tail_labels].apply(lambda x: (x == 1).any(), axis=1)].index.tolist()
    
    X_sub = X[X.index.isin(index)].reset_index(drop = True)
    y_sub = y[y.index.isin(index)].reset_index(drop = True)
    return X_sub, y_sub

def nearest_neighbour(X: pd.DataFrame, neigh) -> list:
    " Find nearest neighbors for each sample in X dataframe. "
    nbs = NearestNeighbors(n_neighbors=neigh, metric='euclidean', algorithm='kd_tree').fit(X)
    euclidean, indices = nbs.kneighbors(X)
    return indices

def MLSMOTE(X, y, n_samples, n_neighbors=5):
    " Generate new samples using MLSMOTE algorithm. "
    indices2 = nearest_neighbour(X, neigh=n_neighbors)
    n = len(indices2)
    new_X = np.zeros((n_samples, X.shape[1]))
    target = np.zeros((n_samples, y.shape[1]))
    for i in range(n_samples):
        reference = random.randint(0, n-1)
        neighbor = random.choice(indices2[reference, 1:])
        all_point = indices2[reference]
        nn_df = y[y.index.isin(all_point)]
        ser = nn_df.sum(axis = 0, skipna = True)
        target[i] = np.array([1 if val > 0 else 0 for val in ser])
        ratio = random.random()
        gap = X.loc[reference,:] - X.loc[neighbor,:]
        new_X[i] = np.array(X.loc[reference,:] + ratio * gap)
    new_X = pd.DataFrame(new_X, columns=X.columns)
    target = pd.DataFrame(target, columns=y.columns)
    return new_X, target

def augment_data(X, y, oversample_args: tuple):
    " Augment feature/targets data (just doing oversampling for now)"
    n_samples, n_neighbors = oversample_args

    X_sub, y_sub = get_minority_samples(X, y)
    X_res, y_res = MLSMOTE(X_sub, y_sub, n_samples, n_neighbors)
    X_augmented = pd.concat([X, X_res])
    y_augmented = pd.concat([y, y_res])
    return X_augmented, y_augmented

In [8]:
data_oversample_args = (1000, 5)
train_data_augmented, train_targets_augmented = augment_data(train_data, train_targets, data_oversample_args)

X = train_data_augmented.values
Y = train_targets_augmented.values
X_test = test_data.values

X_original = train_data.values
Y_original = train_targets.values

In [9]:
autonet = AutoNetMultilabel(config_preset="medium_cs", result_logger_dir="logs/")
autonet.get_current_autonet_config()



{'embeddings': ['none'],
 'lr_scheduler': ['cosine_annealing', 'plateau'],
 'networks': ['shapedresnet'],
 'preprocessors': ['none', 'truncated_svd', 'power_transformer'],
 'result_logger_dir': 'logs/',
 'hyperparameter_search_space_updates': None,
 'categorical_features': None,
 'dataset_name': None,
 'run_id': '0',
 'task_id': -1,
 'algorithm': 'bohb',
 'portfolio_type': 'greedy',
 'budget_type': 'time',
 'eta': 3,
 'min_workers': 1,
 'working_dir': '.',
 'network_interface_name': 'eth0',
 'memory_limit_mb': 1000000,
 'use_tensorboard_logger': False,
 'run_worker_on_master_node': True,
 'use_pynisher': True,
 'validation_split': 0.3,
 'refit_validation_split': 0.0,
 'cross_validator': 'none',
 'cross_validator_args': {},
 'min_budget_for_cv': 0,
 'shuffle': True,
 'imputation_strategies': ['mean', 'median', 'most_frequent'],
 'normalization_strategies': ['none', 'minmax', 'standardize', 'maxabs'],
 'over_sampling_methods': ['none'],
 'under_sampling_methods': ['none'],
 'target_size_

In [10]:
results_fit = autonet.fit(X_train=X,
                          Y_train=Y,
                          validation_split=0.3,
                          max_runtime=300,
                          min_budget=60,
                          max_budget=100,
                          refit=True)

  loglike = -n_samples / 2 * np.log(x_trans.var())
  loglike = -n_samples / 2 * np.log(x_trans.var())


In [11]:
with open("logs/results_fit.json", "w") as file:
    json.dump(results_fit, file)

In [13]:
score = autonet.score(X_test=X_original, Y_test=Y_original)
print("Model accuracy score: ", score)
# TODO: what does this score represent?

Model accuracy score:  0.02341899034080554


In [24]:
preds = autonet.predict(X=X_test)

In [25]:
preds

array([[0.0000000e+00, 5.2453688e-36, 0.0000000e+00, ..., 0.0000000e+00,
        0.0000000e+00, 0.0000000e+00],
       [0.0000000e+00, 0.0000000e+00, 0.0000000e+00, ..., 0.0000000e+00,
        0.0000000e+00, 0.0000000e+00],
       [1.1269496e-03, 5.0480483e-04, 9.7554994e-13, ..., 3.5363615e-10,
        8.7857707e-12, 8.7715181e-13],
       ...,
       [1.1358051e-16, 8.5846139e-14, 1.2323135e-14, ..., 5.0025088e-15,
        4.7446711e-15, 1.1774521e-15],
       [3.7844887e-13, 9.3131940e-11, 1.3905371e-09, ..., 2.5447855e-10,
        2.3934350e-09, 1.2638107e-10],
       [1.5752066e-25, 9.4785180e-21, 5.2640633e-24, ..., 4.6327245e-24,
        2.2442925e-25, 8.3291474e-25]], dtype=float32)

In [26]:
targets = [col for col in train_targets.columns]
sample_submission[targets] = preds
sample_submission.loc[test_features['cp_type']=='ctl_vehicle', targets] = 0
sample_submission.to_csv('submission.csv', index=False)

In [27]:
class MoaDataset(Dataset):
    def __init__(self, features, targets, mode="train"):
        self.mode = mode
        self.data = features
        if mode == "train":
            self.targets = targets

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        if self.mode == "train":
            return torch.FloatTensor(self.data[idx]), torch.FloatTensor(
                self.targets[idx]
            )
        elif self.mode == "eval":
            return torch.FloatTensor(self.data[idx]), 0

In [29]:
def predict(model, device, data_loader):
    model.eval()
    preds = []

    for inputs, _ in data_loader:
        inputs = inputs.to(device)

        with torch.no_grad():
            outputs = model(inputs)

        preds.append(outputs.detach().cpu().numpy())

    preds = np.concatenate(preds)

    return preds

In [30]:
batch_size = 4096
test_dataset = MoaDataset(X_test, None, mode='eval')
test_loader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)
preds = predict(model, device, test_loader)

NameError: name 'model' is not defined