In [30]:
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.datasets import make_classification
from sklearn.neighbors import NearestNeighbors
from autoPyTorch import AutoNetMultilabel

import os
from datetime import datetime
import sys
import json

In [31]:
BASE_PATH = os.environ.get('TRAINML_DATA_PATH') if os.environ.get('TRAINML_DATA_PATH') else '../input/lish-moa'
BASE_PATH

'/opt/input'

In [32]:
train_features = pd.read_csv(f'{BASE_PATH}/train_features.csv')
train_targets = pd.read_csv(f'{BASE_PATH}/train_targets_scored.csv')
test_features = pd.read_csv(f'{BASE_PATH}/test_features.csv')

sample_submission = pd.read_csv(f'{BASE_PATH}/sample_submission.csv')

In [33]:
def preprocess(df):
    df = df.copy()
    df.loc[:, 'cp_type'] = df.loc[:, 'cp_type'].map({'trt_cp': 0, 'ctl_vehicle': 1})
    df.loc[:, 'cp_dose'] = df.loc[:, 'cp_dose'].map({'D1': 0, 'D2': 1})
    del df['sig_id']
    return df

train_data = preprocess(train_features)
test_data = preprocess(test_features)

del train_targets['sig_id']

train_targets = train_targets.loc[train_data['cp_type']==0].reset_index(drop=True)
train_data = train_data.loc[train_data['cp_type']==0].reset_index(drop=True)

In [34]:
# Data augmentation
def get_tail_labels(df: pd.DataFrame, ql=[0.03, 1.]) -> list:
    " Find the underepresented targets a.k.a. minority labels. "
    irlbl = df.sum(axis=0)
    irlbl = irlbl[(irlbl > irlbl.quantile(ql[0])) & ((irlbl < irlbl.quantile(ql[1])))]  # Filtering
    irlbl = irlbl.max() / irlbl
    threshold_irlbl = irlbl.median()
    tail_labels = irlbl[irlbl > threshold_irlbl].index.tolist()
    return tail_labels

def get_minority_samples(X: pd.DataFrame, y: pd.DataFrame, ql=[0.03, 1.]):
    " Find minority samples associated with minority labels. "
    tail_labels = get_tail_labels(y, ql=ql)
    index = y[y[tail_labels].apply(lambda x: (x == 1).any(), axis=1)].index.tolist()
    
    X_sub = X[X.index.isin(index)].reset_index(drop = True)
    y_sub = y[y.index.isin(index)].reset_index(drop = True)
    return X_sub, y_sub

def nearest_neighbour(X: pd.DataFrame, neigh) -> list:
    " Find nearest neighbors for each sample in X dataframe. "
    nbs = NearestNeighbors(n_neighbors=neigh, metric='euclidean', algorithm='kd_tree').fit(X)
    euclidean, indices = nbs.kneighbors(X)
    return indices

def MLSMOTE(X, y, n_samples, n_neighbors=5):
    " Generate new samples using MLSMOTE algorithm. "
    indices2 = nearest_neighbour(X, neigh=n_neighbors)
    n = len(indices2)
    new_X = np.zeros((n_samples, X.shape[1]))
    target = np.zeros((n_samples, y.shape[1]))
    for i in range(n_samples):
        reference = random.randint(0, n-1)
        neighbor = random.choice(indices2[reference, 1:])
        all_point = indices2[reference]
        nn_df = y[y.index.isin(all_point)]
        ser = nn_df.sum(axis = 0, skipna = True)
        target[i] = np.array([1 if val > 0 else 0 for val in ser])
        ratio = random.random()
        gap = X.loc[reference,:] - X.loc[neighbor,:]
        new_X[i] = np.array(X.loc[reference,:] + ratio * gap)
    new_X = pd.DataFrame(new_X, columns=X.columns)
    target = pd.DataFrame(target, columns=y.columns)
    return new_X, target

def augment_data(X, y, oversample_args: tuple):
    " Augment feature/targets data (just doing oversampling for now)"
    n_samples, n_neighbors = oversample_args

    X_sub, y_sub = get_minority_samples(X, y)
    X_res, y_res = MLSMOTE(X_sub, y_sub, n_samples, n_neighbors)
    X_augmented = pd.concat([X, X_res])
    y_augmented = pd.concat([y, y_res])
    return X_augmented, y_augmented

In [35]:
data_oversample_args = (1000, 5)
train_data_augmented, train_targets_augmented = augment_data(train_data, train_targets, data_oversample_args)

X = train_data_augmented.values
Y = train_targets_augmented.values
X_test = test_data.values

X_original = train_data.values
Y_original = train_targets.values

In [36]:
autonet_config = {
    "result_logger_dir" : "logs/",
    "budget_type" : "epochs",
    "log_level" : "info", 
    "use_tensorboard_logger" : True,
    "validation_split" : 0.3,
    'normalization_strategies': ['none'],
    "max_runtime" : 48000,
    "min_budget" : 300,
    "max_budget" : 24000,
    "final_activation" : 'sigmoid',
    'networks': ['mlpnet', 'shapedmlpnet', 'resnet', 'shapedresnet'],
    'loss_modules': ['bce_with_logits', 'bce_with_logits_weighted']
    }
autonet = AutoNetMultilabel(**autonet_config)
autonet.get_current_autonet_config()

{'embeddings': ['none'],
 'lr_scheduler': ['cosine_annealing', 'plateau'],
 'networks': ['mlpnet', 'shapedmlpnet', 'resnet', 'shapedresnet'],
 'preprocessors': ['none', 'truncated_svd', 'power_transformer'],
 'result_logger_dir': 'logs/',
 'budget_type': 'epochs',
 'log_level': 'info',
 'use_tensorboard_logger': True,
 'validation_split': 0.3,
 'normalization_strategies': ['none'],
 'max_runtime': 48000,
 'min_budget': 300,
 'max_budget': 24000,
 'final_activation': 'sigmoid',
 'loss_modules': ['bce_with_logits', 'bce_with_logits_weighted'],
 'hyperparameter_search_space_updates': None,
 'categorical_features': None,
 'dataset_name': None,
 'run_id': '0',
 'task_id': -1,
 'algorithm': 'bohb',
 'portfolio_type': 'greedy',
 'eta': 3,
 'min_workers': 1,
 'working_dir': '.',
 'network_interface_name': 'eth0',
 'memory_limit_mb': 1000000,
 'run_worker_on_master_node': True,
 'use_pynisher': True,
 'refit_validation_split': 0.0,
 'cross_validator': 'none',
 'cross_validator_args': {},
 'min_

In [37]:
# Sample a random hyperparameter configuration to begin the "refit" process
hyperparameter_config = autonet.get_hyperparameter_search_space().sample_configuration().get_dictionary()
hyperparameter_config

{'CreateDataLoader:batch_size': 361,
 'Imputation:strategy': 'mean',
 'InitializationSelector:initialization_method': 'default',
 'InitializationSelector:initializer:initialize_bias': 'Yes',
 'LearningrateSchedulerSelector:lr_scheduler': 'cosine_annealing',
 'LossModuleSelector:loss_module': 'bce_with_logits',
 'NetworkSelector:network': 'mlpnet',
 'NormalizationStrategySelector:normalization_strategy': 'none',
 'OptimizerSelector:optimizer': 'adamw',
 'PreprocessorSelector:preprocessor': 'truncated_svd',
 'ResamplingStrategySelector:over_sampling_method': 'none',
 'ResamplingStrategySelector:target_size_strategy': 'none',
 'ResamplingStrategySelector:under_sampling_method': 'none',
 'TrainNode:batch_loss_computation_technique': 'standard',
 'LearningrateSchedulerSelector:cosine_annealing:T_max': 289,
 'LearningrateSchedulerSelector:cosine_annealing:eta_min': 1e-08,
 'NetworkSelector:mlpnet:activation': 'sigmoid',
 'NetworkSelector:mlpnet:num_layers': 8,
 'NetworkSelector:mlpnet:num_un

In [40]:
import time
start = time.time()

results_fit = autonet.refit(X_train=X,
                        Y_train=Y,
                        hyperparameter_config=hyperparameter_config,
                        autonet_config=autonet.get_current_autonet_config(),
                        budget=24000)


end = time.time()
print("fit executed in %d seconds" % int(end - start))



fit executed in 22830 seconds


In [41]:
with open("logs/results_fit.json", "w") as file:
    json.dump(results_fit, file)

In [53]:
score = autonet.score(X_test=X_original, Y_test=Y_original)
print("Model accuracy score: ", score)

Model accuracy score:  0.0008201202843083652


In [43]:
preds = autonet.predict(X=X_test)

In [44]:
preds

array([[0.00413671, 0.00318396, 0.00299999, ..., 0.00219377, 0.0033501 ,
        0.00308354],
       [0.00413671, 0.00318396, 0.00299999, ..., 0.00219377, 0.0033501 ,
        0.00308354],
       [0.00413671, 0.00318396, 0.00299999, ..., 0.00219377, 0.0033501 ,
        0.00308354],
       ...,
       [0.00413671, 0.00318396, 0.00299999, ..., 0.00219377, 0.0033501 ,
        0.00308354],
       [0.00413671, 0.00318396, 0.00299999, ..., 0.00219377, 0.0033501 ,
        0.00308354],
       [0.00413671, 0.00318396, 0.00299999, ..., 0.00219377, 0.0033501 ,
        0.00308354]], dtype=float32)

In [45]:
targets = [col for col in train_targets.columns]
sample_submission[targets] = preds
sample_submission.loc[test_features['cp_type']=='ctl_vehicle', targets] = 0
sample_submission.to_csv('logs/preds.csv', index=False)

In [46]:
max = 0
for col in targets:
    col_max = sample_submission[col].max()
    max = max if max > col_max else col_max
    
max

0.041240330785512924

In [52]:
test_df = pd.read_csv("logs/best_curr_kaggle_score.csv")
max = 0
for col in targets:
    col_max = test_df[col].max()
    max = max if max > col_max else col_max
    
max

1.0