In [5]:
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.datasets import make_classification
from sklearn.neighbors import NearestNeighbors
from autoPyTorch import AutoNetMultilabel

import os
from datetime import datetime
import sys
import json
try:
    from iterstrat.ml_stratifiers import MultilabelStratifiedKFold  # trainML
except:
    sys.path.append("../input/iterative-stratification")  # kaggle
    from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

In [1]:
BASE_PATH = os.environ.get('TRAINML_DATA_PATH') if os.environ.get('TRAINML_DATA_PATH') else '../input/lish-moa'
BASE_PATH

NameError: name 'os' is not defined

In [7]:
train_features = pd.read_csv(f'{BASE_PATH}/train_features.csv')
train_targets = pd.read_csv(f'{BASE_PATH}/train_targets_scored.csv')
test_features = pd.read_csv(f'{BASE_PATH}/test_features.csv')

sample_submission = pd.read_csv(f'{BASE_PATH}/sample_submission.csv')

FileNotFoundError: [Errno 2] File b'/opt/data/train_features.csv' does not exist: b'/opt/data/train_features.csv'

In [None]:
def preprocess(df):
    df = df.copy()
    df.loc[:, 'cp_type'] = df.loc[:, 'cp_type'].map({'trt_cp': 0, 'ctl_vehicle': 1})
    df.loc[:, 'cp_dose'] = df.loc[:, 'cp_dose'].map({'D1': 0, 'D2': 1})
    del df['sig_id']
    return df

train_data = preprocess(train_features)
test_data = preprocess(test_features)

del train_targets['sig_id']

train_targets = train_targets.loc[train_data['cp_type']==0].reset_index(drop=True)
train_data = train_data.loc[train_data['cp_type']==0].reset_index(drop=True)

In [None]:
# Data augmentation
def get_tail_labels(df: pd.DataFrame, ql=[0.03, 1.]) -> list:
    " Find the underepresented targets a.k.a. minority labels. "
    irlbl = df.sum(axis=0)
    irlbl = irlbl[(irlbl > irlbl.quantile(ql[0])) & ((irlbl < irlbl.quantile(ql[1])))]  # Filtering
    irlbl = irlbl.max() / irlbl
    threshold_irlbl = irlbl.median()
    tail_labels = irlbl[irlbl > threshold_irlbl].index.tolist()
    return tail_labels

def get_minority_samples(X: pd.DataFrame, y: pd.DataFrame, ql=[0.03, 1.]):
    " Find minority samples associated with minority labels. "
    tail_labels = get_tail_labels(y, ql=ql)
    index = y[y[tail_labels].apply(lambda x: (x == 1).any(), axis=1)].index.tolist()
    
    X_sub = X[X.index.isin(index)].reset_index(drop = True)
    y_sub = y[y.index.isin(index)].reset_index(drop = True)
    return X_sub, y_sub

def nearest_neighbour(X: pd.DataFrame, neigh) -> list:
    " Find nearest neighbors for each sample in X dataframe. "
    nbs = NearestNeighbors(n_neighbors=neigh, metric='euclidean', algorithm='kd_tree').fit(X)
    euclidean, indices = nbs.kneighbors(X)
    return indices

def MLSMOTE(X, y, n_samples, n_neighbors=5):
    " Generate new samples using MLSMOTE algorithm. "
    indices2 = nearest_neighbour(X, neigh=n_neighbors)
    n = len(indices2)
    new_X = np.zeros((n_samples, X.shape[1]))
    target = np.zeros((n_samples, y.shape[1]))
    for i in range(n_samples):
        reference = random.randint(0, n-1)
        neighbor = random.choice(indices2[reference, 1:])
        all_point = indices2[reference]
        nn_df = y[y.index.isin(all_point)]
        ser = nn_df.sum(axis = 0, skipna = True)
        target[i] = np.array([1 if val > 0 else 0 for val in ser])
        ratio = random.random()
        gap = X.loc[reference,:] - X.loc[neighbor,:]
        new_X[i] = np.array(X.loc[reference,:] + ratio * gap)
    new_X = pd.DataFrame(new_X, columns=X.columns)
    target = pd.DataFrame(target, columns=y.columns)
    return new_X, target

def augment_data(X, y, oversample_args: tuple):
    " Augment feature/targets data (just doing oversampling for now)"
    n_samples, n_neighbors = oversample_args

    X_sub, y_sub = get_minority_samples(X, y)
    X_res, y_res = MLSMOTE(X_sub, y_sub, n_samples, n_neighbors)
    X_augmented = pd.concat([X, X_res])
    y_augmented = pd.concat([y, y_res])
    return X_augmented, y_augmented

In [4]:
data_oversample_args = (1000, 5)
train_data_augmented, train_targets_augmented = augment_data(train_data, train_targets, data_oversample_args)

X = train_data_augmented.values
Y = train_targets_augmented.values
X_test = test_data.values

X_original = train_data.values
Y_original = train_targets.values

NameError: name 'augment_data' is not defined

In [10]:
autonet_config = {
    "result_logger_dir" : "logs/",
    "budget_type" : "epochs",
    "log_level" : "info", 
    "use_tensorboard_logger" : True,
    "validation_split" : 0.3,
    'normalization_strategies': ['none'],
    "max_runtime" : 600,
    "min_budget" : 100,
    "max_budget" : 300,
    "final_activation" : 'sigmoid',
    'networks': ["xyz"]
    'loss_modules': ['xyz']
    }
autonet = AutoNetMultilabel(**autonet_config)
autonet.get_current_autonet_config()

Config option loss_modules contains following invalid values {'xyz'}, chose a subset of ['bce_with_logits', 'bce_with_logits_weighted']


ValueError: Config option loss_modules contains following invalid values {'xyz'}, chose a subset of ['bce_with_logits', 'bce_with_logits_weighted']

In [29]:
# Sample a random hyperparameter configuration to begin the "refit" process
hyperparameter_config = autonet.get_hyperparameter_search_space().sample_configuration().get_dictionary()
hyperparameter_config

{'CreateDataLoader:batch_size': 37,
 'Imputation:strategy': 'median',
 'InitializationSelector:initialization_method': 'sparse',
 'InitializationSelector:initializer:initialize_bias': 'Yes',
 'LearningrateSchedulerSelector:lr_scheduler': 'cosine_annealing',
 'LossModuleSelector:loss_module': 'bce_with_logits_weighted',
 'NetworkSelector:network': 'shapedresnet',
 'NormalizationStrategySelector:normalization_strategy': 'none',
 'OptimizerSelector:optimizer': 'rmsprop',
 'PreprocessorSelector:preprocessor': 'truncated_svd',
 'ResamplingStrategySelector:over_sampling_method': 'none',
 'ResamplingStrategySelector:target_size_strategy': 'none',
 'ResamplingStrategySelector:under_sampling_method': 'none',
 'TrainNode:batch_loss_computation_technique': 'mixup',
 'InitializationSelector:sparse:sparsity': 0.9,
 'LearningrateSchedulerSelector:cosine_annealing:T_max': 24,
 'LearningrateSchedulerSelector:cosine_annealing:eta_min': 1e-08,
 'NetworkSelector:shapedresnet:activation': 'tanh',
 'Networ

In [31]:
import time
start = time.time()

results_fit = autonet.refit(X_train=X,
                        Y_train=Y,
                        hyperparameter_config=hyperparameter_config,
                        autonet_config=autonet.get_current_autonet_config(),
                        budget=50)


end = time.time()
print("fit executed in %d seconds" % int(end - start))



fit executed in 2068 seconds


In [35]:
with open("logs_v2/results_fit.json", "w") as file:
    json.dump(results_fit, file)

In [36]:
score = autonet.score(X_test=X_original, Y_test=Y_original)
print("Model accuracy score: ", score)

Model accuracy score:  0.0008201202843083652


In [37]:
preds = autonet.predict(X=X_test)

In [38]:
preds

array([[1.2108620e-10, 1.8945179e-10, 2.0351393e-10, ..., 1.8167555e-10,
        1.4206153e-10, 1.5627633e-10],
       [1.3769365e-10, 2.1489410e-10, 2.3075142e-10, ..., 2.0612116e-10,
        1.6140077e-10, 1.7745419e-10],
       [2.0164960e-10, 3.1236391e-10, 3.3501210e-10, ..., 2.9982158e-10,
        2.3573832e-10, 2.5876920e-10],
       ...,
       [2.5131383e-10, 3.8761847e-10, 4.1543538e-10, ..., 3.7220638e-10,
        2.9334637e-10, 3.2170638e-10],
       [1.5962054e-10, 2.4839397e-10, 2.6660021e-10, ..., 2.3831931e-10,
        1.8690956e-10, 2.0537240e-10],
       [2.0233461e-10, 3.1340469e-10, 3.3612318e-10, ..., 3.0082287e-10,
        2.3653418e-10, 2.5963881e-10]], dtype=float32)

In [39]:
targets = [col for col in train_targets.columns]
sample_submission[targets] = preds
sample_submission.loc[test_features['cp_type']=='ctl_vehicle', targets] = 0
sample_submission.to_csv('autonet_v2.csv', index=False)

In [49]:
max = 0
for col in targets:
    col_max = sample_submission[col].max()
    max = max if max > col_max else col_max
    
max

0.13893255591392517