In [1]:
import argparse
import os, sys
from utils.data_utils import *
from glob import glob
import pandas as pd
import numpy as np
import shutil
from tqdm import tqdm
%matplotlib inline  
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = [12,9]

In [2]:
def is_windows():
    return sys.platform == 'win32'

res_dir = os.path.join('predictions')
if is_windows():
    l_dirs = ['0.21', '0.22', '0.23', '0.24', '0.25']
else:
    l_dirs = ['0.09']
    
def get_data_dir(a, t):
    if t:
        if a:
            data_dir = 'train_ex'
        else:
            data_dir = 'train'
    else:
        if a:
            data_dir = 'valid_ex'
        else:
            data_dir = 'valid'
    return data_dir

def load_loss_df(l_dirs, augmented=False, train=False):
    data_dir = get_data_dir(augmented, train)
    dataframes = []
    for d in l_dirs:
        dataframes.append(pd.read_csv(os.path.join(res_dir, data_dir, d, 'models.csv')))

    df_l = pd.concat(dataframes, ignore_index=True)
    return df_l

def get_gt_name(a, t):
    if t:
        if a:
            name = 'Y_train_augmented.npz'
        else:
            name = 'Y_train.npz'
    else:
        if a:
            name = 'Y_valid_augmented.npz'
        else:
            name = 'Y_valid.npz'
    return name

def load_ground_truth(data_dir, augmented=False, train=False):
    gt_name = get_gt_name(augmented, train)
    y_true_path = os.path.join('augmented', data_dir, gt_name)
    y_true = npz_to_ndarray(np.load(y_true_path))
    return y_true

def load_preds(df_loss):
    preds = []
    for npz_path in df_loss.res_path:
        p = npz_to_ndarray(np.load(npz_path))
        preds.append(p)
    return np.array(preds)

In [4]:
train_augmented = True
valid_augmented = True

In [14]:
df_loss_valid = load_loss_df(l_dirs, augmented=valid_augmented, train=False)
print(len(df_loss_valid))
df_loss_train = load_loss_df(l_dirs, augmented=train_augmented, train=True)
print(len(df_loss_train))

76
76


In [7]:
preds_valid = load_preds(df_loss_valid)
print(preds_valid.shape)
preds_train = load_preds(df_loss_train)
print(preds_train.shape)

(76, 64768, 17)
(76, 259064, 17)


In [8]:
y_true_valid = load_ground_truth('train_augmented_size_128_mult_8_seed_0', augmented=valid_augmented, train=False)
print(y_true_valid.shape)
y_true_train = load_ground_truth('train_augmented_size_128_mult_8_seed_0', augmented=train_augmented, train=True)
print(y_true_train.shape)

(64768, 17)
(259064, 17)


In [75]:
def find_ratios(y_true, y_pred, default=0.2, step = 0.05):
    threshold = [0.2]*17
    best_score = fbeta_score(y_true, (y_pred > threshold).astype(int), beta=2, average='samples')
    step = step
    n = int(1/step)
    for j in tqdm(range(17)):
        temp_threshold = threshold[:]
        r = step
        for _ in range(n):
            temp_threshold[j] = r
            score = fbeta_score(y_true, (y_pred > temp_threshold).astype(int), beta=2, average='samples')
            if score > best_score:
                best_score = score
                threshold[j] = r
            r += step

    return threshold

def get_ensemble_avg_score(ensemble, all_preds, y_true, opt_th=False, opt_step=0.05, th=0.2):
    ensemble_preds = [all_preds[j] for j in ensemble]
    avg_pred = np.mean(ensemble_preds, axis=0)
    if not opt_th:
        avg_pred_final = (avg_pred > th).astype(int)
    else:
        thresholds = find_ratios(y_true, avg_pred, step=opt_step)
        avg_pred_final = (avg_pred > thresholds).astype(int)
    avg_score = fbeta_score(y_true, avg_pred_final, beta=2, average='samples')
    return avg_score

def opt_thresholds(ensemble, all_preds, y_true, opt_step=0.05):
    ensemble_preds = [all_preds[j] for j in ensemble]
    avg_pred = np.mean(ensemble_preds, axis=0)
    thresholds = find_ratios(y_true, avg_pred, step=opt_step)
    avg_pred_final = (avg_pred > thresholds).astype(int)
    avg_score = fbeta_score(y_true, avg_pred_final, beta=2, average='samples')
    return np.array(thresholds), avg_score

In [90]:
# ensemble = [20, 62]
ensemble = [20, 62, 13, 46, 8]
# ensemble = [20, 62, 13, 46, 8, 22, 12, 51, 25]
print("Score %.5f of models %s" % (get_ensemble_avg_score(ensemble, preds_valid, y_true_valid), ensemble))

Score 0.92870 of models [20, 62, 13, 46, 8]


In [91]:
train_th, train_score = opt_thresholds(ensemble, preds_train, y_true_train, opt_step=0.025)
print("Training score with opt thresholds %.5f" % train_score)
print(train_th)

100%|██████████| 17/17 [03:42<00:00, 12.85s/it]


Training score with opt thresholds 0.94531
[ 0.15   0.275  0.175  0.2    0.15   0.075  0.225  0.2    0.225  0.275
  0.325  0.175  0.125  0.1    0.25   0.25   0.15 ]


In [92]:
val_th, val_score = opt_thresholds(ensemble, preds_valid, y_true_valid, opt_step=0.025)
print("Validation score with opt thresholds %.5f" % (val_score))
print(val_th)

100%|██████████| 17/17 [00:46<00:00,  2.72s/it]

Validation score with opt thresholds 0.92943
[ 0.275  0.2    0.225  0.275  0.2    0.075  0.2    0.225  0.2    0.2    0.25
  0.175  0.1    0.4    0.25   0.175  0.125]





In [93]:
print(np.array(train_th))
print(np.array(val_th))
mean_thresholds = np.mean([train_th, val_th], axis=0)
print(mean_thresholds)

[ 0.15   0.275  0.175  0.2    0.15   0.075  0.225  0.2    0.225  0.275
  0.325  0.175  0.125  0.1    0.25   0.25   0.15 ]
[ 0.275  0.2    0.225  0.275  0.2    0.075  0.2    0.225  0.2    0.2    0.25
  0.175  0.1    0.4    0.25   0.175  0.125]
[ 0.2125  0.2375  0.2     0.2375  0.175   0.075   0.2125  0.2125  0.2125
  0.2375  0.2875  0.175   0.1125  0.25    0.25    0.2125  0.1375]


In [94]:
print("Train score %.5f" % get_ensemble_avg_score(ensemble, preds_train, y_true_train))
print("Train score with opt thresolds %.5f" % get_ensemble_avg_score(ensemble, preds_train, y_true_train, th=mean_thresholds))

Train score 0.94422
Train score with opt thresolds 0.94482


In [95]:
print("Valid score %.5f" % get_ensemble_avg_score(ensemble, preds_valid, y_true_valid))
print("Valid score with opt thresolds %.5f" % get_ensemble_avg_score(ensemble, preds_valid, y_true_valid, th=mean_thresholds))

Valid score 0.92870
Valid score with opt thresolds 0.92907


In [96]:
import shutil
copy_to = os.path.join('other_models', 'ensemble16')
if not os.path.exists(copy_to):
    os.makedirs(copy_to)
for j, i in enumerate(ensemble):
    m = df_loss_valid.loc[i].path
    dest = os.path.join(copy_to, df_loss_valid.loc[i].model + '.h5')
    shutil.copyfile(m, dest)
    
np.save(os.path.join(copy_to, 'avg_model_thresholds.npy'), mean_thresholds)
print("Done")

Done
