In [49]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedShuffleSplit

In [50]:
## load in train index and sparse labels
DATA_ROOT = '/beegfs/qx244/ds/openmic-2018/'
OPENMIC = np.load(os.path.join(DATA_ROOT, 'openmic-2018.npz'))
Y_true, Y_mask, sample_key = OPENMIC['Y_true'], OPENMIC['Y_mask'], OPENMIC['sample_key']

split_train = pd.read_csv(os.path.join(DATA_ROOT, 'partitions/split01_train.csv'),                         
                          header=None, squeeze=True)
split_test = pd.read_csv(os.path.join(DATA_ROOT, 'partitions/split01_test.csv'), 
                         header=None, squeeze=True)

In [51]:
train_set = set(split_train)

In [52]:
"117652_46080" in train_set

True

In [53]:
original_idx_train = []

for idx, n in enumerate(sample_key):
    if n in train_set:
        original_idx_train.append(idx)

In [54]:
## For each train sample, make it into multi-class lable (20+1) by taking the most confident sparse label as it's label. If no class have 
Y_masked = np.multiply(Y_true, Y_mask)
train_sample_key = sample_key[original_idx_train]
train_Y_masked = Y_masked[original_idx_train]
train_single_label_Y = train_Y_masked.argmax(axis=1)

In [55]:
negative_Y_idx = train_Y_masked.max(axis=1) < 0.5
negative_Y_idx.sum()

4603

In [56]:
train_single_label_Y[negative_Y_idx] = 20 # 20 is the label for _negative

In [57]:
train_single_label_Y[:50]

array([20, 19,  0,  0, 20,  0, 20, 19, 20, 12,  9, 20, 20, 10, 20, 12, 10,
        0, 13,  0, 20, 17, 16, 20, 19, 11,  6,  6, 19, 20, 13,  6,  6,  9,
       19, 19, 20, 20,  6, 20, 20, 20, 20,  5, 12,  2, 10,  9, 16, 20])

In [58]:
## Split the data using sklearn.model_selection.StratifiedShuffleSplit into validation and train set
seed = 20181105
splitter = StratifiedShuffleSplit(n_splits=1,
                                  random_state=seed,
                                  test_size=0.1)
train_idx, val_idx = next(splitter.split(train_sample_key, train_single_label_Y))

In [59]:
val_idx.shape

(1492,)

In [60]:
## save to disk
df_train = pd.DataFrame(sample_key[original_idx_train][train_idx])
df_train.to_csv("tom_partition/split_train.csv", header=False, index=False)
df_val = pd.DataFrame(sample_key[original_idx_train][val_idx])
df_val.to_csv("tom_partition/split_val.csv", header=False, index=False)
df_test = split_test
df_test.to_csv("tom_partition/split_test.csv", header=False, index=False)

In [61]:
train_set = set(sample_key[original_idx_train][train_idx])
val_set = set(sample_key[original_idx_train][val_idx])

In [62]:
train_set & val_set

set()