# fuel

In [1]:
import bz2
import csv
import numpy as np
import sys

In [2]:
import fuel, os
fuel_path = fuel.config.data_path[0]
fuel_path

Using gpu device 0: GeForce GTX 980 (CNMeM is disabled)


'/Users/udi/Downloads/lisa'

In [3]:
base = 'data/r146_1_1/ivec15-lre/'

In [4]:
def load_ivectors(filename):
    """Loads ivectors

    Parameters
    ----------
    filename : string
        Path to ivector files (e.g. dev_ivectors.csv)

    Returns
    -------
    ids : list
        List of ivectorids
    durations : array, shaped('n_ivectors')
        Array of durations for each ivectorid
    languages : array, shaped('n_ivectors')
        Array of langs for each ivectorid (only applies to train)
    ivectors : array, shaped('n_ivectors', 600)
        Array of ivectors for each ivectorid
    """
    ids = []
    durations = []
    languages = []
    ivectors = []
    with open(filename, 'rb') as infile:
        reader = csv.reader(infile, delimiter='\t')
        reader.next()

        for row in csv.reader(infile, delimiter='\t'):
            ids.append(row[0])
            durations.append(float(row[1]))
            languages.append(row[2])
            ivectors.append(np.asarray(row[3:], dtype=np.float32))

            sys.stdout.write("\r     %s  " % row[0])
            sys.stdout.flush()

    print "\n   I-    Adding Transformed ivectors "

    return ids, np.array(durations, dtype=np.float32), np.array(languages), np.vstack(ivectors)

In [5]:
train_ids, train_durations, train_languages, train_ivec = load_ivectors(base+'data/ivec15_lre_train_ivectors.tsv')

     ivec15-lre_zzzzabb  
   I-    Adding Transformed ivectors 


In [6]:
Nt = len(train_ivec)
Nt

15000

In [7]:
dev_ids, dev_durations, dev_languages, dev_ivec = load_ivectors(base+'data/ivec15_lre_dev_ivectors.tsv')
len(dev_ids)

     ivec15-lre_zzyykqa  
   I-    Adding Transformed ivectors 


6431

In [8]:
test_ids, test_durations, test_languages, test_ivec = load_ivectors(base + 'data/ivec15_lre_test_ivectors.tsv')
len(test_ids)

     ivec15-lre_zzshxfc  
   I-    Adding Transformed ivectors 


6500

compute the mean and whitening transformation over dev set only. You are not allowed to use test and train does not have all languages

In [10]:
m = np.mean(dev_ivec, axis=0)
S = np.cov(dev_ivec, rowvar=0)
D, V = np.linalg.eig(S)
W = (1 / np.sqrt(D) * V).transpose().astype('float32')

center and whiten

In [11]:
all_durations = np.hstack((train_durations,dev_durations,test_durations))
all_data = np.vstack((train_ivec,dev_ivec,test_ivec))

In [12]:
all_data = np.dot(all_data - m, W.transpose())

convert labels to int. 'out_of_set' is 0

In [15]:
idx2lang = dict(enumerate(['out_of_set']+sorted(np.unique(train_languages))))
lang2idx = dict((l,i) for i,l in idx2lang.iteritems())
import cPickle as pickle
with open('data/160111-fuel.idx2lang.pkl','wb') as fp:
    pickle.dump(idx2lang,fp)
!aws s3 cp data/160111-fuel.idx2lang.pkl s3://udikaggle/nist/

upload: data/160111-fuel.idx2lang.pkl to s3://udikaggle/nist/160111-fuel.idx2lang.pkl


In [16]:
X = all_data
y = np.array(map(lambda l: lang2idx[l], train_languages))

mark all data not coming from training set as out of set

In [17]:
y = np.hstack((y,lang2idx['out_of_set']*np.ones(len(X)-len(y),dtype=int)))

In [18]:
import fuel
fuel.config.data_path

['/Users/udi/Downloads/lisa']

In [19]:
import os
from fuel.datasets.hdf5 import H5PYDataset
datasource = '160111-fuel.test'
datasource_dir = os.path.join(fuel.config.data_path[0], datasource)
datasource_fname = os.path.join(datasource_dir , datasource + '.hdf5')
datasource_fname

'/Users/udi/Downloads/lisa/160111-fuel.test/160111-fuel.test.hdf5'

In [20]:
!mkdir -p {datasource_dir}

In [21]:
import h5py
N, NF = X.shape
with h5py.File(datasource_fname, mode='w') as fp:
    features = fp.create_dataset('features', (N, NF), dtype=np.float32)
    targets = fp.create_dataset('targets', (N,), dtype='int')
    features[...] = X.astype(np.float32)
    targets[...] = y
    from fuel.datasets.hdf5 import H5PYDataset
    split_dict = {
        'train': {'features': (0, N), 'targets': (0, N)},
        'test': {'features': (0, N), 'targets': (0, N)}
    }
    fp.attrs['split'] = H5PYDataset.create_split_array(split_dict)
!ls -l {datasource_fname}

-rw-r--r--  1 udi  staff  44915848 Jan 11 17:30 /Users/udi/Downloads/lisa/160111-fuel.test/160111-fuel.test.hdf5


the samples are not shuffled

## simulate training

In [24]:
import random

def cv_modify(X, y, Q, seed=None, oos_labels=None, unlabel_label_ratio=0.5):
    assert Q < 50, "Q has to be smaller than 50, try 38"
    assert np.all(y>0), "unlabeled data"
    
    if seed is not None:
        random.seed(seed)
        np.random.seed(seed)
        
    oos_size = 50 - Q
    if oos_labels is None:
        oos_labels = random.sample(range(1,51), oos_size)
    else:
        n = len(oos_labels)
        assert n <= oos_size
        assert len(set(oos_labels)) == n
        assert all(0 < s <= 50 for s in oos_labels)
        if n < oos_size:
            oos_labels += random.sample(set(range(1,51)) - set(oos_labels), oos_size - n)

    # for each label build a map such that the known labels are at the start followed by the unknown labels
    label_map = [0] + sorted(set(range(1,51)) - set(oos_labels)) + sorted(oos_labels)
    y_train  = np.array([label_map.index(yy) for yy in y])

    # index of all samples that are out-of-set
    oos = [i for i, yy in enumerate(y_train) if yy > Q or yy == 0]
    # index of all samples that are in-set
    in_set = [i for i, yy in enumerate(y_train) if 0 < yy <= Q]
    
    # take a part, r, of the samples that are in-set to be unlabeled, and leave 1-r
    # eventually the unlabeld set will be made from Q/50 in-set samples and 1-Q/50 oos samples
    # eventually the unlabeled size will be r*50/Q and we want
    # (1-r)*unlabel_label_ratio = r*50/Q
    # unlabel_label_ratio = r(50/Q + unlabel_label_ratio)
    # r = unlabel_label_ratio/(50./Q + unlabel_label_ratio)
    # r = Q*unlabel_label_ratio/(50. + Q*unlabel_label_ratio)
    Qu = Q*unlabel_label_ratio
    r = Qu/(50. + Qu)
    in_set_unlabeled = random.sample(in_set, int(len(in_set)*r))
    # the other half will be used as labeled
    in_set_labeled = list(set(in_set) - set(in_set_unlabeled))
    # give the unlabeled samples that are in-set have a high label (so the training will consider them to be unlabeled)
    # but keep their original identity for error measurement
    y_train[in_set_unlabeled] += 50

    # add out-of-set samples to the unlabeled set keeping the ratio to labeled as before
    oos_unlabeled = random.sample(oos,int(len(oos)*r))

    unlabeled = oos_unlabeled+in_set_unlabeled

    # all other (oos) samples are dropped (too bad but we want to keep the original ratios)
    keep = in_set_labeled + unlabeled
    random.shuffle(keep)

    y_train = y_train[keep]
    X_train = X[keep]
    return X_train, y_train, label_map

In [25]:
Q=38
poos=0.23

In [26]:
all_oos = []

In [27]:
for seed in range(5):
    # the first 5 seeds are used to cover all labels at least once
    oos_labels = range(1+12*seed,min(1+12*seed + 12,51))
        
    X_train, y_train, labels = cv_modify(X[:Nt], y[:Nt], Q, seed=seed, oos_labels=oos_labels)
    datasource = '160111-fuel.train.%d'%seed
    datasource_dir = os.path.join(fuel.config.data_path[0], datasource)
    datasource_fname = os.path.join(datasource_dir , datasource + '.hdf5')
    !mkdir -p {datasource_dir}
    N0 = len(X_train)
    print seed, N0, datasource_fname

    with h5py.File(datasource_fname, mode='w') as fp:
        features = fp.create_dataset('features', (N0, NF), dtype=np.float32)
        targets = fp.create_dataset('targets', (N0,), dtype='int')
        features[...] = X_train.astype(np.float32)
        targets[...] = y_train
        
        split_dict = {
            'train': {'features': (0, N0), 'targets': (0, N0)},
            'test': {'features': (0, N0), 'targets': (0, N0)}
        }
        fp.attrs['split'] = H5PYDataset.create_split_array(split_dict)
        fp.attrs['labels'] = labels
    !ls -l {datasource_fname}
    oos = labels[-12:]
    all_oos += oos
    print len(set(all_oos))

0 12391 /Users/udi/Downloads/lisa/160111-fuel.train.0/160111-fuel.train.0.hdf5
-rw-r--r--  1 udi  staff  19928000 Jan 11 18:02 /Users/udi/Downloads/lisa/160111-fuel.train.0/160111-fuel.train.0.hdf5
12
1 12391 /Users/udi/Downloads/lisa/160111-fuel.train.1/160111-fuel.train.1.hdf5
-rw-r--r--  1 udi  staff  19928000 Jan 11 18:02 /Users/udi/Downloads/lisa/160111-fuel.train.1/160111-fuel.train.1.hdf5
24
2 12391 /Users/udi/Downloads/lisa/160111-fuel.train.2/160111-fuel.train.2.hdf5
-rw-r--r--  1 udi  staff  19928000 Jan 11 18:02 /Users/udi/Downloads/lisa/160111-fuel.train.2/160111-fuel.train.2.hdf5
36
3 12391 /Users/udi/Downloads/lisa/160111-fuel.train.3/160111-fuel.train.3.hdf5
-rw-r--r--  1 udi  staff  19928000 Jan 11 18:02 /Users/udi/Downloads/lisa/160111-fuel.train.3/160111-fuel.train.3.hdf5
48
4 12391 /Users/udi/Downloads/lisa/160111-fuel.train.4/160111-fuel.train.4.hdf5
-rw-r--r--  1 udi  staff  19928000 Jan 11 18:02 /Users/udi/Downloads/lisa/160111-fuel.train.4/160111-fuel.train.4.hdf

In [28]:
!(cd /Users/udi/Downloads/lisa/ ; tar cfz 160111-fuel.tgz 160111-fuel.train.* )

In [29]:
!aws s3 mv /Users/udi/Downloads/lisa/160111-fuel.tgz s3://udikaggle/nist/

move: ../../../../../Downloads/lisa/160111-fuel.tgz to s3://udikaggle/nist/160111-fuel.tgz
