In [139]:
import pickle
import numpy as np
import sys
import os
import pandas as pd
import csv
import random
from sklearn.model_selection import train_test_split

In [19]:
data_path = '/scratch/yw3004/projects/ICASSP2019-AL/data'

# 1. Load data

In [20]:
negative_xy = pickle.load(open(os.path.join(data_path, 'negative_xy_allInfo.pickle'), "rb" )) 
positive_xy = pickle.load(open(os.path.join(data_path, 'positive_xy_allInfo.pickle'), "rb" ))
sens= []
for i in range(negative_xy.shape[0]):
    sens.append(negative_xy[i,-3])

unique_sensors = np.unique(sens)
unique_sensors

array(['b827eb0d8af7', 'b827eb0fedda', 'b827eb122f0f', 'b827eb1685c7',
       'b827eb2a1bce', 'b827eb429cd4', 'b827eb42bd4a', 'b827eb44506f',
       'b827eb4e7821', 'b827eb5895e9', 'b827eb815321', 'b827eb86d458',
       'b827eb8e2420', 'b827eb9bed23', 'b827ebad073b'], dtype='<U12')

# 2. Split train/val/test (3/1/1) for 5-fold CV

In [92]:
for i in range(5):
    test_sensors = unique_sensors[3*i:3*(i+1)]
    training_sensors = np.concatenate((unique_sensors[:3*i], unique_sensors[3*(i+1):]))
    
    #random train/val split in training sensors
    train_sensors, val_sensors = train_test_split(training_sensors,test_size=0.25)
    
    #select one train sensor
    train_sensor = np.array(random.choice(train_sensors))
    
    with open(os.path.join(data_path, 'test_sensors_'+str(i+1)+'.pickle'), 'wb') as f:
        pickle.dump(test_sensors, f, protocol=pickle.HIGHEST_PROTOCOL)
    with open(os.path.join(data_path, 'val_sensors_'+str(i+1)+'.pickle'), 'wb') as f:
        pickle.dump(val_sensors, f, protocol=pickle.HIGHEST_PROTOCOL)
    with open(os.path.join(data_path, 'train_sensors_all_'+str(i+1)+'.pickle'), 'wb') as f:
        pickle.dump(train_sensors, f, protocol=pickle.HIGHEST_PROTOCOL)
    with open(os.path.join(data_path, 'train_sensor_'+str(i+1)+'.pickle'), 'wb') as f:
        pickle.dump(train_sensor, f, protocol=pickle.HIGHEST_PROTOCOL)

# 3. Build Xs, ys

In [140]:
def build_xy(data_path, category, cv_round):
    if category == 'train_all':
        sensors = pickle.load(open(os.path.join(data_path, 'train_sensors_all_'+str(cv_round)+'.pickle'), "rb" ))
        n = 9*20
    elif category == 'train':
        sensors = pickle.load(open(os.path.join(data_path, 'train_sensor_'+str(cv_round)+'.pickle'), "rb" ))
        n = 1*20
    else:
        sensors = pickle.load(open(os.path.join(data_path, category+'_sensors_'+str(cv_round)+'.pickle'), "rb" ))
        n = 3*20

    positives = np.empty((n,positive_xy.shape[1]),dtype=object)
    j = 0 
    for i in range(positive_xy.shape[0]): 
        if positive_xy[i,-3] in sensors:
            positives[j,:]= positive_xy[i]
            j=j+1
        
    negatives = np.empty((n,negative_xy.shape[1]),dtype=object)
    j = 0 
    for i in range(negative_xy.shape[0]): 
        if negative_xy[i,-3] in sensors:
            negatives[j,:]= negative_xy[i]
            j=j+1
            
    #Concatenate train examples together and shuffle, or build X_pool   
    if category == 'train':
        np.random.shuffle(positives)
        np.random.shuffle(negatives)
        comb = np.vstack((positives[0,:], negatives[0,:]))
    else:
        comb = np.concatenate((positives, negatives),axis=0)
        np.random.shuffle(comb)
    
    #Separate x and y and info
    X = comb[:,:-4]
    X = x.astype(int)
    y = comb[:,-4]
    y = y.astype(int)
    info = comb[:,-3:]
    info[:,1] = info[:,1].astype(float)
    info[:,2] = info[:,2].astype(int)
    
    with open(os.path.join(data_path, 'X_'+category+'_'+str(cv_round)+'.pickle'), 'wb') as f:
        pickle.dump(X, f, protocol=pickle.HIGHEST_PROTOCOL) 
    with open(os.path.join(data_path, 'y_'+category+'_'+str(cv_round)+'.pickle'), 'wb') as f:
        pickle.dump(y, f, protocol=pickle.HIGHEST_PROTOCOL) 
    with open(os.path.join(data_path, 'info_'+category+'_'+str(cv_round)+'.pickle'), 'wb') as f:
        pickle.dump(info, f, protocol=pickle.HIGHEST_PROTOCOL) 

In [141]:
cv_rounds = range(1,6)
categories = ['train_all', 'train', 'val', 'test']

In [95]:
for category in categories:
    for cv_round in cv_rounds:
        build_xy(data_path, category, cv_round)

# 4. Build X_pool #

In [137]:
X_pool = pickle.load(open(os.path.join(data_path, 'X_pool_100000_random.pickle'), "rb" ))

In [138]:
for i in range(5):
    X_train_all = pickle.load(open(os.path.join(data_path, 'X_train_all_'+str(i+1)+'.pickle'), "rb" ))
    y_train_all = pickle.load(open(os.path.join(data_path, 'y_train_all_'+str(i+1)+'.pickle'), "rb" ))
    info_train_all = pickle.load(open(os.path.join(data_path, 'info_train_all_'+str(i+1)+'.pickle'), "rb" ))
    y_train_all = y_train_all.reshape(-1,1)
    comb = np.hstack((X_train_all, info_train_all, y_train_all))
    
    X_pool_cv = np.vstack((X_pool, comb))
    with open(os.path.join(data_path, 'X_pool_'+str(i+1)+'.pickle'), 'wb') as f:
        pickle.dump(X_pool_cv, f, protocol=pickle.HIGHEST_PROTOCOL)