In [1]:
#-*- coding:utf-8 -*-

import os
import sys
import time
import datetime
import pickle
import random

import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import cv2

import torch

from tqdm import tqdm
from scipy.interpolate import interp1d

In [2]:
# Set random seed
SEED = 12345
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
random.seed(SEED)

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

## Load preprocessed the seneor label data and response data

In [3]:
with open ('train_index', 'rb') as f:
    train_index = pickle.load(f)
    
with open ('test_index', 'rb') as f:
    test_index = pickle.load(f)

In [4]:
with open ('label_list.pkl', 'rb') as f:
    label_list = pickle.load(f)

# with open ('splits/label_train_list.pkl', 'rb') as f:
#     label_train_list = pickle.load(f)
    
# with open ('splits/label_test_list.pkl', 'rb') as f:
#     label_test_list = pickle.load(f)

In [5]:
with open ('data_list.pkl', 'rb') as f:
    data_list = pickle.load(f)
    
# with open ('splits/data_train_list.pkl', 'rb') as f:
#     data_train_list = pickle.load(f)
    
# with open ('splits/data_test_list.pkl', 'rb') as f:
#     data_test_list = pickle.load(f)

## Using train data, estimate the distribution for each sensor
Load the best KDE model for each sensor

In [6]:
sensor_data_all = np.concatenate(data_list[train_index])
sensor_data_all = sensor_data_all[:,3:]

In [7]:
sensor_data_all.shape

(424710, 8)

In [8]:
true_max = np.max(sensor_data_all, axis=0)
true_max

array([518.0, 427.0, 795.0, 970.0, 979.0, 1204.0, 1110.0, 1191.0],
      dtype=object)

In [9]:
RESPONSE_MAX = np.max(true_max)
RESPONSE_MAX

1204.0

In [10]:
class_labels = np.sort(label_list['Class_Label'].unique())
class_labels

array([0, 1, 2, 3, 4])

# Transform the raw data into image

In [11]:
IMAGE_X = 256
IMAGE_Y = 256

In [12]:
class sensor_data_labeled:
    def __init__(self, idx, label, data, split):
        self.idx = idx
        self.label = label
        self.data = data
        self.split = split
        
    def convert_datetime(self, date_time_str):
        return datetime.datetime.strptime(date_time_str, '%Y-%m-%d %H:%M:%S').timestamp()
    
    def sample_transformation(self, IMAGE_X, IMAGE_Y, RESPONSE_MAX, NOISE):
        data = self.data
        x = data[:,0]
        #x = [self.convert_datetime(item) for item in x]
        x = np.array(range(len(x)))/10

        temp = data[:,1]
        x_new = np.linspace(0, max(x), 2**7, endpoint=True)
        f_interpolation = interp1d(x, temp, kind='linear')
        temp_new = f_interpolation(x_new)
        temp_tensor = torch.tensor(temp_new.astype(np.float))

        humi = data[:,2]
        x_new = np.linspace(0, max(x), 2**7, endpoint=True)
        f_interpolation = interp1d(x, humi, kind='linear')
        humi_new = f_interpolation(x_new)
        humi_tensor = torch.tensor(humi_new.astype(np.float))

        x_2 = x[np.arange(int(len(x)/10))*10]
        res_list = []
        for i in range(3,data.shape[1]):
            y = data[:,i]
            
            # Denoise using moving average
            y_series = pd.Series(y)
            y_2 = y_series.rolling(10).mean()
            y_2 = y_2.fillna(y_2.dropna().iloc[0]).to_numpy()
            x = x_2
            y = y_2[np.arange(int(len(y_2)/10))*10]
            
            x_new = np.linspace(0, max(x), 2**14, endpoint=True)
            f_interpolation = interp1d(x, y, kind='linear')
            y_new = f_interpolation(x_new)
            y_new = np.multiply(y_new, [np.random.normal(1, NOISE)] * len(y_new))
            y_new = np.array([y if y < RESPONSE_MAX else RESPONSE_MAX for y in y_new])

            #2d hist
            x_edge = np.linspace(0, max(x_new), IMAGE_X+1, endpoint=True)
            y_edge = np.linspace(0, RESPONSE_MAX, IMAGE_Y+1, endpoint=True)
            H, xedges, yedges = np.histogram2d(x_new, y_new, bins=(x_edge, y_edge))
            H = H.T
            H_flip = np.flipud(H)
            #H_filp_normal = (H_flip - np.min(H_flip))/(np.max(H_flip)-np.min(H_flip))*255

            res = cv2.resize(H_flip, dsize=(IMAGE_X, IMAGE_Y), interpolation=cv2.INTER_AREA)
            res_list.append(res.astype(np.uint8))
        res_tensor = torch.tensor(res_list)

        return [temp_tensor, humi_tensor, res_tensor]

In [13]:
def get_valid_index(label_list, class_labels, train_index, num_ratio = 0.25):
    valid_index = []
    label_train_list = label_list.loc[train_index]
    for idx in class_labels:
        label_idx_list = label_train_list.loc[label_train_list['Class_Label']==idx].index.to_numpy()
        label_idx_valid = np.random.choice(label_idx_list, int(round(len(label_idx_list)*num_ratio)), replace=False)
        valid_index.append(label_idx_valid)
    valid_index = np.concatenate(valid_index)
    return valid_index

In [14]:
def resample_train_index(label_list, class_labels, train_index, num_total=1000):
    num_resample = num_total/len(class_labels)
    index_resample = []
    label_train_list = label_list.loc[train_index]
    for idx in class_labels:
        label_idx_list = label_train_list.loc[label_train_list['Class_Label']==idx].index.to_numpy()
        label_idx_resample = np.random.choice(label_idx_list, int(num_resample), replace=True)
        index_resample.append(label_idx_resample)
    index_resample = np.concatenate(index_resample)
    return index_resample

In [15]:
valid_index = get_valid_index(label_list, class_labels, train_index, num_ratio = 0.25)
train_index_selected = [ii for ii in train_index if ii not in valid_index]

In [16]:
all_index = np.concatenate([train_index_selected, valid_index, test_index])
all_data_counts = label_list.loc[all_index]['Class_Label'].value_counts().sort_index().to_numpy()
all_data_counts_unique = label_list.loc[np.unique(all_index)]['Class_Label'].value_counts().sort_index().to_numpy()

print('all data counts for class:\t\t{}'.format(all_data_counts))
print('all data counts for class(unique):\t{}'.format(all_data_counts_unique))

train_data_counts = label_list.loc[train_index_selected]['Class_Label'].value_counts().sort_index().to_numpy()
valid_data_counts = label_list.loc[valid_index]['Class_Label'].value_counts().sort_index().to_numpy()
test_data_counts = label_list.loc[test_index]['Class_Label'].value_counts().sort_index().to_numpy()

print('train data counts for class:\t\t{}'.format(train_data_counts))
print('valid data counts for class:\t\t{}'.format(valid_data_counts))
print('test data counts for class:\t\t{}'.format(test_data_counts))

all data counts for class:		[54 36 18 54 18]
all data counts for class(unique):	[54 36 18 54 18]
train data counts for class:		[32 22 10 32 10]
valid data counts for class:		[11  7  4 11  4]
test data counts for class:		[11  7  4 11  4]


In [17]:
train_index_resample = resample_train_index(label_list, class_labels, train_index_selected, num_total=1000)
train_resample_data_counts = label_list.loc[train_index_resample]['Class_Label'].value_counts().sort_index().to_numpy()
print('train resample data counts for class:\t{}'.format(train_resample_data_counts))

train resample data counts for class:	[200 200 200 200 200]


In [18]:
#!rm -rf tensor_resamples

In [19]:
if not os.path.exists('tensor_resamples'):
    os.mkdir('tensor_resamples')

for i in range(5):
    cv_path = 'tensor_resamples/CV_{}'.format(i)
    if not os.path.exists(cv_path):
        os.mkdir(cv_path)
    
    np.random.shuffle(train_index)

    valid_index_selected = get_valid_index(label_list, class_labels, train_index, num_ratio = 0.25)
    train_index_selected = [ii for ii in train_index if ii not in valid_index_selected]
    test_index_selected = test_index
    
    train_index_resample = resample_train_index(label_list, class_labels, train_index_selected, num_total=1000)
    resample_index = np.concatenate([train_index_resample, valid_index_selected, test_index_selected])

    sensor_data_label_list = []
    for idx in resample_index:
        label = label_list.iloc[idx] # pandas dataframe
        data = data_list[idx] # numpy array

        if idx in train_index_selected: 
            split = 'train'
        elif idx in valid_index_selected: 
            split = 'valid'
        elif idx in test_index_selected: 
            split = 'test'
        sensor_data_label_list.append(sensor_data_labeled(idx, label, data, split))

    for noise in [0.00, 0.01, 0.03, 0.05]:
        if not os.path.exists(os.path.join(cv_path, '{:0.2f}'.format(noise))):
            os.mkdir(os.path.join(cv_path, '{:0.2f}'.format(noise)))
        for split in ['train', 'valid', 'test']:
            if not os.path.exists(os.path.join(cv_path, '{:0.2f}'.format(noise), str(split))):
                os.mkdir(os.path.join(cv_path, '{:0.2f}'.format(noise), str(split)))
            for idx in class_labels:
                if not os.path.exists(os.path.join(cv_path, '{:0.2f}'.format(noise), str(split), str(idx))):
                    os.mkdir(os.path.join(cv_path, '{:0.2f}'.format(noise), str(split), str(idx)))
        cnt = 0
        for sensor_data in tqdm(sensor_data_label_list):
            split = sensor_data.split
            idx = sensor_data.label.idx
            class_label = sensor_data.label.Class_Label
            data = sensor_data.data
            tensor = sensor_data.sample_transformation(IMAGE_X, IMAGE_Y, RESPONSE_MAX, noise)

            file_name = 'tensor_{}_{:03d}_{:03d}'.format(class_label, int(idx), cnt)    
            file_path = os.path.join(cv_path, '{:0.2f}'.format(noise), str(split), str(class_label), file_name)

            torch.save(tensor, file_path)
            cnt += 1
    print('\t{} completed'.format(cv_path))

100%|██████████| 1074/1074 [02:04<00:00,  8.64it/s]
100%|██████████| 1074/1074 [02:06<00:00,  8.51it/s]
100%|██████████| 1074/1074 [02:05<00:00,  8.57it/s]
100%|██████████| 1074/1074 [01:58<00:00,  9.07it/s]
  0%|          | 0/1074 [00:00<?, ?it/s]

	tensor_resamples/CV_0 completed


100%|██████████| 1074/1074 [02:03<00:00,  8.71it/s]
100%|██████████| 1074/1074 [02:06<00:00,  8.50it/s]
100%|██████████| 1074/1074 [02:00<00:00,  8.94it/s]
100%|██████████| 1074/1074 [02:04<00:00,  8.64it/s]


	tensor_resamples/CV_1 completed


100%|██████████| 1074/1074 [01:52<00:00,  9.58it/s]
100%|██████████| 1074/1074 [02:04<00:00,  8.63it/s]
100%|██████████| 1074/1074 [02:33<00:00,  7.01it/s]
100%|██████████| 1074/1074 [02:27<00:00,  7.30it/s]
  0%|          | 0/1074 [00:00<?, ?it/s]

	tensor_resamples/CV_2 completed


100%|██████████| 1074/1074 [02:43<00:00,  6.56it/s]
100%|██████████| 1074/1074 [02:30<00:00,  7.12it/s]
100%|██████████| 1074/1074 [02:31<00:00,  7.09it/s]
100%|██████████| 1074/1074 [02:35<00:00,  6.91it/s]


	tensor_resamples/CV_3 completed


100%|██████████| 1074/1074 [02:26<00:00,  7.31it/s]
100%|██████████| 1074/1074 [02:32<00:00,  7.05it/s]
100%|██████████| 1074/1074 [02:35<00:00,  6.90it/s]
100%|██████████| 1074/1074 [02:36<00:00,  6.87it/s]

	tensor_resamples/CV_4 completed



