In [1]:
#-*- coding:utf-8 -*-

import os
import sys
import time
import datetime
import pickle
import random

import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import cv2

import torch

from tqdm import tqdm
from scipy.interpolate import interp1d

In [2]:
# Set random seed
SEED = 12345
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
random.seed(SEED)

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

## Load and Preprocess the seneor label data

In [3]:
file_list = os.listdir('dataset_twosources_raw')
file_list.sort()

In [4]:
sample_info_list = []

for file_name in file_list:
    idx = file_name.split('_')[0]
    element_1 = file_name.split('_')[1]
    concentrate_1 = file_name.split('_')[2]
    element_2 = file_name.split('_')[3]
    concentrate_2 = file_name.split('_')[4]
    sample_info_list.append([idx, element_1, concentrate_1, element_2, concentrate_2])
sample_info_array = np.array(sample_info_list)

In [5]:
sample_info_df = pd.DataFrame(sample_info_array, columns=['idx', 'element_1', 'concentrate_1', 'element_2', 'concentrate_2'])
sample_info_df.head()

Unnamed: 0,idx,element_1,concentrate_1,element_2,concentrate_2
0,0,Et,H,CO,n
1,1,Et,L,Me,H
2,2,Et,H,CO,H
3,3,Et,H,Me,n
4,4,Et,L,CO,H


In [6]:
label_list = []
for idx in range(len(sample_info_df)):
    sample = sample_info_df.iloc[idx]
    concentrate_dict = {'Et': None, 'CO': None, 'Me': None}
    concentrate_dict[sample['element_1']] = sample['concentrate_1']
    concentrate_dict[sample['element_2']] = sample['concentrate_2']
    values = list(concentrate_dict.values())
    label_list.append([sample['idx']]+values+[values])
label_list = pd.DataFrame(label_list, columns = ['idx', 'Et', 'CO', 'Me', 'Combination'])
label_list.head()

Unnamed: 0,idx,Et,CO,Me,Combination
0,0,H,n,,"[H, n, None]"
1,1,L,,H,"[L, None, H]"
2,2,H,H,,"[H, H, None]"
3,3,H,,n,"[H, None, n]"
4,4,L,H,,"[L, H, None]"


In [7]:
Combination_list = [tuple(x) for x in label_list['Combination']]
Unique_Combination_list = list(set(Combination_list))
len(Unique_Combination_list) # 30 combinations out of 180 samples 

30

In [8]:
label_list = []
for idx in range(len(sample_info_df)):
    sample = sample_info_df.iloc[idx]
    concentrate_dict = {'Et': 0, 'CO': 0, 'Me': 0}
    if sample['concentrate_1'] != 'n' and sample['concentrate_1'] != None:
        concentrate_dict[sample['element_1']] = 1
    if sample['concentrate_2'] != 'n' and sample['concentrate_2'] != None:
        concentrate_dict[sample['element_2']] = 1
    values = list(concentrate_dict.values())
    label_list.append([sample['idx']]+values+[values])
label_list = pd.DataFrame(label_list, columns = ['idx', 'Et', 'CO', 'Me', 'Combination'])
label_list.head()

Unnamed: 0,idx,Et,CO,Me,Combination
0,0,1,0,0,"[1, 0, 0]"
1,1,1,0,1,"[1, 0, 1]"
2,2,1,1,0,"[1, 1, 0]"
3,3,1,0,0,"[1, 0, 0]"
4,4,1,1,0,"[1, 1, 0]"


In [9]:
Combination_list = [tuple(x) for x in label_list['Combination']]
Unique_Combination_list = list(set(Combination_list))
len(Unique_Combination_list) # 30 combinations out of 180 samples 

5

In [10]:
data_df = label_list
data_df.sort_index(inplace=True)

In [11]:
combination_to_label_dict = dict(zip(Unique_Combination_list, range(len(Unique_Combination_list))))
data_df['Class_Label'] = data_df['Combination'].apply(lambda s: combination_to_label_dict[tuple(s)])
data_df

Unnamed: 0,idx,Et,CO,Me,Combination,Class_Label
0,000,1,0,0,"[1, 0, 0]",1
1,001,1,0,1,"[1, 0, 1]",3
2,002,1,1,0,"[1, 1, 0]",0
3,003,1,0,0,"[1, 0, 0]",1
4,004,1,1,0,"[1, 1, 0]",0
...,...,...,...,...,...,...
175,175,1,0,0,"[1, 0, 0]",1
176,176,1,0,0,"[1, 0, 0]",1
177,177,0,0,1,"[0, 0, 1]",2
178,178,0,1,0,"[0, 1, 0]",4


In [12]:
labels, counts = np.unique(data_df['Combination'], return_counts=True)
np.array([labels, counts]).T

array([[list([0, 0, 1]), 18],
       [list([0, 1, 0]), 18],
       [list([1, 0, 0]), 36],
       [list([1, 0, 1]), 54],
       [list([1, 1, 0]), 54]], dtype=object)

In [13]:
labels, counts = np.unique(data_df['Class_Label'], return_counts=True)
np.array([labels, counts]).T

array([[ 0, 54],
       [ 1, 36],
       [ 2, 18],
       [ 3, 54],
       [ 4, 18]])

In [14]:
#add data split, save it
train_df = []
test_df = []
splits = []

for idx in range(len(Unique_Combination_list)):
    index = data_df.loc[data_df['Class_Label']==idx].index.to_list()
    train_index = np.random.choice(np.array(index), round(len(index)*0.8), replace=False)
    test_index = [ii for ii in index if ii not in train_index]
    print(len(index), len(train_index), len(test_index))
    
    train_df.append(data_df.loc[train_index])
    test_df.append(data_df.loc[test_index])
    splits.append([idx, train_index, test_index])
    
train_df = pd.concat(train_df)
test_df = pd.concat(test_df)

54 43 11
36 29 7
18 14 4
54 43 11
18 14 4


In [15]:
print(train_df.shape)
train_df.head()

(143, 6)


Unnamed: 0,idx,Et,CO,Me,Combination,Class_Label
94,94,1,1,0,"[1, 1, 0]",0
136,136,1,1,0,"[1, 1, 0]",0
134,134,1,1,0,"[1, 1, 0]",0
102,102,1,1,0,"[1, 1, 0]",0
22,22,1,1,0,"[1, 1, 0]",0


In [16]:
print(test_df.shape)
test_df.head()

(37, 6)


Unnamed: 0,idx,Et,CO,Me,Combination,Class_Label
4,4,1,1,0,"[1, 1, 0]",0
52,52,1,1,0,"[1, 1, 0]",0
100,100,1,1,0,"[1, 1, 0]",0
116,116,1,1,0,"[1, 1, 0]",0
120,120,1,1,0,"[1, 1, 0]",0


In [17]:
with open ('label_list.pkl', 'wb') as f:
    pickle.dump(data_df, f)

with open ('splits/label_train_list.pkl', 'wb') as f:
    pickle.dump(train_df, f)

with open ('splits/label_test_list.pkl', 'wb') as f:
    pickle.dump(test_df, f)

In [18]:
with open ('train_index', 'wb') as f:
    pickle.dump(list(train_df.index), f)

with open ('test_index', 'wb') as f:
    pickle.dump(list(test_df.index), f)

## Load and Preprocess the seneor response data

In [19]:
file_list = os.listdir('dataset_twosources_raw')
file_list.sort()

In [20]:
sensor_data_list = []
for file_name in file_list:
    sensor_data_list.append(pd.read_csv(os.path.join('dataset_twosources_downsampled', file_name), header=None))

In [21]:
sensor_data_np = np.asarray(sensor_data_list, dtype=object)

In [22]:
sensor_data_train_np = sensor_data_np[list(train_df.index)]
sensor_data_test_np = sensor_data_np[list(test_df.index)]

In [23]:
sensor_data_train_np = sensor_data_np[list(train_df.index)]
sensor_data_test_np = sensor_data_np[list(test_df.index)]

In [24]:
with open ('data_list.pkl', 'wb') as f:
    pickle.dump(sensor_data_np, f)
    
with open ('splits/data_train_list.pkl', 'wb') as f:
    pickle.dump(sensor_data_train_np, f)
    
with open ('splits/data_test_list.pkl', 'wb') as f:
    pickle.dump(sensor_data_test_np, f)

In [25]:
#sensor_data_all = np.concatenate(sensor_data_list)
sensor_data_all = np.concatenate(sensor_data_train_np)
sensor_data_all = sensor_data_all[:,3:]

In [26]:
sensor_data_all.shape

(424710, 8)

In [27]:
sensor_response_distribution_list = []
for sensor_data in sensor_data_all.T:
    sensor_response_distribution_list.append(sensor_data)

In [28]:
true_max = np.max(sensor_data_all, axis=0)
true_max

array([518.0, 427.0, 795.0, 970.0, 979.0, 1204.0, 1110.0, 1191.0],
      dtype=object)

In [29]:
RESPONSE_MAX = np.max(true_max)
RESPONSE_MAX

1204.0

In [30]:
with open ('response_max', 'wb') as f:
    pickle.dump(RESPONSE_MAX, f)