In [1]:
import torch
import pandas as pd
import numpy as np
import time
from tqdm import tqdm

In [2]:
NUM_MARKS = 10
AGG_TYPE = ['month', 'week', 'day', 'hour', 0.9999, 0.9998, 0.9997, 0.9996, 0.9995, 0.999] 
PRE_NUM_MARKS = [10, 11, 11, 11, 10, 10, 10, 10, 10, 10]

In [3]:
# Unzip the data files
# ! unzip 94102.zip -d energy_94102/
# ! unzip 94103.zip -d energy_94103/
# ! unzip 94104.zip -d energy_94104/
# ! unzip 94105.zip -d energy_94105/

In [3]:
DATA_FOLDERS = [f"./energy_{str(94102+i)}/" for i in range(4)]

In [4]:
DATA_FOLDERS

['./energy_94102/', './energy_94103/', './energy_94104/', './energy_94105/']

In [5]:
import glob

data = []
for directory in DATA_FOLDERS:
    for filepath in glob.iglob(f'{directory}*.csv'):
        data.append( pd.read_csv(filepath) )

In [6]:
def to_float(x):
    try:
        return float(x)
    except ValueError:
        return -100.

In [7]:
for i in range(len(data)):
    data[i].rename(columns={'Unnamed: 0': 'time'}, inplace=True)
    data[i]['time'] = pd.to_datetime(data[i]['time'])
    data[i]['energykWh'] = data[i]['energykWh'].apply(to_float)
    data[i] = data[i].loc[:, ['time', 'energykWh']]

In [8]:
print("Before discard: ", len(data))
data = [d for d in data if sum(d['energykWh']==0)/len(d) < AGG_TYPE[-1]]
print("After discard: ", len(data))

Before discard:  201
After discard:  191


In [9]:
num_na = []
na_ratio = []
contains_na = []
for i in range(len(data)):
    num_na.append((data[i]['energykWh']==-100).sum())
    na_ratio.append( num_na[-1] / len(data[i]['energykWh']) )
    contains_na.append(num_na[-1]>0)

In [10]:
print(f"{sum(contains_na)} out of {len(data)} files contain NA values)")
print(f"Mean number of NA {np.mean(num_na):.1f} ({np.std(num_na):.1f})")
print(f"Mean number of NA Ratio {np.mean(na_ratio):.4f} ({np.std(na_ratio):.4f})")

182 out of 191 files contain NA values)
Mean number of NA 9.8 (18.5)
Mean number of NA Ratio 0.0001 (0.0002)


In [11]:
# Check whether files are sorted by time
data_sorted = []
for d in data:
    data_sorted.append( np.all(d['time'].values[:-1] <= d['time'].values[1:]) )
print(all(data_sorted))

True


In [12]:
data_agg = {}

for agg in AGG_TYPE[:4]:
    data_agg[agg] = []
    for i in tqdm(range(len(data))):
        if agg == 'month':
            data_agg[agg].append( data[i].loc[data[i].groupby([data[i]['time'].dt.year, data[i]['time'].dt.month])['energykWh'].idxmax()] )
        elif agg == 'week':
            data_agg[agg].append( data[i].loc[data[i].groupby([data[i]['time'].dt.year, data[i]['time'].dt.month, data[i]['time'].dt.strftime('%W')])['energykWh'].idxmax()] )
        elif agg == 'day':
            data_agg[agg].append( data[i].loc[data[i].groupby([data[i]['time'].dt.year, data[i]['time'].dt.month, data[i]['time'].dt.day])['energykWh'].idxmax()] )
        elif agg == 'hour':
            data_agg[agg].append( data[i].loc[data[i].groupby([data[i]['time'].dt.year, data[i]['time'].dt.month, data[i]['time'].dt.day, data[i]['time'].dt.hour])['energykWh'].idxmax()] )

100%|██████████| 191/191 [00:02<00:00, 92.83it/s]
100%|██████████| 191/191 [00:51<00:00,  3.72it/s]
100%|██████████| 191/191 [00:08<00:00, 22.27it/s]
100%|██████████| 191/191 [02:42<00:00,  1.17it/s]


In [13]:
def get_peaks_by_quantile(arr, q):
    q_val = np.quantile(arr, q)
    if q_val == 0:
        raise ValueError("Quantile value is 0")
    else:
        return np.where(arr >= q_val)[0]

In [14]:
for agg in AGG_TYPE[4:]:
    data_agg[agg] = []
    for i in range(len(data)):
        peak_idx = get_peaks_by_quantile( data[i]['energykWh'].values, agg )
        if len(peak_idx) == 0:
            continue
        else:
            data_agg[agg].append( data[i].iloc[peak_idx] )
        

In [17]:
np.mean([len(data_agg[0.9996][i]) for i in range(len(data_agg[0.9996]))])

356.717277486911

In [18]:
# Create labels
raw_labels = {}

for agg in AGG_TYPE:
    raw_labels[agg] = []
    for i in range(len(data_agg[agg])):
        raw_labels[agg].append(data_agg[agg][i]['energykWh'].values)

In [19]:
boundry_values = {}

for idx, agg in enumerate(AGG_TYPE):
    rl = np.concatenate(raw_labels[agg])
    rl = [_rl for _rl in rl if _rl != -100.]
    print(agg)
    boundry = [arr[0] for arr in np.array_split(np.arange(len(rl)), PRE_NUM_MARKS[idx])]
    boundry_values[agg] = np.sort(rl)[boundry][-NUM_MARKS:]
    boundry_values[agg] = np.append(boundry_values[agg], np.inf)
    if agg == 'hour':
        boundry_values[agg] = np.concatenate([[-101.],boundry_values[agg]])
    

month
week
day
hour
0.9999
0.9998
0.9997
0.9996
0.9995
0.999


In [20]:
marks = {}
for agg in AGG_TYPE:
    marks[agg] = []
    for i in range(len(data_agg[agg])):
        offset = 2 if agg == 'hour' else 1
        marks[agg].append( np.digitize(raw_labels[agg][i], boundry_values[agg] ) - offset )
        

In [21]:
# Impute missing
rng = np.random.default_rng(seed=123)

for m in marks['hour']:
    for i in range(len(m)):
        if m[i] == -1:
            neighbors = []
            for j in range(i-1, -1, -1):
                if m[j] != -1:
                    neighbors.append(m[j])
                    break
            for j in range(i+1, len(m)):
                if m[j] != -1:
                    neighbors.append(m[j])
                    break
            m[i] = rng.choice(neighbors)

In [27]:
rng = np.random.default_rng(seed=1234)

dataset = {}

for agg in AGG_TYPE:
    dataset[agg] = {}
    dataset[agg]['sequences'] = []
    dataset[agg]['num_marks'] = NUM_MARKS
    
    for i in tqdm(range(len(data_agg[agg]))):
        sequence = {}
        sequence['arrival_times'] = data_agg[agg][i]['time'].apply(lambda x: time.mktime(x.timetuple())).values
        sequence['marks'] = marks[agg][i]
        sequence['energykWh'] = data_agg[agg][i]['energykWh'].values
        assert len(sequence['arrival_times']) == len(sequence['marks'])
        sequence['t_start'] = time.mktime(data[i]['time'].iloc[0].timetuple()) - 900
        sequence['t_end'] = time.mktime(data[i]['time'].iloc[-1].timetuple()) + 900
        sample_flag = True
        arrival_times = None
        while sample_flag:
            arrival_times = sequence['arrival_times'] + rng.normal(0, 60, len(sequence['arrival_times']))
            sample_flag = not( sequence['t_start'] < arrival_times[0] <= arrival_times[-1] < sequence['t_end'] )
            # print(sequence['t_start'], arrival_times[0], arrival_times[-1], sequence['t_end'])
            # print(sample_flag)
        
        dataset[agg]['sequences'].append(sequence)

100%|██████████| 191/191 [00:00<00:00, 1465.07it/s]
100%|██████████| 191/191 [00:00<00:00, 1054.16it/s]
100%|██████████| 191/191 [00:00<00:00, 241.90it/s]
100%|██████████| 191/191 [00:19<00:00,  9.77it/s]
100%|██████████| 191/191 [00:00<00:00, 3346.67it/s]
100%|██████████| 191/191 [00:00<00:00, 2957.68it/s]
100%|██████████| 191/191 [00:00<00:00, 2580.40it/s]
100%|██████████| 191/191 [00:00<00:00, 361.95it/s]
100%|██████████| 191/191 [00:00<00:00, 498.16it/s]
100%|██████████| 191/191 [00:00<00:00, 315.84it/s]


In [28]:
for agg in AGG_TYPE:
    torch.save(dataset[agg], f'./energy_{agg}.pkl')

In [29]:
dd = torch.load("energy_0.9996.pkl")

In [31]:
dd['sequences'][1]

{'arrival_times': array([1.4284161e+09, 1.4328207e+09, 1.4471703e+09, 1.4477778e+09,
        1.4489811e+09, 1.4489820e+09, 1.4489856e+09, 1.4489865e+09,
        1.4489874e+09, 1.4489883e+09, 1.4489892e+09, 1.4489901e+09,
        1.4489910e+09, 1.4520132e+09, 1.4521806e+09, 1.4521815e+09,
        1.4527863e+09, 1.4528016e+09, 1.4528025e+09, 1.4533065e+09,
        1.4538339e+09, 1.4538348e+09, 1.4544198e+09, 1.4544297e+09,
        1.4544306e+09, 1.4544333e+09, 1.4574528e+09, 1.4574537e+09,
        1.4579712e+09, 1.4679054e+09, 1.4704020e+09, 1.4704029e+09,
        1.4704038e+09]),
 'marks': array([9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
        9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9]),
 'energykWh': array([0.693, 0.717, 0.712, 0.786, 1.008, 0.8  , 0.803, 0.778, 0.771,
        0.763, 0.772, 0.8  , 0.732, 0.731, 0.855, 0.831, 0.829, 0.799,
        0.692, 0.721, 0.743, 0.78 , 0.684, 0.874, 0.817, 0.764, 0.708,
        0.788, 0.689, 0.671, 0.827, 0.818, 0.799]),
 't_start'