The baseline period is defined as a maximum number of days before a "start_time". This start time likely will mark some start of a treatment etc. and thus the end of the baseline period. This time has to be provided externally and will be loaded by the notebook to limit the estimation period. The time available for the baseline computation can vary between homes and the dates used for each home will be written to the output file. In the IDEAL data, some homes were already collecting data for quite some time before they were allocated to treatment or control (marking the end of the baseline period). To not include seasonal effects etc in the baseline, it is possible to limit the number of days used for the baseline estimation in the config file.

In [1]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
import sys
sys.path.insert(0, '../src')

import re
import pickle
import datetime

import numpy as np
import pandas as pd

import seaborn as sns

from datetime import timedelta
from pathlib import Path
from multiprocessing import Pool
from functools import partial

from IdealDataInterface import IdealDataInterface

from config import TIME_FORMAT, SAMPLING_MASK
from config import SENSOR_DATA_FOLDER, CACHE_FOLDER, CPU_HIGH_MEMMORY, CPU_LOW_MEMMORY
from config import EVALUATION_PERIOD, FFILL_LIMIT, SAMPLING_WINDOW_BASELINE, BASELINE_NR_DAYS, N_SAMPLES_BASELINE

from utils import treatment_control, load_mains
from sampling import data_to_sample_array, sample_energy, compute_sample_sizes

In [3]:
# Run plotting styles
%run -i '../src/sns_styles.py'

cmap = sns.color_palette()

In [4]:
# Load the information about when each home was assigned treatment or control. The start_date column sets 
# the end for the baseline period. This is currently hard-coded into the analysis pipeline. If the estimator
# should be used for other data, the function treatment_control() must be modified to return a similar table.
df_group = treatment_control()

df_group.tail()

Unnamed: 0,homeid,group,start_date,end_date
263,331,treatment,2018-05-15,2018-06-30
264,332,treatment,2018-05-15,2018-06-30
265,334,control,2018-05-15,2018-06-30
266,335,treatment,2018-05-15,2018-06-30
267,333,control,2018-05-15,2018-06-30


In [5]:
# Separate into control, treatment, and enhanced
homeid_control = df_group.loc[df_group['group'] == 'control','homeid']
homeid_treatment = df_group.loc[df_group['group'] == 'treatment','homeid']
homeid_enhanced = df_group.loc[df_group['group'] == 'enhanced','homeid']

print('Found {} homes in the control group'.format(len(homeid_control)))
print('Found {} homes in the treatment group'.format(len(homeid_treatment)))
print('Found {} homes in the enhanced group'.format(len(homeid_enhanced)))

Found 107 homes in the control group
Found 106 homes in the treatment group
Found 39 homes in the enhanced group


In [6]:
# Load the electricity readings, create the array to sample from, and compute the estimate
def compute_baseline(homeid, seconds, N):
    # Load the mains electricity readings
    ts = load_mains(homeid)
        
    # Limit to the baseline period (before being allocated to control or treatment)
    df_group = treatment_control()
    start_date = df_group.loc[df_group['homeid'] == homeid,'start_date'].values[0]
    ts = ts[(ts.index >= ts.index.min().ceil('D')) & (ts.index < start_date)]
    
    assert len(ts) > 0
    
    # Record the number of days available for the baseline
    # The +1 is there as the difference will be e.g. 13 days 23:59:59 seconds
    nr_days = (ts.index.max() - ts.index.min()).days + 1
    
    # Limit the data so that the baseline is computed on a maximum number of days
    ts = ts[ts.index > (ts.index.max() - timedelta(days=BASELINE_NR_DAYS))]
        
    # Compute the sampling sizes depending on the window_width as specified in seconds
    sample_sizes = compute_sample_sizes(ts, seconds)
    sample_sizes = { k:min(v) for k,v in sample_sizes.items() }
    
    # Compute the baseline estimate
    arr, idx = data_to_sample_array(ts, window_width=None)
    
    # Estimates will be nan if they can't be computed
    estimates = np.array([np.nan, ])
    if not (idx == 0).any():
        estimates = sample_energy(N, arr, idx, SAMPLING_MASK, window_width=SAMPLING_WINDOW_BASELINE)
    
    return {'estimate_mean':np.mean(estimates), 
            'estimate_std':np.std(estimates), 
            'sample_sizes':sample_sizes,
            'missing_data':ts.isna().mean(),
            'nr_days':nr_days,
            'baseline_start':ts.index.min(),
            'baseline_end':ts.index.max(),
           }

In [7]:
# The sample size will be computed for window_width parameters of the following. This should
# give a feel for what an appropriate value for the window_width might be.
seconds = [1,2,3,4,5,10,15,30,60,90,120]

if not SAMPLING_WINDOW_BASELINE in seconds:
    seconds.append(SAMPLING_WINDOW_BASELINE)
    seconds.sort()

# Set up the function for multiprocessing
func = partial(compute_baseline, seconds=seconds, N=N_SAMPLES_BASELINE)

homeids = list(homeid_control) + list(homeid_treatment)

print('Computing the baseline estimates per home. This may take a while..')
with Pool(processes=2*CPU_HIGH_MEMMORY) as pool:
    baseline_results = pool.map(func, homeids)
print('Done.')

Computing the baseline estimates per home. This may take a while..
Done.


In [8]:
print('Computed estimates for {} homes'.format(len(baseline_results)))

Computed estimates for 213 homes


## Check the minimum sampling size

This is is worst case that occured for at least one time point of the day

In [9]:
# Extract the minimum number of samples
df = pd.DataFrame([ item['sample_sizes'] for item in baseline_results ], index=homeids, dtype=int)

df.head()

Unnamed: 0,0,1,2,3,4,5,10,15,30,60,90,120
59,39,117,195,273,351,429,819,1209,2379,4749,7149,9549
74,39,117,195,273,351,429,819,1211,2411,4811,7211,9611
78,34,102,170,238,306,374,721,1078,2147,4299,6517,8730
71,38,116,194,272,350,428,818,1208,2403,4807,7217,9617
76,25,77,130,183,235,288,558,836,1663,3388,5218,7066


In [10]:
# The absolute minumum observed for at least one home for the different window_width parameters (the index)
# A value of 0 indicates that it is impossible to compute a baseline for at least one home if the window_width
# is chosen as indicated by the index.
df.min(axis=0)

0        0
1        0
2        0
3        0
4        0
5        0
10       0
15       3
30      11
60      74
90     147
120    244
dtype: int64

## Write the result to disk

In [11]:
# Construct the final DataFrame
data = [ {'estimate_mean':v['estimate_mean'],
          'estimate_std':v['estimate_std'],
          'min_sample_size':v['sample_sizes'][SAMPLING_WINDOW_BASELINE][0],
          'missing_data':v['missing_data'],
          'nr_days_available':v['nr_days'],
          'baseline_start':v['baseline_start'],
          'baseline_end':v['baseline_end'],
         } for v in baseline_results ]

df_result = pd.DataFrame(data)

df_result['homeid'] = homeids

df_result = df_result[['homeid', 'estimate_mean', 'estimate_std', 'min_sample_size',
                       'missing_data', 'baseline_start', 'baseline_end', 'nr_days_available']]

df_result.head()

Unnamed: 0,homeid,estimate_mean,estimate_std,min_sample_size,missing_data,baseline_start,baseline_end,nr_days_available
0,59,6.934674,0.032738,4749,0.022207,2017-02-02,2017-03-15 23:59:59,160
1,74,14.350139,0.036927,4811,0.018192,2017-02-02,2017-03-15 23:59:59,69
2,78,6.802723,0.037634,4299,0.06408,2017-03-04,2017-04-14 23:59:59,85
3,71,5.59171,0.032068,4807,0.008122,2017-04-04,2017-05-15 23:59:59,124
4,76,12.003025,0.043057,3388,0.191335,2017-02-02,2017-03-15 23:59:59,56


In [12]:
# Save the DataFrame to disk
fname = '../data/baseline_estimates_per_home.csv'

df_result.reset_index().to_csv(fname, sep='\t', float_format='%.3f', date_format=TIME_FORMAT, index=False)