In [1]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
import sys
sys.path.insert(0, '../src')

import pickle
import datetime

import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.ticker as plticker

from datetime import timedelta
from pathlib import Path
from multiprocessing import Pool
from functools import partial

from IdealDataInterface import IdealDataInterface

from config import TIME_FORMAT, SAMPLING_MASK
from config import SENSOR_DATA_FOLDER, CACHE_FOLDER, CPU_HIGH_MEMMORY, CPU_LOW_MEMMORY
from config import EVALUATION_PERIOD, FFILL_LIMIT, N_SAMPLES, SAMPLING_WINDOW
from utils import treatment_control, load_cached_data
from sampling import data_to_sample_array, sample_energy, compute_sample_sizes, compute_estimate

In [3]:
# Run plotting styles
%run -i '../src/sns_styles.py'

cmap = sns.color_palette()

In [4]:
def estimate_period(period, N, SAMPLING_MASK, SAMPLING_WINDOW):
    # Load the data, exclude homes which bad data availability. See notebook 02.1-... for the
    # generation of the cached files.
    df = load_cached_data(period, full=False)
    
    # Compute the estimate per home using all cores. This will compute each home on a individual core.
    func = partial(compute_estimate, N=N, SAMPLING_MASK=SAMPLING_MASK, SAMPLING_WINDOW=SAMPLING_WINDOW)

    with Pool(processes=CPU_LOW_MEMMORY) as pool:
        estimates = pool.map(func, [ df[c] for c in df.columns ])
        
    # Put everything into a DataFrame
    df_result = pd.DataFrame(estimates)
    df_result['homeid'] = df.columns
    df_result['period'] = period

    return df_result

## Compute the estimates per period

In [5]:
result = list()

for period in EVALUATION_PERIOD.keys():
    print('Computing period {}..'.format(period))
    # Compute the estimates for each home in that period
    df = estimate_period(period, N_SAMPLES, SAMPLING_MASK, SAMPLING_WINDOW)
    
    # Store everything in a list
    result.append(df)
    
print('Done.')

Computing period P1_1..
Computing period P1_2..
Computing period P2_1..
Computing period P2_2..
Computing period P3..
Done.


In [None]:
# << Can you add a description here of what df_result is showing? It's pretty clear to me except for min_sample_size and samples_missing.
# The columns of df_result are described in the documentation on overleaf. They are as follows,
#
# homeid          | The homeid
# period          | The name of the period as defined in the config file
# estimate_mean   | The mean energy estimate, i.e. the mean of the sums. Will be NaN if it cannot be computed.
# estimate_std    | The standard deviation of the estimates, i.e. the std of the sums. Will be NaN if 
#                 | it cannot be computed.
# min_sample_size | The absolute minimum sample size to sample one time point (worst case)
# samples_missing | If there are time points with min\_sample\_size equal to 0, this will count the number 
#                 | of time points for which no samples could be drawn. This might be helpful to know in case
#                 | the available samples should be increased for example by increasing the window width
# missing_data    | The percentage of missing data in the input time series

In [6]:
df_result = pd.concat(result)

df_result.head()

Unnamed: 0,estimate_mean,estimate_std,min_sample_size,missing_data,samples_missing,homeid,period
0,6.18467,0.028219,7569,0.026611,0,59,P1_1
1,12.604855,0.038171,8181,0.003662,0,74,P1_1
2,5.475857,0.033081,7853,0.022078,0,78,P1_1
3,4.932472,0.026409,8159,0.005062,0,71,P1_1
4,11.193172,0.051557,7942,0.016887,0,79,P1_1


In [7]:
fname = '../data/estimated_average_electricity_use.csv'

df_result = df_result[['homeid', 'period', 'estimate_mean', 'estimate_std', 'min_sample_size', 
                       'samples_missing', 'missing_data']]

df_result.to_csv(fname, sep='\t', float_format='%.3f', index=False)