In [1]:
import numpy as np
import pandas as pd
import glob
import os
import sys
import warnings
warnings.filterwarnings("ignore")

In [2]:
FILE_PATH = os.path.expanduser("../metadata/electricity_egauge_hours/")
METADATA_PATH = os.path.expanduser("../metadata/metadata.csv")
HDF_PATH = os.path.expanduser("../2018_all.h5")


In [3]:
# Dropping days when DST changed

dst_times = []
dst_times.append(('2012-03-11 02:00:00', '2012-03-11 03:00:00'))
dst_times.append(('2012-11-04 01:00:00', '2012-11-04 02:00:00'))
dst_times.append(('2013-03-10 02:00:00', '2013-03-10 03:00:00'))
dst_times.append(('2013-11-03 01:00:00', '2013-11-03 02:00:00'))
dst_times.append(('2014-03-09 02:00:00', '2014-03-09 03:00:00'))
dst_times.append(('2014-11-02 01:00:00', '2014-11-02 02:00:00'))
dst_times.append(('2015-03-08 02:00:00', '2015-03-08 03:00:00'))
dst_times.append(('2015-11-01 01:00:00', '2015-11-01 02:00:00'))
dst_times.append(('2016-03-13 02:00:00', '2016-03-13 03:00:00'))
dst_times.append(('2016-11-06 01:00:00', '2016-11-06 02:00:00'))
dst_times.append(('2017-03-12 02:00:00', '2017-03-12 03:00:00'))
dst_times.append(('2017-11-05 01:00:00', '2017-11-05 02:00:00'))

In [4]:
files = os.listdir(FILE_PATH)
file_size= {x:os.path.getsize(FILE_PATH+x) for x in  files if '.csv' in x}
file_series = pd.Series(file_size)

In [5]:
file_series.head()

100055.csv    2369044
100056.csv    2339187
100059.csv    2679192
100062.csv     385611
100063.csv    3093331
dtype: int64

In [6]:
fs = file_series[file_series>1000]

In [7]:
store = pd.HDFStore(HDF_PATH, mode='a', complevel=9, complib='blosc')
feed_ignore = ['gen', 'grid']

In [8]:
count = 0
for building_number_csv in fs.index:
    print("Done %d of %d" %(count, len(fs)))
    try:
        building_path = os.path.join(FILE_PATH, building_number_csv)
        building_number = int(building_number_csv[:-4])
        if building_number in store.keys():
            continue
        df = pd.read_csv(building_path)
        df.index = pd.to_datetime(df["localhour"])
        df = df.drop("localhour", 1)
        
        # Dropping feeds
        for feed in feed_ignore:
            if feed in df.columns:
                df = df.drop(feed, 1)

        df = df.mul(1000)

        # Dropping feeds with 0 sum
        cols_to_keep = df.sum()[df.sum()>0].index
        df = df[cols_to_keep]

        # Dropping dataid
        if "dataid" in df.columns:
            df = df.drop('dataid', 1)
        df = df.drop(['Unnamed: 0'], axis=1)


        # Fixing DST issues
        for start, end in dst_times:
            ix_drop = df[start:end].index
            df = df.drop(ix_drop)


        # Assigning local timezone
        df = df.tz_localize('US/Central')

        # Making data float32
        df = df.astype('float32')
        
        # Write in temp HDF5 store
        store.put(str(building_number), df, format='table')
        count = count + 1
    except Exception:
        print(building_number, "Unexpected error:", sys.exc_info()[0])
store.close()

Done 0 of 834
Done 1 of 834
Done 2 of 834
Done 3 of 834
Done 4 of 834
Done 5 of 834
Done 6 of 834
Done 7 of 834
Done 8 of 834
Done 9 of 834
Done 10 of 834
Done 11 of 834
Done 12 of 834
Done 13 of 834
Done 14 of 834
Done 15 of 834
Done 16 of 834
Done 17 of 834
Done 18 of 834
Done 19 of 834
Done 20 of 834
Done 21 of 834
Done 22 of 834
Done 23 of 834
Done 24 of 834
Done 25 of 834
Done 26 of 834
Done 27 of 834
Done 28 of 834
Done 29 of 834
Done 30 of 834
Done 31 of 834
Done 32 of 834
Done 33 of 834
Done 34 of 834
Done 35 of 834
Done 36 of 834
Done 37 of 834
Done 38 of 834
Done 39 of 834
Done 40 of 834
Done 41 of 834
Done 42 of 834
Done 43 of 834
Done 44 of 834
Done 45 of 834
Done 46 of 834
Done 47 of 834
Done 48 of 834
Done 49 of 834
Done 50 of 834
Done 51 of 834
Done 52 of 834
Done 53 of 834
Done 54 of 834
Done 55 of 834
Done 56 of 834
Done 57 of 834
Done 58 of 834
Done 59 of 834
Done 60 of 834
Done 61 of 834
Done 62 of 834
Done 63 of 834
Done 64 of 834
Done 65 of 834
Done 66 of 834
Done 

Done 519 of 834
Done 520 of 834
Done 521 of 834
Done 522 of 834
Done 523 of 834
Done 524 of 834
Done 525 of 834
Done 526 of 834
Done 527 of 834
Done 528 of 834
Done 529 of 834
Done 530 of 834
Done 531 of 834
Done 532 of 834
Done 533 of 834
Done 534 of 834
Done 535 of 834
Done 536 of 834
Done 537 of 834
Done 538 of 834
Done 539 of 834
Done 540 of 834
Done 541 of 834
Done 542 of 834
Done 543 of 834
Done 544 of 834
Done 545 of 834
Done 546 of 834
Done 547 of 834
Done 548 of 834
Done 549 of 834
Done 550 of 834
Done 551 of 834
Done 552 of 834
Done 553 of 834
Done 554 of 834
Done 555 of 834
Done 556 of 834
Done 557 of 834
Done 558 of 834
Done 559 of 834
Done 560 of 834
Done 561 of 834
Done 562 of 834
Done 563 of 834
Done 564 of 834
Done 565 of 834
Done 566 of 834
Done 567 of 834
Done 568 of 834
Done 569 of 834
Done 570 of 834
Done 571 of 834
Done 572 of 834
Done 573 of 834
Done 574 of 834
Done 575 of 834
Done 576 of 834
Done 577 of 834
Done 578 of 834
Done 579 of 834
Done 580 of 834
Done 581

In [11]:
store = pd.HDFStore(os.path.expanduser("~/2018_all.h5"), 'r')

OSError: HDF5 error back trace

  File "H5F.c", line 586, in H5Fopen
    unable to open file
  File "H5Fint.c", line 1305, in H5F_open
    unable to lock the file
  File "H5FD.c", line 1839, in H5FD_lock
    driver lock request failed
  File "H5FDsec2.c", line 940, in H5FD_sec2_lock
    unable to lock file, errno = 11, error message = 'Resource temporarily unavailable'

End of HDF5 error back trace

Unable to open/create file '/zf14/yj9xs/2018_all.h5'

In [10]:
store.get(str(2018))

ClosedFileError: ../2018_all.h5 file is not open!