In [10]:
import pandas as pd
import glob
import os
import warnings
warnings.filterwarnings("ignore")


FILE_PATH = os.path.expanduser("~/git/dataport/metadata/electricity_egauge_hours/")
METADATA_PATH = os.path.expanduser("~/git/dataport/metadata/metadata.csv")
HDF_PATH = os.path.expanduser("~/monthly-12-17.h5")




In [2]:
# Dropping days when DST changed

dst_times = []
dst_times.append(('2012-03-11 02:00:00', '2012-03-11 03:00:00'))
dst_times.append(('2012-11-04 01:00:00', '2012-11-04 02:00:00'))
dst_times.append(('2013-03-10 02:00:00', '2013-03-10 03:00:00'))
dst_times.append(('2013-11-03 01:00:00', '2013-11-03 02:00:00'))
dst_times.append(('2014-03-09 02:00:00', '2014-03-09 03:00:00'))
dst_times.append(('2014-11-02 01:00:00', '2014-11-02 02:00:00'))
dst_times.append(('2015-03-08 02:00:00', '2015-03-08 03:00:00'))
dst_times.append(('2015-11-01 01:00:00', '2015-11-01 02:00:00'))
dst_times.append(('2016-03-13 02:00:00', '2016-03-13 03:00:00'))
dst_times.append(('2016-11-06 01:00:00', '2016-11-06 02:00:00'))
dst_times.append(('2017-03-12 02:00:00', '2017-03-12 03:00:00'))
dst_times.append(('2017-11-05 01:00:00', '2017-11-05 02:00:00'))

In [3]:
# Feeds to ignore
feed_ignore = ['gen', 'grid']


#list_of_buildings = glob.glob("/Users/nipunbatra/w/*.csv")
files = os.listdir(FILE_PATH)
file_size= {x:os.path.getsize(FILE_PATH+x) for x in  files if '.csv' in x}
file_series = pd.Series(file_size)
#file_series = file_series.drop(METADATA_PATH)
fs = file_series[file_series>1000]

In [4]:
store = pd.HDFStore(HDF_PATH, mode='a', complevel=9, complib='blosc')
count = 0

In [12]:
for building_number_csv in fs.index:
    print ("Done {} of {}".format(count, len(fs)))
    try:
        building_path = os.path.join(FILE_PATH, building_number_csv)
        building_number = int(building_number_csv[:-4])
        if building_number in store.keys():
            continue
        df = pd.read_csv(building_path)
        df.index = pd.to_datetime(df["localhour"])
        df = df.drop("localhour", 1)
        # Dropping feeds
        for feed in feed_ignore:
            if feed in df.columns:
                df = df.drop(feed, 1)

        df = df.mul(1000)

        # Dropping feeds with 0 sum
        cols_to_keep = df.sum()[df.sum()>0].index
        df = df[cols_to_keep]

        # Dropping dataid
        if "dataid" in df.columns:
            df = df.drop('dataid', 1)
        df = df.drop(['Unnamed: 0'], axis=1)


        # Fixing DST issues
        for start, end in dst_times:
            ix_drop = df[start:end].index
            df = df.drop(ix_drop)


        # Assigning local timezone
        df = df.tz_localize('US/Central')

        # Making data float32
        df = df.astype('float32')

        # Write in temp HDF5 store
        store.put(str(building_number), df, format='table')
        count = count + 1
    except Exception as e:
        print(e)





Done 8 of 834
Done 9 of 834
Done 10 of 834
Done 11 of 834
Done 12 of 834
2017-03-12 02:00:00
Done 12 of 834
2017-03-12 02:00:00
Done 12 of 834
Done 13 of 834
2017-03-12 02:00:00
Done 13 of 834
2017-03-12 02:00:00
Done 13 of 834
Done 14 of 834
2017-03-12 02:00:00
Done 14 of 834
2017-03-12 02:00:00
Done 14 of 834
2017-03-12 02:00:00
Done 14 of 834
2017-03-12 02:00:00
Done 14 of 834
Done 15 of 834
2017-03-12 02:00:00
Done 15 of 834
2017-03-12 02:00:00
Done 15 of 834
2017-03-12 02:00:00
Done 15 of 834
Done 16 of 834
Done 17 of 834
2017-03-12 02:00:00
Done 17 of 834
2017-03-12 02:00:00
Done 17 of 834
Done 18 of 834
Done 19 of 834
Done 20 of 834
2017-03-12 02:00:00
Done 20 of 834
2017-03-12 02:00:00
Done 20 of 834
2017-03-12 02:00:00
Done 20 of 834
Done 21 of 834
2017-03-12 02:00:00
Done 21 of 834
Done 22 of 834
Done 23 of 834
2017-03-12 02:00:00
Done 23 of 834
2017-03-12 02:00:00
Done 23 of 834
Done 24 of 834
2017-03-12 02:00:00
Done 24 of 834
Done 25 of 834
Done 26 of 834
Done 27 of 834
Ca

KeyboardInterrupt: 