In [2]:
from numpy import unique
from numpy import nan
from numpy import array
from numpy import savetxt
from pandas import read_csv

def to_chunks(values, chunk_ix=1):
    chunks = dict()
    # get the unique chunk ids
    chunk_ids = unique(values[:, chunk_ix])
    # group rows by chunk id
    for chunk_id in chunk_ids:
        selection = values[:, chunk_ix] == chunk_id
        chunks[chunk_id] = values[selection, :]
    return chunks

# split each chunk into train/test sets
def split_train_test(chunks, row_in_chunk_ix=2):
    train, test = list(), list()
    # first 5 days of hourly observations for train
    cut_point = 5 * 24
    # enumerate chunks
    for k,rows in chunks.items():
        # split chunk rows by 'position_within_chunk'
        train_rows = rows[rows[:,row_in_chunk_ix] <= cut_point, :]
        test_rows = rows[rows[:,row_in_chunk_ix] > cut_point, :]
        if len(train_rows) == 0 or len(test_rows) == 0:
            print('>dropping chunk=%d: train=%s, test=%s' % (k, train_rows.shape, test_rows.shape))
            continue
        # store with chunk id, position in chunk, hour and all targets
        indices = [1,2,5] + [x for x in range(56,train_rows.shape[1])]
        train.append(train_rows[:, indices])
        test.append(test_rows[:, indices])
    return train, test

# return a list of relative forecast lead times
def get_lead_times():
    return [1, 2 ,3, 4, 5, 10, 17, 24, 48, 72]

# convert the rows in a test chunk to forecasts
def to_forecasts(test_chunks, row_in_chunk_ix=1):
    # get lead times
    lead_times = get_lead_times()
    # first 5 days of hourly observations for train
    cut_point = 5 * 24
    forecasts = list()
    # enumerate each chunk
    for rows in test_chunks:
        chunk_id = rows[0, 0]
        # enumerate each lead time
        for tau in lead_times:
            # determine the row in chunk we want for the lead time
            offset = cut_point + tau
            # retrieve data for the lead time using row number in chunk
            row_for_tau = rows[rows[:,row_in_chunk_ix]==offset, :]
            # check if we have data
            if len(row_for_tau) == 0:
                # create a mock row [chunk, position, hour] + [nan...]
                row = [chunk_id, offset, nan] + [nan for _ in range(39)]
                forecasts.append(row)
            else:
                # store the forecast row
                forecasts.append(row_for_tau[0])
    return array(forecasts)

In [3]:
dataset = read_csv('input/TrainingData.csv', header=0)
values = dataset.values


In [6]:
dataset.head()


Unnamed: 0,rowID,chunkID,position_within_chunk,month_most_common,weekday,hour,Solar.radiation_64,WindDirection..Resultant_1,WindDirection..Resultant_1018,WindSpeed..Resultant_1,...,target_4_6006,target_4_8003,target_5_6006,target_7_57,target_8_57,target_8_4002,target_8_6004,target_8_8003,target_9_4002,target_9_8003
0,1,1,1,10,Saturday,21,0.01,117.0,187.0,0.3,...,1.748424,,,5.130631,1.341606,2.138792,3.013752,,5.67928,
1,2,1,2,10,Saturday,22,0.01,231.0,202.0,0.5,...,2.14412,,,5.130631,1.195779,2.722099,3.888712,,7.426751,
2,3,1,3,10,Saturday,23,0.01,247.0,227.0,0.5,...,1.932469,,,5.136395,1.409658,3.11097,3.888712,,7.683732,
3,4,1,4,10,Sunday,0,0.01,219.0,218.0,0.2,...,2.088907,,,5.217102,1.477711,2.041574,3.208188,,4.831243,
4,5,1,5,10,Sunday,1,0.01,2.0,216.0,0.2,...,2.604232,,,5.217102,1.458267,2.138792,3.499841,,4.625658,


In [7]:
dataset.columns

Index(['rowID', 'chunkID', 'position_within_chunk', 'month_most_common',
       'weekday', 'hour', 'Solar.radiation_64', 'WindDirection..Resultant_1',
       'WindDirection..Resultant_1018', 'WindSpeed..Resultant_1',
       'WindSpeed..Resultant_1018', 'Ambient.Max.Temperature_14',
       'Ambient.Max.Temperature_22', 'Ambient.Max.Temperature_50',
       'Ambient.Max.Temperature_52', 'Ambient.Max.Temperature_57',
       'Ambient.Max.Temperature_76', 'Ambient.Max.Temperature_2001',
       'Ambient.Max.Temperature_3301', 'Ambient.Max.Temperature_6005',
       'Ambient.Min.Temperature_14', 'Ambient.Min.Temperature_22',
       'Ambient.Min.Temperature_50', 'Ambient.Min.Temperature_52',
       'Ambient.Min.Temperature_57', 'Ambient.Min.Temperature_76',
       'Ambient.Min.Temperature_2001', 'Ambient.Min.Temperature_3301',
       'Ambient.Min.Temperature_6005', 'Sample.Baro.Pressure_14',
       'Sample.Baro.Pressure_22', 'Sample.Baro.Pressure_50',
       'Sample.Baro.Pressure_52', 'Sample.Ba