In [1]:
import numpy as np
import pandas as pd
import pickle as pkl
import csv
import tensorflow as tf
import matplotlib.pyplot as plt

In [2]:
def load_data(data_path,data_file,file_type="csv",unchange_indices=[],indirect_indices=[],
                direct_indices=[],id_ind=0,target_ind=-1,val_prop=0.10,test_prop=0.10,
                opt_params={},save_file=""):

    """
        data_path: Path to the data file. The output data will be written to this 
		   location.

        data_file: File containing the data to be loaded.

        file_type: The type of file, either 'csv' or 'pkl'.

                   (1 ) 'csv' assumes the following:

                         a. Has a header and is the first line in the file.
                         b. The first column identifies the instance.
                         c. The last column is the target variable.
                         d. ALL VARIABLES ARE NUMERIC (including identifiers
                            and target).

                   (2) 'pkl' file type is assumed to have been generated
                        according to this code.
        
        unchange_indices: The indices onf the unchangeable features.

        indirect_indices: The indices of the indirectly changeable features.
 
        direct_indices: The indices of the directly changeable features

        seed: Seed to randomly partition data.

        val_prop: Proportion of data to be used for the validation set.

        test_prop: Proportion of data to be used for the test set.

        imbal_classes: Boolean. Whether or not there is class imbalance. If
                       set to True, then we will stratify the positive class
                       (assumed to be the imbalanced class). To ensure that
                       positive samples are present in the train, validation,
                       and test sets.

    """

    if file_type == "pkl":
        with open(data_path+data_file,'rb') as rF:
            load_data = pkl.load(rF)
            return load_data
    
    elif file_type == "csv":
        sep=","
    else:
        raise Exception("Unsupoorted file type {}. Support file types are 'csv' and 'pkl'.".format(file_type))

    dset_df = pd.read_csv(data_path+data_file,sep=sep)

    header = dset_df.columns


    id_col_name = header[id_ind]
    target_col_name = header[target_ind]
    indirect_col_names = header[indirect_indices]
    direct_col_names = header[direct_indices]
    unchange_col_names = header[unchange_indices]

    dset_ids = dset_df[id_col_name].values
    dset_targets = dset_df[target_col_name].values
    X_data = dset_df.drop([id_col_name, target_col_name],axis=1)
    

    unchange_indices = [X_data.columns.get_loc(c) for c in unchange_col_names]
    indirect_indices = [X_data.columns.get_loc(c) for c in indirect_col_names]
    direct_indices = [X_data.columns.get_loc(c) for c in direct_col_names]

    X_data = X_data.values    


    #Define train, val, test indices according to test_prop, val_prop
    n = dset_ids.shape[0]
    train_indices = [i for i in range(int(n*(1 - test_prop - val_prop)))]
    val_indices = [i + int(n*(1 - test_prop - val_prop)) for i in range(int(n*val_prop))]
    test_indices = [i + int(n*(1 - test_prop)) for i in range(int(n*test_prop))]

    #Partition data into train,val,test according to the above defined indices
    
    #Train
    train_X = X_data[train_indices]    
    train_target = dset_targets[train_indices]
    train_ids = dset_ids[train_indices]

    #Obtain normalization values
    min_X = np.amin(train_X,axis=0)
    max_X = np.amax(train_X,axis=0) 

    #Normalize training data
    norm_train_X =np.divide(train_X - min_X,max_X - min_X)
    
   
    train_dict = {"X":norm_train_X, "target":train_target, "ids":train_ids}

    #Val
    val_X= X_data[val_indices]
    val_target = dset_targets[val_indices]
    val_ids = dset_ids[val_indices]

    #Normailze validation data
    norm_val_X = np.divide(val_X - min_X,max_X - min_X)

    val_dict = {"X":norm_val_X, "target":val_target, "ids":val_ids}

    #Test
    test_X = X_data[test_indices]
    test_target = dset_targets[test_indices]
    test_ids = dset_ids[test_indices]

    #Normalize test data
    norm_test_X = np.divide(test_X - min_X,max_X - min_X)


    test_dict = {"X":norm_test_X, "target":test_target, "ids":test_ids}


    return_dict = {'train':train_dict,
                   'val':val_dict,
                   'test':test_dict,
                   'train_indices':train_indices,
                   'val_indices':val_indices,
                   'test_indices':test_indices,
                   'min_X':min_X,
                   'max_X':max_X,
                   'opt_params':opt_params,
                   'xU_ind':unchange_indices,
                   'xI_ind':indirect_indices,
                   'xD_ind':direct_indices
                   }

    #If a save file is defined, write the defined data out.
    if save_file != "":
        with open(data_path+save_file,'wb') as sF:
            pkl.dump(return_dict,sF)

    return return_dict

In [3]:
def load_indices(data_path,util_file):
    """
        data_path: Path to data files.

        util_file: Name of the file containing the index designations, cost
                   parameters, and direction of change parameters. Should be
                    of the form:

                        index, designation, cost increase, cost decrease, direction
                      
                        e.g.:

                        0,id,,,
                        1,dir,0,2,-1
                        2,dir,3,0,1
                        3,dir,4,3,0
                        4,unch,,,
                        5,ind,,,
                         ...
                        p,target,,,

    """

    unch_indices = []
    ind_indices = []
    dir_indices = []
    cost_inc = []
    cost_dec = []
    direct_chg = []
    id_ind = -1
    target_ind = -1
    with open(data_path+util_file,'rU') as rF:
        fReader = csv.reader(rF,delimiter=',')
        for i, row in enumerate(fReader):
            if row[1] == 'id':
                id_ind = int(row[0])
            elif row[1] == 'target':
                target_ind = int(row[0])
            elif row[1] == 'ind':
                ind_indices.append(int(row[0]))
            elif row[1] == 'unch':
                unch_indices.append(int(row[0]))
            elif row[1] == 'dir':
                dir_indices.append(int(row[0]))
                cost_inc.append(int(row[2]))
                cost_dec.append(int(row[3]))
                direct_chg.append(int(row[4]))
            else:
                raise Exception("Problem loading index file. Unrecognized designation '{}' found on row\
                          {}".format(row[0],str(i+1)))

    return unch_indices,ind_indices,dir_indices,cost_inc,cost_dec,direct_chg,id_ind,target_ind

In [4]:
unch_indices,indir_indices,dir_indices,cost_inc,cost_dec,direct_chg,id_ind,target_ind = load_indices('','brazil_indices.csv')

In [5]:
opt_params = {'cost_inc':cost_inc,'cost_dec':cost_dec,'direct_chg':direct_chg}

In [6]:
data_dict = load_data('','brazil_weather.csv','csv',
                          unch_indices,indir_indices,dir_indices,id_ind=id_ind,
                          target_ind=target_ind,val_prop=0.10,test_prop=0.10,
                          opt_params=opt_params,save_file="")

In [7]:
train_dat = data_dict['train']

In [8]:
opt = tf.keras.optimizers.Adam(learning_rate=0.001)

In [9]:
in_dim = train_dat['X'].shape[1]

In [10]:
model = tf.keras.models.Sequential()
model.add(tf.keras.layers.LSTM(32, return_sequences=True))
model.add(tf.keras.layers.Dense(units=1))

In [11]:
class WindowGenerator():
    def __init__(self, input_width, label_width, shift, data_dict):
        
        self.train_dat = data_dict['train']['X']
        self.val_dat = data_dict['val']['X']
        self.test_dat = data_dict['test']['X']
        
        self.label_column_indices = data_dict['xD_ind']
        self.column_indices = data_dict['xU_ind'] + data_dict['xI_ind'] + data_dict['xD_ind']
        
        self.input_width = input_width
        self.label_width = label_width
        self.shift = shift
        
        self.total_window_size = input_width + shift
        
        self.input_slice = slice(0, input_width)
        self.input_indices = np.arange(self.total_window_size)[self.input_slice]
        
        self.label_start = self.total_window_size - self.label_width
        self.label_slice = slice(self.label_start, None)
        self.label_indices = np.arange(self.total_window_size)[self.label_slice]
    
    def __repr__(self):
        return '\n'.join([
            f'Total window size: {self.total_window_size}',
            f'Input indices: {self.input_indices}',
            f'Label indices: {self.label_indices}',
            f'Label column indices: {self.label_column_indices}'
        ])
    
    def split_window(self, features):
        inputs = features[:, self.input_slice, :]
        labels = features[:, self.label_slice, :]
        labels = tf.stack(
            [labels[:, :, self.column_indices[i]] for i in self.label_column_indices],
            axis=-1
        )
        
        inputs.set_shape([None, self.input_width, None])
        labels.set_shape([None, self.label_width, None])
        
        return inputs, labels
    
    def plot(self, model=None, plot_col=17, max_subplots=3):
        inputs, labels = self.example

        plt.figure(figsize=(12,8))
        max_n = min(max_subplots, inputs.shape[0])

        plot_col_index = self.column_indices.index(plot_col)
        label_col_index = self.label_column_indices.index(plot_col)

        for n in range(max_n):
            plt.subplot(max_n, 1, n+1)
            plt.ylabel(f'Column {plot_col_index} [normed]')
            plt.plot(
                self.input_indices,
                inputs[n, :, plot_col_index],
                label='Inputs', marker='.', zorder=-10
            )
            plt.scatter(
                self.label_indices,
                labels[n, :, label_col_index],
                edgecolors='k', label='Labels', c='#2ca02c', s=64
            )

            if model is not None:
                predictions = model(inputs)
                plt.scatter(
                    self.label_indices,
                    predictions[n, :, label_col_index],
                    marker='X', edgecolors='k', label='Predictions',
                    c='#ff7f0e', s=64
                )

            if n==0:
                plt.legend()

        plt.xlabel('time')

In [12]:
w1 = WindowGenerator(input_width=24, label_width=1, shift=24, data_dict=data_dict)
w1

Total window size: 48
Input indices: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23]
Label indices: [47]
Label column indices: [17, 18, 19, 20, 21, 22, 23, 24]

In [13]:
w2 = WindowGenerator(input_width=6, label_width=1, shift=1, data_dict=data_dict)
w2

Total window size: 7
Input indices: [0 1 2 3 4 5]
Label indices: [6]
Label column indices: [17, 18, 19, 20, 21, 22, 23, 24]

In [14]:
example_window = tf.stack([
    train_dat['X'][:w2.total_window_size],
    train_dat['X'][100:100+w2.total_window_size],
    train_dat['X'][200:200+w2.total_window_size]
])

In [15]:
example_inputs, example_labels = w2.split_window(example_window)

print('All shapes are: (batch, time, features)')
print(f'Window shape: {example_window.shape}')
print(f'Inputs shape: {example_inputs.shape}')
print(f'Labels shape: {example_labels.shape}')

All shapes are: (batch, time, features)
Window shape: (3, 7, 25)
Inputs shape: (3, 6, 25)
Labels shape: (3, 1, 8)


In [16]:
example_inputs.shape[0]

3

In [17]:
w2.example = example_inputs, example_labels

In [None]:
w2.plot()

In [None]:
example_labels[1, :, 0]