In [1]:
# import utility libraries
from netCDF4 import Dataset
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
from mpl_toolkits.basemap import Basemap, cm
%matplotlib inline

# import machine learning tools
from sklearn.preprocessing import StandardScaler
from keras.models import Sequential, Graph
from keras.layers import Dense, Flatten
from keras.layers.convolutional import Convolution2D, ZeroPadding2D, MaxPooling2D
from keras import backend as K

# import utilities and classes I wrote
from clustering import Location_Clusterer

Using Theano backend.


In [2]:
%%writefile nn_input.py
# import utility libraries
from netCDF4 import Dataset
import pandas as pd
import numpy as np
from collections import OrderedDict

# import machine learning tools
from sklearn.preprocessing import StandardScaler
from keras.models import Sequential
from keras.layers.convolutional import Convolution2D, ZeroPadding2D, MaxPooling2D
from keras import backend as K

# import utilities and classes I wrote
from clustering import Location_Clusterer

class NN_Input(object):
    """
    Stores the input data ready for feeding into a keras neural network. 

    To-Do:
    - add function to take the clustering data in some ways
    - change the output of "select" to fit the graph model of keras
    - add function to return the actual lat, lon, and time based on indices
    
    """
    def __init__(self, predict=2, history=2, box=5):
        """
        Initialize a class for storing neural network input data. 
        
        Parameters
        ----------
        predict: int, number of time points ahead that the model will predict. 
                 For example, if predict=2, the model will predict 2 time points away from the given time. 
        history: int, number of time points for which data would be included as input.
                 For example, if data_length=3, the model will receive 3 time points worth of data (current time
                 point, the previous time point, and the timep point before that).
        """
        self.lons = None
        self.lats = None
        self.times = None
        
        self.labels = None
        self.features = {}
        self.feature_types = {}
        self.variables = []
        
        self.predict = predict
        self.history = history
        self.box = box
        
    def load_labels(self, f_path, var):
        """
        Load labels from netCDF file. 
        
        Parameters
        ----------
        f_path: string
        var: string
        """
        nc = Dataset(f_path, 'r')
        self.lons = nc.variables['lon'][:]
        self.lats = nc.variables['lat'][:]
        
        self.times = nc.variables['time'][self.history:-self.predict]
        n = self.predict + self.history
        self.labels = nc.variables[var][n:,:,:]
        
    def load_features(self, f_path, var, name, feature_type):
        """
        Load feature values from netCDF files. Stores feature type information. 
        
        Parameters
        ----------
        f_path: string, path to input netCDF file.
        var: string, variable name as appeared in the netCDF file. 
        name: string, name of the variable to be stored. 
        feature_type: string, must be one of the following: 'history_time_series', 'forecast_time_series', 
        'multi_layers', 'single_layer'
        """
        nc = Dataset(f_path, 'r')
        temp_data = nc.variables[var][:]
        
        # Storing information on whether the input features 
        self.feature_types[name] = feature_type
        self.variables.append(name)
        
        if self.feature_types[name] == 'history_time_series':
            self.features[name] = temp_data[:-self.predict, :, :]
        elif self.feature_types[name] == 'forecast_time_series':
            self.features[name] = temp_data[self.history:, :, :]
        else:
            self.features[name] = temp_data
        
    
    def get_features(self, i, j, k):
        """
        Given indices for latitude, longitude, and time point, returns the associated data from self.data. 
        
        Parameters
        ----------
        lat: int, index for the latitude desired. Must be within the range available in self.data. 
        lon: int, index for the longitude desired. Must be within the range available in self.data. 
        time: int, index for the time point desired. Must be within the range available in self.data. 
        """
        maps = None
        lst = None
        for ix, feat in enumerate(self.variables):
            if self.feature_types[feat] == 'history_time_series':
                temp_data = self.features[feat][i:i+self.history+1, j-self.box:j+self.box+1, k-self.box:k+self.box+1]
            elif self.feature_types[feat] == 'forecast_time_series':
                temp_data = self.features[feat][i:i+self.predict+1, j-self.box:j+self.box+1, k-self.box:k+self.box+1]
            elif self.feature_types[feat] == 'multi_layers':
                temp_data = self.features[feat][:, j, k].flatten()
            else: 
                temp_data = self.features[feat][j, k]
            
            if len(temp_data.shape) == 3:                
                if np.any(temp_data.mask):
                    return None
                elif maps is None:
                    maps = temp_data
                else:
                    maps = np.ma.concatenate((maps, temp_data), axis=0)
            else:
                if lst is None:
                    lst = temp_data
                else:
                    lst = np.append(lst, temp_data)
        return [maps, lst]
        
    def select(self, n, cutoff=None):
        if cutoff is None:
            cutoff = len(self.times)/2
            
        indices, labels, output_maps, output_lst = [], [], [], []
        
        while len(labels) < n:
            i = np.random.randint(cutoff)
            j = np.random.randint(self.box, len(self.lats)-self.box)
            k = np.random.randint(self.box, len(self.lons)-self.box)
            features = self.get_features(i, j, k)
            if features is not None:
                indices.append([i, j, k])
                labels.append(self.labels[i, j, k])
                output_maps.append(features[0])
                output_lst.append(features[1])
        return np.array(indices), np.array(labels), np.array(output_maps), np.array(output_lst)

    def _check_mask(self, i, j, k):
        pass

Overwriting nn_input.py


In [3]:
from nn_input import NN_Input

# Preparing a graph model neural network input

folder = '/home/ubuntu/dataset/'

nn = NN_Input(predict=2, history=2, box=20)
nn.load_labels(folder+'sign.label.nc', 'Band1')

f_paths = ['all.ndvi.nc','all.max.of.Wind.nc', 'all.min.of.Tmin.nc', 'all.mean.of.Tmin.nc', 'all.sum.of.Prec.nc',
           'all.max.of.Tmax.nc', 'all.mean.of.Tmax.nc','elev.nc', 'veg.nc']
variables = ['Band1', 'Wind', 'Tmin', 'Tmin', 'Prec', 'Tmax', 'Tmax', 'elev', 'Cv']
names = ['ndvi', 'max_wind', 'min_tmin', 'mean_tmin', 'total_prec', 'max_tmax', 'mean_tmax', 'elev', 'veg']
feature_types = ['history_time_series', 'forecast_time_series', 'forecast_time_series', 'forecast_time_series',
                 'forecast_time_series', 'forecast_time_series', 'forecast_time_series',
                'single_layer', 'multi_layers']

for f_path, v, n, feature_type in zip(f_paths, variables, names, feature_types):
    nn.load_features(folder+f_path, v, n, feature_type)

In [52]:
train_id, train_y, train_X_map, train_X_lst = nn.select(1000)



In [53]:
print train_X_map[0].shape
print train_X_lst[0].shape
print type(train_X_map)
#print np.array(train_y)

(21, 41, 41)
(12,)
<type 'numpy.ndarray'>


In [54]:
# graph = Graph() 
# graph.add_input(name='input1', input_shape=(32,)) 
# graph.add_input(name='input2', input_shape=(32,)) 
# graph.add_node(Dense(16), name='dense1', input='input1') 
# graph.add_node(Dense(4), name='dense2', input='input2') 
# graph.add_node(Dense(4), name='dense3', input='dense1') 
# graph.add_output(name='output', inputs=['dense2', 'dense3'], merge_mode='sum') 
# graph.compile(optimizer='rmsprop', loss={'output':'mse'}) 
# history = graph.fit({'input1':X_train, 'input2':X2_train, 'output':y_train}, nb_epoch=10) 
# predictions = graph.predict({'input1':X_test, 'input2':X2_test}) # {'output':...}

train_y[train_y == -1] = 0


# graph model with two inputs and one output 
model = Graph() 

map_dimensions=train_X_map[0][0].shape

# two types of inputs: maps in 3D matrix and a list
model.add_input(name='maps', input_shape=train_X_map[0].shape) 
model.add_input(name='lst', input_shape=train_X_lst[0].shape) 

# adding layers to process the maps
model.add_node(Convolution2D(32, 3, 3, activation='relu', border_mode='same', input_shape=(21,41,41), dim_ordering='th')
               , name='map_conv1', input='maps')
model.add_node(Convolution2D(16, 3, 3, activation='relu', border_mode='same', dim_ordering='th')
               , name='map_conv2', input='map_conv1')
model.add_node(Flatten(), name='map_flatten', input='map_conv2')
model.add_node(Dense(32), name='map_dense1', input='map_flatten')

# adding layers to process the lst
model.add_node(Dense(16), name='lst_dense1', input='lst') 
model.add_node(Dense(8), name='lst_dense2', input='lst_dense1')

# merging two sets of weights
model.add_node(Dense(40, activation='relu'), name='combine', inputs=['map_flatten', 'lst_dense1'], merge_mode='concat')
model.add_node(Dense(1, activation='softmax'), name='reduce', input='combine')
model.add_output(name='output', input='reduce')

model.compile(optimizer='rmsprop', loss='mse', metrics=['accuracy']) 


In [55]:
print train_X_map.shape
print train_X_lst.shape
print train_y.shape

model.fit({'maps': train_X_map, 'lst': train_X_lst, 'output': train_y}, nb_epoch=3, batch_size=10, verbose=True)

(1000, 21, 41, 41)
(1000, 12)
(1000,)
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f8b5dd73910>