In [1]:
# import utility libraries
from netCDF4 import Dataset
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
from mpl_toolkits.basemap import Basemap, cm
%matplotlib inline

# import machine learning tools
from sklearn.preprocessing import StandardScaler
from keras.models import Sequential
from keras.layers.convolutional import Convolution2D, ZeroPadding2D, MaxPooling2D
from keras import backend as K

# import utilities and classes I wrote
from clustering import Location_Clusterer

Using Theano backend.


In [2]:
%%writefile nn_input.py
# import utility libraries
from netCDF4 import Dataset
import pandas as pd
import numpy as np
from collections import OrderedDict

# import machine learning tools
from sklearn.preprocessing import StandardScaler
from keras.models import Sequential
from keras.layers.convolutional import Convolution2D, ZeroPadding2D, MaxPooling2D
from keras import backend as K

# import utilities and classes I wrote
from clustering import Location_Clusterer

class NN_Input(object):
    """
    Stores the input data ready for feeding into a keras neural network. 

    To-Do:
    - add function to take the clustering data in some ways
    - add function to return the actual lat, lon, and time based on indices
    
    """
    def __init__(self, predict=2, history=2, box=5):
        """
        Initialize a class for storing neural network input data. 
        
        Parameters
        ----------
        predict: int, number of time points ahead that the model will predict. 
                 For example, if predict=2, the model will predict 2 time points away from the given time. 
        history: int, number of time points for which data would be included as input.
                 For example, if data_length=3, the model will receive 3 time points worth of data (current time
                 point, the previous time point, and the timep point before that).
        """
        self.lons = None
        self.lats = None
        self.times = None
        
        self.labels = None
        self.features = {}
        self.feature_types = {}
        self.variables = []
        
        self.predict = predict
        self.history = history
        self.box = box
        
    def load_labels(self, f_path, var):
        """
        Load labels from netCDF file. 
        
        Parameters
        ----------
        f_path: string
        var: string
        """
        nc = Dataset(f_path, 'r')
        self.lons = nc.variables['lon'][:]
        self.lats = nc.variables['lat'][:]
        
        self.times = nc.variables['time'][self.history:-self.predict]
        n = self.predict + self.history
        self.labels = nc.variables[var][n:,:,:]
        
    def load_features(self, f_path, var, name, feature_type):
        """
        Load feature values from netCDF files. Stores feature type information. 
        
        Parameters
        ----------
        f_path: string, path to input netCDF file.
        var: string, variable name as appeared in the netCDF file. 
        name: string, name of the variable to be stored. 
        feature_type: string, must be one of the following: 'history_time_series', 'forecast_time_series', 
        'multi_layers', 'single_layer'
        """
        nc = Dataset(f_path, 'r')
        temp_data = nc.variables[var][:]
        
        # Storing information on whether the input features 
        self.feature_types[name] = feature_type
        self.variables.append(name)
        
        if self.feature_types[name] == 'history_time_series':
            self.features[name] = temp_data[:-self.predict, :, :]
        elif self.feature_types[name] == 'forecast_time_series':
            self.features[name] = temp_data[self.history:, :, :]
        else:
            self.features[name] = temp_data
        
    
    def get_features(self, i, j, k):
        """
        Given indices for latitude, longitude, and time point, returns the associated data from self.data. 
        
        Parameters
        ----------
        lat: int, index for the latitude desired. Must be within the range available in self.data. 
        lon: int, index for the longitude desired. Must be within the range available in self.data. 
        time: int, index for the time point desired. Must be within the range available in self.data. 
        """
        output = []
        masks = []
        for feat in self.variables:
            if self.feature_types[feat] == 'history_time_series':
                output.append(self.features[feat][i:i+self.history, j-5:j+5, k-5:k+5])
                masks.append(self.features[feat].mask[i:i+self.history, j-5:j+5, k-5:k+5])
            elif self.feature_types[feat] == 'forecast_time_series':
                output.append(self.features[feat][i:i+self.predict, j-5:j+5, k-5:k+5])
                masks.append(self.features[feat].mask[i:i+self.predict, j-5:j+5, k-5:k+5])
            elif self.feature_types[feat] == 'multi_layers':
                output.append(self.features[feat][:, j, k].flatten())
                masks.append(self.features[feat].mask[:, j, k].flatten())
            else: 
                output.append(self.features[feat][j, k])
                masks.append(self.features[feat].mask[j, k])
                
        return output, masks
        
    def select(self, n, cutoff=None):
        if cutoff is None:
            cutoff = len(self.times)/2
            
        output = []
        while len(output) < n:
            i = np.random.randint(cutoff)
            j = np.random.randint(self.box, len(self.lats)-self.box)
            k = np.random.randint(self.box, len(self.lons)-self.box)
            features, masks = self.get_features(i, j, k)
            if not np.any(masks):
                indices = (i, j, k)
                label = self.labels[i, j, k]
                output.append([indices, label, features])
        return output

    def _check_mask(self, i, j, k):
        pass

Overwriting nn_input.py


In [3]:
from nn_input import NN_Input

folder = '/home/ubuntu/dataset/'

nn = NN_Input(predict=2, history=3)
nn.load_labels(folder+'sign.label.nc', 'Band1')

f_paths = ['all.ndvi.nc','all.max.of.Wind.nc', 'all.min.of.Tmin.nc', 'all.mean.of.Tmin.nc', 'all.sum.of.Prec.nc',
           'all.max.of.Tmax.nc', 'all.mean.of.Tmax.nc','elev.nc', 'veg.nc']
variables = ['Band1', 'Wind', 'Tmin', 'Tmin', 'Prec', 'Tmax', 'Tmax', 'elev', 'Cv']
names = ['ndvi', 'max_wind', 'min_tmin', 'mean_tmin', 'total_prec', 'max_tmax', 'mean_tmax', 'elev', 'veg']
feature_types = ['history_time_series', 'forecast_time_series', 'forecast_time_series', 'forecast_time_series',
                 'forecast_time_series', 'forecast_time_series', 'forecast_time_series',
                'single_layer', 'multi_layers']

for f_path, v, n, feature_type in zip(f_paths, variables, names, feature_types):
    nn.load_features(folder+f_path, v, n, feature_type)

In [4]:
nn.select(1)

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

In [None]:
data_folder = '/Users/Chiao/google-drive/projects/Galvanize/fall-foliage-finder/data/'

# Getting my labels
nc = Dataset(data_folder+'sign.label.nc', 'r')
labels = nc.variables['Band1'][4:][np.newaxis]
lons = nc.variables['lon'][:]
lats = nc.variables['lat'][:]
times = nc.variables['time'][:]

nc = Dataset(data_folder+'all.ndvi.nc')
ndvis = nc.variables['Band1'][:]

ndvis = np.ma.concatenate((ndvis[2:-2][np.newaxis], ndvis[1:-3][np.newaxis], ndvis[0:-4][np.newaxis]), axis=0)
mask = ndvis.mask[:]

ndvis = np.lib.pad(ndvis, ((0, 0), (0, 0), (5, 5), (5, 5)), 'constant', constant_values=(-3000))

nobs = len(times)*len(lats)*len(lons)

t0 = datetime.now()
flatten_data = np.zeros((nobs, 3, 10,10))
n = 0
for i, t in enumerate(times):
    for j, lat in enumerate(lats):
        for k, lon in enumerate(lons):
            flatten_data[n, :] = np.squeeze(ndvis[:, i, j:j+10, k:k+10])
            n += 1
print 'that took:', datetime.now()-t0, 'seconds'
            


In [3]:



f_paths = ['all.ndvi.nc', 'sign.label.nc', 'elev.nc']
names = ['ndvi', 'label', 'elev']


# util function to open, resize and format pictures into appropriate tensors
def preprocess_image(image_path):
    img = imresize(imread(image_path), (img_width, img_height))
    img = img.transpose((2, 0, 1)).astype('float64')
    img = np.expand_dims(img, axis=0)
    return img

# util function to convert a tensor into a valid image
def deprocess_image(x):
    x = x.transpose((1, 2, 0))
    x = np.clip(x, 0, 255).astype('uint8')
    return x

(437, 614, 927)
(437, 614, 927)


In [None]:
# # subsetting the first 100 time points to work with
# # 100 time points is around 2 years of data
# n = 100

# # flatten the 3D array into a 1D column
# coords = np.meshgrid(times[:n], lats[10:-10], lons[10:-10], indexing='ij')
# nobs = n*len(lats[10:-10])*len(lons[10:-10])
# flatten_labels = np.zeros((nobs, len(coords)+2))

# for i in xrange(len(coords)):
#     flatten_labels[:,i] = coords[i].flatten()
    
# # get the label from 2 timestamps away 
# flatten_labels[:,-2] = labels[2:n+2, 10:-10, 10:-10].flatten()

# # stack the mask together from current time point and the future time point
# # get the composite
# current_mask = labels.mask[:n, 10:-10, 10:-10].flatten()
# future_mask = labels.mask[2:n+2, 10:-10, 10:-10].flatten()
# flatten_labels[:,-1] = np.any(np.vstack((current_mask, future_mask)), axis=0)

In [None]:
# base_image = data['ndvi'][:20]
# result_prefix = 'result'

# # dimensions of the generated picture.
# img_width, img_height = data['label'].shape[-2:]



# # build the VGG16 network
# model = Sequential()

# model.add(ZeroPadding2D((1, 1), batch_input_shape=(1, 3, img_width, img_height)))

# model.add(Convolution2D(64, 3, 3, activation='relu', name='conv1_1'))
# model.add(ZeroPadding2D((1, 1)))
# model.add(Convolution2D(64, 3, 3, activation='relu', name='conv1_2'))
# model.add(MaxPooling2D((2, 2), strides=(2, 2)))

# model.add(ZeroPadding2D((1, 1)))
# model.add(Convolution2D(128, 3, 3, activation='relu', name='conv2_1'))
# model.add(ZeroPadding2D((1, 1)))
# model.add(Convolution2D(128, 3, 3, activation='relu', name='conv2_2'))
# model.add(MaxPooling2D((2, 2), strides=(2, 2)))


# # get the symbolic outputs of each "key" layer (we gave them unique names).
# layer_dict = dict([(layer.name, layer) for layer in model.layers])

# # continuity loss util function
# def continuity_loss(x):
#     assert K.ndim(x) == 4
#     a = K.square(x[:, :, :img_width-1, :img_height-1] - x[:, :, 1:, :img_height-1])
#     b = K.square(x[:, :, :img_width-1, :img_height-1] - x[:, :, :img_width-1, 1:])
#     return K.sum(K.pow(a + b, 1.25))

# # define the loss
# loss = K.variable(0.)
# for layer_name in settings['features']:
#     # add the L2 norm of the features of a layer to the loss
#     assert layer_name in layer_dict.keys(), 'Layer ' + layer_name + ' not found in model.'
#     coeff = settings['features'][layer_name]
#     x = layer_dict[layer_name].output
#     shape = layer_dict[layer_name].output_shape
#     # we avoid border artifacts by only involving non-border pixels in the loss
#     loss -= coeff * K.sum(K.square(x[:, :, 2: shape[2]-2, 2: shape[3]-2])) / np.prod(shape[1:])

# # add continuity loss (gives image local coherence, can result in an artful blur)
# loss += settings['continuity'] * continuity_loss(dream) / (3 * img_width * img_height)
# # add image L2 norm to loss (prevents pixels from taking very high values, makes image darker)
# loss += settings['dream_l2'] * K.sum(K.square(dream)) / (3 * img_width * img_height)

# # feel free to further modify the loss as you see fit, to achieve new effects...

# # compute the gradients of the dream wrt the loss
# grads = K.gradients(loss, dream)

# outputs = [loss]
# if type(grads) in {list, tuple}:
#     outputs += grads
# else:
#     outputs.append(grads)

# f_outputs = K.function([dream], outputs)
# def eval_loss_and_grads(x):
#     x = x.reshape((1, 3, img_width, img_height))
#     outs = f_outputs([x])
#     loss_value = outs[0]
#     if len(outs[1:]) == 1:
#         grad_values = outs[1].flatten().astype('float64')
#     else:
#         grad_values = np.array(outs[1:]).flatten().astype('float64')
#     return loss_value, grad_values

# # this Evaluator class makes it possible
# # to compute loss and gradients in one pass
# # while retrieving them via two separate functions,
# # "loss" and "grads". This is done because scipy.optimize
# # requires separate functions for loss and gradients,
# # but computing them separately would be inefficient.
# class Evaluator(object):
#     def __init__(self):
#         self.loss_value = None
#         self.grads_values = None

#     def loss(self, x):
#         assert self.loss_value is None
#         loss_value, grad_values = eval_loss_and_grads(x)
#         self.loss_value = loss_value
#         self.grad_values = grad_values
#         return self.loss_value

#     def grads(self, x):
#         assert self.loss_value is not None
#         grad_values = np.copy(self.grad_values)
#         self.loss_value = None
#         self.grad_values = None
#         return grad_values

# evaluator = Evaluator()

# # run scipy-based optimization (L-BFGS) over the pixels of the generated image
# # so as to minimize the loss
# x = preprocess_image(base_image_path)
# for i in range(5):
#     print('Start of iteration', i)
#     start_time = time.time()

#     # add a random jitter to the initial image. This will be reverted at decoding time
#     random_jitter = (settings['jitter'] * 2) * (np.random.random((3, img_width, img_height)) - 0.5)
#     x += random_jitter

#     # run L-BFGS for 7 steps
#     x, min_val, info = fmin_l_bfgs_b(evaluator.loss, x.flatten(),
#                                      fprime=evaluator.grads, maxfun=7)
#     print('Current loss value:', min_val)
#     # decode the dream and save it
#     x = x.reshape((3, img_width, img_height))
#     x -= random_jitter
#     img = deprocess_image(x)
#     fname = result_prefix + '_at_iteration_%d.png' % i
#     imsave(fname, img)
#     end_time = time.time()
#     print('Image saved as', fname)
#     print('Iteration %d completed in %ds' % (i, end_time - start_time))