In [37]:
from netCDF4 import Dataset
import pandas as pd
import numpy as np
import xarray as xr
import matplotlib as mpl
import matplotlib.pyplot as plt
from mpl_toolkits.basemap import Basemap, cm
import subprocess
import os
%matplotlib inline

In [58]:
# Exploring all NVDI data that I have
data_folder = '/Users/Chiao/google-drive/projects/Galvanize/fall-foliage-finder/data/nvdi/nc/'
files = os.listdir(data_folder)

In [107]:
print 'Number of files =', len(files)
print 'Example file name:', files[0]
print 'Shape of all latitudes', lats.shape
print 'Shape of all longitudes', lons.shape
print 'Sahep of NVDI variable', var.shape

Number of files = 449
Example file name: 2002.01.01.mask.nc
Shape of all latitudes (614,)
Shape of all longitudes (927,)
Sahep of NVDI variable (614, 927)


In [81]:
nc = Dataset(data_folder+files[0], 'r')
lats = nc.variables['lat'][:]
lons = nc.variables['lon'][:]
nc.close()

data = np.zeros([len(lats), len(lons), len(files)])
for i, f in enumerate(files):
    nc = Dataset(data_folder+f, 'r')
    var = nc.variables['Band1'][:]
    data[:,:,i] = var
    nc.close()

In [82]:
data.shape

(614, 927, 449)

In [83]:
# Constructing a mask where True == somewhere in the time series there's a NaN value
is_nan = (data == -3000.)
mask = np.any(is_nan, axis=2)
mask.shape

(614, 927)

In [84]:
mask

array([[ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       ..., 
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False]], dtype=bool)

In [126]:
def plot_time_series(i, j, time_series):
    fig = plt.figure(figsize=(20,6))
    ax = fig.add_subplot(1, 1, 1)
    ax.plot(np.arange(len(time_series)), time_series)
    note = 'lat = {0}, lon = {1}'.format(lats[i], lons[j])
    ax.text(0.9, 2, note, fontsize=20)
    plt.show()

In [133]:
#plt.figure(figsize=(20,12))
# for i in np.arange(450, 460):
#     for j in np.arange(798, 800):
#         if not mask[i, j]:
#             time_series = data[i, j, :]
#             plot_time_series(i, j, time_series)
            

In [32]:
# Set default font size for matplotlib plots
mpl.rcParams.update({'font.size': 16})
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)
mpl.rc('legend', fontsize=12)

In [195]:
def plot_nvdi(lats, lons, data)
    cmap = mpl.cm.get_cmap('BuGn')
    norm = mpl.colors.Normalize(0, 10000)
    area_thresh=25000
    land_color='grey'
    ocean_color='lightblue'

    fig = plt.figure(figsize=(20,12))

    ax = fig.add_subplot(1,1,1)
    m = Basemap(projection='cyl', llcrnrlat=lats.min(), llcrnrlon=lons.min(),
            urcrnrlat=lats.max(), urcrnrlon=lons.max(), resolution='h',
            area_thresh=area_thresh)
    xi, yi = m(lons, lats)
    xi, yi = np.meshgrid(xi, yi)
    m.pcolormesh(xi, yi, var, cmap=cmap, norm=norm)
    m.drawlsmask(land_color=land_color, ocean_color=ocean_color, lakes=True)
    m.drawcountries()
    m.drawcoastlines()


[-124.96875 -124.90615 -124.84355 -124.78095 -124.71835 -124.65575
 -124.59315 -124.53055 -124.46795 -124.40535 -124.34275 -124.28015
 -124.21755 -124.15495 -124.09235 -124.02975 -123.96715 -123.90455
 -123.84195 -123.77935 -123.71675 -123.65415 -123.59155 -123.52895
 -123.46635 -123.40375 -123.34115 -123.27855 -123.21595 -123.15335
 -123.09075 -123.02815 -122.96555 -122.90295 -122.84035 -122.77775
 -122.71515 -122.65255 -122.58995 -122.52735 -122.46475 -122.40215
 -122.33955 -122.27695 -122.21435 -122.15175 -122.08915 -122.02655
 -121.96395 -121.90135 -121.83875 -121.77615 -121.71355 -121.65095
 -121.58835 -121.52575 -121.46315 -121.40055 -121.33795 -121.27535
 -121.21275 -121.15015 -121.08755 -121.02495 -120.96235 -120.89975
 -120.83715 -120.77455 -120.71195 -120.64935 -120.58675 -120.52415
 -120.46155 -120.39895 -120.33635 -120.27375 -120.21115 -120.14855
 -120.08595 -120.02335 -119.96075 -119.89815 -119.83555 -119.77295
 -119.71035 -119.64775 -119.58515 -119.52255 -119.45995 -119.3

In [211]:
%%writefile clustering.py

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from netCDF4 import Dataset
import numpy as np
import pandas as pd

class Location_Clusterer(KMeans):
    def __init__(self):
        KMeans.__init__(self)
        self.source_files = []
        self.coords = None
        self.mask = None
        self.raw_data = None
        self.clean_data = None

    def read_data(self, file_name, var_name):
        '''
        Read data from netCDF input file (1 file 1 variable at a time)
        
        INPUT:
            file_name -> string, path to input file
            var_name -> string, name of variable as appeared in the netCDF file
        '''
        
        # Initialize the coordinate system and dimension if this is the first file read
        if len(self.source_files) == 0:
            nc = Dataset(file_name, 'r')
            self.source_files.append((file_name, var_name))
            
            lats = nc.variables['lat'][:]
            lons = nc.variables['lon'][:]
            self.coords = np.meshgrid(lons, lats)          
        
        # Reading the actual data
        var = nc.variables[var_name][:]
        # Check dimensions of the new input data against existing data
        if var.shape[-2:] != self.coords[0].shape:
            raise InputError('Dimensions of input data do not match existing data.')
        # Initialize self.raw_data if it is previously empty
        elif self.raw_data is None:
            self.raw_data = np.copy(var)
        # Adding data if there's already some data stored
        else:
            self.raw_data = np.append(self.raw_data, var, axis0)
        
        # Check the mask from this file and initialize or update current mask 
        if len(var.mask.shape) == 3:
            mask = var.mask[0]
        else:
            mask = var.mask
            
        if self.mask is None:
            self.mask = np.copy(mask)
        else:
            self.mask = np.any(np.append(self.mask, var.mask, axis=0), axis=0)
            
        
    def clean_data(self):
        pass
        
        
            
        

Overwriting clustering.py


In [217]:
from clustering import Location_Clusterer

#files = ['/Users/Chiao/google-drive/projects/Galvanize/fall-foliage-finder/data/veg.nc',
#        ]

lc = Location_Clusterer()
lc.read_data('/Users/Chiao/google-drive/projects/Galvanize/fall-foliage-finder/data/veg.nc', 'Cv')

print lc.raw_data.shape
print lc.mask.shape

(11, 614, 927)
(614, 927)


In [200]:
veg_types = pd.read_table('/Users/Chiao/google-drive/projects/Galvanize/fall-foliage-finder/data/LDAS_veg_lib')\
                                                                                                         ['COMMENT']


(11, 614, 927)
(614, 927)


In [170]:
df_cv = pd.DataFrame()
mask = Cv.mask[0]

for i, v_type in enumerate(veg_types):
    df_cv[v_type] = Cv[i].flatten()

In [171]:
df_cv = df_cv[df_cv < 1000000].dropna()

In [210]:
a = np.meshgrid(np.arange(10), np.arange(10))

[array([[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]]),
 array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [2, 2, 2, 2, 2, 2, 2, 2, 2, 2],
        [3, 3, 3, 3, 3, 3, 3, 3, 3, 3],
        [4, 4, 4, 4, 4, 4, 4, 4, 4, 4],
        [5, 5, 5, 5, 5, 5, 5, 5, 5, 5],
        [6, 6, 6, 6, 6, 6, 6, 6, 6, 6],
        [7, 7, 7, 7, 7, 7, 7, 7, 7, 7],
        [8, 8, 8, 8, 8, 8, 8, 8, 8, 8],
        [9, 9, 9, 9, 9, 9, 9, 9, 9, 9]])]

In [187]:
len(df_cv.sum(axis=1) < 0.9)

332410