In [1]:
import gzip
from netCDF4 import Dataset
import numpy as np
import collections

In [2]:
from __future__ import print_function

In [3]:
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties
% matplotlib inline

In [33]:
import copy
import os

General information

In [5]:
MRMS_directory = '/home/htan/proj/TensorFlow/data/MRMS/'
Data = '20160628/'
temp_derectory = '/home/htan/proj/TensorFlow/notebook/temp/'


Information in each file folder

In [129]:
class MRMS_data:
       
    def __init__(self, MRMS_Path, Tempfile_Path):
        # TODO: add check statement for two paths
        self.MRMS_Path = MRMS_Path
        self.Tempfile_Path = Tempfile_Path
        self.info_complete_flag = False
        self.data_complete_flag = False
        self._fillBasicInfo()
        
    def _fillBasicInfo(self):
        Gauge_info = {'folderName':'MRMS_GaugeCoor',
                      'tempFileName':'temp_gauge.nc'}
        Gauge = {'info': Gauge_info}
        
        NLDN_info = {'folderName':'MRMS_NLDN_LightningDensity',
                     'tempFileName':'temp_NLDN.nc'}
        NLDN = {'info' : NLDN_info}
        
        Z_info = {'folderName':'MRMS_Reflectivity',
                  'tempFileName':'temp_Reflectivity.nc'}
        Z = {'info' : Z_info}
        
        ZL_info = {'folderName':'MRMS_ReflectivityAtLowestAltitude',
                   'tempFileName':'temp_ReflectivityAtLowestAltitude.nc'}
        ZL = {'info' : ZL_info}
        
        HSR_info = {'folderName':'MRMS_SeamlessHSR',
                    'tempFileName':'temp_HSR.nc'}
        HSR = {'info' : HSR_info}
        
        VII_info = {'folderName':'MRMS_VII',
                    'tempFileName':'temp_VII.nc'}
        VII = {'info' : VII_info}
        data =  collections.OrderedDict()
        data['GaugeCoor'] = Gauge
        data['NLDN_LightningDensity'] = NLDN
        data['Reflectivity'] = Z
        data['ReflectivityAtLowestAltitude'] = ZL
        data['SeamlessHSR'] = HSR
        data['VII'] = VII
        self.data = data
        
    
        
        
    def readDataInTime(self, date, time):
        '''search function that create a datalist for certain time, the return value has shape(lat, lon, features)'''
        MRMS_file_list = []
        for key in self.data:
            self.data[key]['info']['date'] = date
            path = self.MRMS_Path + self.data[key]['info']['folderName'] + '/' + str(date) + '/'
            filenames = _search_file(path, time)
            MRMS_file_list.append(sorted(filenames))

        #print(MRMS_file_list)
        # TODO: implement the search function to find the filenames in each type of dataset
        self.data['GaugeCoor']['info']['inputFileName'] = MRMS_file_list[0]
        self.data['NLDN_LightningDensity']['info']['inputFileName'] = MRMS_file_list[1]
        self.data['Reflectivity']['info']['inputFileName'] = MRMS_file_list[2]
        self.data['ReflectivityAtLowestAltitude']['info']['inputFileName'] = MRMS_file_list[3]
        self.data['SeamlessHSR']['info']['inputFileName'] = MRMS_file_list[4]
        self.data['VII']['info']['inputFileName'] = MRMS_file_list[5]
        self.info_complete_flag = True        
        self._read()
    
    def readDataDuringTime(self, start, end):
        '''create a data list from start time to end time, the return value has shape (time, lat, lon, features)'''
        pass
        # TODO implementation duration fucntion
        def duration(start, end):
            pass
        dur = duration(start, end)
        time_list = []
        for date, time in dur:
            self.readDataInTime(date, time)
            timeSliceData = self.getFeatures().reshape(1, 3500, 7000, 6)
            time_list.append(timeSliceData)
        return np.vstack(time_list)
            
    
    def _read(self):
        '''read the MRMS data information into a '''
        if self.info_complete_flag == False:
            raise ValueError('The MRMS data infomation is note completed for reading')
        for key in self.data:
            data_info = self.data[key]['info']
            self.data[key]['data'] = collections.OrderedDict()
            for name in data_info['inputFileName']:
                gz_file_path = self.MRMS_Path + '/' + data_info['folderName'] + '/' + str(data_info['date']) + '/' + name
                nc_file_path = self.Tempfile_Path + '/' + data_info['tempFileName']
                self._uncompressData(gz_file_path, nc_file_path)
                type_name = name.split(str(data_info['date']))[0][:-1]
                self.data[key]['data'][type_name] = self._readNcFile(nc_file_path)
            print()
            
        
    
    def searchDemo(self):
        '''The simple demo, do not use it in real application'''
        if self.info_complete_flag == True:
            return
        for key in self.data:
            self.data[key]['info']['date'] = 20160628
        self.data['GaugeCoor']['info']['inputFileName'] = 'MRMS_GaugeCorr_QPE_01H_00.00_20160628-170000.nc.gz'
        self.data['NLDN_LightningDensity']['info']['inputFileName'] = 'MRMS_NLDN_LightningDensity_015_min_20160628-170113.nc.gz'
        self.data['Reflectivity']['info']['inputFileName'] = 'MRMS_Reflectivity_-5C_00.50_20160628-170040.nc.gz'
        self.data['ReflectivityAtLowestAltitude']['info']['inputFileName'] = 'MRMS_ReflectivityAtLowestAltitude_00.50_20160628-170040.nc.gz'
        self.data['SeamlessHSR']['info']['inputFileName'] = 'MRMS_SeamlessHSR_00.00_20160628-170000.nc.gz'
        self.data['VII']['info']['inputFileName'] = 'MRMS_VII_00.50_20160628-170040.nc.gz'
        self.info_complete_flag = True
        self._read()
        
    def getRawData(self):
        return self.data
    
    def preprocess(features):
        '''preprocess the features which must has the shape as (time, lat, lon, features)'''
        # TODO: add check function to check the shape
        # remove the invalid value with nan
        f0 = features[:,:,:,0]
        f0[f0 <= -2]=np.nan
        f2 = features[:,:,:,2]
        f2[f0 <= -999]=np.nan
        f3 = features[:,:,:,3]
        f3[f3 <= -999]=np.nan
        f4 = features[:,:,:,4]
        f4[f4 <= -999]=np.nan
        
        # standardize the dataset
        # TODO: check self.mean and self.std exsit or not. If exsit, directly use them to standardize the dataset
        self.mean = np.nanmean(features, axis = (0, 1, 2))
        self.std = np.nanstd(features, axis = (0, 1, 2))
        return (features - self.mean)/self.std
        
    
    def getFeatures(self):
        '''combine six dataset into one feature database'''
        feature_name_list = []
        feature_list = []
        for key in self.data:
            for data_name in self.data[key]['data']:
                feature_name_list.append(data_name)
                d = self.data[key]['data'][data_name]['var']
                feature_list.append(d.reshape(3500, 7000, 1))
        return feature_name_list, np.dstack(feature_list)
            
    def _uncompressData(self, gz_file_path, nc_file_path):
        print('Uncompressing gzip file %s ...' % (gz_file_path))
        inF = gzip.open(gz_file_path, 'rb')
        outF = open(nc_file_path, 'wb')
        outF.write( inF.read() )
        inF.close()
        outF.close()
        print('Create temp file %s' % (nc_file_path))
        
    def _readNcFile(self, nc_file_path):
        print('Read Netcdf file %s ...' % (nc_file_path))
        ncF = Dataset(nc_file_path, mode = 'r')
        var = []
        for key in ncF.variables:
            var.append(key)
        data = {'var' : ncF.variables[var[0]][:],
                'lat' : ncF.variables[var[1]][:],
                'lon' : ncF.variables[var[2]][:],}
        print('Finish reading data')
        return data
    
def _search_file(path, time):
    file_list = [f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))]
    seconds = [_timeToSec(f.split('20160628-')[1][:6]) for f in file_list]
    target = _timeToSec(time)
    abs_value= np.abs(np.array(seconds) - target)
    index = np.where(abs_value == abs_value.min())
    return [file_list[i] for i in index[0]]
        
    
def _timeToSec(time):
    if len(str(time)) != 6:
        raise ValueError('The time formate is not correct at HHMMSS')
    hour = int(time[:2])
    minute = int(time[2:4])
    second = int(time[4:])
    return second + minute*60 + hour*3600
        
            

In [130]:
test = MRMS_data(MRMS_directory, temp_derectory)

In [131]:
test.readDataInTime('20160628', '172245')

Uncompressing gzip file /home/htan/proj/TensorFlow/data/MRMS//MRMS_GaugeCoor/20160628/MRMS_GaugeCorr_QPE_01H_00.00_20160628-170000.nc.gz ...
Create temp file /home/htan/proj/TensorFlow/notebook/temp//temp_gauge.nc
Read Netcdf file /home/htan/proj/TensorFlow/notebook/temp//temp_gauge.nc ...
Finish reading data
Uncompressing gzip file /home/htan/proj/TensorFlow/data/MRMS//MRMS_GaugeCoor/20160628/MRMS_GaugeCorr_QPE_03H_00.00_20160628-170000.nc.gz ...
Create temp file /home/htan/proj/TensorFlow/notebook/temp//temp_gauge.nc
Read Netcdf file /home/htan/proj/TensorFlow/notebook/temp//temp_gauge.nc ...
Finish reading data

Uncompressing gzip file /home/htan/proj/TensorFlow/data/MRMS//MRMS_NLDN_LightningDensity/20160628/MRMS_NLDN_LightningDensity_001_min_20160628-172243.nc.gz ...
Create temp file /home/htan/proj/TensorFlow/notebook/temp//temp_NLDN.nc
Read Netcdf file /home/htan/proj/TensorFlow/notebook/temp//temp_NLDN.nc ...
Finish reading data
Uncompressing gzip file /home/htan/proj/TensorFlow

In [139]:
fn, feature = test.getFeatures()
feature.shape

(3500, 7000, 13)

In [140]:
from six.moves import cPickle as pickle
pickle_file = '/home/htan/proj/TensorFlow/data/MRMS/' +  '1timeslice.pickle'

with open(pickle_file, 'wb') as f:
    pickle.dump(feature, f, pickle.HIGHEST_PROTOCOL)


In [136]:
f[0, 0, :]

array([  -3.,   -3.,    0.,    0.,    0., -999., -999., -999., -999.,
       -999., -999., -999.,   -1.], dtype=float32)

In [None]:
f0 = f[:, :, 0]
f0[f0 <= -2]=np.nan


In [None]:
plt.figure(figsize=(15, 10))
plt.imshow(f0)
plt.colorbar()

In [None]:
f1 = f[:, :, 1]
f1[f1 <= 0] = np.nan
plt.figure(figsize = (15, 10))
plt.imshow(f1)
plt.colorbar()

In [None]:
f2 = f[:, :, 2]
f2[f2 <= -999] = np.nan
plt.figure(figsize = (15, 10))
plt.imshow(f2)
plt.colorbar()

In [None]:
f3 = f[:, :, 3]
f3[f3 <= -999] = np.nan
plt.figure(figsize = (15, 10))
plt.imshow(f3)
plt.colorbar()

In [None]:
f4 = f[:, :, 4]
f4[f4 <= -999] = np.nan
plt.figure(figsize = (15, 10))
plt.imshow(f4)
plt.colorbar()

In [None]:
f5 = f[:, :, 5]
f5[f5 <= -1] = np.nan
plt.figure(figsize = (15, 10))
plt.imshow(f5)
plt.colorbar()

In [None]:
f[0,0,0]/23