# Scan MERRA-2 atmospheric properties during one Year
----------------------------------------------------------------------------------

- author: Sylvie Dagoret-Campagne
- creation January 12 2017
- update April 25th 2018

Link:

http://disc.sci.gsfc.nasa.gov/datareleases/merra_2_data_release

### purpose:

Scan One month of MERRA-2 predictions of the dataset inst1_2d_asm_Nx_M2I1NXASM. 
Extract the relevant atmospheric variables.
Build the correcponding time series and dataset in pandas.
Plot the variables. Save the pandas dataset into a file.
Convert the pandas dataset into an astropy fits table and save into a fits file as well.

## 1) python libraries
---------------------------

In [37]:
# Set up matplotlib and use a nicer set of plot parameters
%config InlineBackend.rc = {}
import matplotlib
import matplotlib as mpl
matplotlib.rc_file("templates/matplotlibrc")
import matplotlib.pyplot as plt
%matplotlib inline

In [38]:
import datetime

In [39]:
from matplotlib.dates import MonthLocator, WeekdayLocator,DateFormatter
from matplotlib.dates import MONDAY

In [40]:
mondays = WeekdayLocator(MONDAY)
months = MonthLocator(range(1, 13), bymonthday=1, interval=1)
monthsFmt = DateFormatter("%b '%y")

In [41]:
import os
import re
import numpy as np
from mpl_toolkits.basemap import Basemap
from matplotlib import colors
from matplotlib.backends.backend_pdf import PdfPages
import pandas as pd

In [42]:
from astropy import units as u
from astropy.coordinates import SkyCoord

from astropy.table import Table

In [43]:
import h5py

In [44]:
import libGMAOMERRA2Data as merra2  # My own library

In [45]:
############################################################################
def ensure_dir(f):
    d = os.path.dirname(f)
    if not os.path.exists(f):
        os.makedirs(f)
#########################################################################

## 2)  Configuration
-------------------------

In [46]:
# SELECT OBSERVATORY
OBS_NAME='ohp'

In [47]:
# where are the HDF files
#HDFEOS_ZOO_DIR="/Volumes/LaCie2/DATA/MERRA-2/inst1_2d_asm_Nx_M2I1NXASM"
# For Simulation of CTIO atmosphere in May-Jun 2017
#HDFEOS_ZOO_DIR="/sps/lsst/data/AtmosphericCalibration/MERRA-2/May-Jun-2017/subset_M2I1NXASM_V5.12.4_20180424_201411"
# Data 2018
HDFEOS_ZOO_DIR="/sps/lsst/data/MERRA2/M2I1NXASM.5.12.4/2018"

In [48]:
path=HDFEOS_ZOO_DIR

### Here I describe the content of the input files

In [49]:
DATA_TAG=['PS','T10M','T2M','TO3','TOX','TQI','TQV','TS','U10M','U2M','U50M','V10M','V2M','V50M' ]

In [50]:
DATA_TITLE=['Surface Pressure',
            '10 meter air temperature',
            '2 meter air temperature',
            'total column ozone',
            'total column odd oxygen',
            'total precipitable ice water',
            'total precipitable liquid water',
            'Surface Temperature skin',
            '10 meter eastward wind',
            '2 meter eastward wind',
            '50 meter eastward wind',
            '10 meter northward wind',
            '2 meter northward wind',
            '50 meter northward wind'
           ]

In [51]:
NB_DATAFIELDS=len(DATA_TAG)

### List of output files

In [52]:
# The selected data field
DATA_NAME =  'inst1_2d_asm_Nx_M2I1NXASM'   # 

In [53]:
pandas_filename='MERRA2_2018_'+DATA_NAME+'_'+OBS_NAME+'_'+'AllYear'+'.csv'

In [54]:
fits_filename='MERRA2_2018_'+DATA_NAME+'_'+OBS_NAME+'_'+'AllYear' +'.fits'

In [55]:
hdf5_filename='MERRA2_2018_'+DATA_NAME+'_'+OBS_NAME+'_'+'AllYear'+'.h5'

In [56]:
# 14 output files for figures
figfile_ps='GMAO_MERRA2_2018_'+DATA_NAME+'_'+OBS_NAME+'_'+'AllYear'+'_ps'+'.jpg'
figfile_t10='GMAO_MERRA2_2018_'+DATA_NAME+'_'+OBS_NAME+'_'+'AllYear'+'_t10'+'.jpg'
figfile_t2='GMAO_MERRA2_2018_'+DATA_NAME+'_'+OBS_NAME+'_'+'AllYear'+'_t2'+'.jpg'
figfile_to3='GMAO_MERRA2_2018_'+DATA_NAME+'_'+OBS_NAME+'_'+'AllYear'+'_to3'+'.jpg'
figfile_tox='GMAO_MERRA2_2018_'+DATA_NAME+'_'+OBS_NAME+'_'+'AllYear'+'_tox'+'.jpg'
figfile_tqi='GMAO_MERRA2_2018_'+DATA_NAME+'_'+OBS_NAME+'_'+'AllYear'+'_tqi'+'.jpg'
figfile_tql='GMAO_MERRA2_2018_'+DATA_NAME+'_'+OBS_NAME+'_'+'AllYear'+'_tql'+'.jpg'
figfile_ts='GMAO_MERRA2_2018_'+DATA_NAME+'_'+OBS_NAME+'_'+'AllYear'+'_ts'+'.jpg'
figfile_u10m='GMAO_MERRA2_2018_'+DATA_NAME+'_'+OBS_NAME+'_'+'AllYear'+'_u10m'+'.jpg'
figfile_u2m='GMAO_MERRA2_2018_'+DATA_NAME+'_'+OBS_NAME+'_'+'AllYear'+'_u2m'+'.jpg'
figfile_u50m='GMAO_MERRA2_2018_'+DATA_NAME+'_'+OBS_NAME+'_'+'AllYear'+'_u50m'+'.jpg'
figfile_v10m='GMAO_MERRA2_2018_'+DATA_NAME+'_'+OBS_NAME+'_'+'AllYear'+'_v10m'+'.jpg'
figfile_v2m='GMAO_MERRA2_2018_'+DATA_NAME+'_'+OBS_NAME+'_'+'AllYear'+'_v2m'+'.jpg'
figfile_v50m='GMAO_MERRA2_2018_'+DATA_NAME+'_'+OBS_NAME+'_'+'AllYear'+'_v50m'+'.jpg'

### Select where in the world

In [57]:
# Select observatory
loc=merra2.observatory_location(OBS_NAME)

In [58]:
loc

(5.71222222222, 43.9316666667, 650.0)

### 2.2) Getting the list of the files
------------------------------

In [59]:
nc4_files = [f for f in os.listdir(path) if f.endswith('.nc4')]  

In [60]:
nc4_files[:5]

['MERRA2_400.inst1_2d_asm_Nx.20180227.nc4',
 'MERRA2_400.inst1_2d_asm_Nx.20180322.nc4',
 'MERRA2_400.inst1_2d_asm_Nx.20180329.nc4',
 'MERRA2_400.inst1_2d_asm_Nx.20180224.nc4',
 'MERRA2_400.inst1_2d_asm_Nx.20180528.nc4']

### 2.3) Select files of a given month

In [61]:
keysel_filename='^MERRA2_400.inst1_2d_asm_Nx.2018.*'

In [62]:
print 'Selection key' ,keysel_filename

Selection key ^MERRA2_400.inst1_2d_asm_Nx.2018.*


In [63]:
nc4_files2 = []
for file in nc4_files:
    if re.findall(keysel_filename,file):
        nc4_files2.append(file)

nc4_files2=np.array(nc4_files2)

In [64]:
nc4_files2

array(['MERRA2_400.inst1_2d_asm_Nx.20180227.nc4',
       'MERRA2_400.inst1_2d_asm_Nx.20180322.nc4',
       'MERRA2_400.inst1_2d_asm_Nx.20180329.nc4',
       'MERRA2_400.inst1_2d_asm_Nx.20180224.nc4',
       'MERRA2_400.inst1_2d_asm_Nx.20180528.nc4',
       'MERRA2_400.inst1_2d_asm_Nx.20180115.nc4',
       'MERRA2_400.inst1_2d_asm_Nx.20180429.nc4',
       'MERRA2_400.inst1_2d_asm_Nx.20180216.nc4',
       'MERRA2_400.inst1_2d_asm_Nx.20180402.nc4',
       'MERRA2_400.inst1_2d_asm_Nx.20180306.nc4',
       'MERRA2_400.inst1_2d_asm_Nx.20180110.nc4',
       'MERRA2_400.inst1_2d_asm_Nx.20180428.nc4',
       'MERRA2_400.inst1_2d_asm_Nx.20180405.nc4',
       'MERRA2_400.inst1_2d_asm_Nx.20180101.nc4',
       'MERRA2_400.inst1_2d_asm_Nx.20180430.nc4',
       'MERRA2_400.inst1_2d_asm_Nx.20180321.nc4',
       'MERRA2_400.inst1_2d_asm_Nx.20180312.nc4',
       'MERRA2_400.inst1_2d_asm_Nx.20180504.nc4',
       'MERRA2_400.inst1_2d_asm_Nx.20180330.nc4',
       'MERRA2_400.inst1_2d_asm_Nx.20180410.nc4',


### 2.4) Sort files by increasing time

In [65]:
nc4_files=np.sort(nc4_files2)

### 2.5) Build the full filename before reading

In [None]:
NBFILES=len(nc4_files)
full_nc4files=[]

for file in nc4_files:
    fname = os.path.join(path, file)
    full_nc4files.append(fname)  

## 3)  Extract data and write them into pandas dataset and time series
--------------------------------------------------------------------------------------

In [None]:
ts0=[]  # intermediate data series
ts1=[]
ts2=[]
ts3=[]  # intermediate data series
ts4=[]
ts5=[]
ts6=[]  # intermediate data series
ts7=[]
ts8=[]
ts9=[]  # intermediate data series
ts10=[]
ts11=[]
ts12=[]
ts13=[]

df_inst1_2d_asm_Nx=[] # final pandas dataset for all atmospheric quantities

for file in full_nc4files: # loop on data file of each day of the month
    
    #Retrieve 1D parameters longitude, latitude, time
    (m_lat,m_un_lat,m_nm_lat) = merra2.Get1DData(file,'lat') # latitude (array, unit, name)
    m_latitude = m_lat[:]
    (m_lon,m_un_lon,m_nm_lon) = merra2.Get1DData(file,'lon') # longitude(array, unit, name)
    m_longitude = m_lon[:]
    (m_tim,m_un_tim,m_nm_tim)= merra2.Get1DData(file,'time') # time (array, unit, name)
    m_time=m_tim[:]
       
    NbDataPerFile=m_time.shape[0] # number of data sample per file
    #start_time = re.findall("^minutes since[ ]([0-9.].+[0-9.].+[0-9.].+)[ ]00:00:00$",m_un_tim) # extract start time
    start_time = re.findall("^minutes since[ ]([0-9.].+[0-9.].+[0-9.].+)",m_un_tim) # extract start time
    
    #print 'start_time = ', start_time
    time_rng = pd.date_range(start_time[0], periods=NbDataPerFile, freq='H') # one data per hour
    

    
    m_X,m_Y=np.meshgrid(m_longitude,m_latitude) # build meash-grid in longitude and latitude
    (sel_long, sel_lat)=merra2.GetBinIndex(m_X,m_Y,loc[0],loc[1]) # get bin in longitude and latitude for the site  
    
 
    # loop
    for index in range(NB_DATAFIELDS):
        (m_data,m_unit,m_longname)=merra2.GetGeoRefData(file,DATA_TAG[index]) # 3D array : time x longitude x latitude  
        dt=m_data[:,sel_lat,sel_long]
        if index==0:
            ts0 = pd.Series(dt, index=time_rng)
        elif index==1:
            ts1 = pd.Series(dt, index=time_rng)
        elif index==2:
            ts2 = pd.Series(dt, index=time_rng)
        elif index==3:
            ts3 = pd.Series(dt, index=time_rng)

        elif index==4:
            ts4 = pd.Series(dt, index=time_rng)
        elif index==5:
            ts5 = pd.Series(dt, index=time_rng)
        elif index==6:
            ts6 = pd.Series(dt, index=time_rng)
            
        elif index==7:
            ts7 = pd.Series(dt, index=time_rng)
        elif index==8:
            ts8 = pd.Series(dt, index=time_rng)
        elif index==9:
            ts9 = pd.Series(dt, index=time_rng)            
            
        elif index==10:
            ts10 = pd.Series(dt, index=time_rng)
        elif index==11:
            ts11 = pd.Series(dt, index=time_rng)
        elif index==12:
            ts12 = pd.Series(dt, index=time_rng) 
            
        elif index==13:
            ts13 = pd.Series(dt, index=time_rng) 
            
            
        #clf_timeseries.append(ts)
        # Create the dataframe
    df = pd.DataFrame({DATA_TAG[0]: ts0, 
                       DATA_TAG[1]: ts1,
                       DATA_TAG[2]: ts2,
                       DATA_TAG[3]: ts3,
                       
                       DATA_TAG[4]: ts4,
                       DATA_TAG[5]: ts5,
                       DATA_TAG[6]: ts6,                       

                       DATA_TAG[7]: ts7,
                       DATA_TAG[8]: ts8,
                       DATA_TAG[9]: ts9,
                       
                       DATA_TAG[10]: ts10,
                       DATA_TAG[11]: ts11,
                       DATA_TAG[12]: ts12,
                       DATA_TAG[13]: ts13 }, index=time_rng)
    df_inst1_2d_asm_Nx.append(df)  
    

### Concatenation

In [None]:
df_inst1_2d_asm_Nx=pd.concat(df_inst1_2d_asm_Nx)

In [None]:
df_inst1_2d_asm_Nx.info()

## 4) Plot the time dependence of time series and dataset

####  Fill information on years and months from the filenames

In [None]:
def PlotTimeSeries(index,figfile):
    plt.figure(figsize=(20,5))
    df_inst1_2d_asm_Nx[DATA_TAG[index]].plot(lw=3,color='red')
    plt.xlabel('time')
    plt.ylabel(DATA_TAG[index])
    title= "Hourly {} at {} in 2018".format(DATA_TITLE[index],OBS_NAME)
    plt.title(title,fontsize=15)
    plt.suptitle('NASA GMAO GES (MERRA-2)', y=1.02, fontsize=13)
    plt.grid(True)
#    plt.savefig(figfile)

### 4.1) Pressure

In [None]:
PlotTimeSeries(0,figfile_ps)

###  4.2)  T10
----------------

In [None]:
PlotTimeSeries(1,figfile_t10)

### 4.3) t2

In [None]:
PlotTimeSeries(2,figfile_t2)

### 4.4) to3

In [None]:
PlotTimeSeries(3,figfile_to3)

### 4.5) tox

In [None]:
PlotTimeSeries(4,figfile_tox)

### 4.6 TQI

In [None]:
PlotTimeSeries(5,figfile_tqi)

### 4.7) TQL

In [None]:
PlotTimeSeries(6,figfile_tql)

### 4.8) TS

In [None]:
PlotTimeSeries(7,figfile_ts)

### 4.9) um10

In [None]:
PlotTimeSeries(8,figfile_u10m)

In [None]:
PlotTimeSeries(9,figfile_u2m)

In [None]:
PlotTimeSeries(10,figfile_u50m)

In [None]:
PlotTimeSeries(11,figfile_v10m)

In [None]:
PlotTimeSeries(12,figfile_v2m)

In [None]:
PlotTimeSeries(13,figfile_v50m)

## 5) Output

In [None]:
df_inst1_2d_asm_Nx.index.name='time'
df_inst1_2d_asm_Nx.describe()

## 5)  Save dataset  in file pandas (csv)
----------------------------------------

In [None]:
dataset=df_inst1_2d_asm_Nx

In [None]:
dataset.index.name='time'

In [None]:
dataset.describe()

In [None]:
dataset.head()

### write pandas

In [None]:
dataset.to_csv(pandas_filename)

### write hdf5

In [None]:
dataset.to_hdf(hdf5_filename,key='df',mode="w")

### Check

In [None]:
saved_dataset=pd.read_csv(pandas_filename)

In [None]:
saved_dataset.head()

## 6) Convert dataset into a table and then save in a fits file
--------------------------------------------------------------------------

In [None]:
table = Table.from_pandas(saved_dataset)

In [None]:
table

In [None]:
table.write(fits_filename,format='fits',overwrite=True)