In [10]:
"""
Name: extract_ams
Author: Tania Lopez-Cantu
Date: 06/13/2020
-------------------------
The following notebook writes into a csv file the AMS at each gridcell of a gridded 
dataset using the grid ids that we stored and created using the extract_grid_center_indices
notebook.

CSV format: rows: year; columns: grid cell id
"""

from netCDF4 import Dataset
import numpy as np
import pandas as pd

In [15]:
"""
Variables to modify:
file_path --> path to .nc file 
save_path --> directory where to store the output of this notebook
grid_info --> path to csv with grid_id info was stored
"""
file_path = "cheswx_prcp_1948_2015.experimental.2017-08-14.nc"
save_path = "output/"
grid_info = "output/historical_ches_gridcells.csv"

In [16]:
def aggregate_by_frequency(df, fr, cols):
    """Create a new df with col aggregated by desired frequency
    Note: df needs to have a column named "date" and be a datetime object
    in order to work. Available freq depending on frequency of the original data
    possible values of freq: "3h, 6h, 12h, 24h..." and so on. It is not limited
    to hours but can also aggregate by week or by month, by year too."""

    return df.groupby(pd.Grouper(key='date', freq=fr))[cols].max() 

In [31]:
def get_timestamps(nc_file, origin):
    """
    This function outputs a timestamp series using the Time variable from the ncfile, and its origin
    ---
    input: .nc file
    output: [array] timestamps starting from the .nc file Time variable origin
    """
    timev = nc_file.variables['time'][:]

    return pd.to_datetime(timev, unit="h", origin=pd.Timestamp(origin))

In [57]:
df_grid_ids = pd.read_csv(grid_info, usecols=[1])

In [14]:
file_path = "cheswx_prcp_1948_2015.experimental.2017-08-14.nc"
nc_file = Dataset(file_path, "r")

In [42]:
"""
Verify time and precipitation variables and their units in the
nc file
"""

print("Time: {}".format(nc_file.variables["time"].units))
print("Precipitation: {}".format(nc_file.variables["prcp"].units))

Units
Time: hours since 1948-01-01
Precipitation: mm


In [32]:
# Get timestamps
origin = nc_file.variables["time"].units.split(" ")[-1]
timestamps = get_timestamps(nc_file, origin)

In [33]:
timestamps

DatetimeIndex(['1948-01-01 07:00:00', '1948-01-02 07:00:00',
               '1948-01-03 07:00:00', '1948-01-04 07:00:00',
               '1948-01-05 07:00:00', '1948-01-06 07:00:00',
               '1948-01-07 07:00:00', '1948-01-08 07:00:00',
               '1948-01-09 07:00:00', '1948-01-10 07:00:00',
               ...
               '2015-12-22 07:00:00', '2015-12-23 07:00:00',
               '2015-12-24 07:00:00', '2015-12-25 07:00:00',
               '2015-12-26 07:00:00', '2015-12-27 07:00:00',
               '2015-12-28 07:00:00', '2015-12-29 07:00:00',
               '2015-12-30 07:00:00', '2015-12-31 07:00:00'],
              dtype='datetime64[ns]', length=24837, freq=None)

In [46]:
# This might take some time because the matrix is (24837, 222, 207)
prcp = np.ma.getdata(nc_file.variables["prcp"])[:]

In [55]:
grid_ids.iloc[0,0].lstrip("id_").split("_")

['0', '0']

In [62]:
# Format data to create df
r = {}
for grid_id in df_grid_ids['grid_id'].values:
    indeces = grid_id.lstrip("id_").split("_")
    gridiy = int(indeces[0]) #lat
    gridix = int(indeces[1]) #lon
    r[grid_id] = np.ma.getdata(prcp[:, gridiy, gridix])

In [63]:
# Create dataframe of all precipitation records. This might take some time.
full_domain = pd.DataFrame(r)
cols = full_domain.columns

In [65]:
# Add time stamps that we extracted earlier
full_domain['date'] = timestamps


In [66]:
# Get AMS using function defined at the start of this notebook.
# This might take some time.

df_ams = aggregate_by_frequency(full_domain, 'A', cols)

In [67]:
# save df_AMS to path specified at the beginning of notebook
name_csv = "historical_ches_AMS"
df_ams.to_csv(f"{save_path}{name_csv}.csv")