In [1]:
"""
Usage:
    To run the script, use the following command in the terminal:
    python ucrb_diversions.py

    Make sure to have the environment ucrb_diversions installed (see ucrb_diversions.yml in scripts folder)
"""

import sys
# sys.path.append('../../ucrb_utils/python_packages_static')
sys.path.append('python_packages_static')
import os
import shutil
import platform
import requests
import re
from io import StringIO
import json

import numpy as np
import pandas as pd
import geopandas as gp
from shapely.geometry import Point

import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
from calendar import monthrange
import matplotlib.dates as mdates
years1 = mdates.YearLocator()
years5 = mdates.YearLocator(5)
years10 = mdates.YearLocator(10)
years20 = mdates.YearLocator(20)
years_fmt = mdates.DateFormatter('%Y')
from datetime import date

#import spnspecs
#spnspecs.set_graph_specifications()

if 'window' in platform.platform().lower():
    newln = '\n'
else:
    newln = '\r\n'


def divfilter(df=None, mindiff=25., minfact=2., sig=3.):
    """
    Function to identify time series outliers using an initial and secondary filter.

    Parameters
    ----------
    
    df : pandas dataframe
        must contain two columns:
            "datetime" of type np.datetime64
            "discharge_cfs" of type float
        (default is None)
    
    mindiff : float
        minimum amount (in cfs) greater than the median annual max at which a record
        can be deemed an outlier in the first pass filter
        (default is 25.)

    minfact : float
        minimum factor by which a record must be greater than the median annual max
        to be deemed an outlier in the first pass filter
        (default is 2.)

    sig : float
        number of standard deviations above the median annual max at which a record can be deemed
        an outlier in the second pass filter
        (default is 3.)

    Returns
    ----------
    df : pandas dataframe
        identical to input dataframe with additional columns:
            "mdflag" of type int, where a value of 1 indicates records identified as outliers in the first pass filter
            "sdflag" of type int, where a value of 1 indicates records identified as outliers in the second pass filter

    See Also
    --------

    Notes
    -----
    The first pass filter uses the median of the annual maximum values and a minimum difference and minimum factor
    to identify potential outliers. Records meeting the criteria of potential outliers are not included in the
    calculation of the standard deviation of the annual maximum values.

    The second pass filter uses a provided sigma distance to identify potential outliers that are greater than a specified
    number of standard deviations above the median annual maximum rate. The standard deviation value is  calculated on the
    values from the first pass that did not meet the potential outlier criteria of minimum difference and minimum factor

    Examples
    --------
    Need to add
    """

    df.loc[:, "year"] = df.loc[:, "datetime"].apply(lambda x: x.year)
    dfy = df.loc[df["discharge_cfs"] > 0.].groupby("year").max()
    aym = np.median(dfy.loc[:, "discharge_cfs"])
    
    df.loc[:, "mdflag"] = 0
    df.loc[df.apply(lambda x: all((x.discharge_cfs - aym > mindiff, x.discharge_cfs > minfact * aym)), axis=1), "mdflag"] = 1
    
    dfy = df.loc[(df["mdflag"] == 0) & (df["discharge_cfs"] > 0.)].groupby("year").max()
    aymstd = np.std(dfy.loc[:, "discharge_cfs"])
    
    df.loc[:, "sdflag"] = 0
    df.loc[df.apply(lambda x: x.discharge_cfs > (aym + sig * aymstd), axis=1), "sdflag"] = 1
    
    return df, aym


def gaplengths(vals):
    """
    Function to identify the length of consecutive null values surrounding each null value in a series of values.
    This function is used by fill_missing_diversion_values() to identify gaps to be filled that are less than 90 days.
    Gaps longer than 90 days are not filled to avoid interpolating values from irrigation season into non-irrigation season.

    Parameters
    ----------
    
    vals : list of float values
        (default is None)

    Returns
    ----------
    cts : list of int values

    See Also
    --------

    Notes
    -----

    Examples
    --------
    Need to add
    """
    i0 = 0
    x = 0
    cts = []

    for i,v in enumerate(vals):
        if np.isnan(v):
            x += 1
            cts.append(x)
        else:
            cts[i0: i] = [x] * (i - i0)
            i0 = i
            x = 0
            cts.append(x)
    return cts


def build_spdf(start="19791231", end="20220930", spfreq="M", tsfreq="D"):
    """
    Function to build a dataframe of stress periods and time steps for a given date range and intervals

    Parameters
    ----------
    
    start : str
        starting datetime in string format "%Y%m%d"
        (default is "19791231")

    end : str
        ending datetime in string format "%Y%m%d"
        (default is "20220930")

    spfreq : str
        stress period frequency. Must be "M"
        (default is "M")

    Returns
    ----------
    df : pandas dataframe
        includes dates, stress periods assigned by month, and MODFLOW totim values for each day, in 1-index

    See Also
    --------

    Notes
    -----

    Examples
    --------
    Need to add    
    """
    dts = pd.date_range(start=pd.to_datetime(start), end=pd.to_datetime(end), freq=tsfreq)
    df = pd.DataFrame(data={"year": dts.year, "month": dts.month, "day": dts.day}, index=dts)
    if spfreq == "M":
        df.loc[:, "sp"] = df.groupby(["year", "month"]).ngroup()
        gr = df.groupby('sp')
        df.loc[:, "ts"] = gr.cumcount()
    df.loc[:, "totim"] = range(1, len(df) + 1)
    
    return df
    

def format_sites_df(df=None):
    """
    Function to create point geometries from decimal lat/long and reproject to UTM 12N 

    Parameters
    ----------
    
    df : pandas dataframe
        function applies to diversion site dataframes created in each data pulling function
        (default is None)

    Returns
    ----------
    df : pandas dataframe
        original dataframe modified to include point geometries for each site in UTM 12N projection

    See Also
    --------

    Notes
    -----

    Examples
    --------
    Need to add    
    """
    df.loc[:, "geometry"] = df.apply(lambda x: Point(x.siteLong, x.siteLat), axis=1)
    df = gp.GeoDataFrame(df, geometry="geometry", crs="epsg:4269")
    df.to_crs(epsg=model_epsg, inplace=True)
    df.loc[:, "utmX"] = df.loc[:, "geometry"].apply(lambda xx: xx.x)
    df.loc[:, "utmY"] = df.loc[:, "geometry"].apply(lambda xx: xx.y)
    # df.drop(["siteLat", "siteLong"], axis=1, inplace=True)

    return df


def get_cdss_diversion_data(dst_dir=os.path.join("..", "output", "cdss_raw_data"),
                            sites_ifp=os.path.join("..", "input", "ucrb_diversion_master_table.csv"),
                            apiKey=None):
    """
    Function to pull diversion records from Utah Department of Water Resources website 

    Parameters
    ----------
    
    dst_dir : str
        relative path location to directory to save downloaded data
        (default is "utdwr_raw_data")

    sites_ifp : str
        relative path location to csv file containing all UCRB diversion sites.
        This function only attempts to pull data for records with "dataSource" attribute of "UTDWR"
        (default is "ucrb_diversion_master_table.csv")

    sp_df : pandas dataframe
        dataframe containing one record per day within period of interest, used for combining automatically
        retrieved data with manually retrieved data located in hst_dir directory

    hst_dir : str
        relative path location to directory containing manually-retrieved historical records
        (default is "utdwr_historical_data")

    comb_dir : str
        relative path location to directory where combined automatically-retrieved and manually-retrieved
        records will be saved
        (default is "utdwr_combined_data")

    Exports
    ----------
    Microsoft Excel CSV file "utdwr_diversion_sites.csv" containing site information of every site
    for which daily records were pulled

    1 additional CSV file for each site (e.g. "cms_ut_caineville_canal.csv") containing daily diversion records of that site


    Returns
    ----------
    None

    See Also
    --------

    Notes
    -----

    Examples
    --------
    Need to add    
    """    
    print("downloading UTDWR diversion record data to directory {0}".format(dst_dir))
    if os.path.exists(dst_dir):
        pass
        # shutil.rmtree(dst_dir)
        # print("existing diversion data directory found and will be replaced")
    else:
        os.mkdir(dst_dir)

    print("combining UTDWR diversion records into directory {0}".format(comb_dir))
    if os.path.exists(comb_dir):
        pass
        # shutil.rmtree(comb_dir)
        # print("existing directory for combined data found and will be replaced")
    else:
        os.mkdir(comb_dir)
    
    # organize info for lookup table
    siteIds = []
    siteNames = []
    siteLat = []
    siteLong = []
    siteSource = []
    siteFiles = []
    siteUse = []
    siteStart = []
    siteEnd = []
    noFillYears = []
    shortID = []
    destinationCode = []
    destinationFlag = []

    # import table of UTDWR diversion sites
    sites = pd.read_csv(sites_ifp)
    sites = sites.loc[sites["dataSource"] == "UTDWR"].copy()
    
    # retrieve UT DWR data
    for i, r in sites.loc[sites["utdwrID"].notnull()].iterrows():
        ID = r.utdwrID
        siteFile = "{0}.csv".format(r.siteName)
        
        today = date.today()
        Current_Date = today.strftime("%Y")
        URL = f"https://www.waterrights.utah.gov/cgi-bin/dvrtview.exe?STATION_ID={ID}&RECORD_YEAR={Current_Date}&Modinfo=Daily_Comma"
        
        try:
            rr = requests.get(URL)
            temp=StringIO(rr.text)
            temp1=temp.readlines()
            for line in temp1:
                if line.startswith("Daily comma delimited"):
                    URL_raw= re.findall('"([^"]*)"', line)
            URL_end="".join(map(str,URL_raw))
            URL_base = 'https://www.waterrights.utah.gov'
            URL_full = f"{URL_base}{URL_end}"
            rrr = requests.get(URL_full)
            temp_cd = StringIO(rrr.text)
            df = pd.read_csv(temp_cd)
            df.columns=["year", "month", "day", "discharge_cfs"]
            df.loc[:,"date"]=pd.to_datetime(df[['year','month', 'day']])
            df.drop(labels=df.columns.difference(["date", "discharge_cfs"]), axis=1, inplace=True)
            df.index=df.pop("date")

            df.to_csv(os.path.join(dst_dir, siteFile))

            if r.historicalRecord == "y":
                try:
                    temp = pd.read_csv(os.path.join(hst_dir, "{0}.csv".format(r.siteName)))
                except:
                    temp = pd.read_csv(os.path.join(hst_dir1, "{0}.csv".format(r.siteName)))

                temp.loc[:,"date"] = pd.to_datetime(temp.loc[:,"date"])
                temp.index = temp.pop("date")
                temp = sp_df.join(temp,how="left")

                for ii, rr in temp.loc[temp["monthly_cfsd"].notnull()].iterrows():
                    ix = temp.loc[(temp["year"] == rr.year) & (temp["month"] == rr.month)].index
                    temp.loc[ix, "discharge_cfs"] = rr.monthly_cfsd / monthrange(int(rr.year), int(rr.month))[1]

                df.rename(columns={"discharge_cfs": "auto_cfs"}, inplace=True)

                df = temp.join(df, how="left")

                df.loc[:, 'discharge_cfs'] = df.loc[:, 'discharge_cfs'].fillna(df.loc[:, 'auto_cfs'])
                df.loc[:, "date"] = df.index.values
                df.index = df.pop("date")

                df.filter(['discharge_cfs']).to_csv(os.path.join(comb_dir, siteFile))
            else:
                df.to_csv(os.path.join(comb_dir, siteFile))

            print(r.siteName)
            siteIds.append(r.utdwrID)
            siteNames.append(r.siteName)
            siteUse.append(r.siteUse)
            siteLat.append(r.decLat)
            siteLong.append(r.decLong)
            siteSource.append(r.dataSource)
            siteFiles.append(siteFile)
            siteStart.append(r.startDate)
            siteEnd.append(r.endDate)
            noFillYears.append(r.no_fill_years)
            shortID.append(r.shortID)
            destinationCode.append(r.destinationCode)
            destinationFlag.append(r.destinationFlag)

        except:
            print("could not download or process data from UTDWR diversion site: {0}".format(r.siteName))
            pass   
            
    # build and export diversion site lookup table for use in build_diversion_tabfiles()
    df = pd.DataFrame(data={"siteID": siteIds, "siteName": siteNames, "siteUse": siteUse,
                            "siteLat": siteLat, "siteLong": siteLong,
                            "siteSource": siteSource, "siteFile": siteFiles,
                            "startDate": siteStart, "endDate": siteEnd,
                            "noFillYears": noFillYears, "shortID": shortID,
                            "destinationCode": destinationCode, "destinationFlag": destinationFlag})
    
    df.loc[:, "siteFolder"] = os.path.split(comb_dir)[-1]
    df_out = format_sites_df(df)
    df_out.to_csv(os.path.join(dst_dir, "..", "utdwr_diversion_sites.csv"))


def get_wy_diversion_data(dst_dir=os.path.join("..", "output", "wyseo_raw_data"),
                          sp_df=None,
                          sites_ifp=os.path.join("..", "input", "ucrb_diversion_master_table.csv"),
                          hst_dir=os.path.join("..", "input", "wyseo_historical_data"),
                          comb_dir=os.path.join("..", "output", "wyseo_combined_data")
                         ):
    print("Function is defined correctly")