# Load Libraries

In [61]:
import pandas as pd
import numpy as np
import geopandas as gpd
import os, glob, linecache, uuid
import xml.etree.ElementTree as ETE
import seaborn as sns
# import matplotlib.pyplot as plt
import shutil
from scipy.stats import zscore
import re
import ipdb

uuid_gen = uuid.uuid4()

# Pull Data from LEVs into CSVs for Upload

## Access LEVs

In [2]:
#Point to folder with new levs
data_dir = "C:/Users/jinsu.elhance/Desktop/2023_06_16_WellSurvey/Well Data"

#Point to folder where jsons for dendra API can be stored
dendra_dir = "C:\\Users\\jinsu.elhance\\Box\\000. Jinsu Elhance\\DendraWork\\Data\\"


In [13]:
#Find all LEV files
LEV_dir_list = glob.glob(f"{data_dir}/*/*.lev")

In [14]:
#Define helper fn
def find_data_rows(lines):
    i = 0 
    while i < len(lines):
        if lines[i] == "[Data]\r\n":
            return i 
        i += 1
        
def find_discrete_ts(df): #Finds contiguous segments of time series data
    df['tdelta'] = df['timestamp_utc'].diff()
    gap_finder = df.loc[df['tdelta'] > timedelta(days=7)]
    tseries = []

    indices = np.append([0], gap_finder.index.values)
    indices = np.append(indices, df.index.values[-1]+1)

    indexr = [(indices[i-1], indices[i]-1) for i in range(1, len(indices))]
    
#     For each gap, section our the data
    for _indexr in indexr:
        tseries.append(df[_indexr[0]:_indexr[1]])
        
    return tseries

In [15]:
#Store all device metadata and data
WellDevices = pd.DataFrame()
WellData = {}

In [16]:
#Parse each LEV to extract data and metadata
for lev in LEV_dir_list:
    if "Compensated" in lev:
        continue
    with open(lev, newline="\n") as lev_text:
        lev_lines = lev_text.readlines()
        _dataStart = find_data_rows(lev_lines) + 2
        
        #Pull out metadata from datafile
        _metadata = {}
        for _mdx in lev_lines[10:_dataStart]:
            _spltKeyDat = _mdx.replace(" ", "").strip().split("=")
            if len(_spltKeyDat) > 1:
                if _spltKeyDat[0] == "Unit" and "LevelUnit" in _metadata.keys():
                    _spltKeyDat[0] = "TemperatureUnit"
                elif _spltKeyDat[0] == "Unit":
                    _spltKeyDat[0] = "LevelUnit"
                _metadata[_spltKeyDat[0]] = _spltKeyDat[1]

        #Which well?
        location = (_metadata.get("Location") or "Unknown").replace("/", "").replace("#", "").replace(" ","").lower()
        instrumentType = _metadata.get('Instrumenttype' or "Unknown")
        
        ## Find Data Pointer
        _df = pd.read_fwf(lev, skiprows=_dataStart, names=["date", "time", "level", "temperature"], encoding='iso-8859-1')
        _df = _df.iloc[:-1]
        _df = _df.set_index(_df.agg(('{0[date]}{0[time]}' + f"{location}{instrumentType}").format, axis=1).apply(lambda x: hash(x)))
        _df['date'] = pd.to_datetime(_df['date'])
        
        #Update Metadata
        _metadata['Data_start_date'] = min(_df['date'])
        _metadata['Data_end_date'] = max(_df['date'])
        _metadata['TemperatureUnit'] = _metadata['TemperatureUnit'][-1]
        _metadata["Location"] = location
        _metadata['Data_source'] = "lev"
        
        #Save data_units to different columns
        _df = _df.dropna(subset='temperature')
        if _metadata['TemperatureUnit'] == "C":
            _df = _df.rename(columns = {'temperature': 'temperature_C'})
            _df['temperature_F'] = np.NaN
        elif _metadata['TemperatureUnit'] == "F":
            _df = _df.rename(columns = {'temperature': 'temperature_F'})
            _df['temperature_C'] = np.NaN   
        
        _df = _df.dropna(subset='level')
        if _metadata['LevelUnit'] == "ft":
            _df = _df.rename(columns = {'level': 'level_ft'})
            _df['level_m'] = np.NaN
        elif _metadata['LevelUnit'] == "m":
            _df = _df.rename(columns = {'level': 'level_m'})
            _df['level_ft'] = np.NaN 
            
        _df['TIMESTAMP'] = pd.to_datetime(_df['date'].astype(str) + " " + _df['time'].astype(str))
        _df = _df.drop(['date', 'time'], axis=1)
        _df = _df.dropna(subset='TIMESTAMP')
        
        #Save data to dataframes
        if location in WellData.keys():
            if 'xle_lev' in WellData[location].keys():
                WellData[location]['xle_lev'] = pd.concat([WellData[location]['xle_lev'], _df], axis=0)
            else:
                WellData[location]['xle_lev'] = _df
        else:
            WellData[location] = {}
            WellData[location]['xle_lev'] = _df

        WellDevices = pd.concat([WellDevices, pd.DataFrame(_metadata, index=[instrumentType+location])], ignore_index = True)

In [17]:
for well in WellData.keys():
    if 'xle_lev' in WellData[well]:
        WellData[well]['xle_lev'].index.names = ["seq_id"]
        WellData[well]['xle_lev'] = WellData[well]['xle_lev'].drop_duplicates(keep="first")
        WellData[well]['xle_lev'].to_csv(f"{dendra_dir}/{well}_dendra_xle_lev.csv")

# Upload data to Dendra by sending it to Scott Smith

** Only proceed when data is uploaded **

# Verify that Dendra has Stations for each Dataset

# Create Datastreams if necessary
1. Ensure you hide xlsx datastreams or empty ones or datastreams on empty stations

In [None]:
# datastreams = [("level", "xle_lev"), ("temperature", "xle_lev"), ("temperature", "xlsx"), ("level", "xlsx")]
# datastream_template_paths = ["xle_lev_level.json", "xlsx_level.json", "xle_lev_temp.json", "xlsx_temp.json"]
# datastream_templates = []

# for t in datastream_template_paths: 
#     with open(f"C:/Users/jinsu.elhance/Box/000. Jinsu Elhance/DendraWork/Datastreams/{t}") as template_raw:
#         datastream_templates.append(deepcopy(json.load(template_raw)))
        
# #get station names and slugs
# datastreams_glob = glob.glob(f"C:/Users/jinsu.elhance/Box/000. Jinsu Elhance/DendraWork/Datastreams/Requests/*.json")
# station_glob = glob.glob(f"C:/Users/jinsu.elhance/Box/000. Jinsu Elhance/DendraWork/Stations/64*.json")

# for station_path in station_glob:
#     with open(station_path, "r") as station_file:
#         station_data = json.load(station_file)
#         station_name = station_data['slug'].lower().replace("dangermond-", "")
#         for i in range(4):
#             datastream = datastreams[i]
#             if os.path.exists(f"C:/Users/jinsu.elhance/Box/Wells/WellsDatasheets/HistoricalWellSynthesis/Data/Dendra_Uploads/{station_name}_dendra_{datastream[1]}.csv"):
# #                 print(station_name, datastream)
#                 datastream_template = deepcopy(datastream_templates[i])
#                 datastream_template['datapoints_config'][0]["params"]["query"]["fc"] = datastream_template['datapoints_config'][0]["params"]["query"]["fc"].replace("WELL", station_name)
#                 datastream_template['description'] = datastream_template['description'].replace("STATIONNAME", station_data['full_name'].replace("Dangermond ", ""))
#                 datastream_template['datapoints_config_refd'][0]['params']['query']['fc'] = datastream_template['datapoints_config'][0]["params"]["query"]["fc"]
#                 datastream_template['station_id'] = station_data['_id']
    #              Write the JSON objects to a file
#                 with open(f"C:/Users/jinsu.elhance/Box/000. Jinsu Elhance/DendraWork/Datastreams/Requests/{station_name}.{datastream[0]}.{datastream[1]}.datastream.json", 'w') as json_file:
#                     json.dump(datastream_template, json_file, indent=4)

#Hide datastreams on hidden stations
# hidden_stations = []

# station_glob = glob.glob(f"C:/Users/jinsu.elhance/Box/000. Jinsu Elhance/DendraWork/Stations/64*.json")

# for station_path in station_glob:
#     with open(station_path, "r") as station_file:
#         station_data = json.load(station_file)
#         if station_data['is_hidden'] == True:
#             hidden_stations.append(station_data["_id"])
            
# datastreams_glob = glob.glob(f"C:/Users/jinsu.elhance/Box/000. Jinsu Elhance/DendraWork/Datastreams/Requests/64*.json")

# for datastream_path in datastreams_glob:
#     with open(datastream_path, "r") as datastream_file:
#         datastream_data = json.load(datastream_file)
#         if datastream_data['station_id'] in hidden_stations:
#             datastream_data['is_hidden'] = True
#     #              Write the JSON objects to a file
#             with open(f"C:/Users/jinsu.elhance/Box/000. Jinsu Elhance/DendraWork/Datastreams/Requests/patch.{datastream_data['_id']}.datastream.json", 'w') as json_file:
#                 json.dump(datastream_data, json_file, indent=4)

# Annotate Survey Dates

In [None]:
#Using well data uploaded to dendra, find starts and ends of data gaps to create annotations with

#The code below writes gap_markers
# well_data_list = []

# for data_f in dendra_data_glob:
#     well = os.path.basename(data_f).replace("_dendra", "").replace("_xle_lev.csv", "").replace("_xlsx.csv", "")
#     well_data = pd.read_csv(data_f)
#     well_data['well'] = well
#     well_data['TIMESTAMP'] = pd.to_datetime(well_data['TIMESTAMP'])
#     well_data['tdelta'] = well_data['TIMESTAMP'].diff()
#     well_data_list.append(well_data)
    
# well_data_full = pd.concat(well_data_list)
# well_data_full.head()

# gap_finder = well_data_full.loc[well_data_full['tdelta'] > timedelta(days=1)]
# gap_finder['gap_start'] = gap_finder['TIMESTAMP'] - gap_finder['tdelta']
# gap_ends = gap_finder.groupby(gap_finder['TIMESTAMP'].map(lambda x: (x.year, x.month, x.day)))['well'].agg(lambda x: set(x))
# gap_starts = gap_finder.groupby(gap_finder['gap_start'].map(lambda x: (x.year, x.month, x.day)))['well'].agg(lambda x: set(x))

# with open(f"C:\\Users\\jinsu.elhance\\Box\\000. Jinsu Elhance\\DendraWork\\Annotations\\template.annotation.json", encoding="utf-8") as anno_template_file:
    
#     anno_template_json = json.load(anno_template_file)
#     anno_template_json['station_ids'] = []
#     anno_template_json['intervals'] = []
    
#     for i, anno in gap_markers.iterrows():
#         anno_template = deepcopy(anno_template_json)
#         wells = anno['wells'].split("-")
#         anno_template['intervals'].append({
#             "begins_at":f"{anno['BEGIN_DATE']}T00:00:00.000Z",
#             "ends_before":f"{anno['END_DATE']}T23:59:00.000Z",
#         })
#         anno_template['title'] = anno_template['title'].replace("START", str(anno['BEGIN_DATE'])).replace("END", str(anno['END_DATE']))
#         anno_template['station_ids'] = [stations_dict[well] for well in wells]
        
#       Write the JSON objects to a file
#         with open(f"C:/Users/jinsu.elhance/Box/000. Jinsu Elhance/DendraWork/Annotations/{anno['BEGIN_DATE']}.annotation.json", 'w') as json_file:
#             json.dump(anno_template, json_file, indent=4)

# Identify and Annotate Outliers on Datastreams

In [None]:
# # datastream_df_dict = {}

# #Create empty anno json
# with open(f"C:\\Users\\jinsu.elhance\\Box\\000. Jinsu Elhance\\DendraWork\\Annotations\\outlier_template.annotation.json", encoding="utf-8") as anno_template_file:
    
#     anno_template_json = json.load(anno_template_file)
    
# anno_template_json = deepcopy(anno_template_json)
# anno_template_json['datastream_ids'] = []

# #Interval looks like:
# """
# 'intervals':[
# {'begins_at' : '2010-02-23T19:30:00.000',
#  'ends_before' : '2010-02-23T19:30:00.000'},
#  ...
# ]
# """

# #Query Dendra API to fetch Datapoints for each stream (from Station IDs)
# for station_name in stations_dict:
#     station_id = stations_dict[station_name]
#     station_datastreams = dendra.list_datastreams_by_station_id(station_id)
    
#     for datastream in station_datastreams: 
#         datastream_id = datastream['_id']
#         dendra_fetch = dendra.get_datapoints(datastream_id, begins_at="2000-01-01T00:00:00", time_type="local").reset_index()
#         dendra_fetch = dendra_fetch.rename(columns={dendra_fetch.columns[2]:"v"})
        
#         if dendra_fetch.shape[0] == 0:
#             continue
            
#         outlier_anno = deepcopy(anno_template_json)
#         outlier_indices = []
        
#         #Iterate over any contiguous sections of timeseries data and find outliers.
#         for ts in find_discrete_ts(dendra_fetch):
#             clf = IsolationForest(random_state=0, contamination=0.0005).fit(ts[['v']])
#             ts.loc[:, 'outlier'] = clf.predict(ts[['v']])
#             outlier_indices = np.append(outlier_indices, ts.loc[ts['outlier'] == -1].index)
        
#         #Add indices where values are non-positive
#         outlier_indices = np.append(outlier_indices, list(dendra_fetch.loc[dendra_fetch.v <= 0].index.values)).astype(int)
        
#         if len(outlier_indices) == 0:
#             continue
        
#         #Create outlier anno timestamp objects
#         time_stamps = [{
#             'begins_at': str(dendra_fetch.iloc[index]['timestamp_utc']).replace(" ","T").replace("+00:00",".000Z"),
#             'ends_before': str(dendra_fetch.iloc[index]['timestamp_utc']).replace(" ","T").replace("+00:00",".000Z")
#         } for index in outlier_indices]
        
#         #Create annotation object
#         outlier_anno['datastream_ids'] = [datastream_id]
#         outlier_anno['intervals'] = time_stamps
#         outlier_anno['station_ids'] = [station_id]
#         outlier_anno['title'] = f"Outlier Filter for {station_id}:{datastream['name']}"
#         outlier_anno['description'] = "Isolation Forest Outlier Filter + Removing Negative Points"

#         print(f"{station_name}:{datastream['name']}:{len(time_stamps)} outliers found")
# #         Write the JSON objects to a file
#         with open(f"C:/Users/jinsu.elhance/Box/000. Jinsu Elhance/DendraWork/Annotations/{datastream_id}.outliers.annotation.json", 'w') as json_file:
#             json.dump(outlier_anno, json_file, indent=4)

# Create Derived Datastreams

In [None]:
# #Iterate through stations
#     #Find level datastreams
#     #Find temperature datastreams
   
# with open(f"C:\\Users\\jinsu.elhance\\Box\\000. Jinsu Elhance\\DendraWork\\Datastreams\\Derived\\level.template.json", encoding="utf-8") as level_template_file:
    
#     level_template_json = json.load(level_template_file)
    
# level_template_json = deepcopy(level_template_json)

# with open(f"C:\\Users\\jinsu.elhance\\Box\\000. Jinsu Elhance\\DendraWork\\Datastreams\\Derived\\temp.template.json", encoding="utf-8") as temp_template_file:
    
#     temp_template_json = json.load(temp_template_file)
    
# temp_template_json = deepcopy(temp_template_json)

# for station_name in stations_dict:
#     station_id = stations_dict[station_name]
#     station_datastreams = dendra.list_datastreams_by_station_id(station_id)
#     station_name_C = dendra.get_meta_station_by_id(station_id)['full_name']
        
#     level_xle, level_xlsx, temp_xle, temp_xlsx = None, None, None, None
#     level_json = deepcopy(level_template_json)
#     temp_json = deepcopy(temp_template_json)
#     to_hide = True
    
#     for datastream in station_datastreams: 
#         datastream_id = datastream['_id']
        
#         if datastream['name'] == "Well Water Level xle/lev":
#             level_xle = datastream
#         elif datastream['name'] == "Well Water Level xlsx":
#             level_xlsx = datastream
#         elif datastream['name'] == "Well Water Temperature xle/lev":
#             temp_xle = datastream
#         elif datastream['name'] == "Well Water Temperature xlsx":
#             temp_xlsx = datastream

#     #Create Derived Level datastream
#     level_json['description'] = f"Derived datastream for {station_name_C} Well Water Level"
#     level_json['derived_from_datastream_ids'] = [level_xle['_id'], level_xlsx['_id']]
#     level_json['name'] = "Well Water Level"
#     level_json['station_id'] = station_id
#     level_json['is_hidden'] = dendra.get_meta_datastream_by_id(level_xle['_id'])['is_hidden'] and dendra.get_meta_datastream_by_id(level_xlsx['_id'])['is_hidden']
    
#     with open(f"C:/Users/jinsu.elhance/Box/000. Jinsu Elhance/DendraWork/Datastreams/Derived/{station_name}.level.derived.json", 'w') as json_file:
#         json.dump(level_json, json_file, indent=4)
    
#     #Create Derived temperature datastream
#     temp_json['description'] = f"Derived datastream for {station_name_C} Well Water Temperature"
#     temp_json['derived_from_datastream_ids'] = [temp_xle['_id'], temp_xlsx['_id']]
#     temp_json['name'] = "Well Water Temperature"
#     temp_json['station_id'] = station_id
#     temp_json['is_hidden'] = dendra.get_meta_datastream_by_id(temp_xle['_id'])['is_hidden'] and dendra.get_meta_datastream_by_id(temp_xlsx['_id'])['is_hidden']
    
#     with open(f"C:/Users/jinsu.elhance/Box/000. Jinsu Elhance/DendraWork/Datastreams/Derived/{station_name}.temp.derived.json", 'w') as json_file:
#         json.dump(temp_json, json_file, indent=4)

# Barometrically Correct (if possible)

# Use Survey123 Data to Create Station Attributes