# Load modules

In [None]:
#import moduels
import xarray as xr

#general modules
import numpy as np
import json
import datetime
import math
from shapely.geometry import Point

#statistical modules
from scipy.stats import linregress
import scipy.stats
from dtw import dtw,accelerated_dtw
from scipy.signal import hilbert, butter, filtfilt
from scipy.fftpack import fft,fftfreq,rfft,irfft,ifft

#machine learning modules
from tsfresh import extract_features
from tsfresh import select_features
from tsfresh.utilities.dataframe_functions import impute
from sklearn import cluster, datasets

#gridded & tabular data
import xarray as xr
import geopandas as gpd
import pandas as pd
import salem
from geocube.api.core import make_geocube

#Visualisation
import holoviews as hv
from holoviews import opts
import geoviews as gv
import matplotlib as mlp
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import gmaps
import ipywidgets as widgets
import seaborn as sns

#projection
from pyproj import Proj

#Widgets & Ipython Stuff
from IPython.core.interactiveshell import InteractiveShell
from IPython.display import display_html

#holoview
hv.extension('matplotlib')
hv.config.image_rtol = 0.1
opts.defaults(opts.Scatter3D(color='Value', cmap='fire', edgecolor='black', s=50))
renderer = hv.plotting.mpl.MPLRenderer.instance(dpi=120)
#hv.renderer('matplotlib')

#gmaps
gmaps.configure(api_key='AIzaSyC3jBxz5pktQXdIFOQzFv6MnOYIuF_ULvc')

#change modules parameter
# pretty print all cell's output and not just the last one
InteractiveShell.ast_node_interactivity = "last_expr" ##'allÄ or lst_exp = only last if unwandtet noise use ; at EOL or in for loop use _ = to assign varaible
#check module versions

# Load data from drive

In [None]:
#paths 
paths = ['C://Users//USER//Desktop//master-thesis-master//', 'D://jupy_data//', 'C://Users//USER//Desktop//Masterarbeit//DATA//master_data//', 'C:\\Users\\USER\\Desktop\\Masterarbeit\\DATA\\master_data\\'] 

## create city names

In [None]:
# Ried center
lon_c = 8.5262
lat_c = 49.7238

#Load city names

from geopandas import GeoDataFrame
from shapely.geometry import Point
import fiona

df = pd.read_csv(paths[0] + 'json//germany_city_names.csv', sep=None, encoding='utf-8')
print(df.head())
geometry = [Point(xy) for xy in zip(df.lon, df.lat)]
crs = {'init': 'epsg:4326'} #http://www.spatialreference.org/ref/epsg/2263/
geo_df = GeoDataFrame(df, crs=crs, geometry=geometry)

geo_df.to_file(driver='ESRI Shapefile', filename='germany_city_names.shp')

In [None]:
city_names = gpd.read_file('germany_city_names.shp')
city_names_filtered = city_names[city_names['place'] == 'city']

## Time Series Gridded data

### Soil Moisture & Precipitation & real evapotranspiration (1km x 1km)

In [None]:
xr_model_data = xr.open_dataset(paths[1] + 'xr_model_data.nc')

In [None]:
plot_xr_da_mean(xr_model_data.soil_moisture_1km,vec_aoi,'Soil Moisture Time Series Mean (2015.04 - 2019.12)', 'mean_ts_model_sm', cmap='plasma_r')

### NDVI & other Indizes

In [None]:
s2_1c = xr.open_dataset(paths[0] + 'indizes//s2_1c.nc')
s2_1c_ndvi_100m = xr.open_dataset(paths[0] + 'indizes//s2_1c_ndvi_100m.nc').rename_vars({'ndvi_100m' : 'NDVI'})
l7_sr_ndvi_100m = xr.open_dataset(paths[0] + 'indizes//l7_sr_ndvi_100m.nc').rename_vars({'ndvi_100m' : 'NDVI'})
l8_sr_ndvi_100m = xr.open_dataset(paths[0] + 'indizes//l7_sr_ndvi_100m.nc').rename_vars({'ndvi_100m' : 'NDVI'})
ndvi_datasets = [s2_1c, s2_1c_ndvi_100m, l7_sr_ndvi_100m, l8_sr_ndvi_100m]

## Load soil map data

In [None]:
soil_map_data = gpd.read_file(paths[3] + 'Bodenkarte_200\\aoi2020\\Boden_2020.shp')

In [None]:
Hauptgruppe = soil_map_data.HAUPTGRUPP.drop_duplicates().values.tolist()
Gruppe = soil_map_data.GRUPPE.drop_duplicates().values.tolist()
Untergruppe = soil_map_data.UNTERGRUPP.drop_duplicates().values.tolist()
Bodeneinheit = soil_map_data.BODENEINHE.drop_duplicates().values.tolist()
Substrat = soil_map_data.SUBSTRAT.drop_duplicates().values.tolist()

In [None]:
c = {'HAUPTGRUPP': Hauptgruppe, 'GRUPPE': Gruppe, 'UNTERGRUPP': Untergruppe, 'BODENEINHE': Bodeneinheit}

## Convert categorical data to Grid

In [None]:
xr_soil_map_100 = make_geocube(vector_data=soil_map_data, output_crs="+init=epsg:4326", resolution=(-0.0008983152841195215, 0.0008983152841195215), categorical_enums=c)

## Load desired AOI as shapefile with geopandas

In [None]:
import geopandas as gpd
import holoviews as hv

In [None]:
vec_aoi = gpd.read_file(paths[0] + 'aoi2020//aoi_2020.shp')
vec_ried = gpd.read_file(paths[2] + 'Ried_225_222//hessisches_ried.shp')

## Subset the xarray dataset with salem accesor 

In [None]:
xr_model_aoi = xr_model_data.salem.subset(shape=vec_aoi)
xr_model_ried = xr_model_data.salem.subset(shape=vec_ried)

In [None]:
plot_xr_da_mean(xr_model_aoi.soil_moisture_1km,None,None,'Soil Moisture Time Series Mean for Nördl. Oberrheingraben(2015.04 - 2019.12)', 'mean_ts_aoi_sm', cmap='plasma_r', lw_1=4, fsc=12)

In [None]:
plot_xr_da_mean(xr_model_ried.soil_moisture_1km,vec_ried,None,'Soil Moisture Time Series Mean for Hessisches Ried(2015.04 - 2019.12)', 'mean_ts_ried_sm', cmap='plasma_r', lw_1=4, fsc=16)

In [None]:
plot_xr_da_mean(xr_model_ried.soil_moisture_1km, vec_ried, vec_clc_ried[vec_clc_ried['raster_grp'] !=2],'Soil Moisture Time Series Mean for Hessisches Ried(2015.04 - 2019.12)', 'mean_ts_ried_sm_clc', cmap='plasma_r', lw_1=4,lw_2=1, fsc=16)

In [None]:
plot_xr_da_mean(xr_model_ried.soil_moisture_1km.where(clc_100_ried < 300), vec_ried, vec_clc_ried[vec_clc_ried['raster_grp'] !=2],'Soil Moisture Time Series Mean for Hessisches Ried(2015.04 - 2019.12) \n with agriculture areas', 'mean_ts_ried_sm_clc_f', lw_1=4,lw_2=1, fsc=16)

In [None]:
kwargs={'title' : 'Soil Moisture Mean cm³/cm³ over time (2015.04 - 2019.12)', 'titelsize' : 12}
cbar_kwargs = {'label': 'soil moisture cm³/cm³', 'pad' : 0.1, 'shrink' : 0.5} #, 'drawedges': True
legend_dict={'fontsize' : 15}

def plot_xr_da_mean(da,vec,vec_clc_1,title, name, cmap='plasma_r', lw_1 = 4,lw_2 = 2, fsc=14): #plasma_r
    fig, ax = plt.subplots(dpi=300)
    
    
    #norm=plt.Normalize(0,0.6)
    #cmap = mlp.colors.LinearSegmentedColormap.from_list("", ["lightblue",'blue',"red"])
    
    da.groupby('latitude', 'longitude').mean('time').plot(cbar_kwargs=cbar_kwargs, cmap=plt.cm.get_cmap(cmap))
    #kmeans_plot = gpd_kmeans_f.plot('group', markersize=18, ax=ax, legend=True,alpha=1, label='Kmean group') #, color=['white', 'blue', 'red', 'yellow', 'black']
    try:
        vec.geometry.boundary.plot(color=None,edgecolor='k', linewidth = lw_1, ax=ax)
    except:
        print('no geometry detect')
    try:
        vec_clc_1.geometry.boundary.plot(color=None,edgecolor='k', linewidth = lw_2, ax=ax, hatch='\\\\\\\\')
        circ1 = mpatches.Patch(facecolor='white',alpha=1, hatch=r'\\\\',label='non agriculture land')
        ax.legend(handles = [circ1],loc='upper right', fontsize='large')
    except:
        print('no geometry detect')
        
    city_names_filtered_1 = city_names_filtered[::]
    city_names_filtered_1.plot(ax=ax)
    city_names_filtered_1.apply(lambda x: ax.annotate(s=x['name'], xy=x.geometry.centroid.coords[0], ha='center', fontsize=fsc) ,axis=1) ;
    
    ax.set_title(title, fontsize=15,  pad=10) 
    ax.tick_params('both', labelsize=11) 
    plt.xlabel('longitude', fontsize=13,  labelpad=10)
    plt.ylabel('latitude', fontsize=13,  labelpad=10)

    F = plt.gcf()
    Size = F.get_size_inches()
    F.set_size_inches(Size[0]*3, Size[1]*3, forward=True) # Set forward to True to resize window along with plot in figure
    plt.tight_layout()

    plt.savefig('figures//%s.png' %(name))
    return plt.show()

In [None]:
fig, ax = plt.subplots(dpi=300)

xr_model_aoi.soil_moisture_1km.groupby('latitude', 'longitude').mean('time').plot(cbar_kwargs=cbar_kwargs, cmap=plt.cm.get_cmap('RdBu_r'))

city_names_filtered.plot(ax=ax)
city_names_filtered.apply(lambda x: ax.annotate(s=x['name'], xy=x.geometry.centroid.coords[0], ha='center', fontsize=12) ,axis=1) ;
vec_ried.geometry.boundary.plot(color=None,edgecolor='k', linewidth = 4, ax=ax)

ax.set_title('Soil Moisture Time Series Mean (2015.04 - 2019.12)', fontsize=15,  pad=10) 
ax.tick_params('both', labelsize=11) 
plt.xlabel('longitude', fontsize=13,  labelpad=10)
plt.ylabel('latitude', fontsize=13,  labelpad=10)

F = plt.gcf()
Size = F.get_size_inches()
F.set_size_inches(Size[0]*3, Size[1]*3, forward=True) # Set forward to True to resize window along with plot in figure

plt.savefig('figures//sm_mean_aoi_RB.png')

plt.show()

# Machine learning

In [None]:
clc_1000_aoi

In [None]:
#create pandas dataframe 
stacked_df = xr_model_aoi.to_dataframe().reset_index()
stacked_df_f = xr_model_aoi.where(clc_1000_aoi.data < 300).to_dataframe().reset_index()

stacked_df['lonlat'] = stacked_df.longitude + stacked_df.latitude
stacked_df_f['lonlat'] = stacked_df_f.longitude + stacked_df_f.latitude

In [None]:
#mask nan values
mask_na_sm = np.isfinite(stacked_df.soil_moisture_1km.tolist())
stacked_df_sm = stacked_df[mask_na_sm]
lonlat_list = stacked_df_sm.lonlat.unique()

#mask nan values
mask_na_sm_f = np.isfinite(stacked_df_f.soil_moisture_1km.tolist())
stacked_df_sm_f = stacked_df_f[mask_na_sm_f]
lonlat_list_f = stacked_df_sm_f.lonlat.unique()

In [None]:
stacked_df_sm_f

In [None]:
#get soil map as Y
y = xr_soil_map_100.interp(y=xr_model_ried.latitude, x=xr_model_ried.longitude, method='nearest')
y_Bodeneinheit = y.BODENEINHE.to_dataframe().reset_index()
y_Hauptgruppe  = y.HAUPTGRUPP.to_dataframe().reset_index()
y_Bodeneinheit['lonlat'] = y_Bodeneinheit.longitude + y_Bodeneinheit.latitude
y_Hauptgruppe['lonlat']  = y_Hauptgruppe.longitude + y_Hauptgruppe.latitude
y_Bodeneinheit = y_Bodeneinheit[y_Bodeneinheit['lonlat'].isin(lonlat_list)]
y_Hauptgruppe = y_Hauptgruppe[y_Hauptgruppe['lonlat'].isin(lonlat_list)]


In [None]:
#Extract Features
extracted_features_sm = extract_features(stacked_df_sm, column_id="lonlat", column_sort="time", column_value="soil_moisture_1km")



In [None]:
#Extract Features
extracted_features_sm_f = extract_features(stacked_df_sm_f, column_id="lonlat", column_sort="time", column_value="soil_moisture_1km")

In [None]:
#concat y to features
extracted_features_sm_e = extracted_features_sm
#extracted_features_sm_e['Bodeneinheit'] = np.array(y_Bodeneinheit['BODENEINHE'].tolist()).astype('int')
extracted_features_sm_e['Hauptgruppe'] = np.array(y_Hauptgruppe['HAUPTGRUPP'].tolist()).astype('int')

#select only valid samples
extracted_features_sm_s = extracted_features_sm_e[extracted_features_sm_e['Hauptgruppe'] != -1]

In [None]:
#remove all nan values 
impute(extracted_features_sm_s)
#select only relevant features
features_filtered_sm = select_features(extracted_features_sm_s.iloc[:,:-1], extracted_features_sm_s['Hauptgruppe'])

In [None]:
#remove all nan values 
impute(extracted_features_sm)
impute(extracted_features_sm_f)

In [None]:
len(extracted_features_sm), len(extracted_features_sm_f)

In [None]:
#K means clustering
X = np.array(extracted_features_sm.iloc[:,:2])
#y_iris = np.array(extracted_features_sm_s['Hauptgruppe'])
km3 = cluster.KMeans(n_clusters=3).fit(X)
km4 = cluster.KMeans(n_clusters=4).fit(X)
km5 = cluster.KMeans(n_clusters=5).fit(X)
km7 = cluster.KMeans(n_clusters=7).fit(X)
km3_p = km3.predict(X)
km4_p = km4.predict(X)
km5_p = km5.predict(X)
km7_p = km7.predict(X)
plt.figure(figsize=(24, 8))
plt.subplot(141)
plt.scatter(X[:, 0], X[:, 1], c=km3.labels_)
plt.title("K=3, J=%.2f" % km3.inertia_)
plt.subplot(142)
plt.scatter(X[:, 0], X[:, 1], c=km4.labels_)
plt.title("K=4, J=%.2f" % km4.inertia_)
plt.subplot(143)
plt.scatter(X[:, 0], X[:, 1], c=km5.labels_)#.astype(np.float))
plt.title("K=5, J=%.2f" % km5.inertia_)
plt.subplot(144)
plt.scatter(X[:, 0], X[:, 1], c=km7.labels_)#.astype(np.float))
plt.title("K=7, J=%.2f" % km7.inertia_)
plt.savefig('figures//%s.png' %('kmeans_aoi_all'))#K means clustering

X_f = np.array(extracted_features_sm_f.iloc[:,:20])
#y_iris = np.array(extracted_features_sm_s['Hauptgruppe'])
km3_f = cluster.KMeans(n_clusters=3).fit(X_f)
km4_f = cluster.KMeans(n_clusters=4).fit(X_f)
km5_f = cluster.KMeans(n_clusters=5).fit(X_f)
km7_f = cluster.KMeans(n_clusters=7).fit(X_f)
km3_f_p = km3_f.predict(X_f)
km4_f_p = km4_f.predict(X_f)
km5_f_p = km5_f.predict(X_f)
km7_f_p = km7_f.predict(X_f)
plt.figure(figsize=(24, 8))
plt.subplot(141)
plt.scatter(X_f[:, 0], X_f[:, 1], c=km3_f.labels_)
plt.title("K=3, J=%.2f" % km3_f.inertia_)
plt.subplot(142)
plt.scatter(X_f[:, 0], X_f[:, 1], c=km4_f.labels_)
plt.title("K=4, J=%.2f" % km4_f.inertia_)
plt.subplot(143)
plt.scatter(X_f[:, 0], X_f[:, 1], c=km5_f.labels_)#.astype(np.float))
plt.title("K=5, J=%.2f" % km5_f.inertia_)
plt.subplot(144)
plt.scatter(X_f[:, 0], X_f[:, 1], c=km7_f.labels_)#.astype(np.float))
plt.title("K=7, J=%.2f" % km7_f.inertia_)
plt.savefig('figures//%s.png' %('kmeans_aoi_all_2_f'))#K means clustering

#np.unique(y_iris)

In [None]:
#get groups for classification with kmeans
result = extracted_features_sm
result['group'] = km7_p
result = result['group']
groups = stacked_df_sm.drop_duplicates('lonlat').join(result, on='lonlat', rsuffix='asd')[['longitude', 'latitude', 'group']]

result_f = extracted_features_sm_f
result_f['group'] = km7_f_p
result_f = result_f['group']
groups_f = stacked_df_sm_f.drop_duplicates('lonlat').join(result_f, on='lonlat', rsuffix='asd')[['longitude', 'latitude', 'group']]

#create geometry
gdf_groups_aoi = gpd.GeoDataFrame(groups, geometry=gpd.points_from_xy(groups.longitude, groups.latitude))
gdf_groups_f_aoi = gpd.GeoDataFrame(groups_f, geometry=gpd.points_from_xy(groups_f.longitude, groups_f.latitude))

#intersection


In [None]:
from mpl_toolkits.axes_grid1 import make_axes_locatable

In [None]:
cbar_kwargs = {'label': 'soil moisture cm³/cm³', 'pad' : 0.1, 'shrink' : 0.5} #, 'drawedges': True
legend_kwds={'label': "Kmeans cluster group"}

#create gpd multipoint with shaply geometry from lon & lat equal to [Point(x, y) for x, y in zip(df.Longitude, df.Latitude)]
gpd_kmeans_ried = gpd.GeoDataFrame(gdf_groups_ried, geometry=gpd.points_from_xy(gdf_groups_ried.longitude, gdf_groups_ried.latitude))
gpd_kmeans_f_ried = gpd.GeoDataFrame(gdf_groups_f_ried, geometry=gpd.points_from_xy(gdf_groups_f_ried.longitude, gdf_groups_f_ried.latitude))

fig, ax = plt.subplots(ncols = 2, nrows=2, dpi=200)
ax[0,0].scatter(X[:, 0], X[:, 1], c=km5.labels_)
ax[0,1].scatter(X_f[:, 0], X_f[:, 1], c=km5_f.labels_)#.astype(np.float))

divider = make_axes_locatable(ax[1,0])
divider_2 = make_axes_locatable(ax[1,1])

cax = divider.append_axes("right", size="5%", pad=0.1)
cax_2 = divider_2.append_axes("right", size="5%", pad=0.1)

gpd_kmeans_ried.plot('group',  markersize=120, legend=True, ax=ax[1,0], alpha=1, cax=cax)

gpd_kmeans_f_ried.plot('group', markersize=120, legend=True, ax=ax[1,1], alpha=1, cax=cax_2)

vec_ried.geometry.boundary.plot(color=None,edgecolor='k', linewidth = 3, ax=ax[1,0])
vec_ried.geometry.boundary.plot(color=None,edgecolor='k', linewidth = 3, ax=ax[1,1])
#ax[.axis([7.7, 8.7, 49.55, 50.18])


F = plt.gcf()
Size = F.get_size_inches()
F.set_size_inches(Size[0]*3, Size[1]*3, forward=True) # Set forward to True to resize window along with plot in figure
ax[0,0].set_title('Soil Moisture Clustering by Feature Extraction from Time Series (5 Groups)')
ax[0,1].set_title('Soil Moisture Clustering by Feature Extraction from Time Series on Agriculture Surfaces (5 Groups)')
plt.tight_layout()
plt.savefig('figures//%s.png' %('kmeans_ried_all_compare_results_5'))#K means clustering


In [None]:
cbar_kwargs = {'label': 'soil moisture cm³/cm³', 'pad' : 0.1, 'shrink' : 0.5} #, 'drawedges': True
legend_kwds={'label': "Kmeans cluster group"}

#create gpd multipoint with shaply geometry from lon & lat equal to [Point(x, y) for x, y in zip(df.Longitude, df.Latitude)]
gpd_kmeans_ried = gpd.GeoDataFrame(gdf_groups_ried, geometry=gpd.points_from_xy(gdf_groups_ried.longitude, gdf_groups_ried.latitude))
gpd_kmeans_f_ried = gpd.GeoDataFrame(gdf_groups_f_ried, geometry=gpd.points_from_xy(gdf_groups_f_ried.longitude, gdf_groups_f_ried.latitude))

gpd_kmeans_aoi = gpd.GeoDataFrame(gdf_groups_aoi, geometry=gpd.points_from_xy(gdf_groups_aoi.longitude, gdf_groups_aoi.latitude))
gpd_kmeans_f_aoi = gpd.GeoDataFrame(gdf_groups_f_aoi, geometry=gpd.points_from_xy(gdf_groups_f_aoi.longitude, gdf_groups_f_aoi.latitude))

fig, ax = plt.subplots(ncols = 2, nrows=2, dpi=200)
ax[0,0].scatter(X[:, 0], X[:, 1], c=km5.labels_)
ax[0,1].scatter(X_f[:, 0], X_f[:, 1], c=km5_f.labels_)#.astype(np.float))

divider = make_axes_locatable(ax[1,0])
divider_2 = make_axes_locatable(ax[1,1])

cax = divider.append_axes("right", size="5%", pad=0.1)
cax_2 = divider_2.append_axes("right", size="5%", pad=0.1)

gpd_kmeans_aoi.plot('group',  markersize=60, legend=True, ax=ax[1,0], alpha=1, cax=cax)

gpd_kmeans_f_aoi.plot('group', markersize=60, legend=True, ax=ax[1,1], alpha=1, cax=cax_2)

vec_ried.geometry.boundary.plot(color=None,edgecolor='k', linewidth = 3, ax=ax[1,0])
vec_ried.geometry.boundary.plot(color=None,edgecolor='k', linewidth = 3, ax=ax[1,1])
#ax[.axis([7.7, 8.7, 49.55, 50.18])


F = plt.gcf()
Size = F.get_size_inches()
F.set_size_inches(Size[0]*3, Size[1]*3, forward=True) # Set forward to True to resize window along with plot in figure
ax[0,0].set_title('Soil Moisture Clustering by Feature Extraction from Time Series (5 Groups)')
ax[0,1].set_title('Soil Moisture Clustering by Feature Extraction from Time Series on Agriculture Surfaces (5 Groups)')
plt.tight_layout()
plt.savefig('figures//%s.png' %('kmeans_aoi_all_compare_results_5'))#K means clustering


In [None]:
cbar_kwargs = {'label': 'soil moisture cm³/cm³', 'pad' : 0.1, 'shrink' : 0.5} #, 'drawedges': True
legend_kwds={'label': "Kmeans cluster group"}

#create gpd multipoint with shaply geometry from lon & lat equal to [Point(x, y) for x, y in zip(df.Longitude, df.Latitude)]
gpd_kmeans_ried = gpd.GeoDataFrame(gdf_groups_ried, geometry=gpd.points_from_xy(gdf_groups_ried.longitude, gdf_groups_ried.latitude))
gpd_kmeans_f_ried = gpd.GeoDataFrame(gdf_groups_f_ried, geometry=gpd.points_from_xy(gdf_groups_f_ried.longitude, gdf_groups_f_ried.latitude))

gpd_kmeans_aoi = gpd.GeoDataFrame(gdf_groups_aoi, geometry=gpd.points_from_xy(gdf_groups_aoi.longitude, gdf_groups_aoi.latitude))
gpd_kmeans_f_aoi = gpd.GeoDataFrame(gdf_groups_f_aoi, geometry=gpd.points_from_xy(gdf_groups_f_aoi.longitude, gdf_groups_f_aoi.latitude))

fig, ax = plt.subplots(ncols = 2, nrows=2, dpi=200)
ax[0,0].scatter(X[:, 0], X[:, 1], c=km7.labels_)
ax[0,1].scatter(X_f[:, 0], X_f[:, 1], c=km7_f.labels_)#.astype(np.float))

divider = make_axes_locatable(ax[1,0])
divider_2 = make_axes_locatable(ax[1,1])

cax = divider.append_axes("right", size="5%", pad=0.1)
cax_2 = divider_2.append_axes("right", size="5%", pad=0.1)

gpd_kmeans_aoi.plot('group',  markersize=25, legend=True, ax=ax[1,0], alpha=1, cax=cax)

gpd_kmeans_f_aoi.plot('group', markersize=25, legend=True, ax=ax[1,1], alpha=1, cax=cax_2)

vec_ried.geometry.boundary.plot(color=None,edgecolor='k', linewidth = 3, ax=ax[1,0])
vec_ried.geometry.boundary.plot(color=None,edgecolor='k', linewidth = 3, ax=ax[1,1])
#ax[.axis([7.7, 8.7, 49.55, 50.18])


F = plt.gcf()
Size = F.get_size_inches()
F.set_size_inches(Size[0]*3, Size[1]*3, forward=True) # Set forward to True to resize window along with plot in figure
ax[0,0].set_title('Soil Moisture Clustering by Feature Extraction from Time Series (7 Groups)')
ax[0,1].set_title('Soil Moisture Clustering by Feature Extraction from Time Series on Agriculture Surfaces (7 Groups)')
plt.tight_layout()
plt.savefig('figures//%s.png' %('kmeans_aoi_all_compare_results_7'))#K means clustering


In [None]:
import mplleaflet
ax = gpd_kmeans_f_aoi.plot(column='group')
mplleaflet.show(fig=ax.figure, path='figures//gpd_kmeans_aoi_f_map_7.html')

https://towardsdatascience.com/playing-with-time-series-data-in-python-959e2485bff8

## Single band data 

In [None]:
#load tif file in xarray and squeeze single value dimensions & rename dims
clc_100 = xr.open_rasterio(paths[0] + 'corine_land_cover//corine_land_cover_2018_100.tif').squeeze(drop = True).rename({'x' : 'longitude', 'y' : 'latitude'})
clc_1000 = xr.open_rasterio(paths[0] + 'corine_land_cover//corine_land_cover_2018_1000.tif').squeeze(drop = True).rename({'x' : 'longitude', 'y' : 'latitude'})

In [None]:
clc_1000

In [None]:
#load json with attributes file into variable
with open(paths[0] + 'corine_land_cover//clc_2018_attributes.txt') as json_file:
    clc_2018_attributes = json.load(json_file)
    
#add class names and values from dict
df_attributes = pd.DataFrame({'landcover_class_names' : clc_2018_attributes['properties']['landcover_class_names'], 'landcover_class_values' : clc_2018_attributes['properties']['landcover_class_values']})
#reduce class column to lvl0 class
df_attributes['landcover_class'] = df_attributes['landcover_class_names'].apply(lambda x: x.partition(";")[0])
df_attributes['landcover_lvl0_values'] = df_attributes['landcover_class_values'].apply(lambda x: int(str(x)[0]))

#### List of Datasets

## vector data

## corine landcover to geodatabase

In [None]:
#Load shapefile via geopandas into a GeoSeries 
#vec_aoi = gpd.GeoSeries(gpd.read_file(paths[0] + 'aoi2020//aoi_2020.shp')['geometry'])
#vec_ried = gpd.GeoSeries(gpd.read_file(paths[2] + 'Ried_225_222//hessisches_ried.shp')['geometry'])
vec_clc_aoi = gpd.GeoDataFrame(gpd.read_file(paths[0] + 'corine_land_cover//vec_clc_aoi.shp'))

#get Intersection of ried and aoi to create ried clc vector
vec_clc_ried = gpd.overlay(gpd.GeoDataFrame(vec_ried), gpd.GeoDataFrame(vec_clc_aoi), how='intersection')

#Set projection to GeoSeries 
#vec_aoi.crs = {'init' :'epsg:4326'}
#vec_ried.crs = {'init' :'epsg:4326'}
vec_clc_aoi.crs = {'init' :'epsg:4326'}
vec_clc_ried.crs = {'init' :'epsg:4326'}

In [None]:
#remove invalid polygon from lancover vector
vec_clc_aoi = vec_clc_aoi[vec_clc_aoi['raster_val'] != 0.0]
vec_clc_ried = vec_clc_ried[vec_clc_ried['raster_val'] != 0.0]
#Add column for lvl 0 Group of Land classes
vec_clc_aoi['raster_grp'] = vec_clc_aoi['raster_val'].apply(lambda x: int(str(x)[0]))
vec_clc_ried['raster_grp'] = vec_clc_ried['raster_val'].apply(lambda x: int(str(x)[0]))
#Add column for string description of landclasses
vec_clc_aoi['landcover_class'] = vec_clc_aoi['raster_val'].apply(lambda x: df_attributes['landcover_class'][df_attributes['landcover_class_values'] == x].values[0])
vec_clc_ried['landcover_class'] = vec_clc_ried['raster_val'].apply(lambda x: df_attributes['landcover_class'][df_attributes['landcover_class_values'] == x].values[0])

## Interpolate coordinates (long & lat) in order to get same dimension size and spatial extend

In [None]:
#original resolution ~1km
clc_1000_aoi = clc_100.interp(latitude=xr_model_aoi['latitude'], longitude=xr_model_aoi['longitude'], method='nearest')
clc_1000_ried = clc_1000.interp(latitude=xr_model_ried['latitude'], longitude=xr_model_ried['longitude'], method='nearest')
clc_100_ried = clc_100.interp(latitude=xr_model_ried['latitude'], longitude=xr_model_ried['longitude'], method='nearest')

clc_1000_aoi['clc_grp'] = (['latitude', 'longitude'], np.array([int(str(x)[0]) if str(x) != 'nan' else np.nan for x in np.ravel(clc_1000_aoi.values.tolist())]).reshape(clc_1000_aoi.shape))
clc_1000_grp = clc_1000_aoi.to_dataset(name='clc_class').reset_coords(['clc_grp'])


#lat & long for 500m resolution 
lon_500 = np.linspace(xr_model_aoi.longitude[0], xr_model_aoi.longitude[-1], xr_model_aoi.dims['longitude'] * 2)
lat_500 = np.linspace(xr_model_aoi.latitude[0], xr_model_aoi.latitude[-1], xr_model_aoi.dims['latitude'] * 2)

#lat & long for 250m resolution 
lon_250 = np.linspace(xr_model_aoi.longitude[0], xr_model_aoi.longitude[-1], xr_model_aoi.dims['longitude'] * 4)
lat_250 = np.linspace(xr_model_aoi.latitude[0], xr_model_aoi.latitude[-1], xr_model_aoi.dims['latitude'] * 4)

#interpolating resolution ~500m & ~250m
clc_500_aoi = clc_100.interp(latitude=lat_500, longitude=lon_500, method='nearest').astype(int)
clc_250_aoi = clc_100.interp(latitude=lat_250, longitude=lon_250, method='nearest').astype(int)

## Load DataFrame from csv

In [None]:
aoi_db = pd.read_csv('csv_new\\aoi_1000_v1.csv')
ried_db = pd.read_csv('csv_new\\ried_1000_v1.csv')

## Show measurement positions on Heatmap

In [None]:
gpd_location_ried = gpd.GeoDataFrame(ried_db, geometry=gpd.points_from_xy(ried_db.lon, ried_db.lat)).drop_duplicates('lonlat')
gpd_location_ried = gpd_location_ried[gpd_location_ried.within(polygon)]


In [None]:
#generate (latitude, longitude) pairs
location_1 = np.unique(gpd_location_ried[['lat','lon']], axis=0)
weights_1 = [abs(x) * 100 for x in gpd_location_ried.groupby('lonlat').agg({'slope_lineregress': "mean"}).slope_lineregress.values.tolist()]
location_2 = np.unique(gpd_location_ried[gpd_location_ried['clc_category'] == 2][['lat','lon']], axis=0)
weights_2 = [abs(x)* 100 for x in gpd_location_ried[gpd_location_ried['clc_category'] == 2].groupby('lonlat').agg({'slope_lineregress': "mean"}).slope_lineregress.values.tolist()]
location_3 = np.unique(aoi_db[aoi_db['clc_category'] == 2][['lat','lon']], axis=0)[:-2]
weights_3 = [abs(x) * 100 for x in aoi_db[aoi_db['clc_category'] == 2].groupby('lonlat').agg({'slope_lineregress': "mean"}).slope_lineregress.values.tolist()]
location_4 = np.unique(gpd_location_ried[(gpd_location_ried['clc_category'] == 2)&(gpd_location_ried['ndvi'].between(0,0.2))& (gpd_location_ried['slope_lineregress'] < 0)][['lat','lon']], axis=0)

figures = list()
for loc,w in zip([location_1,location_2, location_3],[weights_1,weights_2,weights_3]):
    heatmap_layer = gmaps.heatmap_layer(loc, w, max_intensity=13, dissipating=False, point_radius=0.01) #, weights=sm slope mean , gradient=['white', 'red', 'blue']
    heatmap_layer.gradient = [
          'rgba(0, 255, 255, 0)',
          'rgba(0, 255, 255, 1)',
          'rgba(0, 191, 255, 1)',
          'rgba(0, 127, 255, 1)',
          'rgba(0, 63, 255, 1)',
          'rgba(0, 0, 255, 1)',
          'rgba(0, 0, 223, 1)',
          'rgba(0, 0, 191, 1)',
          'rgba(0, 0, 159, 1)',
          'rgba(0, 0, 127, 1)',
          'rgba(63, 0, 91, 1)',
          'rgba(127, 0, 63, 1)',
          'rgba(191, 0, 31, 1)',
          'rgba(255, 0, 0, 1)']
    gmap = gmaps.figure(map_type='SATELLITE', layout={'width': '400px', 'height': '600px', 'padding': '3px','border': '1px solid black'}, zoom_level=8, center=location_1[0])
    gmap.add_layer(heatmap_layer)
    figures.append(gmap)
    


In [None]:
title = widgets.HTML('<h3>Soil Moisture Measurements with mean of slope for single locations!</h3>')
widgets.VBox([title, widgets.HBox(figures, layout={'width': '100%'})])

In [None]:
embed_minimal_html('figures//slope_ried.html', views=[figures[0]])

In [None]:
#center coordinates to set view 
center = (49.77611923217773, 8.490663528442383)   
#create figure for gmaps 
gmap = gmaps.figure(center=center, map_type='SATELLITE', zoom_level=10, layout={'width': '800px', 'height': '600px'})
#add locations with marker
location = location_2
marker = gmaps.marker_layer(locations=[(x[0],y[1]) for x,y in zip(location[:], location[:])], info_box_content=[str((x[0],y[1])) for x,y in zip(location[:], location[:])])
symbol = gmaps.symbol_layer(locations=[(x[0],y[1]) for x,y in zip(location[:], location[:])], info_box_content=[str((x[0],y[1])) for x,y in zip(location[:], location[:])])
#add markers to figure
gmap.add_layer(symbol)

#display map
print('unique locations : ', len(location_1))
print('unique locations RIED (with agriculture surface derived from corine land cover): ', len(location_2))
print('unique locations AOI (with agriculture surface derived from corine land cover): ', len(location_3))
display(gmap)

In [None]:
from ipywidgets.embed import embed_minimal_html

embed_minimal_html('figures//ried_map_circle.html', views=[gmap])

In [None]:
r2 = [(49.64369583129883, 8.407676696777344), (49.64369583129883, 8.407676696777344),(49.71588134765625, 8.50103759765625),(49.72792434692383, 8.50103759765625),(49.93307876586914, 8.573651313781737) ,(49.92098617553711, 8.594398498535156)]
r1 = [(49.8484992980957, 8.490663528442383),(49.8484992980957, 8.480290412902832),(49.8484992980957, 8.469917297363281),(49.8484992980957, 8.459543228149414),(49.8484992980957, 8.449170112609862),(49.8484992980957, 8.438796997070312),(49.8484992980957, 8.428422927856445),(49.8484992980957, 8.418049812316895), (49.8484992980957, 8.407676696777344), (49.8484992980957, 8.397302627563478)]
r3 = [(49.64369583129883, 8.428422927856445), (49.64369583129883, 8.418049812316895), (49.64369583129883, 8.407676696777344),(49.67977905273438, 8.511410713195799),(49.67977905273438, 8.521783828735353),(49.667747497558594, 8.521783828735353)]
r4_good_collection = [((49.78817367553711, 8.438796997070312)), (49.78817367553711, 8.490663528442383), (49.67977905273438, 8.50103759765625), (49.691810607910156, 8.573651313781737), (49.59563446044922, 8.573651313781737), (49.75201416015625, 8.584024429321289), (49.88473129272461, 8.418049812316895), (49.860572814941406, 8.407676696777344), (49.8484992980957, 8.407676696777344), (49.83642959594727, 8.407676696777344), (49.908897399902344, 8.469917297363281)]

In [None]:
#center coordinates to set view 
center = (49.77611923217773, 8.490663528442383)   
#create figure for gmaps 
gmap = gmaps.figure(center=center, map_type='SATELLITE', zoom_level=10, layout={'width': '800px', 'height': '600px'})
#add locations with marker
location = r3
marker = gmaps.marker_layer(locations=[(x[0],y[1]) for x,y in zip(location[:], location[:])], info_box_content=[str((x[0],y[1])) for x,y in zip(location[:], location[:])])
#add markers to figure
gmap.add_layer(marker)
display(gmap)

In [None]:
xrl_ried_r1 = [xr_model_ried.sel(longitude=x[1], latitude=x[0], method='nearest', tolerance=0.0001) for x in r1]
xrl_ried_r2 = [xr_model_ried.sel(longitude=x[1], latitude=x[0], method='nearest', tolerance=0.0001) for x in r2]
xrl_ried_r3 = [xr_model_ried.sel(longitude=x[1], latitude=x[0], method='nearest', tolerance=0.0001) for x in r3]

pdl_ried_r1 = [x.to_dataframe() for x in xrl_ried_r1]
pdl_ried_r2 = [x.to_dataframe() for x in xrl_ried_r2]
pdl_ried_r3 = [x.to_dataframe() for x in xrl_ried_r3]

In [None]:
fig.suptitle('Time Series for Soil Moisture, Precipitation & Real Evapotranspiration from 10 points on same latitude with rolling mean of 7 days', fontsize=16, y=1.08)

fig, ax = plt.subplots(ncols=3, nrows=5, figsize=(45,25), gridspec_kw={'width_ratios': [3, 1,1]})
for line in pdl_ried_r3[::1]:
    line = line.resample('7d').mean()
    line_m = np.isfinite(line.soil_moisture_1km)
    line_sm = line.soil_moisture_1km[line_m]
    for i,years in enumerate([2015,2016,2017,2018,2019]):
        line_sm[line_sm.index.year == years].plot(ax=ax[i,0], linewidth=1.3, legend=True, label='%s' %(str(line.longitude[0])[:4]), marker='o', linestyle='-')
        line[line.index.year == years].real_evapotranspiration.plot(ax=ax[i,1], linewidth=0.2)
        line[line.index.year == years].precipitation_1km.plot(ax=ax[i,2], linewidth=0.2)
        ax[i,0].legend(loc='right')
        ax[i,0].tick_params('both', labelsize=11)
        #plt.xlabel('longitude', fontsize=13,  labelpad=10)
        ax[i,0].set_ylabel('Soil Moisture cm³/cm³', fontsize=13,  labelpad=10)
        ax[i,1].set_ylabel('Precipitation mm', fontsize=13,  labelpad=10)
        ax[i,2].set_ylabel('Real Evapotranspiration', fontsize=13,  labelpad=10)
        
plt.tight_layout()

plt.savefig('figures//%s.png' %('ts_r3_7d'))#K means clustering


In [None]:
from cycler import cycler
default_cycler = (cycler(color=['blue', 'red', 'black', 'yellow', 'orange', 'purple']))

plt.rc('axes', prop_cycle=default_cycler)

In [None]:
fig, ax = plt.subplots(ncols=3, nrows=5, figsize=(45,20), gridspec_kw={'width_ratios': [3, 1,1]}, dpi=150)
colors=['blue', 'red', 'black', 'yellow', 'orange']
for line in pdl_ried_r3[::1]:
    line = line.resample('14d').mean()
    line_m = np.isfinite(line.soil_moisture_1km)
    line_sm = line.soil_moisture_1km[line_m]
    for i,years in enumerate([2015,2016,2017,2018,2019]):
        line_sm[line_sm.index.year == years].plot(ax=ax[i,0], linewidth=2, legend=True, label='%s' %(str(line.longitude[0])[:4]), marker='o', linestyle='-')
        line[line.index.year == years].real_evapotranspiration.plot(ax=ax[i,1], linewidth=0.2)
        line[line.index.year == years].precipitation_1km.plot(ax=ax[i,2], linewidth=0.2)
        ax[i,0].legend(loc='right',fontsize=16)
        ax[i,0].tick_params('both', labelsize=15) 
        #plt.xlabel('longitude', fontsize=13,  labelpad=10)
        ax[i,0].set_ylabel('Soil Moisture cm³/cm³', fontsize=16,  labelpad=10)
        ax[i,1].set_ylabel('Precipitation mm', fontsize=16,  labelpad=10)
        ax[i,2].set_ylabel('Real Evapotranspiration', fontsize=16,  labelpad=10)
        
plt.tight_layout()
plt.savefig('figures//%s.png' %('ts_r3_14d'))#K means clustering

In [None]:
#from east to west
collection_se = [(49.57161712646485, 8.656639099121094),(49.57161712646485, 8.646265983581543), (49.57161712646485, 8.635891914367676), (49.57161712646485, 8.625518798828125),(49.57161712646485, 8.615145683288574),(49.57161712646485, 8.604771614074707),(49.57161712646485, 8.594398498535156)]
r_1 = (49.667747497558594, 8.397302627563478)
r_2 = (49.63167572021485, 8.407676696777344)
r_3 = (49.59563446044922, 8.428422927856445)
r_4 = (49.908897399902344, 8.532157897949219)

http://geopandas.org/gallery/plotting_basemap_background.html#sphx-glr-gallery-plotting-basemap-background-py

# Find locations for soil samples 

## Priority list: (high  -> low) 
**1. cells with 0 & 0.5 & 1 mm rain**    
**2. cells with NDVI between (0.036 - 0.2) ~bare soil**  
**3. cells with specifi clc_2018 category (211-216??)**  
**4. cells with highest count on different meassurements on same coordinate pair**     
**5. cells with highest count on sm values within a period**  

## Raiting DataFrame

## Add period score

In [None]:
#scores
sm_measurements_score = 0.2
ndvi_score = 0.2
ndvi_bare = 1
lonlat_count_score = 0.2

In [None]:
sum([-0.5,-0.5])

In [None]:
#add score column to periods
aoi_db['score'] = aoi_db.sm_measurements * sm_measurements_score  + aoi_db.lonlat_count * lonlat_count_score + aoi_db.ndvi.mul(ndvi_score, fill_value=0) + aoi_db.ndvi.between(0,0.2).mul(ndvi_bare) 
ried_db['score'] = ried_db.sm_measurements * sm_measurements_score  + ried_db.lonlat_count * lonlat_count_score + ried_db.ndvi.mul(ndvi_score, fill_value=0) + ried_db.ndvi.between(0,0.2).mul(ndvi_bare) 

#add score to latlon group
aoi_db['location_count'] = aoi_db.groupby(by='lonlat').score.transform('sum')
ried_db['location_count'] = ried_db.groupby(by='lonlat').score.transform('sum')
#ried_db['slope_score'] = abs(ried_db.slope_lineregress)
ried_db['slope_score_sum'] = ried_db.groupby(by='lonlat').slope_lineregress.transform('sum')

ried_db.head()

#groupby location score
#dfg_location_score = df_lonlat.groupby(by='location_count')

In [None]:
gpd_ried_db_complete = gpd.GeoDataFrame(ried_db, geometry=gpd.points_from_xy(ried_db.lon, ried_db.lat))

In [None]:

print(len(gpd_location_ried))
print(len(gpd_location_ried[gpd_location_ried['clc_category'] == 2]))
print(len(gpd_location_ried[(gpd_location_ried['clc_category'] == 2)&(gpd_location_ried['ndvi'].between(0,0.2))]))
print(len(gpd_location_ried[(gpd_location_ried['clc_category'] == 2)&(gpd_location_ried['ndvi'].between(0,0.2))]))
print(len(gpd_location_ried[(gpd_location_ried['clc_category'] == 2)&(gpd_location_ried['ndvi'].between(0,0.2))& (gpd_location_ried['slope_lineregress'] < 0)]))
goal = gpd_location_ried[(gpd_location_ried['clc_category'] == 2)&(gpd_location_ried['ndvi'].between(0,0.2)) & (gpd_location_ried['slope_lineregress'] < 0)]
goal
#len(ried_db[ried_db['ndvi'].between(0,0.2)].groupby(''))

In [None]:
#create boolean mask for clc category 2 
mask_clc_ried = (ried_db['clc_category'] == 2).tolist()
mask_clc_aoi = (aoi_db['clc_category'] == 2).tolist()

In [None]:
#remove entries with clc unequal to category agricutlutre (2)
aoi_db = aoi_db[mask_clc_aoi]
ried_db = ried_db[mask_clc_ried]

In [None]:
#create geopandas database 
aoi_db['sum_measurements_location'] = aoi_db.drop('Unnamed: 0', axis=1).groupby('lonlat').sm_measurements.transform('sum')
aoi_db_llw = aoi_db.drop_duplicates('lonlat')
gpd_aoi_db = gpd.GeoDataFrame(aoi_db_llw[['lonlat_count', 'sum_measurements_location']], geometry=gpd.points_from_xy(aoi_db_llw.lon, aoi_db_llw.lat))

In [None]:
import math

In [None]:
ax = gpd_aoi_db.plot(markersize=[math.sqrt(x)*4 for x in gpd_aoi_db.sum_measurements_location], legend=True)
mplleaflet.show(fig=ax.figure, path='figures//gpd_aoi_db.html')

## Build Timeseries for tslearn and other machine learning modules  
subset the timeseries to the dry periods otherwise noise is to strength and even better with equal evp values 

### aoi update_3

In [None]:
#load json file into variable
with open(paths[0] + 'json//update_3_v1_aoi_clc1000m.txt') as json_file:
    update_3_aoi = json.load(json_file)

### ried update_3

In [None]:
#load json file into variable
with open(paths[0] + 'json//update_3_v1_ried_clcl1000m.txt') as json_file:
    update_3_ried = json.load(json_file)

In [None]:
#list of lists == in pandas = objects to be avoided if possible
days_number_list_aoi = [x[4][2][0] for x in update_3_aoi]
days_list_aoi = [pd.date_range(x[2], periods=(x[4][0] + 1))[1:] for x in update_3_aoi]
sm_list_aoi = [x[4][2][1] for x in update_3_aoi]
pp_list_aoi = [x[4][2][2] for x in update_3_aoi]

In [None]:
#list of lists == in pandas = objects to be avoided if possible
days_number_list_ried = [x[4][2][0] for x in update_3_ried]
days_list_ried = [pd.date_range(x[2], periods=(x[4][0] + 1))[1:] for x in update_3_ried]
sm_list_ried = [x[4][2][1] for x in update_3_ried]
pp_list_ried = [x[4][2][2] for x in update_3_ried]

In [None]:
len(ried_db), len(days_list_ried), len(days_number_list_ried)

In [None]:
days_number_list_aoi = np.array(days_number_list_aoi)
days_number_list_ried = np.array(days_number_list_ried)

days_list_aoi = np.array(days_list_aoi)
days_list_ried = np.array(days_list_ried)

sm_list_aoi = np.array(sm_list_aoi)
sm_list_ried = np.array(sm_list_ried)

pp_list_aoi = np.array(pp_list_aoi)
pp_list_ried = np.array(pp_list_ried)

# lists
lon_aoi = [x[0] for x in update_3_aoi]
lon_ried = [x[0] for x in update_3_ried]

lat_aoi = [x[1] for x in update_3_aoi]
lat_ried = [x[1] for x in update_3_ried]

event_date_aoi = [pd.Timestamp(x[2]) for x in update_3_aoi]
event_date_ried = [pd.Timestamp(x[2]) for x in update_3_ried]

event_pp_aoi = [x[3] for x in update_3_aoi]
event_pp_ried = [x[3] for x in update_3_ried]

event_et_mean_aoi = [x[4][1] for x in update_3_aoi]
event_et_mean_ried = [x[4][1] for x in update_3_ried]

periode_duration_aoi = [x[4][0] for x in update_3_aoi]
periode_duration_ried = [x[4][0] for x in update_3_ried]

sm_measurements_aoi = [np.count_nonzero(np.isfinite(x[4][2][1])) for x in update_3_aoi]
sm_measurements_ried = [np.count_nonzero(np.isfinite(x[4][2][1])) for x in update_3_ried]

slope_polyfit1d_aoi =  [x[5][0] for x in update_3_aoi]
slope_polyfit1d_ried =  [x[5][0] for x in update_3_ried]

intercept_polyfit1d_aoi =  [x[5][1] for x in update_3_aoi]
intercept_polyfit1d_ried =  [x[5][1] for x in update_3_ried]

slope_lineregress_aoi = [x[6][0] for x in update_3_aoi]
slope_lineregress_ried = [x[6][0] for x in update_3_ried]

intercept_lineregress_aoi = [x[6][1] for x in update_3_aoi]
intercept_lineregress_ried = [x[6][1] for x in update_3_ried]

ndvi_aoi = [np.nanmean(x[9]) for x in update_3_aoi]
ndvi_ried = [np.nanmean(x[9]) for x in update_3_ried]

clc_2018_aoi = [x[7][0] for x in update_3_aoi]
clc_2018_ried = [x[7][0] for x in update_3_ried]

soil_map_aoi = [x[7][1] for x in update_3_aoi]
soil_map_ried = [x[7][1] for x in update_3_ried]

lonlat_aoi = [lon+lat for lon,lat in zip(lon_aoi,lat_aoi)]
lonlat_ried = [lon+lat for lon,lat in zip(lon_ried,lat_ried)]

In [None]:
print('dry_periods on agriculture land: ', len(aoi_db), len(ried_db))

In [None]:
len(lonlat_aoi) == len(soil_map_aoi) == len(clc_2018_aoi) == len(ndvi_aoi) == len(intercept_lineregress_aoi) == len(slope_lineregress_aoi) == len(intercept_polyfit1d_aoi) == len(slope_polyfit1d_aoi) == len(sm_measurements_aoi) == len(periode_duration_aoi) == len(event_et_mean_aoi) == len(event_pp_aoi) == len(event_date_aoi) == len(lat_aoi) == len(lon_aoi)

In [None]:
def get_ts(days, sm):
    masks = [np.isfinite(x) for x in sm]
    days = [np.array(x)[mask] for x, mask in zip(days,masks)]
    sm = [np.array(x)[mask] for x, mask in zip(sm,masks)]
    evp = event_et_mean_aoi
    slope = slope_lineregress_aoi
    count_sm = sm_measurements_aoi
    lonlat = lonlat_aoi
    clc = clc_2018_aoi
    ndvi = ndvi_aoi
    slope = slope_lineregress_aoi
    lon = lon_aoi
    lat = lat_aoi
    return sm, days, evp, slope, count_sm, lonlat, clc, ndvi, slope, lon, lat

In [None]:
sm, days, evp, slope, count_sm, lonlat, clc, ndvi,slope, lon, lat = get_ts(days_number_list_aoi,sm_list_aoi)
len(sm), len(days), len(evp)

In [None]:
#dataframe with lat lon and mean of slopes
slopes = pd.DataFrame({'lonlat' : lonlat, 'slope' : slope, 'lon' : lon, 'lat': lat,'clc' : clc})
slopes_clc = slopes[slopes['clc'] < 300]
slopes_mean = slopes.groupby('lonlat').mean().reset_index(drop=True)
slopes_mean_clc = slopes_clc.groupby('lonlat').mean().reset_index(drop=True)
print(len(slopes_clc), len(slopes))

#find minimum sm value within dry periods:
sm_minimum = pd.DataFrame({'lonlat' : lonlat, 'lon' : lon, 'lat': lat,'clc' : clc, 'sm' : [min(x) for x in sm]})
sm_minimum_mean = sm_minimum.groupby('lonlat').mean().reset_index(drop=True)

#create gpd multipoint with shaply geometry from lon & lat equal to [Point(x, y) for x, y in zip(df.Longitude, df.Latitude)]
gpd_slopes = gpd.GeoDataFrame(slopes_mean, geometry=gpd.points_from_xy(slopes_mean.lon, slopes_mean.lat))
gpd_slopes_clc = gpd.GeoDataFrame(slopes_mean, geometry=gpd.points_from_xy(slopes_mean.lon, slopes_mean.lat))
gpd_sm_min_mean = gpd.GeoDataFrame(sm_minimum_mean, geometry=gpd.points_from_xy(sm_minimum_mean.lon, sm_minimum_mean.lat))

#intersection
polygon = vec_ried.geometry[0]
gpd_slopes_ried = gpd_slopes[gpd_slopes.within(polygon)]
gpd_slopes_ried_clc = gpd_slopes[gpd_slopes.within(polygon)]
gpd_sm_min_mean_ried = gpd_sm_min_mean[gpd_sm_min_mean.within(polygon)]

polygon = vec_ried.geometry[0]
gdf_groups_ried = gdf_groups_aoi[gdf_groups_aoi.within(polygon)]
gdf_groups_f_ried = gdf_groups_f_aoi[gdf_groups_f_aoi.within(polygon)]

In [None]:
fig, ax = plt.subplots(ncols=2,dpi=300)

divider = make_axes_locatable(ax[0])
divider_2 = make_axes_locatable(ax[1])

cax = divider.append_axes("right", size="5%", pad=0.1)
cax_2 = divider_2.append_axes("right", size="5%", pad=0.1)

#soil_map_data.plot('HAUPTGRUPP', figsize=(24,12), markersize=120, cmap='Accent', legend=True, ax=ax)
gpd_sm_mean.plot(column='sm', markersize=20, legend=True, ax=ax[0], cax=cax, alpha=1, cmap='plasma_r')
gpd_sm_mean_ried.plot(column='sm', markersize=60, legend=True, ax=ax[1],cax=cax_2, alpha=1, cmap='plasma_r')
vec_ried.geometry.boundary.plot(color=None,edgecolor='k', linewidth = 3, ax=ax[0])
vec_ried.geometry.boundary.plot(color=None,edgecolor='k', linewidth = 3, ax=ax[1])

F = plt.gcf()
Size = F.get_size_inches()
F.set_size_inches(Size[0]*3, Size[1]*3, forward=True) # Set forward to True to resize window along with plot in figure
ax[0].set_title('Min Soil Moisture  within dry periods')
ax[1].set_title('Min Soil Moisture  within dry periods')
plt.tight_layout()
plt.savefig('figures//%s.png' %('sm_min_dry_periods'))#K means clustering

In [None]:
fig, ax = plt.subplots(ncols=2,dpi=300)

divider = make_axes_locatable(ax[0])
divider_2 = make_axes_locatable(ax[1])

cax = divider.append_axes("right", size="5%", pad=0.1)
cax_2 = divider_2.append_axes("right", size="5%", pad=0.1)

#soil_map_data.plot('HAUPTGRUPP', figsize=(24,12), markersize=120, cmap='Accent', legend=True, ax=ax)
gpd_sm_min_mean.plot(column='sm', markersize=20, legend=True, ax=ax[0], cax=cax, alpha=1, cmap='plasma_r', vmax=0.3)
gpd_sm_min_mean_ried.plot(column='sm', markersize=85, legend=True, ax=ax[1],cax=cax_2, alpha=1, cmap='plasma_r', vmax=0.3)
vec_ried.geometry.boundary.plot(color=None,edgecolor='k', linewidth = 3, ax=ax[0])
vec_ried.geometry.boundary.plot(color=None,edgecolor='k', linewidth = 3, ax=ax[1])

F = plt.gcf()
Size = F.get_size_inches()
F.set_size_inches(Size[0]*3, Size[1]*3, forward=True) # Set forward to True to resize window along with plot in figure
ax[0].set_title('Min Soil Moisture  within dry periods')
ax[1].set_title('Min Soil Moisture  within dry periods')
plt.tight_layout()
plt.savefig('figures//%s.png' %('sm_min_dry_periods'))#K means clustering

In [None]:
import mplleaflet
ax = gpd_kmeans_f_aoi.plot(column='group')
mplleaflet.show(fig=ax.figure, path='figures//gpd_kmeans_aoi_f_map_7.html')

In [None]:
import mplleaflet
ax = gpd_sm_min_mean.plot(column='sm', markersize=20, legend=True, alpha=1, cmap='plasma_r', vmax=0.3)
mplleaflet.show(fig=ax.figure, path='figures//mean_min_sm.html')

In [None]:
fig, ax = plt.subplots(ncols=2,dpi=300)

divider = make_axes_locatable(ax[0])
divider_2 = make_axes_locatable(ax[1])

cax = divider.append_axes("right", size="5%", pad=0.1)
cax_2 = divider_2.append_axes("right", size="5%", pad=0.1)

#soil_map_data.plot('HAUPTGRUPP', figsize=(24,12), markersize=120, cmap='Accent', legend=True, ax=ax)
gpd_slopes.plot(column='slope', markersize=20, legend=True, ax=ax[0], cax=cax, alpha=1, cmap='gist_rainbow')
gpd_slopes_ried.plot(column='slope', markersize=60, legend=True, ax=ax[1],cax=cax_2, alpha=1, cmap='gist_rainbow')
vec_ried.geometry.boundary.plot(color=None,edgecolor='k', linewidth = 3, ax=ax[0])
vec_ried.geometry.boundary.plot(color=None,edgecolor='k', linewidth = 3, ax=ax[1])

F = plt.gcf()
Size = F.get_size_inches()
F.set_size_inches(Size[0]*3, Size[1]*3, forward=True) # Set forward to True to resize window along with plot in figure
ax[0].set_title('Mean slope from regression line within dry periods ')
ax[1].set_title('Mean slope from regression line within dry periods ')
plt.tight_layout()
plt.savefig('figures//%s.png' %('Mean_slope_dry_periods'))#K means clustering

In [None]:
fig, ax = plt.subplots(ncols=2,dpi=300)

divider = make_axes_locatable(ax[0])
divider_2 = make_axes_locatable(ax[1])

cax = divider.append_axes("right", size="5%", pad=0.1)
cax_2 = divider_2.append_axes("right", size="5%", pad=0.1)

#soil_map_data.plot('HAUPTGRUPP', figsize=(24,12), markersize=120, cmap='Accent', legend=True, ax=ax)
gpd_slopes_clc.plot(column='slope', markersize=20, legend=True, ax=ax[0], cax=cax, alpha=1, cmap='gist_rainbow')
gpd_slopes_ried_clc.plot(column='slope', markersize=60, legend=True, ax=ax[1],cax=cax_2, alpha=1, cmap='gist_rainbow')
vec_ried.geometry.boundary.plot(color=None,edgecolor='k', linewidth = 3, ax=ax[0])
vec_ried.geometry.boundary.plot(color=None,edgecolor='k', linewidth = 3, ax=ax[1])

F = plt.gcf()
Size = F.get_size_inches()
F.set_size_inches(Size[0]*3, Size[1]*3, forward=True) # Set forward to True to resize window along with plot in figure
ax[0].set_title('Mean slope from regression line within dry periods over agriculture surface ')
ax[1].set_title('Mean slope from regression line within dry periods over agriculture surface')
plt.tight_layout()
plt.savefig('figures//%s.png' %('Mean_slope_dry_periods_clc'))#K means clustering

In [None]:
fig, ax = plt.subplots(ncols=2,dpi=300)

divider = make_axes_locatable(ax[0])
divider_2 = make_axes_locatable(ax[1])

cax = divider.append_axes("right", size="5%", pad=0.1)
cax_2 = divider_2.append_axes("right", size="5%", pad=0.1)

#soil_map_data.plot('HAUPTGRUPP', figsize=(24,12), markersize=120, cmap='Accent', legend=True, ax=ax)
gpd_slopes.plot(column='slope', markersize=20, legend=True, ax=ax[0], cax=cax, alpha=1, cmap='gist_rainbow', vmax=0)
gpd_slopes_ried.plot(column='slope', markersize=60, legend=True, ax=ax[1],cax=cax_2, alpha=1, cmap='gist_rainbow', vmax=0)
vec_ried.geometry.boundary.plot(color=None,edgecolor='k', linewidth = 3, ax=ax[0])
vec_ried.geometry.boundary.plot(color=None,edgecolor='k', linewidth = 3, ax=ax[1])

F = plt.gcf()
Size = F.get_size_inches()
F.set_size_inches(Size[0]*3, Size[1]*3, forward=True) # Set forward to True to resize window along with plot in figure
ax[0].set_title('Mean slope from regression line within dry periods - only negative slopes')
ax[1].set_title('Mean slope from regression line within dry periods - only negative slopes')
plt.tight_layout()
plt.savefig('figures//%s.png' %('Mean_slope_dry_periods_negative_slopes'))#K means clustering

In [None]:
#K means clustering
X = np.array(extracted_features_sm_s.iloc[:,:20])
y_iris = np.array(extracted_features_sm_s['Hauptgruppe'])
km2 = cluster.KMeans(n_clusters=4).fit(X)
km3 = cluster.KMeans(n_clusters=3).fit(X)
km4 = cluster.KMeans(n_clusters=7).fit(X)
plt.figure(figsize=(9, 3))
plt.subplot(131)
plt.scatter(X[:, 0], X[:, 1], c=km2.labels_)
plt.title("K=2, J=%.2f" % km2.inertia_)
plt.subplot(132)
plt.scatter(X[:, 0], X[:, 1], c=km3.labels_)
plt.title("K=3, J=%.2f" % km3.inertia_)
plt.subplot(133)
plt.scatter(X[:, 0], X[:, 1], c=km4.labels_)#.astype(np.float))
plt.title("K=7, J=%.2f" % km4.inertia_)
np.unique(y_iris)

In [None]:
ts_data = pd.DataFrame({'day' : days, 'sm' : sm, 'lonlat' : lonlat, 'clc' : clc, 'ndvi' : ndvi})
ts_data.head(10)

In [None]:
from tslearn.generators import random_walks
X = random_walks(n_ts=50, sz=32, d=1)
km = TimeSeriesKMeans(n_clusters=3, metric="euclidean", max_iter=5,
                      random_state=0).fit(X)
km.cluster_centers_.shape
#3,32, 1)
km_dba = TimeSeriesKMeans(n_clusters=3, metric="dtw", max_iter=5,
                          max_iter_barycenter=5,
                          random_state=0).fit(X)
km_dba.cluster_centers_.shape
#3,32, 1)
km_sdtw = TimeSeriesKMeans(n_clusters=3, metric="softdtw", max_iter=5,
                           max_iter_barycenter=5,
                           metric_params={"gamma": .5},
                           random_state=0).fit(X)
km_sdtw.cluster_centers_.shape
#3,32, 1)
X_bis = to_time_series_dataset([[1, 2, 3, 4],
                                [1, 2, 3],
                                [2, 5, 6, 7, 8, 9]])
km = TimeSeriesKMeans(n_clusters=2, max_iter=5,
                      metric="dtw", random_state=0).fit(X_bis)
km.cluster_centers_.shape
#2,3, 1)

## Display location with figures from periods within the location 

In [None]:
df_lonlat = ried_db
days_number_list = days_number_list_ried
days_list = days_list_ried
sm_list = sm_list_ried
pp_list = pp_list_ried
ndvi = ndvi_ried
event_et_mean = event_et_mean_ried
slope_lineregress = slope_lineregress_ried
intercept_lineregress = intercept_lineregress_ried

for location in r3:
    #filter periods for location
    df_location = df_lonlat[(df_lonlat['lon'].isin([location[1]])) & (df_lonlat['lat'].isin([location[0]]))]
    #sort 
    df_location_sort = df_location.sort_values('sm_measurements', ascending=False) #sm_measurements
    #get index of df for locations to acces the sm, pp, days lists
    df_index = df_location_sort.index.tolist()
    #get period counts and sqrt it to get ncols * nrows for plotting
    periods_len = int(math.sqrt(len(df_index)))
    #gmaps image with marker on lat/lon 
    center=(df_location_sort.lat.tolist()[0],df_location_sort.lon.tolist()[0])    
    buf = Point(center).buffer(0.01, cap_style=3) #0.0156109,-0.00915375
    buf_dict = [x for x in gpd.GeoSeries([buf]).__geo_interface__['features'][0]['geometry']['coordinates'][0]]
    lat = df_location_sort.lat.tolist()[0]
    lon = df_location_sort.lon.tolist()[0]
    lon_transform = 0.0156109 / 2
    lat_transform = 0.00915375 / 2
    aoi = [(lat + lat_transform, lon + lon_transform), (lat + lat_transform, lon - lon_transform), (lat - lat_transform, lon - lon_transform), (lat - lat_transform, lon + lon_transform), (lat + lat_transform, lon + lon_transform)]
    gmap = gmaps.figure(center=center, map_type='SATELLITE', zoom_level=14, layout={'width': '300', 'height': '300px'})
    pol = gmaps.Polygon(aoi, stroke_color='red', stroke_weight=4)
    drawing = gmaps.drawing_layer(features=[pol])
    gmap.add_layer(drawing)
    #create widget for displaying gmaps
    display(gmap)
    #display(df_location_sort)
    #create figure & axis
    fig, axs = plt.subplots(ncols=periods_len, nrows=periods_len, figsize=(35,25))

    #loop through all periods within same location 
    for index, ax in zip(df_index, fig.axes):
        #get single period 
        df_period = pd.DataFrame({'time' : days_list[index], 'days' : days_number_list[index], 'soil_moisture' : sm_list[index], 
                                  'precipitation' : pp_list[index], 'ndvi' : ndvi[index], 'et_r' :  event_et_mean[index], 
                                  'slope_lineregress' : slope_lineregress[index], 'intercept_lineregress' : intercept_lineregress[index]})
    
        #Mask where nans to get a not braking line between them
        mask = [np.isfinite(x) for x in sm_list[index]]

        #plot lineregression
        lgx = np.linspace(0, len(df_period.days.values.tolist()), len(df_period.days.values.tolist()))
        slope = df_period.slope_lineregress*lgx+df_period.intercept_lineregress
        df_slope = pd.DataFrame({'lgx' : lgx, 'slope' : slope})
        ax.plot('lgx','slope', data = df_slope, color='black', alpha=0.5)
        
        #plot time (x) vs. soil moisture (y)
        ax.plot('days', 'soil_moisture', data=df_period[mask], marker='o',markerfacecolor='blue' ,linestyle='dotted', color='red', linewidth=6, markersize=15)

        #create label for axes x & y & #change tick color
        #ax.set_xlabel('time', color='blue')
        ax.set_ylabel('soil moisture m³/m³', color='black', fontsize=18)
        ax.set_xlabel('time', color='black', fontsize=15)
        ax.set_ylim(0,0.8)
        ax.tick_params(axis='y', labelcolor='red', labelsize=18)
        ax.tick_params(axis='x', labelsize=15)

        #create text annotation
        table = ax.table(cellText=[['p_start', 'p.length', 'sm.count', 'ndvi', 'evapoT_r' ],
                           [str(df_period.time.min()).replace('','')[2:10], str(len(df_period.days.values.tolist())), str(len(df_period[mask].soil_moisture.values.tolist())), str(df_period[mask].ndvi.mean())[0:4], str(df_period.et_r.mean())[0:4] ]],
                             cellLoc='center', loc=16)#.auto_set_font_size(False)#.set_fontsize(20)
        table.set_fontsize(20)
        ax.legend(loc='upper left', fontsize='large')
        
        # create the xaxis label
        plt.setp(ax.xaxis.get_label(), visible=True, text='time (days)')

        # instantiate a second axes that shares the same x-axis and plot precipitation data
        ax2 = ax.twinx() 
        ax2.bar('days', 'precipitation', data=df_period, alpha=0.45, color='blue', label='precipitation ' )

        #create label for axis (y2)
        ax2.set_ylabel('precipitation radolan 1km² (mm/24h)', color='black', fontsize=18)  
        ax2.set_ylim(0,1)
        
        #change tick color
        ax2.tick_params(axis='y', labelcolor='blue', labelsize=18)

        ax2.legend(loc=0, fontsize='large')

    fig.tight_layout(h_pad=4.5, w_pad = 3.0)
    
    plt.savefig('figures//locs//r3//%s.png' %(str(location).replace('(','').replace(')', '').replace(', ', '_').replace('.','_'))) #K means clustering
    plt.show()


In [None]:
df_lonlat = aoi_db
days_number_list = days_number_list_aoi
days_list = days_list_aoi
sm_list = sm_list_aoi
pp_list = pp_list_aoi
ndvi = ndvi_aoi
event_et_mean = event_et_mean_aoi
slope_lineregress = slope_lineregress_aoi
intercept_lineregress = intercept_lineregress_aoi

for location in df_lonlat.sort_values('location_count', ascending=False)['lonlat'].unique()[0:5]:
    #filter periods for location
    df_location = df_lonlat[df_lonlat['lonlat'].isin([location])]
    #sort 
    df_location_sort = df_location.sort_values('sm_measurements', ascending=False)
    display(df_location_sort)
    #get index of df for locations to acces the sm, pp, days lists
    df_index = df_location_sort.index.tolist()
    #get period counts and sqrt it to get ncols * nrows for plotting
    periods_len = int(math.sqrt(len(df_index)))
    #gmaps image with marker on lat/lon 
    center=(df_location_sort.lat.tolist()[0],df_location_sort.lon.tolist()[0])    
    buf = Point(center).buffer(0.01, cap_style=3) #0.0156109,-0.00915375
    buf_dict = [x for x in gpd.GeoSeries([buf]).__geo_interface__['features'][0]['geometry']['coordinates'][0]]
    lat = df_location_sort.lat.tolist()[0]
    lon = df_location_sort.lon.tolist()[0]
    lon_transform = 0.0156109 / 2
    lat_transform = 0.00915375 / 2
    aoi = [(lat + lat_transform, lon + lon_transform), (lat + lat_transform, lon - lon_transform), (lat - lat_transform, lon - lon_transform), (lat - lat_transform, lon + lon_transform), (lat + lat_transform, lon + lon_transform)]
    gmap = gmaps.figure(center=center, map_type='SATELLITE', zoom_level=13, layout={'width': '400', 'height': '300px'})
    pol = gmaps.Polygon(aoi, stroke_color='red')
    drawing = gmaps.drawing_layer(features=[pol])
    gmap.add_layer(drawing)
    #create widget for displaying gmaps
    display(gmap)
    #create figure & axis
    fig, axs = plt.subplots(ncols=periods_len, nrows=periods_len, figsize=(25,15))

    #loop through all periods within same location 
    for index, ax in zip(df_index, fig.axes):
        #get single period 
        df_period = pd.DataFrame({'time' : days_list[index], 'days' : days_number_list[index], 'soil_moisture' : sm_list[index], 
                                  'precipitation' : pp_list[index], 'ndvi' : ndvi[index], 'et_r' :  event_et_mean[index], 
                                  'slope_lineregress' : slope_lineregress[index], 'intercept_lineregress' : intercept_lineregress[index]})
    
        #Mask where nans to get a not braking line between them
        mask = [np.isfinite(x) for x in sm_list[index]]

        #plot lineregression
        lgx = np.linspace(0, len(df_period.days.values.tolist()), len(df_period.days.values.tolist()))
        slope = df_period.slope_lineregress*lgx+df_period.intercept_lineregress
        df_slope = pd.DataFrame({'lgx' : lgx, 'slope' : slope})
        ax.plot('lgx','slope', data = df_slope, color='black', alpha=0.5)
        
        #plot time (x) vs. soil moisture (y)
        ax.plot('days', 'soil_moisture', data=df_period[mask], marker='o', linestyle='dotted', color='red')

        #create label for axes x & y & #change tick color
        #ax.set_xlabel('time', color='blue')
        ax.set_ylabel('soil moisture m³/m³', color='black')
        ax.set_ylim(0,0.8)
        ax.tick_params(axis='y', labelcolor='red')

        #create text annotation
        ax.table(cellText=[['p_start', 'p.length', 'sm.count', 'ndvi', 'evapoT_r' ],
                           [str(df_period.time.min()).replace('','')[2:10], str(len(df_period.days.values.tolist())), str(len(df_period[mask].soil_moisture.values.tolist())), str(df_period[mask].ndvi.mean())[0:4], str(df_period.et_r.mean())[0:4] ]],
                             cellLoc='center', loc=16).auto_set_font_size(False)#.set_fontsize(20)
        
        ax.legend(loc='upper left')
        
        # create the xaxis label
        plt.setp(ax.xaxis.get_label(), visible=True, text='time (days)')

        # instantiate a second axes that shares the same x-axis and plot precipitation data
        ax2 = ax.twinx() 
        ax2.bar('days', 'precipitation', data=df_period, alpha=0.45, color='blue', label='precipitation ' )

        #create label for axis (y2)
        ax2.set_ylabel('precipitation radolan 1km² (mm/24h)', color='black')  
        ax2.set_ylim(0,1)
        
        #change tick color
        ax2.tick_params(axis='y', labelcolor='blue')

        ax2.legend(loc=0)

    fig.tight_layout(h_pad=4.5, w_pad = 3.0)
    plt.show()

## Find sample location with NDVI and corine land cover 

In [None]:
coord_collection = [(49.848499, 8.428423),(49.62977731115676, 8.428339501487915),(49.63273697221979, 8.434362993266106)]

In [None]:
#convert to pd dataframe & reset index and drop all nan values
df_ndvi = ndvi_datasets[0].NDVI.to_dataframe().reset_index().dropna()
#create unique identifier for location 
df_ndvi['latlon'] = df_ndvi['latitude'] + df_ndvi['longitude']
#group by this identifier
locations_low_ndvi = df_ndvi.groupby('latlon').mean().sort_values('NDVI').reset_index()

In [None]:
#center coordinates to set view 
center = (locations_low_ndvi.latitude.tolist()[0],locations_low_ndvi.longitude.tolist()[0])    
#create figure for gmaps 
gmap = gmaps.figure(center=center, map_type='SATELLITE', zoom_level=11, layout={'width': '800px', 'height': '600px'})
#add locations with marker
marker = gmaps.marker_layer(locations=[(x,y) for x,y in zip(locations_low_ndvi[0:25].latitude.tolist(), locations_low_ndvi[0:25].longitude.tolist())], info_box_content=[str((x,y)) for x,y in zip(locations_low_ndvi[0:25].latitude.tolist(), locations_low_ndvi[0:25].longitude.tolist())])
#add markers to figure
gmap.add_layer(marker)
#display map
display(gmap)

In [None]:
time_year = np.arange(1,13,1)
time_doy = np.arange(1,367,1)
time_week = np.arange('2018-01', '2018-12', dtype='datetime64[M]') #
#time_week = [x for x in range(1,13,1)]

In [None]:
def mask_na(array):
    return np.isfinite(array)

In [None]:
#Create figure and subplots
fig, ax = plt.subplots(ncols=int(math.sqrt(len(coord_collection))) + 1,nrows=int(math.sqrt(len(coord_collection))), figsize=(len(coord_collection)*8,len(coord_collection)*3))
#correct x axes time view
fig.autofmt_xdate()

for loc, ax in zip(coord_collection, fig.axes):
    #ndvi data
    ndvi = ndvi_datasets[0].NDVI.sel(latitude = loc[0], longitude = loc[1], method='nearest').values
    mask_ndvi = np.isfinite(ndvi)
    time_ndvi = ndvi_datasets[0].sel(latitude = loc[0], longitude = loc[1], method='nearest').time.values
    #soil_moisture data 
    sm = xr_model_aoi.soil_moisture_1km.sel(latitude = loc[0], longitude = loc[1], method='nearest').groupby('time.week').apply(lambda x: x.mean()).values
    mask_sm = np.isfinite(sm)
    time_sm = [x for x in xr_model_aoi.soil_moisture_1km.sel(latitude = loc[0], longitude = loc[1], method='nearest').groupby('time.week').groups.keys()]
    time_sm = np.arange(np.datetime64(xr_model_aoi.time.min().values, 'M'), 53, dtype='datetime64[M]')
    #plot data on axes
    ax.plot(time_ndvi[mask_ndvi], ndvi[mask_ndvi], 'gx', label='ndvi')
    ax.plot(time_sm, sm[mask_sm], 'ro--', label = 'soil_moisture monthly mean ')
    ax.legend()
    #fig.title('NDVI & monthl soil moisture time series')
    ax.grid(axis='both')

# Visualizations

### Selection1d paired¶

## Set up Geoviews

In [None]:
def da_to_hv(da):
    try: 
        xr_img = da.clone(datatype=['xarray'])
        print(type(xr_img.data))    
    except:
        print('xarray interface could not be imported.')
    return 

def img_to_hv(img, **kwag):
    return(hv.Image(img, **kwag))

def ds_to_gv(ds):
    gv_dataset = gv.Dataset(ds)
    return gv_dataset

def ds_to_hv(ds, **kwag):
    return hv.Dataset(ds, **kwag)

### Gridded Data 

In [None]:
#declare dimensions and variables
kdims = ['longitude', 'latitude', 'time']
vdims = ['soil_moisture_1km', 'precipitation_1km', 'real_evapotranspiration']

In [None]:
#load gv objects from xarray
hvD_aoi_sm = ds_to_hv(xr_model_aoi.soil_moisture_1km.dropna(dim='time', how='all'))
hvD_aoi_pp = ds_to_hv(xr_model_aoi.precipitation_1km)
hvD_aoi_evp = ds_to_hv(xr_model_aoi.real_evapotranspiration)

In [None]:
#load gv objects from xarray
hvD_ried_sm = ds_to_hv(xr_model_ried.soil_moisture_1km.dropna(dim='time', how='all'))
hvD_ried_pp = ds_to_hv(xr_model_ried.precipitation_1km)
hvD_ried_evp = ds_to_hv(xr_model_ried.real_evapotranspiration)

### Images

In [None]:
hvI_sub_clc_100 = img_to_hv(clc_100, **{'name' : 'clc_100', 'label' : 'Corine Land Cover 2018 100m'})
hvI_sub_clc_1000 = img_to_hv(clc_1000, **{'name' : 'clc_1000', 'label' : 'Corine Land Cover 2018 1000m'})

## Geometrys

In [None]:
#Mask out non Agriculture Surfaces
vec_agri_ried = vec_clc_ried[vec_clc_ried['raster_grp'] == 2]
vec_agri_aoi = vec_clc_aoi[vec_clc_aoi['raster_grp'] == 2]

In [None]:
hvP_ried = hv.Polygons(vec_ried)
hvP_ried

In [None]:
hvP_aoi = hv.Polygons(vec_aoi)
hvP_clc_aoi = hv.Polygons(vec_clc_aoi)
hvP_clc_ried = hv.Polygons(vec_clc_ried)
hvP_clc_agri_ried = hv.Polygons(vec_agri_ried)

### properties

#### Area

In [None]:
#Now copy your GeoDataFrame and change the projection to a Cartesian system (EPSG:3857, unit= m as in the answer of ResMar)
vec_clc_aoi["area"] = vec_clc_aoi.to_crs({'init': 'epsg:3857'})['geometry'].area/ 10**6
#But the surfaces in the Mercator projection are not correct, so with other projection in meters.
vec_clc_aoi = vec_clc_aoi.to_crs({'init': 'epsg:32633'})
vec_clc_aoi["area"] = vec_clc_aoi['geometry'].area/ 10**6
#go back to original crs
vec_clc_aoi = vec_clc_aoi.to_crs({'init': 'epsg:4326'})

#Now copy your GeoDataFrame and change the projection to a Cartesian system (EPSG:3857, unit= m as in the answer of ResMar)
vec_clc_ried["area"] = vec_clc_ried.to_crs({'init': 'epsg:3857'})['geometry'].area/ 10**6
#But the surfaces in the Mercator projection are not correct, so with other projection in meters.
vec_clc_ried = vec_clc_ried.to_crs({'init': 'epsg:32633'})
vec_clc_ried["area"] = vec_clc_ried['geometry'].area/ 10**6
#go back to original crs
vec_clc_ried = vec_clc_ried.to_crs({'init': 'epsg:4326'})

https://stackoverflow.com/questions/38337918/plot-pie-chart-and-table-of-pandas-dataframe

## Create Visualisations

#### multiple plots

In [None]:
g_simple = xr_model_data.sel(time=slice('2018-01-03', '2018-01-18')).soil_moisture_1km.plot(x='longitude', y='latitude', col='time', col_wrap=4)
plt.savefig('figures//xr_before_merge.png') #K means clustering


### Vector

#### Agriculture Surfaces

In [None]:
clc_100

In [None]:
def clc_grp(value):
    return int(str(value)[0])

shape = clc_100.values.shape
arr = np.ravel(clc_100.values.tolist())
arr_grp = list(map(lambda x: clc_grp(x), arr))
arr_ = np.array(arr_grp).reshape(shape)

clc_100['landcover_grp'] = (['latitude', 'longitude'],arr_)

In [None]:
landcover_area_aoi = vec_clc_aoi.groupby('landcover_class').sum().area.values
landcover_area_ried = vec_clc_ried.groupby('landcover_class').sum().area.values
landcover_class = vec_clc_aoi.groupby('landcover_class').sum().index.values
explode = (0.1,0,0,0)
import matplotlib.gridspec as gridspec


def func(pct, allvals):
    absolute = int(pct/100.*np.sum(allvals))
    return "{:.1f}%\n({:d} m³)".format(pct, absolute)

fig, ax = plt.subplots(ncols=2,nrows=2, figsize=(24,16))
fig.suptitle('Landcover Classes from Corine Land Cover 2018 in Nördl. Oberrheingraben / Ried', fontsize=17)
gs = gridspec.GridSpec(2, 4)
gs.update()
ax1 = plt.subplot(gs[0, :2], )
ax2 = plt.subplot(gs[0, 2:])
ax3 = plt.subplot(gs[1, 1:3])
ax1.pie(landcover_area_aoi[:-1], explode=explode, labels=landcover_class[:-1], autopct=lambda pct: func(pct, landcover_area_aoi),
        shadow=True, startangle=90, colors=['grey','red', 'green', 'blue'], textprops={'fontsize':15})
ax1.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
#ax1.set_title('Landcover Classes from Corine Land Cover 2018 in AOI', fontsize=17)

ax2.pie(landcover_area_ried[:-1], explode=explode, labels=landcover_class[:-1], autopct=lambda pct: func(pct, landcover_area_ried),
        shadow=True, startangle=90, colors=['grey','red', 'green', 'blue'], textprops={'fontsize':15})
ax2.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
#ax2.set_title('Landcover Classes from Corine Land Cover 2018 in Ried', fontsize=17)

clc_100.landcover_grp.plot.imshow(ax=ax3, levels=5, colors=['red','grey', 'green', 'blue', 'marple'], add_colorbar=True, cbar_kwargs={'shrink' : 0.8} )

vec_ried.boundary.geometry.plot(ax=ax3, edgecolor='black', linewidth=3)

plt.savefig('figures//landcover_pie_chart.png') #K means clustering


In [None]:

fig, ax = plt.subplots(figsize=(24,8))

plt.tight_layout()


## Masking 

In [None]:
fig, ax = plt.subplots(ncols=2, figsize=(24,12))
vec_agri_aoi.plot(ax=ax[0], color='lightgreen')
vec_agri_ried.plot(ax=ax[0], color='red')
vec_agri_ried.plot(ax=ax[1], color='lightgreen')
ax[0].set_title('Agriculture Surfaces AOI')
ax[1].set_title('Agriculture Surfaces Ried')

In [None]:
#create HoloMap Slider for time Dimension
hmS_aoi_sm = hvD_aoi_sm.ndloc[:, :, :10].to(hv.Image, kdims=kdims[0:2], dynamic=False)#.opts(colorbar=True, fig_size=200)
hmS_aoi_pp = hvD_aoi_pp.ndloc[:, :, :10].to(hv.Image, kdims=kdims[0:2], dynamic=False).opts(colorbar=True, fig_size=200)
hmS_aoi_evp = hvD_aoi_evp.ndloc[:, :, :10].to(hv.Image, kdims=kdims[0:2], dynamic=False).opts(colorbar=True, fig_size=200)

In [None]:
#create HoloMap Slider for time Dimension
hmS_ried_sm = hvD_ried_sm.ndloc[:, :, :10].to(hv.Image, kdims=kdims[0:2], dynamic=False).opts(fig_size=200)
hmS_ried_pp = hvD_ried_pp.ndloc[:, :, :10].to(hv.Image, kdims=kdims[0:2], dynamic=False).opts(colorbar=True, fig_size=200)
hmS_ried_evp = hvD_ried_evp.ndloc[:, :, :10].to(hv.Image, kdims=kdims[0:2], dynamic=False).opts(colorbar=True, fig_size=200)

## Display Visualisations

In [None]:
hmS_ried_sm * hvP_clc_agri_ried.opts(alpha=0.5)

In [None]:
display_html(hmS_aoi_sm)

In [None]:
hv.extension('bokeh')

In [None]:
hvP_ried.opts(alpha=0.5) * hvP_clc_aoi.opts(alpha=0.5) * hmS_aoi_sm  

In [None]:
display(hvP_ried.opts(alpha=0.5) * hvP_clc_aoi.opts(alpha=0.5) * hmS_ried_sm)

## calendar information 

In [None]:
dpm = {'noleap': [0, 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31],
       '365_day': [0, 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31],
       'standard': [0, 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31],
       'gregorian': [0, 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31],
       'proleptic_gregorian': [0, 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31],
       'all_leap': [0, 31, 29, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31],
       '366_day': [0, 31, 29, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31],
       '360_day': [0, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30]}

## calendar functions to determine the number of days in each month

In [None]:
def leap_year(year, calendar='standard'):
    """Determine if year is a leap year"""
    leap = False
    if ((calendar in ['standard', 'gregorian',
        'proleptic_gregorian', 'julian']) and
        (year % 4 == 0)):
        leap = True
        if ((calendar == 'proleptic_gregorian') and
            (year % 100 == 0) and
            (year % 400 != 0)):
            leap = False
        elif ((calendar in ['standard', 'gregorian']) and
                 (year % 100 == 0) and (year % 400 != 0) and
                 (year < 1583)):
            leap = False
    return leap

def get_dpm(time, calendar='standard'):
    """
    return a array of days per month corresponding to the months provided in `months`
    """
    month_length = np.zeros(len(time), dtype=np.int)

    cal_days = dpm[calendar]

    for i, (month, year) in enumerate(zip(time.month, time.year)):
        month_length[i] = cal_days[month]
        if leap_year(year, calendar=calendar) and month == 2:
            month_length[i] += 1
    return month_length

## Seasonal Mean

In [None]:
# Wrap it into a simple function
def season_mean(ds, calendar='standard'):
    # Make a DataArray of season/year groups
    year_season = xr.DataArray(ds.time.to_index().to_period(freq='Q-NOV').to_timestamp(how='E'),
                               coords=[ds.time], name='year_season')

    # Make a DataArray with the number of days in each month, size = len(time)
    month_length = xr.DataArray(get_dpm(ds.time.to_index(), calendar=calendar),
                                coords=[ds.time], name='month_length')
    # Calculate the weights by grouping by 'time.season'
    weights = month_length.groupby('time.season') / month_length.groupby('time.season').sum()

    # Test that the sum of the weights for each season is 1.0
    np.testing.assert_allclose(weights.groupby('time.season').sum().values, np.ones(4))

    # Calculate the weighted average
    return (ds * weights).groupby('time.season').sum(dim='time')

In [None]:
season_mean_sm = season_mean(xr_model_data.soil_moisture_1km, calendar='standard')
season_mean_pp = season_mean(xr_model_data.precipitation_1km, calendar='standard')
season_mean_evp = season_mean(xr_model_data.real_evapotranspiration, calendar='standard')

In [None]:
fig, axes = plt.subplots(ncols=2,nrows=2, figsize=(24,16))
seasons = ['DJF', 'MAM', 'JJA', 'SON']
for i, ax  in enumerate(fig.axes):
    season_mean_sm.sel(season=seasons[i]).plot.pcolormesh(ax=ax, cbar_kwargs={"label": 'cm³/cm³'})
    ax.tick_params('both', labelsize=15)
    ax.set_title(label = seasons[i], fontsize=16)
    ax.set_ylabel(fontsize=16, ylabel='latitude')
    ax.set_xlabel(fontsize=16, xlabel='longitude')
plt.tight_layout()
plt.savefig('figures//mean_seasonal_ca.png') #K means clustering


In [None]:
fig, axes = plt.subplots(ncols=2,nrows=2, figsize=(24,16))
seasons = ['DJF', 'MAM', 'JJA', 'SON']
for i, ax  in enumerate(fig.axes):
    season_mean_pp.sel(season=seasons[i]).plot.pcolormesh(ax=ax, cbar_kwargs={"label": 'cm³/cm³'})
    ax.tick_params('both', labelsize=15)
    ax.set_title(label = seasons[i], fontsize=16)
    ax.set_ylabel(fontsize=16, ylabel='latitude')
    ax.set_xlabel(fontsize=16, xlabel='longitude')
plt.tight_layout()

In [None]:
fig, axes = plt.subplots(ncols=2,nrows=2, figsize=(24,16))
seasons = ['DJF', 'MAM', 'JJA', 'SON']
for i, ax  in enumerate(fig.axes):
    season_mean_evp.where(season_mean_evp > 0).sel(season=seasons[i]).plot.pcolormesh(ax=ax, cbar_kwargs={"label": 'cm³/cm³'})
    ax.tick_params('both', labelsize=15)
    ax.set_title(label = seasons[i], fontsize=16)
    ax.set_ylabel(fontsize=16, ylabel='latitude')
    ax.set_xlabel(fontsize=16, xlabel='longitude')
plt.tight_layout()

In [None]:
season_mean_sm_aoi = season_mean(xr_model_aoi.soil_moisture_1km, calendar='standard')
season_mean_pp_aoi = season_mean(xr_model_aoi.precipitation_1km, calendar='standard')
season_mean_evp_aoi = season_mean(xr_model_aoi.real_evapotranspiration, calendar='standard')

In [None]:
fig, axes = plt.subplots(ncols=2,nrows=2, figsize=(24,16))
seasons = ['DJF', 'MAM', 'JJA', 'SON']
for i, ax  in enumerate(fig.axes):
    season_mean_sm_aoi.sel(season=seasons[i]).plot.pcolormesh(ax=ax, cbar_kwargs={"label": 'cm³/cm³'})
    ax.tick_params('both', labelsize=15)
    ax.set_title(label = seasons[i], fontsize=16)
    ax.set_ylabel(fontsize=16, ylabel='latitude')
    ax.set_xlabel(fontsize=16, xlabel='longitude')
    vec_ried.boundary.geometry.plot(ax=ax, edgecolor='black', linewidth=3.5)
plt.tight_layout()
plt.savefig('figures//mean_seasonal_aoi.png') #K means clustering


In [None]:
fig, axes = plt.subplots(ncols=2,nrows=2, figsize=(24,16))
seasons = ['DJF', 'MAM', 'JJA', 'SON']
for i, ax  in enumerate(fig.axes):
    season_mean_pp_aoi.sel(season=seasons[i]).plot.pcolormesh(ax=ax, cbar_kwargs={"label": 'cm³/cm³'})
    ax.tick_params('both', labelsize=15)
    ax.set_title(label = seasons[i], fontsize=16)
    ax.set_ylabel(fontsize=16, ylabel='latitude')
    ax.set_xlabel(fontsize=16, xlabel='longitude')
    vec_ried.plot(ax=ax, alpha=0.4, edgecolor='black', linewidth=3.5)
plt.tight_layout()


In [None]:
fig, axes = plt.subplots(ncols=2,nrows=2, figsize=(24,16))
seasons = ['DJF', 'MAM', 'JJA', 'SON']
for i, ax  in enumerate(fig.axes):
    season_mean_evp_aoi.sel(season=seasons[i]).plot.pcolormesh(ax=ax, cbar_kwargs={"label": 'cm³/cm³'})
    ax.tick_params('both', labelsize=15)
    ax.set_title(label = seasons[i], fontsize=16)
    ax.set_ylabel(fontsize=16, ylabel='latitude')
    ax.set_xlabel(fontsize=16, xlabel='longitude')
    vec_ried.plot(ax=ax, alpha=0.2, edgecolor='black', linewidth=3.5)
plt.tight_layout()

## Standart deviation 

https://docs.scipy.org/doc/numpy/reference/generated/numpy.std.html#numpy.std  
use ddoff=1 against normally underestimating the not knowing population distribution 

In [None]:
#standart deviation 
fig,ax = plt.subplots(figsize=(25,18))
sm_std = xr_model_aoi.soil_moisture_1km.groupby('latitude','longitude').std('time')
sm_std.plot(ax=ax)
plt.savefig('figures//std_aoi.png') #K means clustering


## Histogramm plotting

In [None]:
#get soil moisture from dataset but why do i do this? why dont use the xarray from beginning? I don't remember 
dataframe_sm_aoi = xr_model_aoi.soil_moisture_1km.groupby('longitude', 'latitude').std('time').to_dataframe().reset_index()
dataframe_sm_ried = xr_model_ried.soil_moisture_1km.groupby('longitude', 'latitude').std('time').to_dataframe().reset_index()

#Get std from sm time series
xr_sm_ried = xr_model_ried.soil_moisture_1km.groupby('longitude', 'latitude').std('time').reset_index(dims_or_levels=['longitude', 'latitude']).rename({'longitude_': 'longitude', 'latitude_': 'latitude'})
xr_sm_aoi = xr_model_aoi.soil_moisture_1km.groupby('longitude', 'latitude').std('time').reset_index(dims_or_levels=['longitude', 'latitude']).rename({'longitude_': 'longitude', 'latitude_': 'latitude'})

xr_ndvi_ried = s2_1c.NDVI.groupby('longitude', 'latitude').std('time').reset_index(dims_or_levels=['longitude', 'latitude']).rename({'longitude_': 'longitude', 'latitude_': 'latitude'})

#align coordinates for clc map to match sm map coordinates
#clc_2018_1km_interp = clc_2018_1km.interp(longitude=xr_sm_ried['longitude'], latitude=xr_sm_ried['latitude'])
xr_sm_ried_interp = xr_sm_ried.interp(longitude=clc_1000['longitude'], latitude=clc_1000['latitude'], method='nearest')

xr_ndvi_ried_interp = xr_ndvi_ried.interp(longitude=clc_1000['longitude'], latitude=clc_1000['latitude'], method='nearest')


In [None]:
#convert to numpy and flatten array
df_sm_ried_interp = xr_sm_ried_interp.values.reshape(-1)
df_clc_1000_interp = clc_1000.values.reshape(-1)
df_ndvi_ried_interp = xr_ndvi_ried_interp.values.reshape(-1)
#combine arrays in dataframe
df_clc_sm = pd.DataFrame({'sm' : df_sm_ried_interp, 'clc' : df_clc_1000_interp, 'ndvi' : df_ndvi_ried_interp})

In [None]:
df_clc_sm.dropna(how='any').plot.scatter(x='clc', y='sm', c='clc', cmap='viridis', title='coreine land cover class vs. sm std')

In [None]:
df_clc_sm.dropna(how='any').plot.scatter(x='sm', y='ndvi', c='ndvi', cmap='viridis', title='ndvi vs. soil moisture')

In [None]:
print('clc_2018 values: ', len(clc_1000.values.reshape(-1)))
print('clc_2018_1km values: ', len(clc_1000.values.reshape(-1)))
print('xr_sm_ried_interp: ', len(xr_sm_ried_interp.values.reshape(-1)))
print('dataframe_sm_ried: ', len(dataframe_sm_ried))

## Correlation

In [None]:
xr_model_aoi_clc2 = xr_model_aoi.where(clc_1000_grp['clc_grp'] == 2)

In [None]:
def get_x_y(cords, dataset):
    x_list = []
    x1_list = []
    x2_list = []
    y_list = []
    y1_list = []
    y2_list = []
    y_nan_list = []
    y1_nan_list = []
    y2_nan_list = []
    x_nan_list = []
    
    for loc in cords:
        x = dataset.sel(longitude=loc[1], latitude=loc[0], method='nearest').time.values
        x_nan_list.append(x)
        #print(x)
        #mask_x = np.isfinite(x)

        y = dataset.sel(longitude=loc[1], latitude=loc[0], method='nearest').soil_moisture_1km.values
        y_nan_list.append(y)
        mask_y = np.isfinite(y)
        
        y1 = dataset.sel(longitude=loc[1], latitude=loc[0], method='nearest').precipitation_1km.values
        y1_nan_list.append(y1)
        mask_y1 = np.isfinite(y1)
        
        y2 = dataset.sel(longitude=loc[1], latitude=loc[0], method='nearest').real_evapotranspiration.values
        y2_nan_list.append(y2)
        mask_y2 = np.isfinite(y2)
        
        y_v = y[mask_y].astype('double')
        y_list.append(y_v)
        
        x_v = x[mask_y]
        x_list.append(x_v)
        
        x1_v = x[mask_y1]
        y1_v = y1[mask_y1].astype('double')
        x1_list.append(x1_v)
        y1_list.append(y1_v)
        
        x2_v = x[mask_y2]
        y2_v = y2[mask_y2].astype('double')
        x2_list.append(x2_v)
        y2_list.append(y2_v)
        
        
    return x_list, x1_list, x2_list, y_list, y1_list, y2_list, y_nan_list, y1_nan_list, y2_nan_list, x_nan_list

In [None]:
#get data for coordinates
x, x1, x2, y, y1, y2, y_nan, y1_nan, y2_nan,x_nan = get_x_y(r3, xr_model_aoi_clc2)

#create Dataframe for statistics
df_stat = pd.DataFrame({'sm_mean': [np.nan], 'sm_std': [np.nan], 'sm_var': [np.nan], 'pp_mean': [np.nan], 'pp_std': [np.nan], 'pp_var': [np.nan], 'ept_mean': [np.nan], 'ept_std': [np.nan], 'ept_var': [np.nan]})
#clear mask & reset index
df_stat = df_stat.dropna().reset_index(drop=True)

In [None]:
len(r3)

In [None]:
fig, ax = plt.subplots(ncols=6, nrows=len(r3), figsize=(40,len(r3)*4.5)) #nrows=len(coord_collection)

#loop through coordinates collection and plot timeseries, hists for values and add statistics to pandas df
for i, (time, time_pp, time_ept, sm, pp, ept) in enumerate(zip(x, x1, x2, y, y1, y2)):
    df_stat = df_stat.append({'sm_mean': sm.mean(), 'sm_std': sm.std(ddof=1), 'sm_var': sm.var(ddof=1), 'pp_mean': pp.mean(), 'pp_std': pp.std(ddof=1), 'pp_var': pp.var(ddof=1), 'ept_mean': ept.mean(), 'ept_std': ept.std(ddof=1), 'ept_var': ept.var(ddof=1)}, ignore_index=True)
    ax[i][0].plot(time, sm, label='soil moisture')
    ax[i][0].legend()
    ax[i][1].hist(sm)
    ax_t = ax[i][1].twinx()
    x_nd = np.linspace(sm.mean() - 3*sm.std(ddof=1), sm.mean() + 3*sm.std(ddof=1), 10)
    ax_t.plot(x_nd, scipy.stats.norm.pdf(x_nd, sm.mean(), sm.std(ddof=1)))
    ax[i][2].plot(time_pp, pp, label='precipitation')
    ax[i][2].legend()
    ax[i][3].hist(pp)
    ax_t = ax[i][3].twinx()
    x_nd = np.linspace(pp.mean() - 3*pp.std(ddof=1), pp.mean() + 3*pp.std(ddof=1), 10)
    ax_t.plot(x_nd, scipy.stats.norm.pdf(x_nd, pp.mean(), pp.std(ddof=1)))
    ax[i][4].plot(time_ept, ept, label='real Evapotranspiration ')
    ax[i][4].legend()
    ax[i][5].hist(ept)
    ax_t = ax[i][5].twinx()
    x_nd = np.linspace(ept.mean() - 3*ept.std(ddof=1), ept.mean() + 3*ept.std(ddof=1), 10)
    ax_t.plot(x_nd, scipy.stats.norm.pdf(x_nd, ept.mean(), ept.std(ddof=1)))
display(df_stat)

## Covariance & Spearman & pearson correlation coefficient

In [None]:
fig,ax=plt.subplots(ncols = len(x_nan), figsize=(len(x_nan)*8,4))

for i, (time, sm, pp, ept, ax) in enumerate(zip(x_nan, y_nan, y1_nan, y2_nan, fig.axes)):
    df = pd.DataFrame({'time' : time, 'sm' : sm, 'pp' : pp, 'ept' : ept})
    overall_pearson_r = df.corr(method='pearson')
    print(f"Pandas computed overall Pearson r: \n{overall_pearson_r}")
    # out: Pandas computed Pearson r: 0.2058774513561943

    r, p = scipy.stats.pearsonr(df.dropna().sm, df.dropna().pp)
    print('')
    print(f"Scipy computed overall Pearson r: \n{r} and p-value: \n{p}")
    # out: Scipy computed Pearson r: 0.20587745135619354 and p-value: 3.7902989479463397e-51
    # Compute rolling window synchrony
    abc = df.set_index('time').rolling(window=12,center=True).median()
    print(abc.iloc[500:550])
    bcd = np.isfinite(abc.sm)
    print(abc.sm[bcd])
    bnm = abc.sm[bcd].plot(ax=ax)
    ax.set(xlabel='Time',ylabel='monthly  mean ')
    ax.set(title=f"Overall Pearson r \n{np.round(overall_pearson_r,2)}")
    
    ## Set window size to compute moving window synchrony.
    #r_window_size = 7
    ## Interpolate missing data.
    #df_interpolated = df.interpolate()
    ## Compute rolling window synchrony
    #rolling_r = abs(df_interpolated.sm.rolling(window=7, center=True).corr(df_interpolated.pp))
    #rolling_r2 = abs(df_interpolated.sm.rolling(window=15, center=True).corr(df_interpolated.pp))
    #rolling_r3 = abs(df_interpolated.sm.rolling(window=30, center=True).corr(df_interpolated.pp))
    #rolling_r4 = abs(df_interpolated.sm.rolling(window=60, center=True).corr(df_interpolated.pp))
    #
    #f,ax=plt.subplots(2,1,figsize=(14,6),sharex=True)
    #df.set_index('time').rolling(window=15,center=True).median().plot(ax=ax[0])
    #ax[0].set(xlabel='Frame',ylabel='values ')
    #print(rolling_r)
    #rolling_r.plot(ax=ax[1], linewidth=0.3, grid=True)
    #rolling_r2.plot(ax=ax[1], linewidth=0.4)
    #rolling_r3.plot(ax=ax[1], linewidth=1.3, grid=True)
    #rolling_r4.plot(ax=ax[1], linewidth=2, grid=True)
    #ax[1].set(xlabel='Frame',ylabel='Pearson r sm & pp')
    #plt.suptitle("sm & pp data and rolling window (30days) correlation")
    

## Cross Correlation
find leader and follower

In [None]:
HTML('<img src="1mWsGTGVdAsy6KoF3n3MyLA.gif">')

In [None]:
def crosscorr(datax, datay, lag=0, wrap=False):
    """ Lag-N cross correlation. 
    Shifted data filled with NaNs 
    
    Parameters
    ----------
    lag : int, default 0
    datax, datay : pandas.Series objects of equal length
    Returns
    ----------
    crosscorr : float
    """
    if wrap:
        shiftedy = datay.shift(lag)
        shiftedy.iloc[:lag] = datay.iloc[-lag:].values
        return datax.corr(shiftedy)
    else: 
        return datax.corr(datay.shift(lag))

d1 = df.pp
d2 = df.sm
rs = [crosscorr(d1,d2, lag) for lag in range(60)]
offset = np.ceil(len(rs)/2)-np.argmax(rs)
f,ax=plt.subplots(figsize=(14,3))
ax.plot(rs)
ax.axvline(np.ceil(len(rs)/2),color='k',linestyle='--',label='Center', linewidth=3)
ax.axvline(np.argmax(rs),color='red',linestyle='solid',label='Peak synchrony', linewidth=3)
#ax.set(title=f'Offset = {offset} frames\nS1 leads <> S2 leads',ylim=[.1,.5],xlim=[0,1901], xlabel='Offset',ylabel='Pearson r')
#ax.set_xticks([0, 50, 100, 151, 201, 251, 301,351,401,451,501,551,601,651,701,751,801,851,901,951,1001,1051,1101,1151,1201,1251,1301,1351,1401,1451,1501,1551,1601,1651,1701,1751,1801,1801])
#ax.set_xticklabels([-900,-850,-800,-750,-700,-650,-600,-550,-500,-450,-400,-350,-300,-250,-200,-150, -100, -50, 0, 50, 100, 150,200,250,300,350,400,450,500,550,600,650,700,750,800,850,900]);
plt.legend()

## Windowed time lagged cross correlation

https://towardsdatascience.com/four-ways-to-quantify-synchrony-between-time-series-data-b99136c4a9c9

https://towardsdatascience.com/four-ways-to-quantify-synchrony-between-time-series-data-b99136c4a9c9

In [None]:
# Windowed time lagged cross correlation #sample data has 5400 rows 54
days_w = 14
no_splits = int(len(df)/days_w)
samples_per_split = df.shape[0]/no_splits
rss=[]
for t in range(0, no_splits):
    d1 = df.pp.loc[(t)*samples_per_split:(t+1)*samples_per_split]
    d2 = df.sm.loc[(t)*samples_per_split:(t+1)*samples_per_split]
    rs = [crosscorr(d1,d2, lag) for lag in range(-int(days_w),int(days_w+1))]
    rss.append(rs)
rss = pd.DataFrame(rss)
f,ax = plt.subplots(figsize=(10,8))
sns.heatmap(rss,cmap='RdBu_r',ax=ax)
ax.set(title=f'Windowed Time Lagged Cross Correlation',xlim=[0,days_w*2], xlabel='Offset',ylabel='Window epochs')
ax.set_xticks(range(0,days_w*2))
ax.set_xticklabels(range(-days_w,days_w));
#ax.set_xticks([0, 5, 10, 15, 20, 25, 30])
#ax.set_xticklabels([-15, -10, -5, 0, 5, 10, 15]);

## Rolling Window time lag cross correlation

In [None]:
# Rolling window time lagged cross correlation
days_w = 7
window_size = 400 #samples
t_start = 0
t_end = t_start + window_size
step_size = 14
rss=[]
while t_end < len(df):
    d1 = df.pp.iloc[t_start:t_end]
    d2 = df.sm.iloc[t_start:t_end]
    rs = [crosscorr(d1,d2, lag, wrap=False) for lag in range(-int(days_w),int(days_w+1))]
    rss.append(rs)
    t_start = t_start + step_size
    t_end = t_end + step_size
rss = pd.DataFrame(rss)

f,ax = plt.subplots(figsize=(10,10))
sns.heatmap(rss,cmap='RdBu_r',ax=ax)
ax.set(title=f'Rolling Windowed Time Lagged Cross Correlation',xlim=[0,15], xlabel='Offset',ylabel='Epochs')
ax.set_xticks(range(0,15,1))
ax.set_xticklabels(range(-7,8,1));
plt.savefig('figures//Rolling_Windowed_Time_Lagged_Cross_Correlation.png') #K means clustering


## DTW Dynamic Time Warping

In [None]:
d1 = df.pp.interpolate().values
d2 = df.sm.interpolate().values
mask_d2 = d2 > 0
d1 = d1[mask_d2]
d2 = d2[mask_d2]

d, cost_matrix, acc_cost_matrix, path = accelerated_dtw(d1,d2, dist='euclidean')

plt.imshow(acc_cost_matrix.T, origin='lower', cmap='gray', interpolation='gaussian')
plt.plot(path[0], path[1], 'w')
plt.xlabel('precipitation')
plt.ylabel('soil moisture')
plt.title(f'DTW Minimum Path with minimum distance: {np.round(d,2)}')
plt.savefig('figures//DTW_Minimum_Path_with_minimum_distance.png') #K means clustering


plt.show()

In [None]:

def butter_bandpass(lowcut, highcut, fs, order=5):
    nyq = 0.5 * fs
    low = lowcut / nyq
    high = highcut / nyq
    b, a = butter(order, [low, high], btype='band')
    return b, a


def butter_bandpass_filter(data, lowcut, highcut, fs, order=5):
    b, a = butter_bandpass(lowcut, highcut, fs, order=order)
    y = filtfilt(b, a, data)
    return y

lowcut  = .01
highcut = .5
fs = 30.
order = 1
d1 = df.pp.interpolate().values
d2 = df.sm.interpolate().values
d3 = df.ept.interpolate().values
mask_d2 = d2 > 0
d1 = d1[mask_d2]
d2 = d2[mask_d2]
d3 = d3[mask_d2]

y1 = butter_bandpass_filter(d1,lowcut=lowcut,highcut=highcut,fs=fs,order=order)
y2 = butter_bandpass_filter(d2,lowcut=lowcut,highcut=highcut,fs=fs,order=order)
y3 = butter_bandpass_filter(d3,lowcut=lowcut,highcut=highcut,fs=fs,order=order)

al1 = np.angle(hilbert(y1),deg=False)
al2 = np.angle(hilbert(y2),deg=False)
al3 = np.angle(hilbert(y3),deg=False)

phase_synchrony = 1-np.sin(np.abs(al1-al2)/2)
N = len(al1)

# Plot results
f,ax = plt.subplots(3,1,figsize=(14,7),sharex=True)
ax[0].plot(y1,color='r',label='sm')
ax[0].plot(y2,color='b',label='pp')
ax[0].plot(y3,color='g',label='evp')

ax[0].legend(bbox_to_anchor=(0., 1.02, 1., .102),ncol=2)
ax[0].set(xlim=[0,N], title='Filtered Timeseries Data')
ax[1].plot(al1,color='r')
ax[1].plot(al2,color='b')
ax[1].plot(al3,color='g')
ax[1].set(ylabel='Angle',title='Angle at each Timepoint',xlim=[0,N])
phase_synchrony = 1-np.sin(np.abs(al1-al2)/2)
ax[2].plot(phase_synchrony)
ax[2].set(ylim=[0,1.1],xlim=[0,N],title='Instantaneous Phase Synchrony',xlabel='Time',ylabel='Phase Synchrony')
plt.tight_layout()
plt.show()
plt.savefig('figures//Instantaneous_Phase_Synchrony.png') #K means clustering


Problem: Pandas == no problem with nan, np & scipy == problem must drop nan

https://soilgrids.org/#!/?layer=ORCDRC_M_sl2_250m&vector=1

## http://jinhyuncheong.com/jekyll/update/2017/12/10/Timeseries_synchrony_tutorial_and_simulations.html

# Clustering

In [None]:
iris = datasets.load_iris()
print(iris.target)
X = df_cluster.loc[:,:2]
y_iris = X.target
print(y_iris)
km2 = cluster.KMeans(n_clusters=2).fit(X)
km3 = cluster.KMeans(n_clusters=3).fit(X)
km4 = cluster.KMeans(n_clusters=4).fit(X)
plt.figure(figsize=(9, 3))
plt.subplot(131)
plt.scatter(X[:, 0], X[:, 1], c=km2.labels_)
plt.title("K=2, J=%.2f" % km2.inertia_)
plt.subplot(132)
plt.scatter(X[:, 0], X[:, 1], c=km3.labels_)
plt.title("K=3, J=%.2f" % km3.inertia_)
plt.subplot(133)
plt.scatter(X[:, 0], X[:, 1], c=km4.labels_)#.astype(np.float))
plt.title("K=4, J=%.2f" % km4.inertia_)
X

In [None]:
sm_coords = pd.DataFrame([xr_model_aoi.sel(longitude=x[1], latitude=x[0], method='nearest').soil_moisture_1km.to_series() for x in r3]).transpose()
column_names_new = [str(x) for x in r3]
column_names_old = sm_coords.columns.tolist()
sm_coords.rename(columns={key:value for (key,value) in zip(column_names_old,column_names_new)}, inplace=True)
sm_coords_i = sm_coords.resample('D').interpolate()[::120]
fig, ax = plt.subplots(figsize=(26,8))
plot = sm_coords_i.plot(style='o-', ax=ax) # draw the lines so they appears in the legend
colors = [line.get_color() for line in plot.lines] # get the colors of the markers
#df = sm_coords.interpolate(limit_area='inside') # interpolate
#lines = plot.plot(df.index, df.values) # add more lines (with a new set of colors)
for color, line in zip(colors, lines):
  line.set_color(color) # overwrite the new lines colors with the same colors as the old lines

ax.set(ylabel='soil moisture cm³/cm³')
plt.savefig('figures//ts_r3_lineplot.png') #K means clustering


In [None]:
diet = sm_coords.iloc[:,1].dropna()
diet_resamp_yr = diet.resample('A').mean()
diet_roll_yr = diet.rolling(15).mean()
fig, ax = plt.subplots(figsize=(26,8))

diet.plot(alpha=0.5, style='-',ax = ax) # store axis (ax) for latter plots
diet_resamp_yr.plot(style=':', label='Resample at year frequency', ax=ax)
diet_roll_yr.plot(style='--', label='Rolling average (smooth), window size=12', ax=ax)
ax.set(ylabel='soil moisture cm³/cm³')
ax.legend()
plt.savefig('figures//rolling_average-example.png') #K means clustering


In [None]:
x = np.asarray(diet)
win = 12
win_half = int(win / 2)
# print([((idx-win_half), (idx+win_half)) for idx in np.arange(win_half, len(x))])
diet_smooth = np.array([x[(idx-win_half):(idx+win_half)].mean() for idx in np.arange(win_half, len(x))])
fig, ax = plt.subplots(figsize=(26,8))
plt.plot(diet_smooth)

In [None]:
gym = sm_coords.iloc[:].dropna()
df_avg = pd.concat([diet.rolling(12).mean(), gym.rolling(12).mean()], axis=1)
fig, ax = plt.subplots(figsize=(26,8))
df_avg.plot(ax=ax)

plt.xlabel('Year')
plt.savefig('figures//ts_r3_rolling_mean.png') #K means clustering


In [None]:
df_dtrend = sm_coords.iloc[:,0:2].dropna() - df_avg
fig, ax = plt.subplots(figsize=(26,8))

df_dtrend.plot(ax=ax)
plt.xlabel('Year')

## First-order differencing: Seasonal Patterns

In [None]:
# diff = original - shiftted data
# (exclude first term for some implementation details)
assert np.all((diet.diff() == diet - diet.shift())[1:])
fig, ax = plt.subplots(figsize=(26,8))

df.iloc[:,2:4].diff().plot(ax=ax)
plt.xlabel('timesteps')
plt.savefig('figures//First-order_differencing.png') #K means clustering


In [None]:
for time, time_pp, time_ept, sm, pp, ept, sm_nan in zip(x, x1, x2, y, y1, y2, y_nan):
    pp = [x for x in pp]
    sm = [x for x in sm_nan]
    ept = [x for x in ept]
    
    #weekly average
    sm_week_mean = [np.nanmean(x) for x in np.array_split(sm, len(pp)/7)]
    sm_mask = np.isfinite(sm_week_mean)
    ept_week_mean = np.array([np.mean(x) for x in np.array_split(ept, len(ept)/7)])
    mask_ept = np.isfinite(ept_week_mean)
    pp_week_mean = np.array([np.mean(x) for x in np.array_split(pp, len(pp)/7)])
    
    ept_week_mean_mask = ept_week_mean[mask_ept]
    pp_week_mean_mask = pp_week_mean[mask_ept]
    sm_week_mean_mask = np.array(sm_week_mean)[sm_mask]
    
    
    #assume normal distribution
    stat, p = stats.pearsonr(ept_week_mean_mask, pp_week_mean_mask )
    print('pearsonr')
    print('stat=%.3f, p=%.3f' % (stat, p))
    if p > 0.05:
        print('Probably independent')
    else:
        print('Probably dependent')
  
    # Example of the Spearman's Rank Correlation Test
    from scipy.stats import spearmanr
    stat, p = spearmanr(ept_week_mean_mask, pp_week_mean_mask )
    print('spearmanr')
    print('stat=%.3f, p=%.3f' % (stat, p))
    if p > 0.05:
        print('Probably independent')
    else:
        print('Probably dependent')    #abc = np.array([pp_week_mean , sm_week_mean])
        #cov = np.cov(m = abc,rowvar=False, ddof=1)
        #corc = np.corrcoef(x = abc)


In [None]:
N = 600 # number of smaples
T = 1.0 / 800.0 # sample spacing
x = np.linspace(0.0, N*T, N)
window_sizes = [10,20,30,40,50]
window_sizes = np.arange(10,51,10).astype(int)
phase_y1_1,phase_y1_2, phase_y2 = 80., 50., 60
amp_y1, amp_y2 = 1., 1.
y1 = amp_y1*np.sin(phase_y1_1 * 2.0*np.pi*x) + amp_y1*np.sin(phase_y1_2 * 2.0*np.pi*x)
y2 = amp_y2*np.sin(phase_y2 * 2.0*np.pi*x)
al1 = np.angle(hilbert(y1),deg=False)
al2 = np.angle(hilbert(y2),deg=False)
f = plt.figure(figsize=(20,8))
gs = mpl.gridspec.GridSpec(4,8)
ax=[f.add_subplot(gs[0,:5]), f.add_subplot(gs[1,:5]),
    f.add_subplot(gs[2,:5]),f.add_subplot(gs[3,:5]),
    f.add_subplot(gs[:2,5:]), f.add_subplot(gs[2:,5:])]
ax[0].plot(y1,color='r',label='y1')
ax[0].plot(y2,color='b',label='y2')
ax[0].legend(bbox_to_anchor=(0., 1.02, 1., .102),ncol=2)
ax[0].set(xlim=[0,N], title='Timeseries Data')
ax[1].plot(al1,color='r')
ax[1].plot(al2,color='b')
ax[1].set(ylabel='Angle', xlim=[0,N],title='Angle at each Timepoint')
phase_synchrony = 1-np.sin(np.abs(al1-al2)/2)
ax[2].plot(phase_synchrony)
ax[2].set(ylim=[0,1],xlim=[0,N],title='Instantaneous Phase Synchrony',ylabel='Phase\nSynchrony')
window_corr_synchrony = pd.DataFrame(columns=window_sizes,index=np.arange(0,N))
for window_size in window_sizes:
    window_corr_synchrony[window_size]=rolling_correlation(data=pd.DataFrame({'y1':y1,'y2':y2}),wrap=True,window=window_size,center=True)
window_corr_synchrony.plot(ax=ax[3])
ax[3].legend(bbox_to_anchor=(0., 1.02, 1., .102),ncol=3)
ax[3].set(ylim=[-1.1,1.1],xlim=[0,N],title='Windowed Correlation Synchrony',xlabel='Time',ylabel='Correlation\nSynchrony')

ticksteps = 30
yf1,yf2 = fft(y1),fft(y2) # perform FFT
amp1,amp2 = 2.0/N * np.abs(yf1),2.0/N * np.abs(yf2)
ax[4].plot(amp1[:N//2],color='r',label='y1')
ax[4].plot(amp2[:N//2],color='b',label='y2')
ax[4].set(xticks=(np.arange(0,N//2,ticksteps)), ylabel='Amplitude',xlabel='Frequency (Hz)',title='FFT result',xlim=[0,N//2])
ax[4].set_xticklabels([int(_tick) for _tick in np.round(fftfreq(N,T)[:N//2],2)[::ticksteps]],rotation=0)
ax[4].legend(bbox_to_anchor=(0., 1.02, 1., .102),ncol=3)

rs_per_window = pd.DataFrame(columns=['rs'],index=np.arange(min(window_sizes),max(window_sizes)+1,1))
rs_per_window['rs']=np.nan
for window_size in window_sizes:
    rs_per_window.loc[window_size,'rs'] = (np.round(stats.pearsonr(phase_synchrony,window_corr_synchrony[window_size].values.ravel())[0],2))
rs_per_window.interpolate(method='index').plot(ax=ax[5],legend=False)
ax[5].set(xticks=window_sizes,xticklabels=[int(w) for w in window_sizes],ylim=[-1.1,1.1],xlabel='Window Size',title='Association between Phase and Window correlation Synchrony')
ax[5].set_ylabel('Correlation between\nPhase and Window\nSynchrony',rotation=0,labelpad=70)
rs_bool = rs_per_window==rs_per_window.max()
rs_bool.loc[rs_per_window.idxmax().values[0]-1:rs_per_window.idxmax().values[0]+1]=True
ax[5].fill_between(np.arange(min(window_sizes),max(window_sizes)+1,1),-1.1,1.1,where=rs_bool.values.ravel(),facecolor='red',alpha=.5)
plt.tight_layout()
plt.show()

Covariance https://scikit-learn.org/stable/modules/covariance.html#covariance  
https://machinelearningmastery.com/statistical-hypothesis-tests-in-python-cheat-sheet/

In [None]:
# Shapiro-Wilk Normality Test
from scipy.stats import shapiro
data = [x,y]
for n, variable in enumerate(data):
    stat, p = shapiro(variable)
    print('variable: ', variable[0:2],'...',variable[-2:])
    print('stat=%.3f, p=%.3f' % (stat, p))
    if p > 0.05:
        print('Probably Gaussian')
    else:
        print('Probably not Gaussian')

In [None]:
dataset

In [None]:

# Author: Nelle Varoquaux <nelle.varoquaux@gmail.com>
#         Alexandre Gramfort <alexandre.gramfort@inria.fr>
# License: BSD

def isotonic_reg(cords):
    x,y = get_x_y(cords)
    
    #x = x[mask_y]
    x = np.arange(0, len(x))

    #y = y[mask_y].astype('double')
    n = len(x)
    
    # #############################################################################
    # Fit IsotonicRegression and LinearRegression models

    ir = IsotonicRegression()

    y_ = ir.fit_transform(x, y)

    lr = LinearRegression()
    lr.fit(x[:, np.newaxis], y)  # x needs to be 2d for LinearRegression

    # #############################################################################
    # Plot result

    segments = [[[i, y[i]], [i, y_[i]]] for i in range(n)]
    lc = LineCollection(segments, zorder=0)
    lc.set_array(np.ones(len(y)))
    lc.set_linewidths(np.full(n, 0.5))

    fig = plt.figure()
    plt.plot(x, y, 'r.', markersize=12)
    plt.plot(x, y_, 'b.-', markersize=12)
    plt.plot(x, lr.predict(x[:, np.newaxis]), 'b-')
    plt.gca().add_collection(lc)
    plt.legend(('Data', 'Isotonic Fit', 'Linear Fit'), loc='lower right')
    plt.title('Isotonic regression')
    return y_#, plt.show()

In [None]:
y_ = isotonic_reg(coord_collection)

**does not assume any form for the target function such as linearity. For comparison a linear regression is also presented**  
https://scikit-learn.org/stable/auto_examples/plot_isotonic_regression.html#sphx-glr-auto-examples-plot-isotonic-regression-py  

In statistics, isotonic regression or monotonic regression is the technique of fitting a free-form line to a sequence of observations under the following constraints: the fitted free-form line has to be non-decreasing (or non-increasing) everywhere, and it has to lie as close to the observations as possible.

## Get locations where ndvi indicates bare_soil

In [None]:
test_ar_mean

In [None]:
mask = xr.ufuncs.isnan(test_ar_mean)
xr.apply_ufunc(function=invert(*mask))

In [None]:
test_ar_mean = s2_1c_ndvi_100m.NDVI.mean( dim='time')
test_ar_mean

## precipitation analysis

In [None]:
def get_coord_pairs(ds):
    #convert long & lat to pandas series then to python list
    longitude = ds.longitude.to_series().tolist()
    latitude = ds.latitude.to_series().tolist()
    return longitude, latitude

def get_dates(ds):
    #convert dates to pandas series then to python list
    time = ds.time.to_series().tolist()
    return time 

def get_combinations(lon, lat, time, ds):
    #make list of all possible fields
    comb = list()
    for i in time:
        for x in lon:
            for y in lat:
                pp = float(ds.sel(longitude=[x], latitude=[y], time=[i]).radolan_precipitation_1km.values)
                if pp > 10:
                    comb.append([x,y,i,pp])
                else:
                    continue
                #comb.append([x,y,i])
    return comb

def clean_combinations(comb, ds, threshold):
    
    #next day
    t1 = datetime.timedelta(days=1)
    
    #check if day after pp event has pp < 1, True = append to list
    comb_clean = [x for x in comb if ds.sel(longitude=x[0], latitude=x[1], time=x[2] + t1).radolan_precipitation_1km.values < threshold]
    
    return comb_clean

def get_con_days(comb, ds, threshold):
    
    #cast list for dry_days
    dry_days_count = list()
    dry_days = list()
    da_list = list()
    
    #make timedeltas to create timespan within expected maximum dry days
    t1 = datetime.timedelta(days=1)
    t2 = datetime.timedelta(days=40)
    
    #loop through possible start combinations look to next day... if >1 elimiate from list 
    for combi in comb:
        
        t_start = combi[2] + t1
        t_end = combi[2] + t2
        
        #select np array first day after event + 30 days in future 
        pp = ds.sel(longitude=combi[0], latitude=combi[1], time=slice(t_start, t_end)).radolan_precipitation_1km.values
        
        #make boolean mask where pp > 1mm 
        pp = np.where(pp > threshold , False, True)
        
        #count occurence of the beginning True periode == dry periode
        try:
            count = int(np.where(pp == False)[0][0])
        except:
            print(pp)
        
        #update dry_days list
        #dry_days_count.append(count)
        
        #select dry periode excluding first occurence of rain > 1mm and first occurence of rain > 10mm
        t_end = t_start + datetime.timedelta(days=count - 1)
        
        #select soil_moisute_1km within this interval
        ds_s = ds.sel(longitude=combi[0], latitude=combi[1], time=slice(t_start, t_end))
        
        #get mean evapo_r for dry periode
        evapo_r = np.mean(ds_s.evapo_r.values.tolist())
        
        #create list for sm &pp  and time & period length
        x = list(range(0,len(ds_s.time.values.tolist())))
        y = ds_s.soil_moisture_1km.values.tolist()
        z = ds_s.radolan_precipitation_1km.values.tolist()
        dry_days.append([count,evapo_r,[x,y,z]])

    #New list with updatet dry_days count
    comb_ext = [x + [y] for x,y in zip(comb,dry_days)]
    
    return comb_ext

def clean_con_days(comb_ext):
    
    #new list for every item with 2 or more sm values (neccesary to calculate slope)
    clean_list = [x for x in comb_ext if np.count_nonzero(~np.isnan(x[4][2][1])) >= 2]
    
    return clean_list
 
def polyfit_xr(values):
    sm_indizes = np.where(np.invert(np.isnan(values[1])))[0].tolist()
    time = values[0][sm_indizes]
    sm = values[1][sm_indizes]
    fit = np.polyfit(time, sm, 1)
    fit_fn = np.poly1d(fit)
    s, i = fit
    
    return [s,i]


def slope_xr(values):
    sm_indizes = np.where(np.invert(np.isnan(values[1])))[0].tolist()
    time = values[0][sm_indizes]
    sm = values[1][sm_indizes]
    slope, intercept, r_value, p_value, std_err = linregress(time, sm)
    return [slope, intercept, r_value, p_value, std_err]


def get_clc_soil(values):
    
    #Insert your lat/lon/band below to extract corresponding pixel value
    raster_value_clc = clc_2018.sel(band=1, y=values[1], x=values[0], method='nearest', tolerance=1000).values.tolist()
    raster_value_soil = soil_map_small_scale.sel(band=1, y=values[1], x=values[0], method='nearest', tolerance=1000).values.tolist()    
    return raster_value_clc, raster_value_soil

def get_ndvi(values):
    #create valid timespan to search for ndvi
    t_delta_f = datetime.timedelta(days=20)
    t_delta_b = datetime.timedelta(days=15) # more time from this point in futute
    time_start = pd.Timestamp(values[2]) - t_delta_b
    time_end = pd.Timestamp(values[2]) + t_delta_f
    
    ds_s2_1c_100m = s2_1c_ndvi_100m.sel(time=slice(time_start, time_end))
    ds_s2_1c = s2_1c.sel(time=slice(time_start, time_end))
    ds_l7_sr = l7_sr_ndvi_100m.sel(time=slice(time_start, time_end))
    ds_l8_sr = l8_sr_ndvi_100m.sel(time=slice(time_start, time_end))

    ndvi = []
    ndvi.extend([np.nanmean(ds_s2_1c_100m.sel(longitude=values[0], latitude=values[1], method='nearest').NDVI.values.tolist())])
    ndvi.extend([np.nanmean(ds_s2_1c.sel(longitude=values[0], latitude=values[1], method='nearest').NDVI.values.tolist())])
    ndvi.extend([np.nanmean(ds_l7_sr.sel(longitude=values[0], latitude=values[1], method='nearest').NDVI.values.tolist())])
    ndvi.extend([np.nanmean(ds_l8_sr.sel(longitude=values[0], latitude=values[1], method='nearest').NDVI.values.tolist())])

    index = None
    index = np.argwhere(np.isfinite(np.array(ndvi).flatten()))
    
    if len(index) > 0:
        ndvi = ndvi[0]
    else:
        ndvi = [np.nan]
    return [ndvi]

**Wäre es zulässig alle sm Einträge zur zeit von d0 auf 0.75 zu setzten weil ja der boden zu diesem zeitpunkt aus niederschlagssicht volle kanne erwischt wurde**

In [None]:
#get lon & lat values #x:  37 y:  33
longitude, latitude = get_coord_pairs(subset)
print('x: ', len(longitude), 'y: ', len(latitude))

In [None]:
#get lon & lat values #x:  37 y:  33
longitude, latitude = get_coord_pairs(subset)
print('x: ', len(longitude), 'y: ', len(latitude))

#combine both to get all possible combinations == acces to every single field #87471
#comb = get_combinations(longitude, latitude, time, subset)
#print('Overall cell count: ', len(comb))

In [None]:
#get dates  #x:  37 y:  33 time:  1812
time = get_dates(subset)
print('x: ', len(longitude), 'y: ', len(latitude), 'time: ', len(time))
print(subset.dims)

In [None]:
#combine both to get all possible combinations == acces to every single field #87471
comb = get_combinations(longitude, latitude, time, subset)
print('Overall cell count: ', len(comb))

In [None]:
#delete all entrys where rain on first day is > 1mm #43850
comb_clean = clean_combinations(comb, subset, 1)
print('possible dry periods d>0 cell count: ', len(comb_clean))

In [None]:
#get number of days with less than 1mm rain  #43850
comb_ext = get_con_days(comb_clean, subset, 1)
print('possible dry periods d>0 cell count: ', len(comb_ext))

In [None]:
comb_ext_clean = clean_con_days(comb_ext) # 6206
print('possible dry periods d>0 & sm values not nan >= 2 cell count: ', len(comb_ext_clean))

In [None]:
comb_calc = [x + [polyfit_xr(np.array(x[4][2]))] for x in comb_ext_clean]
print('calculated polyfit1d slop & intercept')

In [None]:
comb_calc_1 = [x + [slope_xr(np.array(x[4][2]))] for x in comb_calc]
print('calculated lineregression slop & intercept')

In [None]:
update_1 = [x + list(get_clc_soil(x[:2])) for x in comb_calc_1]
print('read corresponding clc_2018 & soil map value for coordinates')

In [None]:
update_2 = [x + [np.count_nonzero(~np.isnan(x[4][-1][-1]))] for x in update_1]
print('add the amount of sm values not nan')

In [None]:
update_3 = [x + list(get_ndvi(x)) for x in update_2] #642 not-null
print('add ndvi to list, periods without nan/overall periods: ', np.sum(np.isfinite([np.sum(x[-1]) for x in update_3])) , '/' , len(update_3))
print('means not nan / overall amount of means: ', np.sum(np.isfinite([np.sum(x[-1]) for x in update_3])) , '/' , len(update_3))

In [None]:
ried_model_v1 = update_3

## Create DataFrame from final json

In [None]:
# lists
lon = [x[0] for x in update_3]
lat = [x[1] for x in update_3]
event_date = [pd.Timestamp(x[2]) for x in update_3]
event_pp = [x[3] for x in update_3]
event_et_mean = [x[4][1] for x in update_3]
periode_duration = [x[4][0] for x in update_3]
sm_measurements = [np.count_nonzero(~np.isnan(x[4][2][1])) for x in update_3]
slope_polyfit1d =  [x[5][0] for x in update_3]
intercept_polyfit1d =  [x[5][1] for x in update_3]
slope_lineregress = [x[6][0] for x in update_3]
intercept_lineregress = [x[6][1] for x in update_3]
ndvi = [np.nanmean(x[-1]) for x in update_3]
clc_2018 = [x[7] for x in update_3]
soil_map = [x[8] for x in update_3]

#list of lists == in pandas = objects to be avoided if possible
days_number_list = [x[4][2][0] for x in update_3]
days_list = [pd.date_range(x[2], periods=(x[4][0] + 1))[1:] for x in update_3]
sm_list = [x[4][2][1] for x in update_3]
pp_list = [x[4][2][2] for x in update_3]

In [None]:
#create Pandas DataFrame
ried_db = pd.DataFrame({'lon' : lon,
                        'lat' : lat,
                        'event_date' : event_date,
                        'event_pp' : event_pp,
                        'event_et_mean' : event_et_mean,
                        'periode_duration' : periode_duration,
                        'sm_measurements' : sm_measurements,
                        'slope_polyfit1d' : slope_polyfit1d,
                        'intercept_polyfit1d' : intercept_polyfit1d,
                        'slope_lineregress' : slope_lineregress,
                        'intercept_lineregress' : intercept_lineregress,
                        'clc_2018' : clc_2018,
                        'soil_map' : soil_map,
                        'ndvi' : ndvi
                       })
print(ried_db.info())
ried_db[0:5]

In [None]:
ried_db.info()

**Group Devis: Erst grob anfangen, also ndvi (0.0 - 0.3), wenn ergebnisse unscharf dann verfeinern der Gruppen ndvi (0.1 -0.2, 0.2-0.3 ...)**

Priority (high -> low) : 1) equal NDVI ranges, 2) clc same category

## Create grouped layer for data with equal ndvi intervals

In [None]:
grp_v1 = ried_db.groupby(pd.cut(ried_db['ndvi'],6))
grp_v1.sm_measurements.describe()

## Show measurement positions on gmap

In [None]:
mask_v1 = (ried_db['ndvi'] < 0.2) & (ried_db['ndvi'] > 0.036) 

In [None]:
db_measurements = ried_db[mask_v1]
print('length of db with mask v1:', len(db_measurements), '/', len(ried_db))

In [None]:
db_sort = db_measurements.sort_values(by=['sm_measurements'], ascending=False)

In [None]:
ried_db[1:2]

In [None]:
def show_thumbnail(db, location, zoom, label, index, col_names):
    
    #gmaps image with marker on lat/lon 
    gmap = gmaps.figure(center=location, map_type='SATELLITE', zoom_level=zoom, layout={'width': '400px', 'height': '300px'})
    marker_layer = gmaps.marker_layer([location], label=str(label)) #,info_box_content=infobox.to_list(), display_info_box=True
    gmap.add_layer(marker_layer)
    
    #plot graph of sm measurements
    fig, ax = plt.subplots() #, figsize=(30,4)
    #x = days_list[index]
    #y1 = np.array(sm_list[index])
    #y1_mask = np.isfinite(sm_list[index])
    #y2 = pp_list[index]
    #p1 = ax[1].plot(x[y1_mask], y1[y1_mask], 'ro' , linestyle='dashed')
    #p2 = ax[1].plot(x, y2, 'ro', color='blue')
    ax.axis('off')
    #ax.axis('tight')
    table = ax.table(cellText=db[0:1].values, colLabels=db.columns,cellLoc ='center')#.set_fontsize(10).scale(1,1.6) #, fontsize=50
    for cell in table._cells:
        if cell[0] == 0:
            table._cells[cell].get_text().set_rotation(90)
    #fig.tight_layout()


    #ax[0].axis('tight')
    #ax[0].axis('off')
    #for row in db:
        
    #the_table = ax[0].table(cellText=str(db.lon), colLabels=['lon'], loc='center')
    #plt.legend((p1[0], p2[0]), ('soil moisture', 'radolan precipitation'))
    #plt.table()
    #fig.autofmt_xdate()
    
    return gmap, plt.show()


In [None]:
table = [x for x in ried_db[0:1].values.tolist()][0]
table


In [None]:

dcsummary = pd.DataFrame([ried_db[0:1].values.tolist()],index=ried_db.columns.tolist()[0:2])

plt.table(cellText=dcsummary.values,colWidths = [0.25]*len(ried_db.columns),
          rowLabels=dcsummary.index,
          colLabels=dcsummary.columns,
          cellLoc = 'center', rowLoc = 'center',
          loc='top')

In [None]:
gmap, fig = show_thumbnail(ried_db, (db_sort.iloc[0][1],db_sort.iloc[0][0]), 15, db_sort.iloc[0][13], db_sort.iloc[0].name, col_names)
gmap

**A small amount of rain <1mm is enough to increase soil moisture up to +0.2**
**Better to set minimal amount of rain to 0.15**

In [None]:
gmap, fig = show_thumbnail((db_sort.iloc[1][1],db_sort.iloc[1][0]), 15, db_sort.iloc[1][-2], db_sort.iloc[1].name)
gmap

In [None]:
gmap, fig = show_thumbnail((db_sort.iloc[2][1],db_sort.iloc[2][0]), 18, db_sort.iloc[2][-2], db_sort.iloc[2].name)
gmap

In [None]:
gmap, fig = show_thumbnail((db_sort.iloc[3][1],db_sort.iloc[3][0]), 18, db_sort.iloc[3][-2], db_sort.iloc[3].name)
gmap

In [None]:
gmap, fig = show_thumbnail((points.iloc[3][1],points.iloc[3][0]), 18, points.iloc[3][-2], points.iloc[3].name)
gmap

In [None]:
gmap, fig = show_thumbnail((points.iloc[78][1],points.iloc[78][0]), 18, points.iloc[78][-2], points.iloc[78].name)
gmap

In [None]:
gmap, fig = show_thumbnail((points.iloc[88][1],points.iloc[88][0]), 18, points.iloc[88][-2], points.iloc[88].name)
gmap

In [None]:
gmap, fig = show_thumbnail((points.iloc[102][1],points.iloc[102][0]), 18, points.iloc[102][-2], points.iloc[102].name)
gmap

In [None]:
gmap, fig = show_thumbnail((points.iloc[11][1],points.iloc[11][0]), 18, points.iloc[11][-2], points.iloc[11].name)
gmap

In [None]:
gmap, fig = show_thumbnail((points.iloc[250][1],points.iloc[250][0]), 18, points.iloc[250][-2], points.iloc[250].name)
gmap

In [None]:
#generate (latitude, longitude) pairs
locations = [(y,x) for y,x in zip(lat,lon)]

heatmap_layer = gmaps.heatmap_layer(locations) #, weights=soil_map
#marker_layer = gmaps.marker_layer(locations)
fig = gmaps.figure(map_type='SATELLITE')
fig.add_layer(heatmap_layer)
fig

**Where does the two patches come from?**

In [None]:
#Plot histogram of sm_measurements within a single periode
plt.axis([2, 13, 0, 1500])
arr = plt.hist(sm_measurements)

n=0
for x,y in zip(arr[0], arr[1]):
    print('sm value amount: ', int(y), 'measurements: ', x)

**Worse Distribution ca. 80% der slopes aus nur aus Messwerten**

**Ich muss mir die einzelnen Perioden anschauen und innerhalb derer nach unterschieden im slope suchen so sind alle anderen parameter gleichzusetzten (Temperatur, Wind, Jahreszeit)**

In [None]:
boundary_0 = [x for x in update_2 if x[-1] == 10]

**Beispiel für Wald boundary_0[1]**

**Beispiel für Acker vielleicht bewässer boundary_0[20]**

funktion schreiben die für die einzelne messreihe das google maps bild bekommt mit graph drunter von sm 
https://stackoverflow.com/questions/7490491/capture-embedded-google-map-image-with-python-without-using-a-browser/50536888#50536888

**211 is agriculture land without irrigation**

In [None]:
agriculture = [x for x in update_1 if x[-1][0] == 211]
print('corresponding clc to 211: ', len(agriculture))

In [None]:
agriculture_1 = [x for x in update_1 if (x[-1][0] == 211) and (x[-4][1] < 5) and (x[-4][1] > 0)]
print('corresponding clc to 211 & evapo_r between 0-5: ', len(agriculture_1))

In [None]:
agriculture_1[0][4][-1][-1]

In [None]:
slopes = [x[-3][0] for x in agriculture]
evapo_r = [x[-4][1] for x in agriculture]
color = [x[-1][1] for x in agriculture]
soil = [x[-1][1] for x in agriculture]

slopes_1 = [x[-3][0] for x in agriculture_1]
evapo_r_1 = [x[-4][1] for x in agriculture_1]
color_1 = [x[-1][1] for x in agriculture_1]
soil_1 = [x[-1][1] for x in agriculture_1]

print(len(slopes) == len(evapo_r) == len(color))
print(len(set(color)), set(color))
print(len(set(color_1)), set(color_1))


In [None]:
s1 = np.mean([x[-3][0] for x in agriculture if x[-1][1] ==1 ])
s2 = np.mean([x[-3][0] for x in agriculture if x[-1][1] ==3 ])
s3 = np.mean([x[-3][0] for x in agriculture if x[-1][1] ==4 ])
s4 = np.mean([x[-3][0] for x in agriculture if x[-1][1] ==5 ])
s5 = np.mean([x[-3][0] for x in agriculture if x[-1][1] ==7 ])
s6 = np.mean([x[-3][0] for x in agriculture if x[-1][1] ==9 ])
s7 = np.mean([x[-3][0] for x in agriculture if x[-1][1] ==10 ])
s8 = np.mean([x[-3][0] for x in agriculture if x[-1][1] ==13 ])

slope_mean = [s1, s2, s3, s4, s5, s6, s7, s8]
soil_class = [1,3,4,5,7,9,10,13]
soil_types_legend = {1:'Niedermoore, Hochmoore', 2:'Vega, Auengleye, örtl. Anmoorgleye', 3:'Tschernoseme', 4:'Parabraunerden', 5:'Braunerden mit Bändern, Bänder-Parabraunerden, örtl. Podsol-Braunerden', 7:'Pararendzinen, Braunerden mit Bändern, örtl. Bänder-Parabraunerden', 8:'Braunerden mit Bändern, Bänder-Parabraunerden, örtl. Podsol-Braunerden', 9:'Pararendzinen', 13: 'Parabraunerden, örtl. Pseudogley-Parabraunerden',25: 'Braunerden, Ranker-Braunerden, Regosol-Braunerden' , 34 : 'Braunerden, Braunerde-Pseudogleye, örtl. Podsol-Braunerden'} #Legende

In [None]:
plt.scatter(soil_class, slope_mean)

**slope test m=(y2-y1)/(x2-x1) example: 0.3/8=0,0375 realistic example when you have 8 day difference between 2 sm values**

**Werte für die Tage von d=1 auf d=0.1 reduzieren** Darstellung ändert sich ?

In [None]:
plt.plot(soil, slopes, 'ro', markersize=0.4 )

In [None]:
plt.axis([-0.1, 0.1, 2, 5])
plt.scatter(slopes_1, evapo_r_1, c=color_1, norm = plt.Normalize(vmin=0, vmax=7), cmap = "nipy_spectral")

In [None]:
plt.scatter(slopes, evapo_r, c=color, norm = plt.Normalize(vmin=0, vmax=7), cmap = "nipy_spectral")

**horizontal lines base on the fact that when slopes available then the possible amount of similiar evapo_r rates is hight becuase it was a long dry event and the evapo_r value is a mean value for each period**  

**

## Geographical describtion & subsetting   
https://georepository.com/crs_6933/WGS-84-NSIDC-EASE-Grid-2-0-Global.html


### Extent

In [None]:
#2D Array of Coordinates
coords_ds = dataset.latitude + dataset.longitude
subset_ds = subset.latitude + subset.longitude
coords_subset_min = subset_overlapping_sm.latitude + subset_overlapping_sm.longitude

#EASE Grid
EASE_crs = ccrs.epsg(6933)

#Coordinates to Gejson Multipoint
ds_lon = dataset.longitude.to_series().tolist()
ds_lat = dataset.latitude.to_series().tolist()
ds_multipoint = list(zip(ds_lon, ds_lat))
ds_multipoint = MultiPoint(ds_multipoint)

In [None]:
#create figure
#fig = plt.figure( )
#Create the geoaxes 
fig, (ax1, ax2, ax3) = plt.subplots(1,3, figsize=(20,10), subplot_kw=dict(projection=ccrs.PlateCarree()), gridspec_kw=dict(wspace=0., hspace =0.))
# Create a Stamen terrain background instance.
stamen_terrain = cimgt.Stamen('terrain-background')
# Add the Stamen data at zoom level 8.
tuple(map(lambda x: x.add_image(stamen_terrain, 8), (ax1, ax2, ax3)))
# Add gridlines
tuple(map(lambda x: x.gridlines(), (ax1, ax2, ax3)))
#Set Extent
#ax1.set_extent([4, 13, 47, 53])
#ax2.set_extent([4, 13, 47, 53])
#Add datalayer
xr.plot.pcolormesh(darray = coords_ds,  ax=ax1, alpha=0.1)
xr.plot.pcolormesh(darray = subset_ds,  ax=ax3, alpha=0.1)
xr.plot.pcolormesh(darray = coords_subset_min,  ax=ax2, alpha=0.1)

## Variables, Dimensions & Coordinates

In [None]:
var1 = 'radolan_precipitation_1km'
var_1 = subset[var1]
var2 = 'soil_moisture_1km'
var_2 = subset[var2]


dim1 = 'longitude'
dim2 = 'latitude'
dim3 = 'time'

## Plot all values in single charts

To-Do:
Xarray suffers when plotting large data arrays switch to other plotting module will be better...  
remove all the xaxes labels and show only the last one

In [None]:
def show_values(ds):
    fig, axes = plt.subplots(nrows=len(np.unique(ds['time.year'].data)) * len(ds), figsize=(20,24))
    count = np.arange(0,len(np.unique(ds['time.year'].data)) * len(ds))
    count_x = count[0]
    for year in np.unique(ds['time.year'].data):
        for var in ds:
            if var == var1:
                ds[var].sel(time=str(year)).mean(['latitude', 'longitude']).plot(ax=axes[count_x]) #2d line https://matplotlib.org/api/_as_gen/matplotlib.lines.Line2D.html#matplotlib.lines.Line2D
            else:
                axes[count_x].set_autoscaley_on(False)
                axes[count_x].set_ylim(0.0,0.6)
                ds[var].sel(time=str(year)).mean(['latitude', 'longitude']).plot(ax=axes[count_x] , marker='x', linestyle='') #2d line https://matplotlib.org/api/_as_gen/matplotlib.lines.Line2D.html#matplotlib.lines.Line2D
            try:
                count_x = count[count_x + 1]
            except:
                count_x = 0
    return 

In [None]:
show_values(subset)

In [None]:
#Show random date for each variable
def show_random_date(ds):
    fig, axes = plt.subplots(ncols=len(ds), figsize=(12,4))
    plots = list()
    for x, var in enumerate(ds):
        plots.append(ds[var].isel(time=np.random.choice(np.arange(5, ds.time.shape[0]))).plot(ax=axes[x]))
    return plots



In [None]:
show_random_date(subset)

## Univariate Description

### Mean

In [None]:
subset

In [None]:
def get_mean(ds):
    
    #location mean over time
    fig, ax = plt.subplots(ncols=2, nrows=2, figsize=(25,15))
    for var,axs in zip(ds,fig.axes):
        ds.mean(dim='time')[var].plot(ax = axs)
        axs.title.set_text(var + 'location mean')
    
    #location standart deviation
    fig, ax = plt.subplots(ncols=2, nrows=2, figsize=(25,15))
    for var,axs in zip(ds,fig.axes):
        ds.std(dim='time')[var].plot(ax = axs)
        axs.title.set_text(var + 'standart deviation')    
        #ds[var].groupby('longitude', 'latitude').map(lambda x: x.mean()).plot(ax=ax)
        #ds[var].groupby('longitude', 'latitude').map(lambda x: x.mean()).plot(ax=ax)

In [None]:
get_mean(subset)

In [None]:
get_mean(dataset)

### Frequency Tables and Histograms

In [None]:
#Histogram 
def show_hist(ds):
    fig, axes = plt.subplots(ncols=2, nrows=2, figsize=(13,10))
    plots = list()
    for x, var in enumerate(ds):
        plots.append(ds[var].plot.hist(ax=axes[x,0], histtype ='bar', align='mid'))
        plots.append(ds[var].plot.hist(ax=axes[x,1], histtype ='bar', align='mid', cumulative =True))
    return plots

def grouped_bins_rad(ds,var):
    ds = ds[var].groupby_bins(group=ds[var], bins=[0. , 0.1, 1,5,10,20,50,100,1000])
    return ds
def grouped_bins_smap(ds,var):
    ds = ds[var].groupby_bins(group=ds[var], bins=[0. , 0.1, 0.2, 0.3, 0.4, 0.5, 0.6])
    return ds
    
def get_freq_table(ds,var):
    freq_dict = dict()
    for key, value in sorted(grouped_bins(ds,var)):
        freq_dict['%s - %s' %(key.left,key.right)] = value.to_series().sum()
    return freq_dict

In [None]:
show_hist(subset)[3][2][0]

In [None]:
print(var1, 'interval value count:')
get_freq_table(subset,var1)

In [None]:
print(var2, 'interval value count:')
get_freq_table(subset,var2)

## Bivariate Description

### Comparing distribution

### Scatterplots

In [None]:
#Make a scatterplot (2 variables against each other)
subset.plot.scatter(x='radolan_precipitation_1km.month', y='soil_moisture_1km')

In [None]:
isotonic_reg(x,y

https://scikit-learn.org/stable/modules/covariance.html

# Preprocessing

## Subsetting (Rectangle) & ROI (Polygon)

In [None]:
#read shapefile into salem with geopandas and adds bounding box
sl_aoi = salem.read_shapefile(paths[0] + 'aoi2020//aoi_2020.shp') #use cached=True for load pickle into temp
sl_ried = salem.read_shapefile(paths[2] + 'Ried_225_222//hessisches_ried.shp')
sl_clc_aoi = salem.read_shapefile(paths[0] + 'corine_land_cover//vec_clc_aoi.shp')

In [None]:
#subset an xarray dataset with salem accesor 
xr_aoi_sm = ls_grid[0].salem.subset(shape=sl_aoi).dropna(dim='time', how='all')
xr_aoi_pp = ls_grid[1].salem.subset(shape=sl_aoi).dropna(dim='time', how='all')
xr_aoi_evp = ls_grid[2].salem.subset(shape=sl_aoi).dropna(dim='time', how='all')

xr_aoi_clc = ls_grid[3].salem.subset(shape=sl_aoi)
xr_aoi_clc_km = ls_grid[4].salem.subset(shape=sl_aoi)

In [None]:
fig, ax = plt.subplots(figsize=(24,24))
xr.align(*[xr_aoi_evp, xr_aoi_pp, xr_aoi_sm], join='right', exclude=['time'])[2].soil_moisture_1km.isel(time=150).plot(ax=ax)

In [None]:
#ROI xarray datasets with salem accesor
xr_ried_sm = xr_aoi_sm.salem.subset(shape=sl_ried).salem.roi(shape=sl_ried).dropna(dim='time', how='all')
xr_ried_pp = xr_aoi_pp.salem.subset(shape=sl_ried).salem.roi(shape=sl_ried).dropna(dim='time', how='all')
xr_ried_evp = xr_aoi_evp.salem.subset(shape=sl_ried).salem.roi(shape=sl_ried).dropna(dim='time', how='all')

xr_ried_clc = xr_aoi_clc.salem.subset(shape=sl_ried).salem.roi(shape=sl_ried)
xr_ried_clc_km = xr_aoi_clc_km.salem.subset(shape=sl_ried).salem.roi(shape=sl_ried)

## Interpolating

#### ~250m Trick: erst np.linspace mit xr sub sm dann mit complete mode l

#### AOI

In [None]:
#Interpolating nan values to minimize the gaps 
xr_model_complete_i = xr_model_complete.interpolate_na(dim='longitude', method='linear', limit=6)

#### Ried

In [None]:
#Interpolating nan values to minimize the gaps 
xr_model_complete_i_ried = xr_model_complete_ried.interpolate_na(dim='longitude', method='linear', limit=4)

In [None]:
#Interpolating nan values to minimize the gaps 
xr_model_data = xr_model_ried.interpolate_na(dim='longitude', method='nearest', limit=3)

In [None]:
fig, ax = plt.subplots(ncols=5,nrows=2, figsize=(32,8))
xr_model_aoi.isel(time=1).soil_moisture_1km.plot(ax=ax[0,0])
xr_model_aoi.isel(time=1).soil_moisture_1km.plot(ax=ax[1,0])
xr_model_aoi.isel(time=1).soil_moisture_1km.plot(ax=ax[0,1])
xr_model_ried.isel(time=1).soil_moisture_1km.plot(ax=ax[1,1])
xr_i_aoi_sm.isel(time=1).soil_moisture_1km.plot(ax=ax[0,2])
xr_i_ried_sm.isel(time=1).soil_moisture_1km.plot(ax=ax[1,2])
xr_model_complete.isel(time=94).soil_moisture_1km.plot(ax=ax[0,3])
xr_model_complete_ried.isel(time=94).soil_moisture_1km.plot(ax=ax[1,3])
xr_model_complete_i.isel(time=94).soil_moisture_1km.plot(ax=ax[0,4])
xr_model_complete_i_ried.isel(time=94).soil_moisture_1km.plot(ax=ax[1,4])

In [None]:
fig, ax = plt.subplots(ncols=5, figsize=(24,8))
precipitation_1km.isel(time=1).precipitation_1km.plot(ax=ax[0])
precipitation_1km.isel(time=1).precipitation_1km.plot(ax=ax[0])
xr_sub_pp.isel(time=1).precipitation_1km.plot(ax=ax[1])
xr_sub_pp.isel(time=1).precipitation_1km.plot(ax=ax[1])
xr_i_sub_pp.isel(time=1).precipitation_1km.plot(ax=ax[2])
xr_model_complete.isel(time=1).precipitation_1km.plot(ax=ax[3])
xr_model_complete_i.isel(time=1).precipitation_1km.plot(ax=ax[4])

In [None]:
fig, ax = plt.subplots(ncols=5, figsize=(24,8))
real_evapotranspiration.isel(time=1).real_evapotranspiration.plot(ax=ax[0])
xr_sub_evp.isel(time=1).real_evapotranspiration.plot(ax=ax[1])
xr_i_sub_evp.isel(time=1).real_evapotranspiration.plot(ax=ax[2])
xr_model_complete.isel(time=1).real_evapotranspiration.plot(ax=ax[3])
xr_model_complete_i.isel(time=1).real_evapotranspiration.plot(ax=ax[4])

In [None]:
xr_i_ried_sm

In [None]:
fig, (ax1,ax2) = plt.subplots(ncols=2, figsize=(16,8))
xr_ried_sm.soil_moisture_1km.isel(time=1).plot(ax=ax1)
ax1.set_title('Raw data')
xr_i_ried_sm.soil_moisture_1km.isel(time=1).plot(ax=ax2)
ax2.set_title('Interpolated data')

# Precipitation Analysis

In [None]:
def get_coord_pairs(ds):
    #convert long & lat to pandas series then to python list
    longitude = ds.longitude.to_series().tolist()
    latitude = ds.latitude.to_series().tolist()
    return longitude, latitude

def get_dates(ds):
    #convert dates to pandas series then to python list
    time = ds.time.to_series().tolist()
    return time 

def get_combinations(lon, lat, time, ds):
    #make list of all possible fields
    pp = [[x, y, i, float(ds.sel(longitude=[x], latitude=[y], time=[i]).precipitation_1km.values)] for i in time for x in lon for y in lat]
    comb = [x for x in pp if x[3] > 10]
    return comb

#def get_combinations(lon, lat, time, ds):
#    #make list of all possible fields
#    comb = list()
#    for i in time:
#        for x in lon:
#            for y in lat:
#                pp = float(ds.sel(longitude=[x], latitude=[y], time=[i]).precipitation_1km.values)
#                if pp > 10:
#                    comb.append([x,y,i,pp])
#                else:
#                    continue
#                #comb.append([x,y,i])
#    return comb

def clean_combinations(comb, ds, threshold):
    
    #next day
    t1 = datetime.timedelta(days=1)
    
    #check if day after pp event has pp < 1, True = append to list
    comb_clean = [x for x in comb if ds.sel(longitude=x[0], latitude=x[1], time=x[2] + t1).precipitation_1km.values < threshold]
    
    return comb_clean

def get_con_days(comb, ds, threshold):
    
    #cast list for dry_days
    dry_days_count = list()
    dry_days = list()
    da_list = list()
    
    #make timedeltas to create timespan within expected maximum dry days
    t1 = datetime.timedelta(days=1)
    t2 = datetime.timedelta(days=40)
    
    #loop through possible start combinations look to next day... if >1 elimiate from list 
    for combi in comb:
        
        if isinstance(combi[2], str):
            combi[2] = datetime.datetime.strptime(combi[2][0:10], '%Y-%m-%d')
        
        t_start = combi[2] + t1
        t_end = combi[2] + t2
        
        #select np array first day after event + 30 days in future 
        pp = ds.sel(longitude=combi[0], latitude=combi[1], time=slice(t_start, t_end)).precipitation_1km.values
        
        #make boolean mask where pp > 1mm 
        pp = np.where(pp > threshold , False, True)
        
        #count occurence of the beginning True periode == dry periode
        try:
            count = int(np.where(pp == False)[0][0])
        except:
            print(pp)
        
        #update dry_days list
        #dry_days_count.append(count)
        
        #select dry periode excluding first occurence of rain > 1mm and first occurence of rain > 10mm
        t_end = t_start + datetime.timedelta(days=count - 1)
        
        #select soil_moisute_1km within this interval
        ds_s = ds.sel(longitude=combi[0], latitude=combi[1], time=slice(t_start, t_end))
        
        #get mean evapo_r for dry periode
        evapo_r = np.mean(ds_s.real_evapotranspiration.values.tolist())
        
        #create list for sm &pp  and time & period length
        x = list(range(0,len(ds_s.time.values.tolist())))
        y = ds_s.soil_moisture_1km.values.tolist()
        z = ds_s.precipitation_1km.values.tolist()
        dry_days.append([count,evapo_r,[x,y,z]])

    #New list with updatet dry_days count
    comb_ext = [x + [y] for x,y in zip(comb,dry_days)]
    
    return comb_ext

def clean_con_days(comb_ext):
    
    #new list for every item with 2 or more sm values (neccesary to calculate slope)
    clean_list = [x for x in comb_ext if np.count_nonzero(~np.isnan(x[4][2][1])) >= 2]
    
    return clean_list
 
def polyfit_xr(values):
    sm_indizes = np.where(np.invert(np.isnan(values[1])))[0].tolist()
    time = values[0][sm_indizes]
    sm = values[1][sm_indizes]
    fit = np.polyfit(time, sm, 1)
    fit_fn = np.poly1d(fit)
    s, i = fit
    
    return [s,i]


def slope_xr(values):
    sm_indizes = np.where(np.invert(np.isnan(values[1])))[0].tolist()
    time = values[0][sm_indizes]
    sm = values[1][sm_indizes]
    slope, intercept, r_value, p_value, std_err = linregress(time, sm)
    return [slope, intercept, r_value, p_value, std_err]

def get_clc_soil(values):
    #Insert your lat/lon/band below to extract corresponding pixel value
    raster_value_clc = clc_1000.sel(latitude=values[1], longitude=values[0], method='nearest').values.tolist()
    raster_value_soil = [xr_soil_map_100.HAUPTGRUPP.sel(y=values[1], x=values[0], method='nearest', tolerance=1000).values.tolist(), 
                         xr_soil_map_100.GRUPPE.sel(y=values[1], x=values[0], method='nearest', tolerance=1000).values.tolist(),
                         xr_soil_map_100.UNTERGRUPP.sel(y=values[1], x=values[0], method='nearest', tolerance=1000).values.tolist(),
                         xr_soil_map_100.BODENEINHE.sel(y=values[1], x=values[0], method='nearest', tolerance=1000).values.tolist()]
    return raster_value_clc, raster_value_soil

def get_ndvi(values):
    #create valid timespan to search for ndvi
    t_delta_f = datetime.timedelta(days=20)
    t_delta_b = datetime.timedelta(days=15) # more time from this point in futute
    time_start = pd.Timestamp(values[2]) - t_delta_b
    time_end = pd.Timestamp(values[2]) + t_delta_f
    
    ds_s2_1c_100m = s2_1c_ndvi_100m.sel(time=slice(time_start, time_end))
    ds_s2_1c = s2_1c.sel(time=slice(time_start, time_end))
    ds_l7_sr = l7_sr_ndvi_100m.sel(time=slice(time_start, time_end))
    ds_l8_sr = l8_sr_ndvi_100m.sel(time=slice(time_start, time_end))

    ndvi = []
    ndvi.extend([np.nanmean(ds_s2_1c_100m.sel(longitude=values[0], latitude=values[1], method='nearest').NDVI.values.tolist())])
    ndvi.extend([np.nanmean(ds_s2_1c.sel(longitude=values[0], latitude=values[1], method='nearest').NDVI.values.tolist())])
    ndvi.extend([np.nanmean(ds_l7_sr.sel(longitude=values[0], latitude=values[1], method='nearest').NDVI.values.tolist())])
    ndvi.extend([np.nanmean(ds_l8_sr.sel(longitude=values[0], latitude=values[1], method='nearest').NDVI.values.tolist())])

    index = None
    index = np.argwhere(np.isfinite(np.array(ndvi).flatten()))
    
    if len(index) > 0:
        ndvi = ndvi[0]
    else:
        ndvi = [np.nan]
    return [ndvi]

## Functions

In [None]:
model = xr_model_aoi

In [None]:
#get lon & lat values #x:  37 y:  33
longitude, latitude = get_coord_pairs(model)
print('x: ', len(longitude), 'y: ', len(latitude))

In [None]:
#get dates  #x:  37 y:  33 time:  1812
time = get_dates(model)
print('x: ', len(longitude), 'y: ', len(latitude), 'time: ', len(time))
print(model.dims)

In [None]:
#combine both to get all possible combinations == acces to every single field V1: #87471 -ried 1km # - aoi 365692 1km #87994 ried
comb = get_combinations(longitude, latitude, time, model)
print('Overall cell count: ', len(comb))

In [None]:
#delete all entrys where rain on first day is > 1mm V1: #43850 ried   #42364 ried
comb_clean = clean_combinations(comb, model, 1)
print('possible dry periods d>0 cell count: ', len(comb_clean))

In [None]:
#get number of days with less than 1mm rain  #43850 #185893 aoi #42364 ried
comb_ext = get_con_days(comb_clean, model, 1)
print('possible dry periods d>0 cell count: ', len(comb_ext))

In [None]:
comb_ext_clean = clean_con_days(comb_ext) #ried 6206 #49610aoi #ried 8275
print('possible dry periods d>0 & sm values not nan >= 2 cell count: ', len(comb_ext_clean))

In [None]:
comb_calc = [x + [polyfit_xr(np.array(x[4][2]))] for x in comb_ext_clean]
print('calculated polyfit1d slop & intercept')

In [None]:
comb_calc_1 = [x + [slope_xr(np.array(x[4][2]))] for x in comb_calc]
print('calculated lineregression slop & intercept')

In [None]:
update_1 = [x + [get_clc_soil(x[:2])] for x in comb_calc_1]
print('read corresponding clc_2018 & soil map value for coordinates')

In [None]:
update_2 = [x + [np.count_nonzero(np.isfinite(x[4][2][1]))] for x in update_1]
print('add the amount of sm values not nan')

In [None]:
update_3 = [x + list(get_ndvi(x)) for x in update_2] #642 not-null #4405 ried
print('add ndvi to list, periods without nan/overall periods: ', np.sum(np.isfinite([np.sum(x[-1]) for x in update_3])) , '/' , len(update_3))
print('means not nan / overall amount of means: ', np.sum(np.isfinite([np.sum(x[-1]) for x in update_3])) , '/' , len(update_3))

## Create DataFrame from final json

In [None]:
# lists
lon = [x[0] for x in update_3]
lat = [x[1] for x in update_3]
event_date = [pd.Timestamp(x[2]) for x in update_3]
event_pp = [x[3] for x in update_3]
event_et_mean = [x[4][1] for x in update_3]
periode_duration = [x[4][0] for x in update_3]
sm_measurements = [np.count_nonzero(np.isfinite(x[4][2][1])) for x in update_3]
slope_polyfit1d =  [x[5][0] for x in update_3]
intercept_polyfit1d =  [x[5][1] for x in update_3]
slope_lineregress = [x[6][0] for x in update_3]
intercept_lineregress = [x[6][1] for x in update_3]
ndvi = [np.nanmean(x[9]) for x in update_3]
clc_2018 = [x[7][0] for x in update_3]
soil_map = [x[7][1] for x in update_3]
lonlat = [lon+lat for lon,lat in zip(lon,lat)]

#list of lists == in pandas = objects to be avoided if possible
days_number_list = [x[4][2][0] for x in update_3]
days_list = [pd.date_range(x[2], periods=(x[4][0] + 1))[1:] for x in update_3]
sm_list = [x[4][2][1] for x in update_3]
pp_list = [x[4][2][2] for x in update_3]

In [None]:
#save as json
#convert time to string while dump via json (datetime is not json seriazeable)
def myconverter(o):
    if isinstance(o, datetime.datetime):
        return o.__str__()

with open('C:\\Users\\USER\\Desktop\\master-thesis-master\\json\\update_3_v1_aoi_clc1000m.txt', 'w') as filehandle:
    json.dump(update_3, filehandle, default = myconverter)

In [None]:
#load json file into variable
with open(paths[0] + 'json//update_3_v1_ried_clcl1000m.txt') as json_file:
    update_3 = json.load(json_file)

In [None]:
#load json file into variable
with open(paths[0] + 'json//update_3_v1_aoi_clc1000m.txt') as json_file:
    update_3 = json.load(json_file)
    
#list of lists == in pandas = objects to be avoided if possible
days_number_list = [x[4][2][0] for x in update_3]
days_list = [pd.date_range(x[2], periods=(x[4][0] + 1))[1:] for x in update_3]
sm_list = [x[4][2][1] for x in update_3]
pp_list = [x[4][2][2] for x in update_3]

In [None]:
#create Pandas DataFrame
ried_db = pd.DataFrame({'lon' : lon,
                        'lat' : lat,
                        'event_date' : event_date,
                        'event_pp' : event_pp,
                        'event_et_mean' : event_et_mean,
                        'periode_duration' : periode_duration,
                        'sm_measurements' : sm_measurements,
                        'slope_polyfit1d' : slope_polyfit1d,
                        'intercept_polyfit1d' : intercept_polyfit1d,
                        'slope_lineregress' : slope_lineregress,
                        'intercept_lineregress' : intercept_lineregress,
                        'clc_2018' : clc_2018,
                        'soil_map' : soil_map,
                        'ndvi' : ndvi,
                        'lonlat' : lonlat
                       })
print(ried_db.info())
ried_db[0:5]

## Add derived columns

In [None]:
#add clc category
ried_db['clc_category'] = ried_db['clc_2018'].apply(lambda x: int(str(x)[:1]))

#add count to latlon
ried_db['lonlat_count'] = ried_db.groupby('lonlat')['lonlat'].transform('count')

print('Unique cell locations:', len(ried_db['lonlat'].unique()), 'periods:', len(ried_db))
ried_db.head()

In [None]:
ried_db.to_csv('csv_new\\ried_1000_v1.csv', date_format='%Y-%m-%d')

**Group Devis: Erst grob anfangen, also ndvi (0.0 - 0.3), wenn ergebnisse unscharf dann verfeinern der Gruppen ndvi (0.1 -0.2, 0.2-0.3 ...)**

Priority (high -> low) : 1) equal NDVI ranges, 2) clc same category

## Create grouped layer for data with equal ndvi intervals

In [None]:
grp_v1 = ried_db_agri.groupby(pd.cut(ried_db_agri['ndvi'],6))
grp_v1.describe()

#### Date

In [None]:
ls_sm_dates = list(xr_sub_sm.time.values)
date_start = '2018-01-01'
date_end = '2019-01-01'

In [None]:
ls_grid = [soil_moisture_1km, precipitation_1km, real_evapotranspiration, clc_2018_100, clc_2018_1000]
ls_grid_sub = [xr_aoi_sm, xr_aoi_pp, xr_aoi_evp, xr_aoi_clc,  xr_aoi_clc_km]
ls_grid_ried = [xr_ried_sm, xr_ried_pp, xr_ried_evp, xr_ried_clc, xr_ried_clc_km]

In [None]:
size=15
params = {'legend.fontsize': 'large',
          'figure.figsize': (20,8),
          'axes.labelsize': size,
          'axes.titlesize': size,
          'xtick.labelsize': size*0.75,
          'ytick.labelsize': size*0.75,
          'axes.titlepad': 25}
plt.rcParams.update(params)
matplotlib.rcdefaults()