### Comparing the SM dataset

Using the yearly SM datasets to the comparison due to computation limits

In [1]:
import xarray as xr
import pandas as pd
import dask.dataframe as dd
import os
import numpy as np

path2data = "/Users/tejasvi/Dropbox/Database/Hydrology/era5_land_soil_moisture/"
path2out = "/Users/tejasvi/Dropbox/Database/Hydrology/era5_land_soil_moisture/processed/"
path2Wang = "/Users/tejasvi/Dropbox/Database/Hydrology/Wang_2021_Soil_Moisture/processed/"

In [2]:
# Load the ERA5-Land dataset
ds_era = xr.open_dataset(path2out + "swvl_yearly_avg.nc")
# Print to see structure (optional)
#print(ds)

# Select specific variables to include (e.g., swvl1, swvl2)
selected_vars = ['swvl_0_100', 'swvl_0_289']

# Convert selected variables to a long-format DataFrame
df_era = ds_era[selected_vars].to_dataframe().reset_index()

print(df_era.shape)

# Convert to Dask DataFrame
df_era = dd.from_pandas(df_era, npartitions=4)

#Add relevant time vars
# Ensure the date_column is of string type
df_era['time'] = df_era['time'].astype(str)

# Extract 'yearmon' (first 7 characters: 'YYYY-MM')
df_era['yearmon'] = df_era['time'].str[:7]

# Extract 'year' (first 4 characters: 'YYYY')
df_era['year'] = df_era['time'].str[:4].astype('int64')

# Extract 'month' (characters at position 6-7: 'MM')
df_era['month'] = df_era['time'].str[5:7]

df_era['lon'] = df_era['lon'].astype(float).round(2)
df_era['lat'] = df_era['lat'].astype(float).round(2)

df_era['lon'] = (df_era['lon'] + 180) % 360 - 180

df_era = df_era.dropna(subset=['lon', 'lat', 'year'])

# Select and re-order the columns as requested
selected_columns = [
    'lon', 'lat', 'yearmon', 'year', 'month', 
    'swvl_0_100', 'swvl_0_289'
]

# Re-order the DataFrame based on the selected columns
df_era = df_era[selected_columns]

# Preview the DataFrame
print(df_era.head())

(11923200, 5)
     lon    lat  yearmon  year month  swvl_0_100  swvl_0_289
0 -180.0 -89.75  1979-01  1979    01    0.205914    0.166020
1 -179.5 -89.75  1979-01  1979    01    0.205157    0.165758
2 -179.0 -89.75  1979-01  1979    01    0.204446    0.165513
3 -178.5 -89.75  1979-01  1979    01    0.203720    0.165261
4 -178.0 -89.75  1979-01  1979    01    0.203007    0.165014


In [3]:
# Load the Wang dataset
ds_wang = xr.open_dataset(path2Wang + "yearly_avg.nc")
# Print to see structure (optional)
#print(ds)

# Select specific variables to include (e.g., swvl1, swvl2)
selected_vars = ['sm_0_100']

# Convert selected variables to a long-format DataFrame
df_wang = ds_wang[selected_vars].to_dataframe().reset_index()

print(df_wang.shape)

# Convert to Dask DataFrame
df_wang = dd.from_pandas(df_wang, npartitions=4)

#Add relevant time vars
# Ensure the date_column is of string type
df_wang['time'] = df_wang['time'].astype(str)

# Extract 'yearmon' (first 7 characters: 'YYYY-MM')
df_wang['yearmon'] = df_wang['time'].str[:7]

# Extract 'year' (first 4 characters: 'YYYY')
df_wang['year'] = df_wang['time'].str[:4].astype('int64')

# Extract 'month' (characters at position 6-7: 'MM')
df_wang['month'] = df_wang['time'].str[5:7]

df_wang['lon'] = df_wang['lon'].astype(float).round(2)
df_wang['lat'] = df_wang['lat'].astype(float).round(2)

df_wang['lon'] = (df_wang['lon'] + 180) % 360 - 180

df_wang = df_wang.dropna(subset=['lon', 'lat', 'year'])

# Select and re-order the columns as requested
selected_columns = [
    'lon', 'lat', 'yearmon', 'year', 'month', 
    'sm_0_100'
]

# Re-order the DataFrame based on the selected columns
df_wang = df_wang[selected_columns]

# Preview the DataFrame
print(df_wang.head())


(8552684, 4)
      lon    lat  yearmon  year month  sm_0_100
0 -179.75 -54.75  1970-01  1970    01       NaN
1 -179.25 -54.75  1970-01  1970    01       NaN
2 -178.75 -54.75  1970-01  1970    01       NaN
3 -178.25 -54.75  1970-01  1970    01       NaN
4 -177.75 -54.75  1970-01  1970    01       NaN


In [5]:
print(df_era.dtypes)
print(df_wang.dtypes)

# Convert to sets and compute
year_common = set(df_era['year'].unique().compute()) & set(df_wang['year'].unique().compute())
lon_common = set(df_era['lon'].unique().compute()) & set(df_era['lon'].unique().compute())
lat_common = set(df_era['lat'].unique().compute()) & set(df_era['lat'].unique().compute())


print("Common year:", len(year_common))
print("Common lon:", len(lon_common))
print("Common lat:", len(lat_common))

lon           float64
lat           float64
yearmon        object
year            int64
month          object
swvl_0_100    float64
swvl_0_289    float64
dtype: object
lon         float64
lat         float64
yearmon      object
year          int64
month        object
sm_0_100    float64
dtype: object
Common year: 38
Common lon: 720
Common lat: 360


In [18]:
# Merge the two DataFrames on columns 'col1', 'col2', 'col3'
df_merged = dd.merge(df_era, df_wang, on=['lon', 'lat', 'year'], how='inner')

# Compute the result (this triggers the actual computation and brings the data into memory)
df_merged = df_merged.compute()

# Preview the DataFrame
print(df_merged.shape)
print(df_merged.head())

(0, 10)
Empty DataFrame
Columns: [lon, lat, yearmon_x, year, month_x, swvl_0_100, swvl_0_289, yearmon_y, month_y, sm_0_100]
Index: []


In [7]:
# Get a sample of keys
keys_era = df_era[['lon', 'lat', 'year']].drop_duplicates().compute()
keys_wang = df_wang[['lon', 'lat', 'year']].drop_duplicates().compute()

# Merge keys only to see if there's any overlap
key_matches = keys_era.merge(keys_wang, on=['lon', 'lat', 'year'], how='inner')
print(f"Number of matching key combinations: {len(key_matches)}")

Number of matching key combinations: 0


In [15]:
print("ERA lat range & step:", keys_era['lat'].min(), keys_era['lat'].max(), keys_era['lat'].diff().unique())
print("WANG lat range & step:", keys_wang['lat'].min(), keys_wang['lat'].max(), keys_wang['lat'].diff().unique())

print("ERA lon range & step:", keys_era['lon'].min(), keys_era['lon'].max(), keys_era['lon'].diff().unique())
print("WANG lon range & step:", keys_wang['lon'].min(), keys_wang['lon'].max(), keys_wang['lon'].diff().unique())

ERA lat range & step: -89.75 89.75 [   nan    0.     0.5 -179.5]
WANG lat range & step: -54.75 78.75 [   nan    0.     0.5 -133.5]
ERA lon range & step: -180.0 179.5 [   nan    1.5    1.     4.5    2.     0.5   11.     3.     2.5   10.
    8.     4.     7.5    5.5    3.5    5.     7.     8.5    9.     6.
 -353.     6.5   12.5 -354.5 -355.  -358.5 -358.  -353.5 -354.     9.5
 -359.5 -359.  -350.  -357.5 -355.5]
WANG lon range & step: -179.75 179.75 [   nan    1.5    0.5    1.     5.5    2.     2.5    3.5    3.     4.
    4.5   16.5   10.5    6.     5.     7.5    6.5   10.  -358.  -357.5
    8.    22.    13.     7.  -355.  -356.     8.5 -359.  -354.5    9.
   11.    26.5 -358.5 -355.5 -357.  -359.5]


In [None]:
df_era_pandas = df_era.compute()
df_wang_pandas = df_wang.compute()
df_merged_pandas = pd.merge(df_era_pandas, df_wang_pandas, on=['lon', 'lat', 'year'], how='inner')
print(df_merged_pandas)