# Dataset information

In [1]:
import xarray as xr
import pandas as pd
from urllib.parse import urljoin

In [None]:
# Function to extract the period from the filename
def extract_period(filename):
    period = filename.split("_")[-1].replace(".ncml", "")
    return period

# Function to extract the dataset name
def extract_dataset_name(filename):
    name = filename.split("_")[1]  # Extract the name part (e.g., AEMET, CHELSA, etc.)
    return name

In [17]:
# Base URL for the 'observations' directory
base_url = "https://data.meteo.unican.es/thredds/dodsC/PTI-clima/observations/"

# List of files to process
files = [
    "OBSERVATIONS_AEMET-5KM-regular_Iberia_day_20240319.ncml",
    "OBSERVATIONS_CHELSA-W5E5v1.0_Canarias_day_20240319.ncml",
    "OBSERVATIONS_CHELSA-W5E5v1.0_Iberia_day_20240319.ncml",
    "OBSERVATIONS_PTI-grid-v0_Canarias_day_20240319.ncml",
    "OBSERVATIONS_PTI-grid-v0_Iberia_day_20240319.ncml"
]

# List to store the dataset information
dataset_info = []

# Iterate over the files
for filename in files:
    file_url = urljoin(base_url, filename)
    
    try:
        # Open the dataset with xarray
        ds = xr.open_dataset(file_url)
        
        # Extract the period from the filename
        period = extract_period(filename)
        
        # Extract the dataset name
        dataset_name = extract_dataset_name(filename)
                
        # Get the time (start and end)
        time = ds.time
        start_time = pd.to_datetime(time.values[0]).strftime("%d/%m/%Y %H:%M:%S")  # Convert to desired format
        end_time = pd.to_datetime(time.values[-1]).strftime("%d/%m/%Y %H:%M:%S")  # Convert to desired format
        
        # Calculate temporal resolution
        if len(time) > 1:
            temporal_diff = pd.to_timedelta(time.values[1] - time.values[0])
            if temporal_diff == pd.Timedelta(days=1):
                temporal_resolution = "Day"
            elif temporal_diff == pd.Timedelta(hours=1):
                temporal_resolution = "Hour"
            else:
                temporal_resolution = f"{temporal_diff}"
        else:
            temporal_resolution = "Unknown"
        
        # Get the resolution from the lat and lon coordinates (if available)
        lat = ds.coords.get('lat', None)
        lon = ds.coords.get('lon', None)
        if lat is not None and lon is not None:
            resolution = f"{abs(lat[1] - lat[0]):.2f}° x {abs(lon[1] - lon[0]):.2f}°"
        else:
            resolution = "Unknown"
        
        # Gather data variables information
        data_vars_info = []
        for var_name, var_data in ds.data_vars.items():
            dims = var_data.dims  # Get the dimensions
            data_vars_info.append(f"{var_name} ({', '.join(dims)})")
        
        # Add the dataset information to the list
        dataset_info.append({
            "Dataset": dataset_name,
            "Period": period,
            "Start Time": start_time,
            "End Time": end_time,
            "Temporal Resolution": temporal_resolution,
            "Resolution": resolution,
            "Data Variables": " ; ".join(data_vars_info),
            "Type": "Observation"
        })
        
    except Exception as e:
        print(f"Error opening {filename}: {e}")

# Create a DataFrame with the gathered information
observations_df = pd.DataFrame(dataset_info)
print(observations_df)

# Save the DataFrame to a CSV file
csv_filename = "datasets_info_observations.csv"
observations_df.to_csv(csv_filename, index=False)

print(f"CSV file '{csv_filename}' created.")


1 days 00:00:00
1 days 00:00:00
1 days 00:00:00
             Dataset    Period           Start Time             End Time  \
0  AEMET-5KM-regular  20240319  01/01/1951 00:00:00  31/12/2022 00:00:00   
1    CHELSA-W5E5v1.0  20240319  01/01/2000 00:00:00  01/01/2000 00:00:00   
2    CHELSA-W5E5v1.0  20240319  01/01/2000 00:00:00  01/01/2000 00:00:00   
3        PTI-grid-v0  20240319  01/01/1961 00:00:00  31/12/2022 00:00:00   
4        PTI-grid-v0  20240319  01/01/1961 00:00:00  31/12/2022 00:00:00   

  Temporal Resolution     Resolution  \
0                 Day  0.05° x 0.05°   
1             Unknown  0.01° x 0.01°   
2             Unknown  0.01° x 0.01°   
3                 Day  0.02° x 0.02°   
4                 Day  0.02° x 0.03°   

                                      Data Variables         Type  
0  lon_bnds (lon, bnds) ; lat_bnds (lat, bnds) ; ...  Observation  
1  time_fx () ; orog (time, lat, lon) ; pr (time,...  Observation  
2  time_fx () ; orog (time, lat, lon) ; pr (time,.

In [14]:
# Base URL for the 'reanalysis' directory
base_url = "https://data.meteo.unican.es/thredds/dodsC/PTI-clima/reanalysis/"

# List of files to process (updated to valid filenames)
files = [
    "REANALYSIS_ERA5-Land_Canarias_day_20240319.ncml",
    "REANALYSIS_ERA5-Land_Iberia_day_20240319.ncml"
]


# List to store the dataset information
dataset_info = []

# Iterate over the files
for filename in files:
    file_url = urljoin(base_url, filename)
    
    try:
        # Open the dataset with xarray
        ds = xr.open_dataset(file_url)
        
        # Extract the period from the filename
        period = extract_period(filename)
        
        # Extract the dataset name
        dataset_name = extract_dataset_name(filename)
        
        # Assuming the first variable is the main one for time
        variable = list(ds.data_vars.keys())[0]  # Get the first variable
        
        # Get the time (start and end)
        time = ds.time
        start_time = pd.to_datetime(time.values[0]).strftime("%d/%m/%Y %H:%M:%S")  # Convert to desired format
        end_time = pd.to_datetime(time.values[-1]).strftime("%d/%m/%Y %H:%M:%S")   # Last time value
                
        # Calculate temporal resolution
        if len(time) > 1:
            temporal_diff = pd.to_timedelta(time.values[1] - time.values[0])
            if temporal_diff == pd.Timedelta(days=1):
                temporal_resolution = "Day"
            elif temporal_diff == pd.Timedelta(hours=1):
                temporal_resolution = "Hour"
            else:
                temporal_resolution = f"{temporal_diff}"
        else:
            temporal_resolution = "Unknown"
            
        # Get the resolution from the lat and lon coordinates (if available)
        lat = ds.coords.get('lat', None)
        lon = ds.coords.get('lon', None)
        if lat is not None and lon is not None:
            resolution = f"{abs(lat[1] - lat[0]):.2f}° x {abs(lon[1] - lon[0]):.2f}°"
        else:
            resolution = "Unknown"
        
        # Gather data variables information
        data_vars_info = []
        for var_name, var_data in ds.data_vars.items():
            dims = var_data.dims  # Get the dimensions
            data_vars_info.append(f"{var_name} ({', '.join(dims)})")
        
        # Add the dataset information to the list
        dataset_info.append({
            "Dataset": dataset_name,
            "Period": period,
            "Resolution": resolution,
            "Start Time": start_time,
            "End Time": end_time,
            "Data Variables": "; ".join(data_vars_info),
            "Type": "Reanalysis"  # Added "Reanalysis" as the type
        })
        
    except Exception as e:
        print(f"Error opening {filename}: {e}")

# Create a DataFrame with the gathered information
reanalysis_df = pd.DataFrame(dataset_info)

# Print the DataFrame for review
print(reanalysis_df)

# Save the DataFrame to a CSV file
csv_filename = "datasets_info_reanalysis.csv"
reanalysis_df.to_csv(csv_filename, index=False)

print(f"CSV file '{csv_filename}' created.")


Unknown
Unknown
     Dataset    Period     Resolution           Start Time  \
0  ERA5-Land  20240319  0.10° x 0.10°  01/01/1950 00:00:00   
1  ERA5-Land  20240319  0.10° x 0.10°  01/02/1950 00:00:00   

              End Time                                     Data Variables  \
0  31/12/2022 00:00:00  d2m (time, latitude, longitude); sfcwind (time...   
1  31/12/2022 00:00:00  d2m (time, latitude, longitude); sfcwind (time...   

         Type  
0  Reanalysis  
1  Reanalysis  
CSV file 'datasets_info_reanalysis.csv' created.


In [4]:
# Concatenate the two DataFrames
combined_df = pd.concat([observations_df, reanalysis_df], ignore_index=True)

# Save the combined DataFrame to a CSV file
csv_filename = "datasets_info.csv"
combined_df.to_csv(csv_filename, index=False)

print(f"CSV file '{csv_filename}' created with combined data.")

CSV file 'datasets_info.csv' created with combined data.
