# Data preparation

## Table of Contents
- Data preparation
- Read the CORA data (.nc)
- Read the ERA5 data (.nc)
- Combine CORA and ERA5
- Change the date col format

## Read the CORA data

In [20]:
import xarray as xr

# Read the CORA data
cora_data = xr.open_dataset('cora_8518750.nc')

In [21]:
cora_data["date"] = pd.to_datetime(cora_data["date"].values, format="%Y-%m-%d %H:%M")
cora_data

## Read the ERA5 data (.nc)

In [22]:
# Read the ERA5 data
era5_data = xr.open_dataset('era5_data.nc')

In [23]:
era5_data = era5_data.drop_vars('sea_surface_temperature')

In [24]:
era5_data = era5_data.rename({"time": "date"})

In [25]:
era5_data

## Combine CORA and ERA5

In [27]:
combined_data = xr.merge([cora_data, era5_data], join='inner')
combined_data

In [28]:
combined_data.to_netcdf('/home/yl0815/ML-ADCIRC/data/time_series/combined-8518750.nc')

## Change the date col format

In [29]:
import xarray as xr
import pandas as pd

# Load the NetCDF file
file_path = "../../data/time_series/combined-8518750.nc"
ds = xr.open_dataset(file_path)

# Convert 'date' from string to datetime64
ds["date"] = pd.to_datetime(ds["date"].values, format="%Y-%m-%d %H:%M")

# Rename 'date' to 'time' (if required for compatibility)
# ds = ds.rename({"date": "time"})

# Verify the conversion
print(ds["date"].values[:10])  # Print first 10 timestamps
print("Dataset time range:", ds["date"].min().values, "to", ds["date"].max().values)

output_file = "../../data/time_series/8518750.nc"
ds.to_netcdf(output_file)

print(f"Fixed dataset saved to: {output_file}")


['2022-01-01T00:00:00.000000000' '2022-01-01T01:00:00.000000000'
 '2022-01-01T02:00:00.000000000' '2022-01-01T03:00:00.000000000'
 '2022-01-01T04:00:00.000000000' '2022-01-01T05:00:00.000000000'
 '2022-01-01T06:00:00.000000000' '2022-01-01T07:00:00.000000000'
 '2022-01-01T08:00:00.000000000' '2022-01-01T09:00:00.000000000']
Dataset time range: 2022-01-01T00:00:00.000000000 to 2022-12-31T23:00:00.000000000
Fixed dataset saved to: ../../data/time_series/8518750.nc
