In [1]:
import os
import datetime

import IPython
import IPython.display
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf

mpl.rcParams['figure.figsize'] = (8, 6)
mpl.rcParams['axes.grid'] = False

In [2]:
def is_gzip_file(filepath):
    with open(filepath, 'rb') as f:
        return f.read(2) == b'\x1f\x8b'

In [3]:
csv_path = '../../dataset/era5-land_timeseries.csv'

In [4]:
if is_gzip_file(csv_path):
  # Read the data directly into a pandas DataFrame
  df = pd.read_csv(csv_path, compression='gzip')
else:
  # Read the data without compression
  df = pd.read_csv(csv_path)

row_count = df.shape[0]
print(f"Total rows: {row_count}")
df.head(5)

Total rows: 31920


Unnamed: 0,datetime,t2m,fal,slhf,ssr,str,sshf,ssrd,strd,u10,v10,sp,tp
0,2021-12-31 01:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2021-12-31 02:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2021-12-31 03:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2021-12-31 04:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2021-12-31 05:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
df['datetime'] = df['datetime'].astype(str).str.strip("'\" ")
df['datetime'] = pd.to_datetime(df['datetime'], format='%Y-%m-%d %H:%M:%S', errors='coerce')
df['datetime'] = df['datetime'].dt.round('15min')
df

Unnamed: 0,datetime,t2m,fal,slhf,ssr,str,sshf,ssrd,strd,u10,v10,sp,tp
0,2021-12-31 01:00:00,0.000000,0.000000,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000,0.000000,0.000000,0.000000
1,2021-12-31 02:00:00,0.000000,0.000000,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000,0.000000,0.000000,0.000000
2,2021-12-31 03:00:00,0.000000,0.000000,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000,0.000000,0.000000,0.000000
3,2021-12-31 04:00:00,0.000000,0.000000,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000,0.000000,0.000000,0.000000
4,2021-12-31 05:00:00,0.000000,0.000000,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
31915,2025-10-25 20:00:00,294.741998,0.159019,-7.385697e+06,1.227919e+07,-3.784283e+06,-1.037121e+06,1.460396e+07,2.811597e+07,0.003645,-1.045682,96241.351599,0.001025
31916,2025-10-25 21:00:00,294.396998,0.159019,-7.379942e+06,1.227919e+07,-3.905559e+06,-1.031178e+06,1.460396e+07,2.949001e+07,-0.090333,-1.096130,96222.583028,0.001025
31917,2025-10-25 22:00:00,294.101350,0.159019,-7.373336e+06,1.227919e+07,-4.039812e+06,-1.024890e+06,1.460396e+07,3.084760e+07,-0.130253,-1.115518,96220.809647,0.001025
31918,2025-10-25 23:00:00,293.943467,0.159019,-7.367421e+06,1.227919e+07,-4.162693e+06,-1.018864e+06,1.460396e+07,3.221136e+07,-0.195488,-1.068894,96258.294931,0.001025


In [7]:
# Sort the DataFrame by 'Start Time'
df = df.sort_values('datetime')
df

Unnamed: 0,datetime,t2m,fal,slhf,ssr,str,sshf,ssrd,strd,u10,v10,sp,tp
0,2021-12-31 01:00:00,0.000000,0.000000,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000,0.000000,0.000000,0.000000
1,2021-12-31 02:00:00,0.000000,0.000000,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000,0.000000,0.000000,0.000000
2,2021-12-31 03:00:00,0.000000,0.000000,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000,0.000000,0.000000,0.000000
3,2021-12-31 04:00:00,0.000000,0.000000,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000,0.000000,0.000000,0.000000
4,2021-12-31 05:00:00,0.000000,0.000000,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
31915,2025-10-25 20:00:00,294.741998,0.159019,-7.385697e+06,1.227919e+07,-3.784283e+06,-1.037121e+06,1.460396e+07,2.811597e+07,0.003645,-1.045682,96241.351599,0.001025
31916,2025-10-25 21:00:00,294.396998,0.159019,-7.379942e+06,1.227919e+07,-3.905559e+06,-1.031178e+06,1.460396e+07,2.949001e+07,-0.090333,-1.096130,96222.583028,0.001025
31917,2025-10-25 22:00:00,294.101350,0.159019,-7.373336e+06,1.227919e+07,-4.039812e+06,-1.024890e+06,1.460396e+07,3.084760e+07,-0.130253,-1.115518,96220.809647,0.001025
31918,2025-10-25 23:00:00,293.943467,0.159019,-7.367421e+06,1.227919e+07,-4.162693e+06,-1.018864e+06,1.460396e+07,3.221136e+07,-0.195488,-1.068894,96258.294931,0.001025


In [8]:
# Define the start and end times
start_time = df['datetime'].min()
end_time = df['datetime'].max()

print(f"device: {start_time} - {end_time}")

device: 2021-12-31 01:00:00 - 2025-10-26 00:00:00


In [10]:
df.transpose()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,31910,31911,31912,31913,31914,31915,31916,31917,31918,31919
datetime,2021-12-31 01:00:00,2021-12-31 02:00:00,2021-12-31 03:00:00,2021-12-31 04:00:00,2021-12-31 05:00:00,2021-12-31 06:00:00,2021-12-31 07:00:00,2021-12-31 08:00:00,2021-12-31 09:00:00,2021-12-31 10:00:00,...,2025-10-25 15:00:00,2025-10-25 16:00:00,2025-10-25 17:00:00,2025-10-25 18:00:00,2025-10-25 19:00:00,2025-10-25 20:00:00,2025-10-25 21:00:00,2025-10-25 22:00:00,2025-10-25 23:00:00,2025-10-26 00:00:00
t2m,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,295.703623,295.253835,294.855697,294.480986,295.199734,294.741998,294.396998,294.10135,293.943467,0.0
fal,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.159019,0.159019,0.159019,0.159019,0.159019,0.159019,0.159019,0.159019,0.159019,0.0
slhf,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-7413658.521177,-7408387.771577,-7402576.861465,-7396250.38162,-7391031.214844,-7385697.475614,-7379942.018116,-7373335.835958,-7367421.0816,0.0
ssr,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,12279194.788693,12279194.788693,12279194.788693,12279194.788693,12279194.788693,12279194.788693,12279194.788693,12279194.788693,12279194.788693,0.0
str,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-3101781.316222,-3272844.602013,-3421688.983474,-3567678.476628,-3670153.85113,-3784283.048851,-3905559.018234,-4039811.878657,-4162693.303686,0.0
sshf,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1070970.148573,-1062199.415227,-1055238.031764,-1049218.882685,-1042951.64998,-1037121.256948,-1031178.386159,-1024890.224633,-1018864.278903,0.0
ssrd,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,14603957.696469,14603957.696469,14603957.696469,14603957.696469,14603957.696469,14603957.696469,14603957.696469,14603958.07608,14603958.07608,0.0
strd,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,21293373.583586,22638113.311807,23992600.348166,25343417.359654,26733667.192138,28115971.059026,29490007.068683,30847598.836896,32211356.993854,0.0
u10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.23241,0.063991,0.111625,0.11725,0.138139,0.003645,-0.090333,-0.130253,-0.195488,0.0


In [11]:
# Group by datetime and calculate mean for all numeric columns
df_clean = df.pivot_table(
    index='datetime',
    values=[col for col in df.columns if col != 'datetime'],
    aggfunc='mean'
).reset_index()

In [12]:
print(f"Original shape df1: {df.shape}")
print(f"After deduplication df1: {df_clean.shape}")
print("\nFirst few rows:")
print(df_clean.head())

Original shape df1: (31920, 13)
After deduplication df1: (31920, 13)

First few rows:
             datetime  fal  slhf   sp  sshf  ssr  ssrd  str  strd  t2m   tp  \
0 2021-12-31 01:00:00  0.0   0.0  0.0   0.0  0.0   0.0  0.0   0.0  0.0  0.0   
1 2021-12-31 02:00:00  0.0   0.0  0.0   0.0  0.0   0.0  0.0   0.0  0.0  0.0   
2 2021-12-31 03:00:00  0.0   0.0  0.0   0.0  0.0   0.0  0.0   0.0  0.0  0.0   
3 2021-12-31 04:00:00  0.0   0.0  0.0   0.0  0.0   0.0  0.0   0.0  0.0  0.0   
4 2021-12-31 05:00:00  0.0   0.0  0.0   0.0  0.0   0.0  0.0   0.0  0.0  0.0   

   u10  v10  
0  0.0  0.0  
1  0.0  0.0  
2  0.0  0.0  
3  0.0  0.0  
4  0.0  0.0  


In [14]:
complete_time_series = pd.date_range(start=start_time, end=end_time, freq='1h')
count = complete_time_series.size
print(f"Number of rows: {count}")

Number of rows: 33480


In [15]:
complete_df = pd.DataFrame({'datetime': complete_time_series})
merged_df = pd.merge(complete_df, df, on='datetime', how='left')

In [17]:
merged_df.to_csv("../../dataset/datasetera5-land_timeseries.csv", index=False)