In [1]:
import os
from osgeo import gdal
import numpy as np
import pandas as pd
from datetime import datetime
from tqdm import tqdm

## **Extract AWS data**
* Extract all values
* Filter always-null pixels

In [2]:
import os
from datetime import datetime, timedelta

def extract_timestamp_from_filename(filename):
    """
    Tr√≠ch xu·∫•t timestamp t·ª´ t√™n file theo ƒë·ªãnh d·∫°ng 'AWS_YYYYMMDDHHMMSS.tif'
    """
    try:
        base_name = filename.split('.')[0]  # B·ªè ph·∫ßn m·ªü r·ªông .tif
        parts = base_name.split('_')
        if len(parts) < 2:
            return None
        timestamp_str = parts[-1]  # Ph·∫ßn YYYYMMDDHHMMSS
        timestamp = datetime.strptime(timestamp_str, "%Y%m%d%H%M%S")
        return timestamp
    except Exception:
        return None

def check_missing_hours(root_dir):
    """
    Ki·ªÉm tra xem trong th√°ng 4 v√† th√°ng 10 c·ªßa nƒÉm 2019 & 2020 c√≥ thi·∫øu b·∫•t k·ª≥ gi·ªù n√†o kh√¥ng.
    Tr·∫£ v·ªÅ t·∫≠p h·ª£p (set) c√°c timestamp b·ªã thi·∫øu.
    """
    missing_hours = set()

    for year in ['2019', '2020']:
        for month in ['04', '10']:  # Ch·ªâ ki·ªÉm tra th√°ng 4 v√† th√°ng 10
            month_path = os.path.join(root_dir, year, month)
            if not os.path.isdir(month_path):
                print(f"‚ö†Ô∏è Kh√¥ng t√¨m th·∫•y th∆∞ m·ª•c th√°ng: {year}/{month}")
                continue

            expected_hours_per_day = {datetime(int(year), int(month), day, hour) 
                                      for day in range(1, 31)  # Th√°ng 4 v√† th√°ng 10 c√≥ 30 ng√†y
                                      for hour in range(24)}
            available_timestamps = set()

            for day in sorted(os.listdir(month_path)):
                day_path = os.path.join(month_path, day)
                if not os.path.isdir(day_path):
                    continue

                for file in sorted(os.listdir(day_path)):
                    if file.endswith(".tif"):
                        timestamp = extract_timestamp_from_filename(file)
                        if timestamp:
                            available_timestamps.add(timestamp)

            # X√°c ƒë·ªãnh timestamp b·ªã thi·∫øu v√† th√™m v√†o t·∫≠p `missing_hours`
            missing_hours.update(expected_hours_per_day - available_timestamps)

    if missing_hours:
        print(f"‚ö†Ô∏è T·ªïng s·ªë timestamp b·ªã thi·∫øu: {len(missing_hours)}")
    else:
        print("‚úÖ D·ªØ li·ªáu ƒë·∫ßy ƒë·ªß, kh√¥ng c√≥ gi·ªù n√†o b·ªã thi·∫øu!")

    return missing_hours if missing_hours else None


In [3]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
from osgeo import gdal
from datetime import datetime, timedelta

def extract_data(root_dir, option=1, coordinates=None):
    """
    Tr√≠ch xu·∫•t d·ªØ li·ªáu t·ª´ c√°c file GeoTIFF trong th∆∞ m·ª•c root_dir v√† b·ªï sung d·ªØ li·ªáu b·ªã thi·∫øu.
    
    - root_dir: ƒë∆∞·ªùng d·∫´n th∆∞ m·ª•c g·ªëc
    - option: 
        + 1 - Tr·∫£ v·ªÅ to√†n b·ªô d·ªØ li·ªáu
        + 2 - Ch·ªâ l·∫•y d·ªØ li·ªáu ·ªü c√°c t·ªça ƒë·ªô c·ª• th·ªÉ (row, col) trong t·∫≠p `coordinates`
    - coordinates: set ch·ª©a c√°c t·ªça ƒë·ªô (row, col) n·∫øu option = 2
    
    Returns:
    - dict ch·ª©a DataFrame c·ªßa t·ª´ng th√°ng, v·ªõi key l√† "YYYY-MM".
    """
    
    monthly_data = {}
    param_name = root_dir.split('/')[-1]  # T·ª± ƒë·ªông l·∫•y t√™n bi·∫øn kh√≠ t∆∞·ª£ng

    for year in ['2019', '2020']:
        year_path = os.path.join(root_dir, year)
        if not os.path.isdir(year_path):
            continue

        for month in sorted(os.listdir(year_path)):
            month_path = os.path.join(year_path, month)
            if not os.path.isdir(month_path):
                continue

            available_timestamps = set()
            pixel_values = {}  # L∆∞u d·ªØ li·ªáu c·ªßa t·ª´ng pixel

            for day in tqdm(sorted(os.listdir(month_path)), desc=f"ƒêang x·ª≠ l√Ω {year}/{month}"):
                day_path = os.path.join(month_path, day)
                if not os.path.isdir(day_path):
                    continue

                for file in sorted(os.listdir(day_path)):
                    if not file.endswith(".tif"):
                        continue
                    file_path = os.path.join(day_path, file)

                    dataset = gdal.Open(file_path)
                    if dataset is None:
                        print(f"‚ö†Ô∏è Kh√¥ng m·ªü ƒë∆∞·ª£c file: {file_path}")
                        continue

                    band = dataset.GetRasterBand(1)
                    data = band.ReadAsArray()

                    base = file.split('.')[0]
                    parts = base.split('_')
                    if len(parts) >= 2:
                        dt_str = parts[-1]
                        dt = datetime.strptime(dt_str, "%Y%m%d%H%M%S")
                    else:
                        dt = None
                    
                    available_timestamps.add(dt)

                    rows, cols = data.shape
                    row_idx, col_idx = np.indices((rows, cols))

                    # √Åp d·ª•ng ƒëi·ªÅu ki·ªán l·ªçc d·ªØ li·ªáu h·ª£p l·ªá
                    valid_mask = np.ones_like(data, dtype=bool)

                    # N·∫øu option == 2, ch·ªâ gi·ªØ l·∫°i c√°c t·ªça ƒë·ªô c√≥ trong `coordinates`
                    if option == 2 and coordinates:
                        coord_mask = np.vectorize(lambda r, c: (r, c) in coordinates)(row_idx, col_idx)
                        valid_mask &= coord_mask

                    values = data[valid_mask]
                    row_idx = row_idx[valid_mask]
                    col_idx = col_idx[valid_mask]

                    for r, c, v in zip(row_idx, col_idx, values):
                        if (r, c) not in pixel_values:
                            pixel_values[(r, c)] = {}
                        pixel_values[(r, c)][dt] = v

            # T·∫°o danh s√°ch timestamp ƒë·∫ßy ƒë·ªß cho th√°ng ƒë√≥
            start_date = datetime(int(year), int(month), 1, 0, 0, 0)
            end_date = datetime(int(year), int(month), 30, 23, 0, 0)
            all_timestamps = {start_date + timedelta(hours=i) for i in range((end_date - start_date).days * 24 + 24)}

            missing_timestamps = all_timestamps - available_timestamps

            # ƒêi·ªÅn NaN v√†o c√°c timestamp b·ªã thi·∫øu
            for pixel, values in pixel_values.items():
                for ts in missing_timestamps:
                    values[ts] = np.nan

            # Chuy·ªÉn d·ªØ li·ªáu th√†nh DataFrame
            data_list = []
            for (r, c), values in pixel_values.items():
                for ts, v in values.items():
                    data_list.append([ts, r, c, v])

            month_key = f"{year}-{month}"
            df = pd.DataFrame(data_list, columns=["datetime", "row", "col", param_name])
            df["datetime"] = pd.to_datetime(df["datetime"])
            df["row"] = df["row"].astype(int)
            df["col"] = df["col"].astype(int)
            df[param_name] = df[param_name].astype(float)

            monthly_data[month_key] = df

    return monthly_data


**Ki·ªÉm tra s·ªë l∆∞·ª£ng ·∫£nh t·ª´ng th√°ng xem c√≥ gi·ªëng nhau**

In [4]:
import os

def count_tif_files(root_dir):
    """
    ƒê·∫øm s·ªë l∆∞·ª£ng file .tif trong t·ª´ng th∆∞ m·ª•c th√°ng c·ªßa root_dir.
    
    Returns:
    - Dictionary { "YYYY-MM": s·ªë l∆∞·ª£ng file .tif }
    """
    file_counts = {}

    for year in ['2019', '2020']:
        year_path = os.path.join(root_dir, year)
        if not os.path.isdir(year_path):
            continue

        for month in sorted(os.listdir(year_path)):
            month_path = os.path.join(year_path, month)
            if not os.path.isdir(month_path):
                continue

            tif_count = sum(
                1 for day in os.listdir(month_path)
                if os.path.isdir(os.path.join(month_path, day))
                for file in os.listdir(os.path.join(month_path, day))
                if file.endswith(".tif")
            )

            file_counts[f"{year}-{month}"] = tif_count

    return file_counts


# ƒê∆∞·ªùng d·∫´n th∆∞ m·ª•c g·ªëc ch·ª©a d·ªØ li·ªáu
root_directory = "/kaggle/input/rainfall-forecast/DATA_SV/Precipitation/AWS"

# G·ªçi h√†m v√† in k·∫øt qu·∫£
tif_file_counts = count_tif_files(root_directory)
for month, count in tif_file_counts.items():
    print(f"{month}: {count} files")


2019-04: 720 files
2019-10: 627 files
2020-04: 718 files
2020-10: 742 files


Kh√¥ng gi·ªëng nhau => c√≥ c√°c m·ªëc th·ªùi gian b·ªã thi·∫øu tuy nhi√™n s·ªë l∆∞·ª£ng kh√¥ng ƒë√°ng k·ªÉ


In [5]:
# AWS raw data
aws_raw_data = extract_data('/kaggle/input/rainfall-forecast/DATA_SV/Precipitation/AWS')

aws_raw_data.keys()

ƒêang x·ª≠ l√Ω 2019/04: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [00:27<00:00,  1.09it/s]
ƒêang x·ª≠ l√Ω 2019/10: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 31/31 [00:23<00:00,  1.33it/s]
ƒêang x·ª≠ l√Ω 2020/04: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [00:26<00:00,  1.12it/s]
ƒêang x·ª≠ l√Ω 2020/10: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 31/31 [00:27<00:00,  1.14it/s]


dict_keys(['2019-04', '2019-10', '2020-04', '2020-10'])

In [6]:
def get_valid_pixels(result):
    """
    X√°c ƒë·ªãnh t·∫≠p h·ª£p c√°c pixel (row, col) c√≥ √≠t nh·∫•t m·ªôt gi√° tr·ªã h·ª£p l·ªá trong b·∫•t k·ª≥ th√°ng n√†o.
    
    Parameters:
    - result (dict): Dictionary ch·ª©a c√°c DataFrame c·ªßa t·ª´ng th√°ng (output t·ª´ `extract_data`).
    
    Returns:
    - set: T·∫≠p h·ª£p t·ªça ƒë·ªô (row, col) h·ª£p l·ªá.
    """
    if not result:
        return set()

    # L·∫•y t√™n bi·∫øn kh√≠ t∆∞·ª£ng t·ª´ m·ªôt DataFrame b·∫•t k·ª≥
    sample_df = next(iter(result.values()))
    param_name = sample_df.columns[-1]  # C·ªôt cu·ªëi c√πng l√† gi√° tr·ªã kh√≠ t∆∞·ª£ng (U250, AWS, ...)

    valid_pixels = set()

    # Duy·ªát qua t·ª´ng DataFrame ƒë·ªÉ t√¨m c√°c t·ªça ƒë·ªô c√≥ √≠t nh·∫•t m·ªôt gi√° tr·ªã h·ª£p l·ªá
    for df in tqdm(result.values()):
        for row, col, value in zip(df["row"], df["col"], df[param_name]):
            if value != -np.inf and not pd.isna(value):  # N·∫øu gi√° tr·ªã kh√¥ng ph·∫£i -inf, th√™m v√†o t·∫≠p h·ª£p
                valid_pixels.add((row, col))

    return valid_pixels


In [7]:
aws_valid_pixels = get_valid_pixels(aws_raw_data)

len(aws_valid_pixels)

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4/4 [00:25<00:00,  6.47s/it]


334

In [8]:
filtered_aws_data = extract_data('/kaggle/input/rainfall-forecast/DATA_SV/Precipitation/AWS', option = 2, coordinates=aws_valid_pixels)

ƒêang x·ª≠ l√Ω 2019/04: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [00:05<00:00,  5.02it/s]
ƒêang x·ª≠ l√Ω 2019/10: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 31/31 [00:05<00:00,  5.70it/s]
ƒêang x·ª≠ l√Ω 2020/04: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [00:05<00:00,  5.02it/s]
ƒêang x·ª≠ l√Ω 2020/10: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 31/31 [00:05<00:00,  5.74it/s]


In [9]:
for key, value in filtered_aws_data.items():
    print(key)
    print(value.info())

2019-04
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 240480 entries, 0 to 240479
Data columns (total 4 columns):
 #   Column    Non-Null Count   Dtype         
---  ------    --------------   -----         
 0   datetime  240480 non-null  datetime64[ns]
 1   row       240480 non-null  int64         
 2   col       240480 non-null  int64         
 3   AWS       240480 non-null  float64       
dtypes: datetime64[ns](1), float64(1), int64(2)
memory usage: 7.3 MB
None
2019-10
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 248496 entries, 0 to 248495
Data columns (total 4 columns):
 #   Column    Non-Null Count   Dtype         
---  ------    --------------   -----         
 0   datetime  248496 non-null  datetime64[ns]
 1   row       248496 non-null  int64         
 2   col       248496 non-null  int64         
 3   AWS       209418 non-null  float64       
dtypes: datetime64[ns](1), float64(1), int64(2)
memory usage: 7.6 MB
None
2020-04
<class 'pandas.core.frame.DataFrame'>
RangeI

## Extract ERA5 data
* Extract ERA5 features corresponding to AWS pixels
* Merge into 1 dataframe and save

In [None]:
import os

# Th∆∞ m·ª•c ch·ª©a d·ªØ li·ªáu ERA5
era5_root = "/kaggle/input/rainfall-forecast/DATA_SV/ERA5"

# T·∫°o m·ªôt dictionary ƒë·ªÉ l∆∞u DataFrame c·ªßa t·ª´ng th∆∞ m·ª•c con trong ERA5
era5_data_dict = {}

# Duy·ªát qua t·∫•t c·∫£ c√°c th∆∞ m·ª•c con c·ªßa ERA5
for subdir in sorted(os.listdir(era5_root)):
    subdir_path = os.path.join(era5_root, subdir)
    
    if os.path.isdir(subdir_path):  # Ch·ªâ x·ª≠ l√Ω n·∫øu l√† th∆∞ m·ª•c
        print(f"üìÇ ƒêang x·ª≠ l√Ω folder con: {subdir}")
        era5_data_dict[subdir] = extract_data(subdir_path, option=2, coordinates=aws_valid_pixels)


In [12]:
output_dir = "/kaggle/working/merged_data"

# T·∫°o th∆∞ m·ª•c l∆∞u k·∫øt qu·∫£
os.makedirs(output_dir, exist_ok=True)

# üü¢ G·ªôp v√† l∆∞u t·ª´ng th√°ng ri√™ng bi·ªát
for month in filtered_aws_data.keys():
    df = filtered_aws_data[month]  # L·∫•y DataFrame AWS c·ªßa th√°ng ƒë√≥

    # Merge v·ªõi t·ª´ng tr∆∞·ªùng c·ªßa ERA5 n·∫øu c√≥ d·ªØ li·ªáu c√πng th√°ng
    for var_name, era5_dict in era5_data_dict.items():
        if month in era5_dict:
            df = df.merge(era5_dict[month], on=["datetime", "row", "col"], how="left", suffixes=("", f"_{var_name}"))

    # üìù L∆∞u DataFrame t·ª´ng th√°ng
    output_path = os.path.join(output_dir, f"merged_{month}.csv")
    df.to_csv(output_path, index=False)
    print(f"‚úÖ ƒê√£ l∆∞u {output_path}")

print("üéØ Ho√†n t·∫•t qu√° tr√¨nh g·ªôp v√† l∆∞u!")


‚úÖ ƒê√£ l∆∞u /kaggle/working/merged_data/merged_2019-04.csv
‚úÖ ƒê√£ l∆∞u /kaggle/working/merged_data/merged_2019-10.csv
‚úÖ ƒê√£ l∆∞u /kaggle/working/merged_data/merged_2020-04.csv
‚úÖ ƒê√£ l∆∞u /kaggle/working/merged_data/merged_2020-10.csv
üéØ Ho√†n t·∫•t qu√° tr√¨nh g·ªôp v√† l∆∞u!
