# I; Split csv file

In [1]:
import numpy as np
import pandas as pd
import os
from sklearn.model_selection import TimeSeriesSplit

csv_path = {"merged": "/kaggle/input/rain-precipitation-station/filter_merged_data"}
time_range = ["2019-04", "2019-10", "2020-04", "2020-10"]
n_splits = 5
timestamps_total = 24*30 
timestamps_test = 24*6    
timestamps_trainval = timestamps_total - timestamps_test 
output_path = "/kaggle/working/"

for time in time_range:
    cur_time_path = os.path.join(output_path, time)
    if not os.path.exists(cur_time_path):
        os.makedirs(cur_time_path)
    
    for key, path in csv_path.items():
        file_name = f"{key}_{time}.csv"
        path_name = os.path.join(path, file_name)
        all_data = pd.read_csv(path_name)
        # each point has 24*30 timestamps
        n_points = all_data.shape[0] // timestamps_total
        print(f"Processing {key} dataset for {time}: {n_points} points found.")
        
        # luu lai du lieu cac fold => 5 fold x 334 diem
        train_folds = [[] for _ in range(n_splits)]
        val_folds = [[] for _ in range(n_splits)]
        test_folds = [[] for _ in range(n_splits)]
        
        # TimeSeriesSplit trên tập train_val với 576 timestamps.
        # mọi điểm có số lượng train_val như nhau nên tính một lần và dùng cho tất cả.
        tscv = TimeSeriesSplit(n_splits=n_splits)
        folds_indices = list(tscv.split(np.arange(timestamps_trainval))) #tuple (train_idx, val_idx)
        
        for point in range(n_points):
            point_data = all_data.iloc[point * timestamps_total: (point + 1) * timestamps_total, :].reset_index(drop=True)
            train_val_data = point_data.iloc[:timestamps_trainval, :].reset_index(drop=True)
            test_data = point_data.iloc[timestamps_trainval:, :].reset_index(drop=True)
            
            for i, (train_idx, val_idx) in enumerate(folds_indices):
                train_fold = train_val_data.iloc[train_idx, :].copy()
                val_fold = train_val_data.iloc[val_idx, :].copy()
                
                train_folds[i].append(train_fold)
                val_folds[i].append(val_fold)
                test_folds[i].append(test_data)
        # to_csv
        for i in range(n_splits):
            fold_train = pd.concat(train_folds[i], axis=0).reset_index(drop=True)
            fold_val = pd.concat(val_folds[i], axis=0).reset_index(drop=True)
            fold_test = pd.concat(test_folds[i], axis=0).reset_index(drop=True)
            
            fold_folder = os.path.join(cur_time_path, f"fold_{i+1}")
            if not os.path.exists(fold_folder):
                os.makedirs(fold_folder)
            
            train_file = os.path.join(fold_folder, f"{key}_train.csv")
            val_file = os.path.join(fold_folder, f"{key}_val.csv")
            test_file = os.path.join(fold_folder, f"{key}_test.csv")
            
            fold_train.to_csv(train_file, index=False)
            fold_val.to_csv(val_file, index=False)
            fold_test.to_csv(test_file, index=False)
            
            print(f"Fold {i+1}: train {fold_train.shape[0]} rows, val {fold_val.shape[0]} rows, test {fold_test.shape[0]} rows")

Processing merged dataset for 2019-04: 334 points found.
Fold 1: train 32064 rows, val 32064 rows, test 48096 rows
Fold 2: train 64128 rows, val 32064 rows, test 48096 rows
Fold 3: train 96192 rows, val 32064 rows, test 48096 rows
Fold 4: train 128256 rows, val 32064 rows, test 48096 rows
Fold 5: train 160320 rows, val 32064 rows, test 48096 rows
Processing merged dataset for 2019-10: 345 points found.
Fold 1: train 33120 rows, val 33120 rows, test 49680 rows
Fold 2: train 66240 rows, val 33120 rows, test 49680 rows
Fold 3: train 99360 rows, val 33120 rows, test 49680 rows
Fold 4: train 132480 rows, val 33120 rows, test 49680 rows
Fold 5: train 165600 rows, val 33120 rows, test 49680 rows
Processing merged dataset for 2020-04: 334 points found.
Fold 1: train 32064 rows, val 32064 rows, test 48096 rows
Fold 2: train 64128 rows, val 32064 rows, test 48096 rows
Fold 3: train 96192 rows, val 32064 rows, test 48096 rows
Fold 4: train 128256 rows, val 32064 rows, test 48096 rows
Fold 5: trai