In [None]:
import pandas as pd
import numpy as np
import os

In [2]:
'''''''''

Function: train_test_split(df, test_size=0.2, random_state=42, split_by_k=True)

Inputs:

df: The input dataframe containing the data to be split.
test_size: The proportion of the data to be allocated for testing. It is typically expressed as a decimal value between 0 and 1.
random_state: An optional parameter that sets the random seed for reproducibility. It ensures the same data split is obtained when the function is executed multiple times with the same random seed.
split_by_k: A boolean flag indicating whether the data should be split based on the clear-sky index criteria.

Outputs:

Training dataframe: The dataframe containing the data for training.
Testing dataframe: The dataframe containing the data for testing.

The function performs the data split by shuffling the data from three sets based on the clear-sky index criteria:

Set i) contains data points where the clear-sky index (k) is greater than 0.6.
Set ii) contains data points where the clear-sky index is between 0.3 and 0.6 (inclusive).
Set iii) contains data points where the clear-sky index is less than or equal to 0.3.

The function then splits the shuffled data into training and testing sets according to the specified test size.

'''''''''


def train_test_split(df, test_size=0.2, random_state=42, split_by_k = True):
        df['date'] = df.index.date
        INDEX = df[['site_name','k']].copy()
        INDEX['date']= INDEX.index.date
        INDEX = INDEX.groupby(by=[df.index.date,'site_name']).mean()
        INDEX.reset_index(level=1, inplace=True)
        
        if split_by_k :
            klow = INDEX[INDEX.k<=0.3]
            kmed = INDEX[(INDEX.k>0.3) & (INDEX.k<=0.6)]
            khigh =  INDEX[(INDEX.k>0.6)]
            klow_test = klow.sample(frac=test_size,random_state=random_state)
            kmed_test = kmed.sample(frac=test_size,random_state=random_state)
            khigh_test = khigh.sample(frac=test_size,random_state=random_state)
            dg_test = pd.concat([klow_test, kmed_test, khigh_test])

        else:
            dg_test = INDEX.sample(frac=test_size, random_state = random_state)
            
        dg_test.reset_index(inplace=True)
        dg_test = dg_test.rename(columns={'index':'date'})
        dg_test = dg_test.drop(columns = ['k'])
        dg_test = df.reset_index().merge(dg_test,on=['date','site_name']).set_index('Datetime')
        dg_train = pd.concat([df,dg_test]).drop_duplicates(keep=False)
        dg_train.drop(columns=['date'], inplace =True); dg_test.drop(columns=['date'],inplace =True)
        
        return dg_train, dg_test

In [3]:
TRIAN_FOLDER = 'C:\\Users\\Tee\\senior_project\\src\\code\\DataAndResult\\training_data'
CM_DATA = 'DATASET_cloudmask.csv'

In [9]:
df = pd.read_csv(os.path.join(TRIAN_FOLDER,CM_DATA),parse_dates=['Datetime'],index_col='Datetime')

In [10]:
dg_train, dg_test = train_test_split(df, test_size=0.2, random_state=42, split_by_k = True)

In [6]:
proc = pd.concat([dg_train, dg_test])

In [7]:
proc

Unnamed: 0_level_0,site_name,I,Iclr,k,k_bar,HR,CI0,CI1,CI_1,CI_2,...,CI_16,CI_17,CI_18,CI_19,CI_20,CI_21,CI_22,CI_23,CI_24,CI_25
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-02-07 08:00:00+07:00,ISL001,219.9951,172.570155,1.274815,1.032743,8,0.023529,0.023529,0.015686,0.019608,...,0.015686,0.015686,0.019608,0.035294,0.035294,0.015686,0.011765,0.019608,0.027451,0.039216
2022-02-07 08:30:00+07:00,ISL001,335.4066,288.475645,1.162686,1.032743,8,0.011765,0.023529,0.015686,0.015686,...,0.015686,0.015686,0.011765,0.011765,0.011765,0.015686,0.015686,0.011765,0.011765,0.015686
2022-02-07 09:00:00+07:00,ISL001,451.1106,401.491557,1.123587,1.032743,9,0.007843,0.011765,0.007843,0.011765,...,0.007843,0.007843,0.007843,0.011765,0.011765,0.007843,0.007843,0.003922,0.007843,0.011765
2022-02-07 09:30:00+07:00,ISL001,541.9034,506.299490,1.070322,1.032743,9,0.007843,0.007843,0.015686,0.011765,...,0.007843,0.007843,0.007843,0.019608,0.019608,0.007843,0.007843,0.007843,0.011765,0.019608
2022-02-07 10:00:00+07:00,ISL001,607.9678,599.632417,1.013901,1.032743,10,0.027451,0.007843,0.000000,0.011765,...,0.000000,0.000000,0.019608,0.031373,0.023529,0.000000,0.000000,0.015686,0.023529,0.011765
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-02-02 14:00:00+07:00,ISL056,635.0856,744.146530,0.853442,0.904870,14,0.078431,0.082353,0.074510,0.074510,...,0.062745,0.050980,0.062745,0.066667,0.074510,0.058824,0.039216,0.078431,0.078431,0.070588
2023-02-02 14:30:00+07:00,ISL056,578.7385,683.623192,0.846575,0.904870,14,0.082353,0.078431,0.082353,0.074510,...,0.082353,0.058824,0.054902,0.086275,0.094118,0.078431,0.043137,0.082353,0.113725,0.094118
2023-02-02 15:00:00+07:00,ISL056,508.4064,607.338102,0.837106,0.904870,15,0.074510,0.082353,0.086275,0.070588,...,0.066667,0.054902,0.054902,0.066667,0.074510,0.066667,0.047059,0.070588,0.078431,0.070588
2023-02-02 15:30:00+07:00,ISL056,420.3034,516.971570,0.813011,0.904870,15,0.066667,0.074510,0.066667,0.062745,...,0.062745,0.047059,0.047059,0.058824,0.066667,0.058824,0.039216,0.066667,0.078431,0.066667


In [8]:
#proc.to_csv('separated_DATASET_cloudmask.csv') # export data