In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
'''''''''

Function: train_test_split(df, test_size=0.2, random_state=42, split_by_k=True)

Inputs:

df: The input dataframe containing the data to be split.
test_size: The proportion of the data to be allocated for testing. It is typically expressed as a decimal value between 0 and 1.
random_state: An optional parameter that sets the random seed for reproducibility. It ensures the same data split is obtained when the function is executed multiple times with the same random seed.
split_by_k: A boolean flag indicating whether the data should be split based on the clear-sky index criteria.

Outputs:

Training dataframe: The dataframe containing the data for training.
Testing dataframe: The dataframe containing the data for testing.

The function performs the data split by shuffling the data from three sets based on the clear-sky index criteria:

Set i) contains data points where the clear-sky index (k) is greater than 0.6.
Set ii) contains data points where the clear-sky index is between 0.3 and 0.6 (inclusive).
Set iii) contains data points where the clear-sky index is less than or equal to 0.3.

The function then splits the shuffled data into training and testing sets according to the specified test size.

'''''''''


def train_test_split(df, test_size=0.2, random_state=42, split_by_k = True):
        df['date'] = df.index.date
        INDEX = df[['site_name','k']].copy()
        INDEX['date']= INDEX.index.date
        INDEX = INDEX.groupby(by=[df.index.date,'site_name']).mean()
        INDEX.reset_index(level=1, inplace=True)
        
        if split_by_k :
            klow = INDEX[INDEX.k<=0.3]
            kmed = INDEX[(INDEX.k>0.3) & (INDEX.k<=0.6)]
            khigh =  INDEX[(INDEX.k>0.6)]
            klow_test = klow.sample(frac=test_size,random_state=random_state)
            kmed_test = kmed.sample(frac=test_size,random_state=random_state)
            khigh_test = khigh.sample(frac=test_size,random_state=random_state)
            dg_test = pd.concat([klow_test, kmed_test, khigh_test])

        else:
            dg_test = INDEX.sample(frac=test_size, random_state = random_state)
            
        dg_test.reset_index(inplace=True)
        dg_test = dg_test.rename(columns={'index':'date'})
        dg_test = dg_test.drop(columns = ['k'])
        dg_test = df.reset_index().merge(dg_test,on=['date','site_name']).set_index('Datetime')
        dg_train = pd.concat([df,dg_test]).drop_duplicates(keep=False)
        dg_train.drop(columns=['date'], inplace =True); dg_test.drop(columns=['date'],inplace =True)
        
        return dg_train, dg_test

In [None]:
TRIAN_FOLDER = 'C:\\Users\\Tee\\senior_project\\src\\SolarMap\\DataAndResult\\training_data'

In [3]:
#generated 'separated_DATASET_cloudmask.csv'

CM_DATA = 'DATASET_cloudmask.csv'
df = pd.read_csv(os.path.join(TRIAN_FOLDER,CM_DATA),parse_dates=['Datetime'],index_col='Datetime')
dg_train, dg_test = train_test_split(df, test_size=0.2, random_state=42, split_by_k = True)
proc = pd.concat([dg_train, dg_test])
proc.to_csv('separated_DATASET_cloudmask.csv') # export data

In [8]:
#generated 'separated_DATASET_ch1_overview.csv'

OV_DATA = 'DATASET_ch1_overview.csv'
df = pd.read_csv(os.path.join(TRIAN_FOLDER,OV_DATA),parse_dates=['Datetime'],index_col='Datetime')
dg_train, dg_test = train_test_split(df, test_size=0.2, random_state=42, split_by_k = True)
proc = pd.concat([dg_train, dg_test])
proc.to_csv('separated_DATASET_ch1_overview.csv') # export data