In [25]:
import pandas as pd
import yaml
import os
import numpy as np
import logging


logging.basicConfig(level=logging.INFO)

## Load the parameters and file paths from param.yaml
with open("/Users/urkarsh.kulshrestha/Documents/AI_environment/work_env/appletree_end_to_end_forecasting/params.yaml", "r") as file:
    params = yaml.safe_load(file)['preprocess']

'''This function will load input data and convert netload into rolling lag values'''
def preprocess_(input_path):
    input_data=pd.read_csv(input_path)
    required_cols = {'time', '_measurement', 'actualNetLoad', 'temperature_2m'}
    missing = required_cols - set(input_data.columns)
    if missing:
        raise ValueError(f"Missing columns: {missing}")
    input_data = input_data.sort_values(by='time')
    input_data.rename(columns={'actualNetLoad': 'netload', 'temperature_2m': 'temperature'}, inplace=True)
    logging.info("Length of original data: %d", len(input_data))
    netload = input_data['netload'].tolist()
    temperature = input_data['temperature'].tolist()
    return (input_data)

In [26]:
def create_training_features(input_df, lag_time):
    input_df['time'] = input_df['time'].astype(str)
    input_df['Datetime'] = pd.to_datetime(input_df['time'])
    input_df['Hour'] = input_df['Datetime'].dt.hour
    input_df['Day_of_Week'] = input_df['Datetime'].dt.day
    input_df['Month'] = input_df['Datetime'].dt.month
    input_df['DayOfYear'] = input_df['Datetime'].dt.year
    for i in range(1, lag_time + 1):
        input_df[f"netload_lag{i}"] = input_df['netload'].shift(i)
    lag_cols = [f"netload_lag{i}" for i in range(1, lag_time + 1)]
    return input_df.dropna(subset=lag_cols)

In [None]:
if __name__ == "__main__":
    input_data= preprocess_(params['input_path'])
    processed_dataframe = create_training_features(input_data, params['lag_time'])
    print(processed_dataframe.head())
    processed_dataframe.to_csv(params['output_path'])
    print("Preprocessing complete. Processed data saved to:", params['output_path'])
    logging.info("Preprocessing complete. Processed data saved to: %s", params['output_path'])


INFO:root:Length of original data: 7337


                        time _measurement   netload  temperature  \
5  2025-03-14 17:00:00+00:00      FncPvSc  6.738630         5.30   
6  2025-03-14 18:00:00+00:00      FncPvSc  6.833518         4.75   
7  2025-03-14 19:00:00+00:00      FncPvSc  6.845242         4.30   
8  2025-03-14 20:00:00+00:00      FncPvSc  7.225632         3.85   
9  2025-03-14 21:00:00+00:00      FncPvSc  6.710588         3.25   

                   Datetime  Hour  Day_of_Week  Month  DayOfYear  \
5 2025-03-14 17:00:00+00:00    17           14      3       2025   
6 2025-03-14 18:00:00+00:00    18           14      3       2025   
7 2025-03-14 19:00:00+00:00    19           14      3       2025   
8 2025-03-14 20:00:00+00:00    20           14      3       2025   
9 2025-03-14 21:00:00+00:00    21           14      3       2025   

   netload_lag1  netload_lag2  netload_lag3  netload_lag4  netload_lag5  
5      6.938568      6.962737      8.218262     22.756666     31.005559  
6      6.738630      6.938568     