In [6]:
import pandas as pd
import numpy as np
import os

BASE_DIR = os.getcwd()
CLEANED_DIR = os.path.join(BASE_DIR, "../data/cleaned")
PREPROCESSED_DIR = os.path.join(BASE_DIR, "../data/preprocessed")


In [7]:
def preprocess_data(file_path):
    """
    Preprocesses the dataset by:
    - Filling missing values.
    - Encoding categorical variables.
    - Normalizing numerical data.
    
    Parameters:
        file_path (str): Path to the cleaned CSV file.
    
    Returns:
        pd.DataFrame: Preprocessed DataFrame.
    """
    df = pd.read_csv(file_path, delimiter=",")
    
    numerical_cols = df.select_dtypes(include=[np.number]).columns
    categorical_cols = df.select_dtypes(exclude=[np.number]).columns
    
    print(f"🔹 Processing {file_path}...")
    print(f"Numerical columns: {list(numerical_cols)}")
    print(f"Categorical columns: {list(categorical_cols)}\n")
    
    for col in numerical_cols:
        df[col] = df[col].fillna(df[col].median())
    
    for col in categorical_cols:
        if not df[col].mode().empty:
            df[col] = df[col].fillna(df[col].mode()[0])
        else:
            print(f"⚠️ Warning: No mode found for {col}, leaving NaN values.")
    
    for col in numerical_cols:
        df[col] = (df[col] - df[col].min()) / (df[col].max() - df[col].min())
    
    return df

In [8]:
def process_datasets(input_dir, output_dir):
    """
    Processes all cleaned datasets in the specified directory.
    """
    os.makedirs(output_dir, exist_ok=True)
    
    for file_name in os.listdir(input_dir):
        file_path = os.path.join(input_dir, file_name)
        
        if file_name.endswith(".csv"):
            try:
                preprocessed_df = preprocess_data(file_path)
                output_file_path = os.path.join(output_dir, f"preprocessed_{file_name}")
                preprocessed_df.to_csv(output_file_path, index=False, sep=",")
                print(f"✅ Saved preprocessed file: {output_file_path}\n")
            except Exception as e:
                print(f"❌ Error processing {file_name}: {e}\n")

In [9]:
def main():
    input_dir = CLEANED_DIR
    output_dir = PREPROCESSED_DIR
    process_datasets(input_dir, output_dir)
    
if __name__ == "__main__":
    main()

🔹 Processing c:\Users\Elite\Documents\ocean\PowerCast The Electricity Price Forecasting Challenge\PowerCast_v2\src\../data/cleaned\cleaned_Actual_consumption_202301010000_202503050000_Quarterhour.csv...
Numerical columns: []
Categorical columns: ['Start date', 'End date', 'Total (grid load) [MWh] Original resolutions', 'Residual load [MWh] Original resolutions', 'Hydro pumped storage [MWh] Original resolutions']

✅ Saved preprocessed file: c:\Users\Elite\Documents\ocean\PowerCast The Electricity Price Forecasting Challenge\PowerCast_v2\src\../data/preprocessed\preprocessed_cleaned_Actual_consumption_202301010000_202503050000_Quarterhour.csv



  df = pd.read_csv(file_path, delimiter=",")


🔹 Processing c:\Users\Elite\Documents\ocean\PowerCast The Electricity Price Forecasting Challenge\PowerCast_v2\src\../data/cleaned\cleaned_Actual_generation_202301010000_202503050000_Quarterhour.csv...
Numerical columns: []
Categorical columns: ['Start date', 'End date', 'Biomass [MWh] Original resolutions', 'Hydropower [MWh] Original resolutions', 'Wind offshore [MWh] Original resolutions', 'Wind onshore [MWh] Original resolutions', 'Photovoltaics [MWh] Original resolutions', 'Other renewable [MWh] Original resolutions', 'Nuclear [MWh] Original resolutions', 'Lignite [MWh] Original resolutions', 'Hard coal [MWh] Original resolutions', 'Fossil gas [MWh] Original resolutions', 'Hydro pumped storage [MWh] Original resolutions', 'Other conventional [MWh] Original resolutions']

✅ Saved preprocessed file: c:\Users\Elite\Documents\ocean\PowerCast The Electricity Price Forecasting Challenge\PowerCast_v2\src\../data/preprocessed\preprocessed_cleaned_Actual_generation_202301010000_202503050000

  df = pd.read_csv(file_path, delimiter=",")


🔹 Processing c:\Users\Elite\Documents\ocean\PowerCast The Electricity Price Forecasting Challenge\PowerCast_v2\src\../data/cleaned\cleaned_Cross-border_physical_flows_202301010000_202503050000_Quarterhour.csv...
Numerical columns: []
Categorical columns: ['Start date', 'End date', 'Net export [MWh] Original resolutions', 'Netherlands (export) [MWh] Original resolutions', 'Netherlands (import) [MWh] Original resolutions', 'Switzerland (export) [MWh] Original resolutions', 'Switzerland (import) [MWh] Original resolutions', 'Denmark (export) [MWh] Original resolutions', 'Denmark (import) [MWh] Original resolutions', 'Czech Republic (export) [MWh] Original resolutions', 'Czech Republic (import) [MWh] Original resolutions', 'Luxembourg (export) [MWh] Original resolutions', 'Luxembourg (import) [MWh] Original resolutions', 'Sweden (export) [MWh] Original resolutions', 'Sweden (import) [MWh] Original resolutions', 'Austria (export) [MWh] Original resolutions', 'Austria (import) [MWh] Original

  df = pd.read_csv(file_path, delimiter=",")


🔹 Processing c:\Users\Elite\Documents\ocean\PowerCast The Electricity Price Forecasting Challenge\PowerCast_v2\src\../data/cleaned\cleaned_Scheduled_commercial_exchanges_202301010000_202503050000_Quarterhour.csv...
Numerical columns: ['Switzerland (export) [MWh] Original resolutions', 'Czech Republic (export) [MWh] Original resolutions', 'Czech Republic (import) [MWh] Original resolutions', 'Luxembourg (export) [MWh] Original resolutions', 'Luxembourg (import) [MWh] Original resolutions', 'Sweden (export) [MWh] Original resolutions', 'Sweden (import) [MWh] Original resolutions', 'Austria (import) [MWh] Original resolutions', 'Poland (export) [MWh] Original resolutions', 'Poland (import) [MWh] Original resolutions', 'Norway (export) [MWh] Original resolutions', 'Norway (import) [MWh] Original resolutions', 'Belgium (export) [MWh] Original resolutions', 'Belgium (import) [MWh] Original resolutions']
Categorical columns: ['Start date', 'End date', 'Net export [MWh] Original resolutions', 