In [34]:
import pandas as pd
import os

BASE_DIR = os.getcwd()
RAW_DIR = os.path.join(BASE_DIR, "../data/raw")
CLEANED_DIR = os.path.join(BASE_DIR, "../data/cleaned")


In [35]:
def clean_data(file_path, datetime_cols=None, delimiter=";"):
    df = pd.read_csv(file_path, delimiter=delimiter, low_memory=False)
    
    if datetime_cols is None:
        datetime_cols = [col for col in df.columns if "date" in col.lower() or "time" in col.lower()]
    
    for col in datetime_cols:
        if col in df.columns:
            df[col] = pd.to_datetime(df[col], errors="coerce")
    
    # missing_dates = df[datetime_cols].isna().sum()
    # print(f"⏳ Missing datetime values before cleaning:\n{missing_dates}")
    
    if datetime_cols:
        df = df.dropna(subset=datetime_cols, how="all")
    
    if df.empty:
        print(f"⚠️ Warning: Cleaned dataset is empty after processing {file_path}!")
    
    if datetime_cols and not df.empty:
        df = df.sort_values(by=datetime_cols[0]).reset_index(drop=True)
    
    return df

In [36]:


def clean_datasets(original_folder=RAW_DIR, cleaned_folder=CLEANED_DIR):
    os.makedirs(cleaned_folder, exist_ok=True)
    
    for file_name in os.listdir(original_folder):
        if file_name.endswith(".csv"):
            original_file_path = os.path.join(original_folder, file_name)
            print(f"🔄 Cleaning {file_name}...")
            
            cleaned_df = clean_data(original_file_path)
            
            if not cleaned_df.empty:
                cleaned_file_path = os.path.join(cleaned_folder, f"cleaned_{file_name}")
                cleaned_df.to_csv(cleaned_file_path, index=False, sep=",")
                print(f"✅ Saved cleaned file: {cleaned_file_path}\n")
            else:
                print(f"⚠️ Skipped saving {file_name} because the cleaned data is empty.\n")
    
    print("🎉 All datasets cleaned and saved successfully!")

In [37]:
def main():
    clean_datasets()

if __name__ == "__main__":
    main()

🔄 Cleaning Actual_consumption_202301010000_202503050000_Quarterhour.csv...


  df[col] = pd.to_datetime(df[col], errors="coerce")
  df[col] = pd.to_datetime(df[col], errors="coerce")


✅ Saved cleaned file: c:\Users\Elite\Documents\ocean\PowerCast The Electricity Price Forecasting Challenge\PowerCast_v2\src\../data/cleaned\cleaned_Actual_consumption_202301010000_202503050000_Quarterhour.csv

🔄 Cleaning Actual_generation_202301010000_202503050000_Quarterhour.csv...


  df[col] = pd.to_datetime(df[col], errors="coerce")
  df[col] = pd.to_datetime(df[col], errors="coerce")


✅ Saved cleaned file: c:\Users\Elite\Documents\ocean\PowerCast The Electricity Price Forecasting Challenge\PowerCast_v2\src\../data/cleaned\cleaned_Actual_generation_202301010000_202503050000_Quarterhour.csv

🔄 Cleaning Automatic_Frequency_Restoration_Reserve_202301010000_202503050000_Quarterhour.csv...


  df[col] = pd.to_datetime(df[col], errors="coerce")
  df[col] = pd.to_datetime(df[col], errors="coerce")


✅ Saved cleaned file: c:\Users\Elite\Documents\ocean\PowerCast The Electricity Price Forecasting Challenge\PowerCast_v2\src\../data/cleaned\cleaned_Automatic_Frequency_Restoration_Reserve_202301010000_202503050000_Quarterhour.csv

🔄 Cleaning Balancing_energy_202301010000_202503050000_Quarterhour_Month.csv...


  df[col] = pd.to_datetime(df[col], errors="coerce")
  df[col] = pd.to_datetime(df[col], errors="coerce")


✅ Saved cleaned file: c:\Users\Elite\Documents\ocean\PowerCast The Electricity Price Forecasting Challenge\PowerCast_v2\src\../data/cleaned\cleaned_Balancing_energy_202301010000_202503050000_Quarterhour_Month.csv

🔄 Cleaning Costs_of_TSOs__without_costs_of_DSOs__202301010000_202503050000_Month.csv...
✅ Saved cleaned file: c:\Users\Elite\Documents\ocean\PowerCast The Electricity Price Forecasting Challenge\PowerCast_v2\src\../data/cleaned\cleaned_Costs_of_TSOs__without_costs_of_DSOs__202301010000_202503050000_Month.csv

🔄 Cleaning Cross-border_physical_flows_202301010000_202503050000_Quarterhour.csv...


  df[col] = pd.to_datetime(df[col], errors="coerce")
  df[col] = pd.to_datetime(df[col], errors="coerce")


✅ Saved cleaned file: c:\Users\Elite\Documents\ocean\PowerCast The Electricity Price Forecasting Challenge\PowerCast_v2\src\../data/cleaned\cleaned_Cross-border_physical_flows_202301010000_202503050000_Quarterhour.csv

🔄 Cleaning Day-ahead_prices_202301010000_202503050000_Hour.csv...


  df[col] = pd.to_datetime(df[col], errors="coerce")


✅ Saved cleaned file: c:\Users\Elite\Documents\ocean\PowerCast The Electricity Price Forecasting Challenge\PowerCast_v2\src\../data/cleaned\cleaned_Day-ahead_prices_202301010000_202503050000_Hour.csv

🔄 Cleaning Exported_balancing_services_202301010000_202503050000_Quarterhour.csv...


  df[col] = pd.to_datetime(df[col], errors="coerce")
  df[col] = pd.to_datetime(df[col], errors="coerce")


✅ Saved cleaned file: c:\Users\Elite\Documents\ocean\PowerCast The Electricity Price Forecasting Challenge\PowerCast_v2\src\../data/cleaned\cleaned_Exported_balancing_services_202301010000_202503050000_Quarterhour.csv

🔄 Cleaning Forecasted_consumption_202301010000_202503050000_Quarterhour.csv...


  df[col] = pd.to_datetime(df[col], errors="coerce")
  df[col] = pd.to_datetime(df[col], errors="coerce")


✅ Saved cleaned file: c:\Users\Elite\Documents\ocean\PowerCast The Electricity Price Forecasting Challenge\PowerCast_v2\src\../data/cleaned\cleaned_Forecasted_consumption_202301010000_202503050000_Quarterhour.csv

🔄 Cleaning Forecasted_generation_Day-Ahead_202301010000_202503050000_Hour_Quarterhour.csv...


  df[col] = pd.to_datetime(df[col], errors="coerce")
  df[col] = pd.to_datetime(df[col], errors="coerce")


✅ Saved cleaned file: c:\Users\Elite\Documents\ocean\PowerCast The Electricity Price Forecasting Challenge\PowerCast_v2\src\../data/cleaned\cleaned_Forecasted_generation_Day-Ahead_202301010000_202503050000_Hour_Quarterhour.csv

🔄 Cleaning Frequency_Containment_Reserve_202301010000_202503050000_Quarterhour.csv...


  df[col] = pd.to_datetime(df[col], errors="coerce")
  df[col] = pd.to_datetime(df[col], errors="coerce")


✅ Saved cleaned file: c:\Users\Elite\Documents\ocean\PowerCast The Electricity Price Forecasting Challenge\PowerCast_v2\src\../data/cleaned\cleaned_Frequency_Containment_Reserve_202301010000_202503050000_Quarterhour.csv

🔄 Cleaning Generation_Forecast_Intraday_202301010000_202503050000_Quarterhour.csv...


  df[col] = pd.to_datetime(df[col], errors="coerce")
  df[col] = pd.to_datetime(df[col], errors="coerce")


✅ Saved cleaned file: c:\Users\Elite\Documents\ocean\PowerCast The Electricity Price Forecasting Challenge\PowerCast_v2\src\../data/cleaned\cleaned_Generation_Forecast_Intraday_202301010000_202503050000_Quarterhour.csv

🔄 Cleaning Imported_balancing_services_202301010000_202503050000_Quarterhour.csv...


  df[col] = pd.to_datetime(df[col], errors="coerce")
  df[col] = pd.to_datetime(df[col], errors="coerce")


✅ Saved cleaned file: c:\Users\Elite\Documents\ocean\PowerCast The Electricity Price Forecasting Challenge\PowerCast_v2\src\../data/cleaned\cleaned_Imported_balancing_services_202301010000_202503050000_Quarterhour.csv

🔄 Cleaning Installed_generation_capacity_202301010000_202503050000_Year.csv...
✅ Saved cleaned file: c:\Users\Elite\Documents\ocean\PowerCast The Electricity Price Forecasting Challenge\PowerCast_v2\src\../data/cleaned\cleaned_Installed_generation_capacity_202301010000_202503050000_Year.csv

🔄 Cleaning Manual_Frequency_Restoration_Reserve_202301010000_202503050000_Quarterhour.csv...


  df[col] = pd.to_datetime(df[col], errors="coerce")
  df[col] = pd.to_datetime(df[col], errors="coerce")


✅ Saved cleaned file: c:\Users\Elite\Documents\ocean\PowerCast The Electricity Price Forecasting Challenge\PowerCast_v2\src\../data/cleaned\cleaned_Manual_Frequency_Restoration_Reserve_202301010000_202503050000_Quarterhour.csv

🔄 Cleaning Scheduled_commercial_exchanges_202301010000_202503050000_Quarterhour.csv...


  df[col] = pd.to_datetime(df[col], errors="coerce")
  df[col] = pd.to_datetime(df[col], errors="coerce")


✅ Saved cleaned file: c:\Users\Elite\Documents\ocean\PowerCast The Electricity Price Forecasting Challenge\PowerCast_v2\src\../data/cleaned\cleaned_Scheduled_commercial_exchanges_202301010000_202503050000_Quarterhour.csv

🎉 All datasets cleaned and saved successfully!
