In [2]:
!pip3 install --quiet matplotlib numpy pandas scikit-learn seaborn

# Data Cleaning

In [3]:
# Basic imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [4]:
# Changing the path to the root of the repository

if not os.path.exists('data'):
    current_path = os.getcwd()
    root_path = os.path.abspath(os.path.join(current_path, '..'))
    os.chdir(root_path)
    print(f'Changed working directory to: {os.getcwd()}')

Changed working directory to: /Users/thiagonarcizo/Code/EI-ST4/Forecast-of-Electricity-Consumption


In [5]:
# Load raw data
acorn_details = pd.read_csv('data/00_raw/acorn_details.csv', encoding='ISO-8859-1')
temperatures = pd.read_csv('data/00_raw/temperatures.csv', sep=';', decimal=',', encoding='utf-8')
uk_bank_holidays = pd.read_csv('data/00_raw/uk_bank_holidays.csv')
weather_daily = pd.read_csv('data/00_raw/weather_daily_darksky.csv')
weather_hourly = pd.read_csv('data/00_raw/weather_hourly_darksky.csv')

# Load processed data from parquet
group_4_daily_predict = pd.read_parquet('data/02_processed/parquet/group_4_daily_predict.parquet')
group_4_half_hourly_predict = pd.read_parquet('data/02_processed/parquet/group_4_half_hourly_predict.parquet')
group_4_daily = pd.read_parquet('data/02_processed/parquet/group_4_daily.parquet')
group_4_half_hourly = pd.read_parquet('data/02_processed/parquet/group_4_half_hourly.parquet')

In [6]:
# Fixing Datetime formats
temperatures['DateTime'] = pd.to_datetime(temperatures['DateTime'], format='mixed')
uk_bank_holidays['Bank holidays'] = pd.to_datetime(uk_bank_holidays['Bank holidays'], format='mixed')

# Convert all datetime-like columns in weather_daily to datetime format
datetime_columns = ['temperatureMaxTime', 'temperatureMinTime', 'apparentTemperatureMinTime', 
                   'apparentTemperatureHighTime', 'time', 'sunsetTime', 'sunriseTime', 
                   'temperatureHighTime', 'uvIndexTime', 'temperatureLowTime', 
                   'apparentTemperatureMaxTime', 'apparentTemperatureLowTime']

for col in datetime_columns:
    weather_daily[col] = pd.to_datetime(weather_daily[col])

In [10]:
# Function to check for nulls and NaNs
def check_nulls_and_nans(df):
    """
    Check for null values and NaN values in a DataFrame
    Returns two Series: one for nulls count and one for NaNs count
    """
    nulls = df.isnull().sum()
    nans = df.isna().sum()
    return nulls, nans

# Checking Null values and NaN in the dataframes
# Initialize empty dictionaries to store null/NaN information
nulls_nans_info = {}
df_names = ['acorn_details', 'temperatures', 'uk_bank_holidays', 'weather_daily', 'weather_hourly',
           'group_4_daily_predict', 'group_4_half_hourly_predict', 'group_4_daily', 'group_4_half_hourly']

# Check each dataframe and store results
for i, df in enumerate(dfs):
    nulls, nans = check_nulls_and_nans(df)
    nulls_nans_info[df_names[i]] = (nulls, nans)

# Display results and identify which dataframes are OK (no nulls/NaNs)
print("NULL AND NaN ANALYSIS SUMMARY")
print("=" * 50)

ok_dataframes = []
problematic_dataframes = []

for df_name, (nulls, nans) in nulls_nans_info.items():
    total_nulls = nulls.sum()
    total_nans = nans.sum()
    
    print(f"\n{df_name.upper()}:")
    print(f"  Total nulls: {total_nulls}")
    print(f"  Total NaNs: {total_nans}")
    
    if total_nulls > 0 or total_nans > 0:
        print(f"  HAS ISSUES")
        problematic_dataframes.append(df_name)
        # Show which columns have issues
        if total_nulls > 0:
            print(f"  Columns with nulls: {nulls[nulls > 0].to_dict()}")
        if total_nans > 0:
            print(f"  Columns with NaNs: {nans[nans > 0].to_dict()}")
    else:
        print(f"  OK - No nulls or NaNs")
        ok_dataframes.append(df_name)

print("\n" + "=" * 50)
print("SUMMARY:")
print(f"OK dataframes ({len(ok_dataframes)}): {ok_dataframes}")
print(f"Problematic dataframes ({len(problematic_dataframes)}): {problematic_dataframes}")

NULL AND NaN ANALYSIS SUMMARY

ACORN_DETAILS:
  Total nulls: 1
  Total NaNs: 1
  HAS ISSUES
  Columns with nulls: {'REFERENCE': 1}
  Columns with NaNs: {'REFERENCE': 1}

TEMPERATURES:
  Total nulls: 252
  Total NaNs: 252
  HAS ISSUES
  Columns with nulls: {'Temperature': 252}
  Columns with NaNs: {'Temperature': 252}

UK_BANK_HOLIDAYS:
  Total nulls: 0
  Total NaNs: 0
  OK - No nulls or NaNs

WEATHER_DAILY:
  Total nulls: 3
  Total NaNs: 3
  HAS ISSUES
  Columns with nulls: {'cloudCover': 1, 'uvIndex': 1, 'uvIndexTime': 1}
  Columns with NaNs: {'cloudCover': 1, 'uvIndex': 1, 'uvIndexTime': 1}

WEATHER_HOURLY:
  Total nulls: 13
  Total NaNs: 13
  HAS ISSUES
  Columns with nulls: {'pressure': 13}
  Columns with NaNs: {'pressure': 13}

GROUP_4_DAILY_PREDICT:
  Total nulls: 96
  Total NaNs: 96
  HAS ISSUES
  Columns with nulls: {'Conso_kWh_predict': 96}
  Columns with NaNs: {'Conso_kWh_predict': 96}

GROUP_4_HALF_HOURLY_PREDICT:
  Total nulls: 288
  Total NaNs: 288
  HAS ISSUES
  Columns w