In [18]:
%load_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [19]:
from glob import glob

import pandas as pd

In [20]:
%autoreload 2

from src.data_cleaning import expand_to_full_length, remove_drops, remove_peaks

General definitions

In [21]:
# Get definitions
%run "00-definitions.ipynb"

MAX_GAP_TO_FILL = 4

# Clean raw data

Fill in the gaps in the raw generation data by interpolating.

In [22]:
def clean_data(category: str, technology: str = None):
    """
    Args:
        category: category of data, one of 'generation', 'load', etc.
        technology
    """
    
    entso_e_raw_filename = f"../data/raw/ENTSO-E_TP/ENTSO-E_TP_{category}{('_' + technology) if technology else ''}.csv"
    other_raw_filename_pattern = f"../data/raw/others/*_{category}{('_' + technology) if technology else ''}.csv"
    intermediate_filename = f"../data/intermediate/{category.capitalize()}{(' ' + technology) if technology else ''}.csv"
    statistics_filename = f"../data/intermediate/Statistics {category}{(' ' + technology) if technology else ''}.csv"
    
    # load ENTSO-E data
    print(f"Reading file {entso_e_raw_filename}...")
    raw_data = pd.read_csv(entso_e_raw_filename, 
                           index_col=0, parse_dates=True)
    
    # Load data from other sources and merge
    for other_src_path in glob(other_raw_filename_pattern):
        print(f"Reading file {other_src_path}...")
        other_source = pd.read_csv(other_src_path, 
                                   index_col=0, parse_dates=True)
        for col, ts in other_source.iteritems(): 
            raw_data[col] = ts

    stats = pd.DataFrame()
    cleaned_data = (raw_data.apply(expand_to_full_length, stats=stats, 
                              start=START, end=END, tzone=TZ,
                              max_gap=MAX_GAP_TO_FILL, min_timestep=MIN_TIMESTEP)
                            .apply(remove_drops, stats_df=stats)
                            .apply(remove_peaks, stats_df=stats)
                   )

    # Write to disk
    cleaned_data.to_csv(intermediate_filename)
    print(f'Wrote {intermediate_filename}')
    stats.to_csv(statistics_filename)
    print(f'Wrote {statistics_filename}')

In [23]:
for tech in GEN_TYPES:
    clean_data('generation', tech)

Reading file ../data/raw/ENTSO-E_TP/ENTSO-E_TP_generation_Solar.csv...
Reading file ../data/raw/others\SVK_generation_Solar.csv...
Filling gaps for ES. . .
Filling gaps for FR. . .
Filling gaps for GB. . .
Filling gaps for IT. . .
Filling gaps for NL. . .
Filling gaps for SE. . .


  mns = a.mean(axis=axis, keepdims=True)
  ret = um.true_divide(
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(
  ret = um.true_divide(


Wrote ../data/intermediate/Generation Solar.csv
Wrote ../data/intermediate/Statistics generation Solar.csv
Reading file ../data/raw/ENTSO-E_TP/ENTSO-E_TP_generation_Wind Onshore.csv...
Reading file ../data/raw/others\Energiateollisuus_generation_Wind Onshore.csv...
Reading file ../data/raw/others\ERCOT_generation_Wind Onshore.csv...
Reading file ../data/raw/others\NVE_generation_Wind Onshore.csv...
Reading file ../data/raw/others\REN_generation_Wind Onshore.csv...
Reading file ../data/raw/others\SVK_generation_Wind Onshore.csv...
Filling gaps for ES. . .
Filling gaps for FI. . .
Filling gaps for FR. . .
Filling gaps for GB. . .
Filling gaps for GB-NIR. . .
Filling gaps for IE. . .
Filling gaps for IT. . .
Filling gaps for NL. . .
Filling gaps for PT. . .
Filling gaps for SE. . .
Filling gaps for US-TX. . .
Wrote ../data/intermediate/Generation Wind Onshore.csv
Wrote ../data/intermediate/Statistics generation Wind Onshore.csv
Reading file ../data/raw/ENTSO-E_TP/ENTSO-E_TP_generation_Win

  return (a - mns) / sstd


Wrote ../data/intermediate/Generation Wind Offshore.csv
Wrote ../data/intermediate/Statistics generation Wind Offshore.csv


Fill in gaps in the raw load data by interpolating.

In [24]:
clean_data('load')
clean_data('load_forecast')

Reading file ../data/raw/ENTSO-E_TP/ENTSO-E_TP_load.csv...
Filling gaps for DE. . .
Filling gaps for ES. . .
Filling gaps for FI. . .
Filling gaps for FR. . .
Filling gaps for GB. . .
Filling gaps for GB-NIR. . .
Filling gaps for IE. . .
Filling gaps for NO. . .
Wrote ../data/intermediate/Load.csv
Wrote ../data/intermediate/Statistics load.csv
Reading file ../data/raw/ENTSO-E_TP/ENTSO-E_TP_load_forecast.csv...
Filling gaps for DE. . .
Filling gaps for GB-NIR. . .
Filling gaps for IE. . .
Filling gaps for NO. . .
Wrote ../data/intermediate/Load_forecast.csv
Wrote ../data/intermediate/Statistics load_forecast.csv


In [25]:
for tech in GEN_TYPES:
    clean_data('forecast', tech)

Reading file ../data/raw/ENTSO-E_TP/ENTSO-E_TP_forecast_Solar.csv...
Filling gaps for DE. . .
Filling gaps for FR. . .
Filling gaps for NL. . .
Filling gaps for NO. . .
Filling gaps for PT. . .
Wrote ../data/intermediate/Forecast Solar.csv
Wrote ../data/intermediate/Statistics forecast Solar.csv
Reading file ../data/raw/ENTSO-E_TP/ENTSO-E_TP_forecast_Wind Onshore.csv...
Filling gaps for DE. . .
Filling gaps for FI. . .
Filling gaps for FR. . .
Filling gaps for GB-NIR. . .
Filling gaps for IE. . .
Filling gaps for NL. . .
Filling gaps for NO. . .
Filling gaps for PT. . .
Filling gaps for SE. . .
Wrote ../data/intermediate/Forecast Wind Onshore.csv
Wrote ../data/intermediate/Statistics forecast Wind Onshore.csv
Reading file ../data/raw/ENTSO-E_TP/ENTSO-E_TP_forecast_Wind Offshore.csv...
Filling gaps for NL. . .
Filling gaps for NO. . .
Wrote ../data/intermediate/Forecast Wind Offshore.csv
Wrote ../data/intermediate/Statistics forecast Wind Offshore.csv
