# Missing Values Imputation
This notebook loads the preprocessed data and impute the missing values for each station.

## 0 - Setup

### 0.1 - Imports
Load the necessary dependencies.

In [43]:
from ydata.connectors import LocalConnector, GCSConnector
from ydata.utils.formats import read_json
from ydata.quality.impute.timeseries import TSMissingImputer

## 0.2 - Auxiliary Functions
The auxiliary functions are custom-designed utilities developed for the REE use case.

In [23]:
from utils import preprocess_data
from imputation import (get_cold_start_meters, get_proxy_data,
                        resample_station_data, data_boundaries, load_factors)

## 1 - Load Data
Train data comprises the preprocessed readings until August 2021

In [24]:
# Create the connector for Google Cloud Storage
connector = LocalConnector()

# Read the train data
data = connector.read_file('train_allmeters.csv')

In [25]:
# Load the factors
add_factors = load_factors('df_factors_2018_2021.json')

## 2 - Data Processing

### 2.1 - Data Wrangling
Parse the data into the correct types and with the right format.

In [26]:
# Preprocess data to be ready for imputation
data = preprocess_data(data)

### 2.2 - Cold Start
Training on cold-start meters (i.e. without any observed values) should be made in separate from the rest of the meters.

In [27]:
# Get a list of cold-start meters
cold_start_meters = get_cold_start_meters(data)
cold_start_meters

['aysha1', 'gode1']

In [28]:
# Define the stations that can serve as proxy data for the cold-start meters
proxy_stations = {
    'aysha1': ['diredawa1', 'tuluguled1'],
    'gode1': ['seladingay', 'ziway'],
}

In [29]:
proxy_data = get_proxy_data(proxy_stations, data)

In [30]:
# Subset the data for cold-start meters only.
cold_start_data = data[data['station'].isin(cold_start_meters)]

### 2.3 - Data Boundaries

In [31]:
# Apply the data boundaries to each dataframe used as proxy data for cold start
proxy_data = {k: data_boundaries(v, replace_na=True) for (k,v) in proxy_data.items()}

## 3 - Imputer
The TSMissingImputer is responsible to impute the missing values for time-series.
- Learns the temporal dynamics from the observed values
- Supports multiple entities with the `partition_by` parameter
- Follows the usual scikit-learn method interfaces (e.g. fit, transform)

### 3.1 - Train the TSMissing Imputer

In [32]:
# Train the Imputer
imputer = TSMissingImputer()

In [33]:
# Train the Imputer
imputer.fit(cold_start_data, partition_by='station', num_cols=['speed'], proxy_data=proxy_data, add_factors=add_factors)

TSMissingImputer()

### 3.2 - Impute for Full Year
Construct a full year of data, on hourly basis, for devices with observed readings. For each hour, the average of windspeed/winddirection is calculated and used as ground-truth for observed readings.

In [34]:
# Create a DataFrame of2m a whole year for all the meters with observed values.
whole_year = resample_station_data(cold_start_data)

In [35]:
# Apply the missing values imputation to reconstruct a whole year of data.
reconstructed = imputer.transform(whole_year)

### 3.3 - Impute for Holdout
Construct a full month of holdout, on hourly basis, for devices with observed readings.

In [39]:
# Apply the missing values imputation to reconstruct the holdout period
holdout = connector.read_file('holdout_allmeters.csv')
holdout = preprocess_data(holdout)

# Filter for cold-start meters on holdout data.
holdout = holdout[holdout['station'].isin(cold_start_meters)]
whole_holdout = resample_station_data(holdout, start_ts='2021-09-01', end_ts='2021-10-01')
holdout_reconstructed = imputer.transform(whole_holdout)

## 3.3 - Data Validation

In [40]:
# After reconstruction, no value should be missing
assert reconstructed.isna().sum().sum() == 0, "The reconstructed dataset contains missing values after reconstruction."
assert holdout_reconstructed.isna().sum().sum() == 0, "The reconstructed dataset of holdout contains missing values after reconstruction."

### 3.4 - Data PostProcessing
The imputation of time-series is applicable to any type of numerical data and thus agnostic to energy-specific boundaries of wind measurements. To guarantee adequacy for wind speed and direction, we enforce that wind speed cannot be negative and that wind direction should range within degree angles (between 0 and 360).

In [41]:
# Postprocess the training data
postprocessed = data_boundaries(data=reconstructed)

# Postprocess the holdout data
postprocessed_holdout = data_boundaries(data=holdout_reconstructed)

## 4 - Store Data
After the data is fully reconstructed, store to cloud storage.

In [47]:
# Load the credentials
credentials = read_json('gcs_credentials.json')

# Create the connector for Google Cloud Storage
connector = GCSConnector('ydatasynthetic', gcs_credentials=credentials)

# Store the whole year reconstructed
connector.write_file(data=postprocessed, path='gs://pipelines_artifacts/wind_measurements_pipeline/outputs/whole_year_coldstart_reconstructed.csv', index=True)

# Store the holdout
connector.write_file(data=postprocessed_holdout, path='gs://pipelines_artifacts/wind_measurements_pipeline/outputs/holdout_coldstart_reconstructed.csv', index=True)

OSError: Timed out trying to connect to gateway://gateway.dask.svc.cluster.local:80/dask.b8671cbf08b747008d39d02c4dff82f1 after 30 s