# Factors
This notebook calculates the factors used to add seasonality patterns to reconstructed data.

Factors are multiplicative scalars applied for a numerical feature for each groupby value (e.g. average monthly windspeed ratio as percentage of annual average)

## 0 - Setup

### 0.1 - Imports
Load the necessary dependencies.

In [1]:
%%capture
from ydata.connectors import GCSConnector
from ydata.utils.formats import read_json
from typing import List
from pandas import concat

## 0.2 - Auxiliary Functions
The auxiliary functions are custom-designed utilities developed for the use case.

In [2]:
from factors import save_json

## 1 - Load Data

In [3]:
# Load the credentials
credentials = read_json('gcs_credentials.json')

# Create the connector for Google Cloud Storage
connector = GCSConnector('ydatasynthetic', gcs_credentials=credentials)

## 2 - Calculate Factors
Calculate the average windspeed per month for meters in relevant provinces.

In [4]:
YEARS = [2018, 2019, 2020, 2021]

In [5]:
# Internal IDs that identify stations which are in the same provinces

meter_ids = read_json('meter_ids.json')

In [6]:
def get_monthly_avg_per_year(connector: GCSConnector, meter_ids: List, years=YEARS):
    "Calculates the monthly factor over the anual average, per year, for meters within same provinces as the original data."

    # create a map of each month to corresponding index
    meses = ['january', 'february', 'march', 'april', 'may', 'june', 'july',
             'august', 'september', 'october', 'november', 'december']

    months = {k : v for (k, v) in zip(meses, range(1, len(meses) + 1))}

    factors = []  # will contain a Series of monthly averages over anual, per each year

    for year in years:
        filepath = f'gs://pipelines_artifacts/wind_measurements_pipeline/data/df_for_factors_{str(year)}.csv'
        df = connector.read_file(filepath, assume_missing=True).to_pandas().set_index('name_station')  # read the yearly monthly averages
        df = df[df.index.isin(meter_ids)]                                                # filter for meters in same provinces
        df = df.dropna()                                                                 # drop the missing values
        for month in months.keys():                                                      # calculate factor as ratio of monthly average
            df[month] = df[month] / df['yearly']                                          # over the anual average
        factors.append(df[months.keys()].mean().copy())

    # Aggregate data
    agg_data = concat(factors, axis=1)
    agg_data.columns = years
    agg_data = agg_data.rename(index=months)
    agg_data['avg'] = agg_data.mean(axis=1)
    return agg_data

In [7]:
factors = get_monthly_avg_per_year(connector=connector, meter_ids=meter_ids)

## 3 - Store Data
Save factors as a JSON of windspeed factor per each month.

In [8]:
# Store the average per month of year
save_json({'windspeed': factors['avg'].to_dict()}, 'df_factors_2018_2021.json')