In [1]:
from pathlib import Path

import pandas as pd
import numpy as np
import plotly.express as px

from var import DATA_IN, DATA_OUT, START_DATE, END_DATE, FORECAST_HOURS_IN_ADVANCE
from src.io import read_time_series
from src.preprocess import (
    resample_time_series,
    get_categories,
    get_solar_position,
    preprocess_ionosonde_data,
)

## TIDs catalog

In [2]:
df_tid = read_time_series(
    Path(DATA_IN, 'TID_catalog.csv'),
    column_names=[
        'duration',
        'period',
        'amplitude',
        'spectral_contribution',
        'velocity',
        'azimuth',
        'quality_index',
        'datetime',
    ],
)

In [3]:
df_tid_30 = resample_time_series(df_tid, aggregation_function='max')

Given the duration of a TID event, we repeat the information describing it for as long as it lasted

In [4]:
df_tid_30['total_periods'] = (df_tid_30['duration'] * 60 / 30).round()

df_tid_30_ = pd.DataFrame()
for _, row in df_tid_30.dropna().reset_index().iterrows():
    periods = int(row['total_periods'])
    datetimes = pd.date_range(start=row['datetime'], periods=periods, freq='30T')
    df_tid_30_ = pd.concat(
        [df_tid_30_, pd.DataFrame({'datetime': datetimes, **row[1:]})],
        ignore_index=True,
    )
    
df_tid_30_ = df_tid_30_.set_index('datetime').drop(columns='total_periods')

## Ionosonde Data

### Athens

In [26]:
df_athens_30 = preprocess_ionosonde_data(
    station_name='AT138',
    aggregation_function={
        'spectral_contribution': 'median',
        'velocity': 'median',
        'azimuth': 'median',
        'local_warning_level': 'max',
    },
).loc[START_DATE:END_DATE]

### Fairford

In [27]:
df_fairford_30 = preprocess_ionosonde_data(
    station_name='FF051',
    aggregation_function={
        'spectral_contribution': 'median',
        'velocity': 'median',
        'azimuth': 'median',
        'local_warning_level': 'max',
    },
).loc[START_DATE:END_DATE]

### Juliusruh

In [28]:
df_juliusruh_30 = preprocess_ionosonde_data(
    station_name='JR055',
    aggregation_function={
        'spectral_contribution': 'median',
        'velocity': 'median',
        'azimuth': 'median',
        'local_warning_level': 'max',
    },
).loc[START_DATE:END_DATE]

### Pruhonice

In [30]:
df_pruhonice_30 = preprocess_ionosonde_data(
    station_name='PQ052',
    aggregation_function={
        'spectral_contribution': 'median',
        'velocity': 'median',
        'azimuth': 'median',
        'local_warning_level': 'max',
    },
).loc[START_DATE:END_DATE]

### Rome

In [31]:
df_rome_30 = preprocess_ionosonde_data(
    station_name='RO041',
    aggregation_function={
        'spectral_contribution': 'median',
        'velocity': 'median',
        'azimuth': 'median',
        'local_warning_level': 'max',
    },
).loc[START_DATE:END_DATE]

### San Vito

In [32]:
df_svito_30 = preprocess_ionosonde_data(
    station_name='VT139',
    aggregation_function={
        'spectral_contribution': 'median',
        'velocity': 'median',
        'azimuth': 'median',
        'local_warning_level': 'max',
    },
).loc[START_DATE:END_DATE]

### All ionosondes

In [33]:
df_ionosondes_30 = df_athens_30.merge(
    df_fairford_30,
    left_index=True,
    right_index=True,
    how='left',
).merge(
    df_juliusruh_30,
    left_index=True,
    right_index=True,
    how='left',
).merge(
    df_pruhonice_30,
    left_index=True,
    right_index=True,
    how='left',
).merge(
    df_rome_30,
    left_index=True,
    right_index=True,
    how='left',
).merge(
    df_svito_30,
    left_index=True,
    right_index=True,
    how='left',
)

## L1 data

In [None]:
df_l1 = read_time_series(
    Path(DATA_IN, 'SolarWind_Projected_Merged.csv'),
    column_names=['datetime','bz','vx','rho'],
    usecols=[0,6,8,11],
).loc[START_DATE:END_DATE]

In [None]:
df_l1_30 = resample_time_series(df_l1, aggregation_function='median')

## HF-EU index

In [None]:
df_hf = read_time_series(
    Path(DATA_IN, 'HF_EU_IDX.csv'),
    column_names=['datetime','hf'],
).loc[START_DATE:END_DATE]

In [None]:
df_hf_30 = resample_time_series(df_hf, aggregation_function='mean')
df_hf_30['hf'] = df_hf_30['hf'].round(2)

## SMR (SuperMAG partial ring current index)

In [None]:
df_smr = read_time_series(
    Path(DATA_IN, 'SMR.csv'),
    column_names=['datetime','smr'],
).loc[START_DATE:END_DATE]

In [None]:
df_smr_30 = resample_time_series(df_smr, aggregation_function='median')

## HP-30

In [None]:
df_hp = read_time_series(
    Path(DATA_IN, 'Hp30_ap30_IDX.csv'),
    column_names=['hp_30','datetime'],
    usecols=[0, 2]
).loc[START_DATE:END_DATE]

In [None]:
df_hp_30 = resample_time_series(df_hp, aggregation_function='median')

## Solar Data

In [None]:
df_solar = read_time_series(
    Path(DATA_IN, 'solar_data.csv'),
    column_names=[
        'day_of_rotation',
        'n_sunsposts',
        'f_107_adj',
        'date',
    ],
    datetime_format="%d-%b-%Y"
).loc[START_DATE:END_DATE]

## Auroral Electrojet

In [None]:
df_ejet = read_time_series(
    Path(DATA_IN, 'ImageIDX.csv'),
    column_names=['il','iu','ie','datetime'],
).loc[START_DATE:END_DATE]

In [None]:
df_ejet_30 = resample_time_series(df_ejet, aggregation_function='median')
df_ejet_30['ie'] = df_ejet_30['ie'].round(1)

We detected **anomalous IE values**

The reported values appear to be affected by instrumental **offsets** that have a time window of 1 or more consecutive days

We removed the offset by subtracting a value equal to the average of the jumps before and after the affected day(s) (```ill_dates```)

In [None]:
df_ejet_30['ie_diff'] = df_ejet_30['ie'].diff()

ill_dates = [
    ['2015-05-31', '2015-06-01'],
    ['2015-11-27', '2015-12-01'],
    ['2015-12-04', '2015-12-07'],
    ['2018-08-29', '2018-08-31'],
    ['2018-09-01', '2018-09-08'],
    ['2019-12-27', '2020-01-01'],
]

df_ejet_30['ie_fix'] = df_ejet_30['ie']
for range_ in ill_dates:
    offset_ = np.round(
        (
            df_ejet_30.loc[range_[0], 'ie_diff'].values[0] -
            df_ejet_30.loc[range_[-1], 'ie_diff'].values[0]
        ) / 2,
        1
    )
    
    df_ejet_30.loc[
        range_[0]: pd.to_datetime(range_[1]) - pd.Timedelta(minutes=30),
        'ie_fix',
    ] -= offset_
    
df_ejet_30['ie_fix'] = df_ejet_30['ie_fix'].clip(lower=0)
df_ejet_30 = df_ejet_30.drop(columns='ie_diff')

Similarly for IU and IL

In [None]:
df_ejet_30['iu_diff'] = df_ejet_30['iu'].diff()

ill_dates = [
    ['2015-05-31', '2015-06-01'],
    ['2015-11-27', '2015-12-01'],
    ['2015-12-04', '2015-12-07'],
    ['2019-12-27', '2020-01-01'],
]

df_ejet_30['iu_fix'] = df_ejet_30['iu']
for range_ in ill_dates:
    offset_ = np.round(
        (
            df_ejet_30.loc[range_[0], 'iu_diff'].values[0] -
            df_ejet_30.loc[range_[-1], 'iu_diff'].values[0]
        ) / 2,
        1
    )
    
    df_ejet_30.loc[
        range_[0]: pd.to_datetime(range_[1]) - pd.Timedelta(minutes=30),
        'iu_fix',
    ] -= offset_
    
df_ejet_30 = df_ejet_30.drop(columns='iu_diff')

In [None]:
df_ejet_30['il_diff'] = df_ejet_30['il'].diff()

ill_dates = [
    ['2018-08-29', '2018-08-31'],
    ['2018-09-01', '2018-09-08'],
]

df_ejet_30['il_fix'] = df_ejet_30['il']
for range_ in ill_dates:
    offset_ = np.round(
        (
            df_ejet_30.loc[range_[0], 'il_diff'].values[0] -
            df_ejet_30.loc[range_[-1], 'il_diff'].values[0]
        ) / 2,
        1
    )
    
    df_ejet_30.loc[
        range_[0]: pd.to_datetime(range_[1]) - pd.Timedelta(minutes=30),
        'il_fix',
    ] -= offset_
    
df_ejet_30 = df_ejet_30.drop(columns='il_diff')

We also evaluate the unbalance between incoming (IL) and outgoing (IU) currents, namely $\rm{IO := IL + IU}$

In [None]:
df_ejet_30['io_fix'] = df_ejet_30['il_fix'] + df_ejet_30['iu_fix']

Finally, we discretise IE, IL and IU in categories according to their variation

In [None]:
hours = 6
time_steps = 2 * hours

In [None]:
_, labels = get_categories(
    df_ejet_30['ie_fix'],
    window=time_steps,
)

df_ejet_30['ie_variation'] = np.insert(labels, 0, 0, axis=0)

In [None]:
_, labels = get_categories(
    df_ejet_30['iu_fix'],
    window=time_steps,
)

df_ejet_30['iu_variation'] = np.insert(labels, 0, 0, axis=0)

In [None]:
_, labels = get_categories(
    df_ejet_30['il_fix'],
    window=time_steps,
)

df_ejet_30['il_variation'] = np.insert(labels, 0, 0, axis=0)

Here we construct IE, IL and IU moving averages with several rolling windows (3, 6, 12 and 24 hours)

In [None]:
# n. of periods needed to get 3, 6, 12, 24 hours
periods = [2 * per_ for per_ in [3, 6, 12, 24]]

for per_ in periods:
    # IE moving average
    df_ejet_30[f'ie_mav_{per_/2:.0f}h'] = df_ejet_30['ie_fix'].rolling(
        window=int(per_)
    ).mean().round(1)
    
    # IU moving average
    df_ejet_30[f'iu_mav_{per_/2:.0f}h'] = df_ejet_30['iu_fix'].rolling(
        window=int(per_)
    ).mean().round(1)
    
    # IL moving average
    df_ejet_30[f'il_mav_{per_/2:.0f}h'] = df_ejet_30['il_fix'].rolling(
        window=int(per_)
    ).mean().round(1)
    
df_ejet_30 = df_ejet_30.drop(columns=['ie','il','iu']).dropna()

## Dataset creation

In [None]:
df_j = df_ejet_30.merge(
    df_tid_30_['quality_index'],
    how='left',
    left_index=True,
    right_index=True,
).merge(
    df_hf_30,
    how='left',
    left_index=True,
    right_index=True,
).merge(
    df_solar[['n_sunsposts','f_107_adj']],
    how='left',
    left_index=True,
    right_index=True,
).merge(
    df_hp_30,
    left_index=True,
    right_index=True,
).merge(
    df_smr_30,
    left_index=True,
    right_index=True,
).merge(
    df_l1_30,
    left_index=True,
    right_index=True,
)

# Solar data need to be repeated, since they're provided on a daily basis only
df_j[['n_sunsposts','f_107_adj']] = df_j[['n_sunsposts','f_107_adj']].ffill()

# Include month of the year and hour of the day (UTC)
# df_j['month'] = df_j.index.to_series().dt.month.astype(str).str.zfill(2)
# df_j['hour'] = df_j.index.to_series().dt.hour.astype(str).str.zfill(2)

# Solar zenith angle
df_j['solar_zenith_angle'] = get_solar_position(
    df_j.index, columns='zenith', altitude=0,
).round(1)

In [None]:
assert df_j.reset_index().duplicated('datetime').sum() == 0

In [None]:
# px.line(
#     df_j.loc['2020-04','solar_zenith_angle']
# )

In [None]:
## df_tid_30 = df_tid_30.reset_index()
## df_tid_30['month'] = df_tid_30["datetime"].dt.month.astype(str).str.zfill(2)
## df_tid_30 = df_tid_30.set_index('datetime')
## 
## df_tid_30["occurrences"] = np.where(
##     df_tid_30["quality_index"].notna(),
##     1,
##     0,
## )

In [None]:
## df_j['solar_zenith_angle_cat'] = pd.qcut(
##     df_j['solar_zenith_angle'],
##     q=[0, 0.25, 0.5, 0.75, 1.0],
##     labels=["low", "m-l", "m-u", "upp"],
## )

In [None]:
## df_occ = df_tid_30[["occurrences", "month"]].merge(
##     df_j['solar_zenith_angle_cat'],
##     how="left",
##     left_index=True,
##     right_index=True,
## ).groupby(
##     ["month", "solar_zenith_angle_cat"],
##     as_index=False,
##     observed=False,
## )["occurrences"].sum()
## 
## df_occ["occurrences_norm"] = df_occ["occurrences"].div(df_occ["occurrences"].max())

In [None]:
## df_j = df_j.reset_index()
## df_j['month'] = df_j["datetime"].dt.month.astype(str).str.zfill(2)
## df_j = df_j.set_index('datetime')

In [None]:
## df_j = df_j.reset_index().merge(
##     df_occ,
##     how='left',
##     on=['month','solar_zenith_angle_cat'],
## ).drop(
##     columns=['solar_zenith_angle_cat','month','occurrences']
## ).set_index('datetime')

**Construct** the actual **target**, a boolean column which is set to `True` whenever a **TID event** is reported **within a 3-hours timeframe**

In [None]:
HOURS = [3, 6, 12]

for hr_ in HOURS:
    steps = 2 * hr_

    df_j[f'tid_within_{hr_}h'] = df_j['quality_index'].rolling(
        window=steps+1, min_periods=1
    ).sum().gt(0).shift(-steps, fill_value=False)

In [None]:
# Class imbalance
100 * df_j[f'tid_within_{FORECAST_HOURS_IN_ADVANCE}h'].value_counts(normalize=True)

In [None]:
# Good-quality events account for ~70% of the TID catalog
np.round(100 * df_tid['quality_index'].ge(0.5).sum() / df_tid.shape[0], 1)

In [None]:
# (linear) correlations between variables
df_j.corr().style.map(
    lambda val: 'color: blue' if (val < -0.4 or val > 0.4) else 'color: black'
)

In [None]:
px.line(
    df_j.loc['2020']['io_fix']
)

In [None]:
# px.histogram(
#     df_j['ie_fix'].div(df_j['ie_mav_6h']).sample(frac=.6)
# )

In [None]:
# px.histogram(
#     df_j['ie_fix'].div(df_j['ie_mav_3h']).sample(frac=.6)
# )

### Note to self!

if a feature is un-correlated with the target, it doesn't tell you that a **non-linear** model (*e.g.* CatBoost) wouldn't perform well by using this feature

Pearson correlation only takes into account *linear* correlation between variables; there might be non-linear (polynomial, logarithmic etc.) relationships between variables. Since the Pearson correlations are low, it seems that the relationships in the dataset (if any) might be non-linear and complex

$$\left( \frac{\rm{IE_0}}{\rm{IE_{3h}}} \right) ^ {\rm{HF}}$$

In [None]:
(
    df_j['ie_fix'].div(
        df_j['ie_mav_3h']
    ) ** df_j['hf']
).corr(
    df_j[f'tid_within_{FORECAST_HOURS_IN_ADVANCE}h']
)

In [None]:
px.histogram(
    (
        df_j['ie_fix'].div(
            df_j['ie_mav_3h']
        ) ** df_j['hf']
    ).sample(frac=.6)
)

$$\left( \frac{\rm{IE_0 - IE_{3h}}}{\rm{IE_{3h}}} \right) ^ {\rm{HF}}$$

In [None]:
(
    (df_j['ie_fix'] - df_j['ie_mav_3h']).div(
        df_j['ie_mav_3h']
    ) ** df_j['hf']
).corr(
    df_j[f'tid_within_{FORECAST_HOURS_IN_ADVANCE}h']
)

In [None]:
px.histogram(
    (
        (df_j['ie_fix'] - df_j['ie_mav_3h']).div(
            df_j['ie_mav_3h']
        ) ** df_j['hf']
    ).sample(frac=.6)
)

In [None]:
# df_tid.loc['2014-02-03']

In [None]:
# df_j['solar_zenith_angle'].apply(np.real).describe()

## Dump

In [None]:
df_j.drop(
    columns=['n_sunsposts','quality_index'],
).to_pickle(Path(DATA_OUT, 'df_dataset.pickle'))