In [None]:
from datetime import datetime, timedelta

import pandas as pd
import numpy as np
import plotly.express as px

from src.io import (
    get_techtide_hf,
    get_techtide_ionosondes,
    get_gfz_f107,
    get_gfz_hp30,
    get_fmi_iu_ie,
)
from src.preprocess import (
    resample_time_series,
    get_categories,
    get_solar_position,
)

In [None]:
STOP_UTC_NOW = datetime.utcnow()
START_UTC = STOP_UTC_NOW - timedelta(hours=6)

STOP_UTC_NOW = STOP_UTC_NOW.strftime("%Y-%m-%d %H:%M:%S")
START_UTC = START_UTC.strftime("%Y-%m-%d %H:%M:%S")

# TechTIDE

#### HF-INT + 2-h moving average

In [None]:
df_hf = get_techtide_hf(start=START_UTC, stop=STOP_UTC_NOW)

df_hf_30 = resample_time_series(df_hf, aggregation_function='mean').round(2)

In [None]:
period = 2 * 2

df_hf_30[f'hf_mav_{period/2:.0f}h'] = df_hf_30['hf'].rolling(
    window=int(period)
).mean().round(2)

#### Ionosondes

In [None]:
df_iono = get_techtide_ionosondes(
    START_UTC,
    STOP_UTC_NOW,
    iono_list=['AT138', 'FF051', 'JR055', 'PQ052', 'RO041', 'VT139'],
)

df_iono_30 = resample_time_series(
    df_iono,
    aggregation_function='median',
).round(2)

# Solar Zenith Angle (pvlib)

In [None]:
get_solar_position(
    df_hf_30.index[-1], columns='zenith', altitude=0,
).round(1)

# GFZ

#### Hp-30

In [None]:
df_hp_30 = get_gfz_hp30(start='2024-10-15 23:00:00', stop=STOP_UTC_NOW)

#### F10.7 (adj)

In [None]:
get_gfz_f107()

# NOAA

#### L1 data (propagated)

Lo storico è limitato, meglio agganciarsi al DB interno

In [None]:
cols = ['propagated_time_tag', 'density', 'bz', 'vx']

df_l1 = pd.read_json(
    'https://services.swpc.noaa.gov/products/geospace/propagated-solar-wind-1-hour.json',
    convert_dates=False,
)

df_l1.columns = df_l1.iloc[0]
df_l1 = df_l1[1:][cols].reset_index(drop=True)

for col_ in cols:
    if 'time_' in col_:
        df_l1[col_] = pd.to_datetime(df_l1[col_])
    else:
        df_l1[col_] = pd.to_numeric(df_l1[col_])
        
df_l1 = df_l1.rename(
    columns={
        'propagated_time_tag': 'datetime',
        'density': 'rho',
    }
)

df_l1 = df_l1[
    df_l1['datetime'].lt(STOP_UTC_NOW)
].set_index('datetime')

In [None]:
df_l1_30 = resample_time_series(df_l1, aggregation_function='median')

# FMI

#### IU & IL

In [None]:
df_fmi = get_fmi_iu_ie()

In [None]:
df_fmi_30 = resample_time_series(df_fmi, aggregation_function='median').round(2)

# Newell (coupling) - TODO

In [None]:
df_newell = read_time_series(
    Path(DATA_IN, 'newell.csv'),
    column_names=['datetime','newell'],
).loc[START_DATE:END_DATE]

In [None]:
df_newell_30 = resample_time_series(df_newell, aggregation_function='median')

# SMR (SuperMAG partial ring current index) - TODO

In [None]:
df_smr = read_time_series(
    Path(DATA_IN, 'SMR.csv'),
    column_names=['datetime','smr'],
).loc[START_DATE:END_DATE]

In [None]:
df_smr_30 = resample_time_series(df_smr, aggregation_function='median')

# Auroral Electrojet - TODO

We discretise IE, IL and IU in categories according to their variation

In [None]:
hours = 6
time_steps = 2 * hours

In [None]:
_, labels = get_categories(
    df_ejet_30['ie_fix'],
    window=time_steps,
    zero_phase=False,
)

df_ejet_30['ie_variation'] = np.insert(labels, 0, 0, axis=0)

In [None]:
_, labels = get_categories(
    df_ejet_30['iu_fix'],
    window=time_steps,
    zero_phase=False,
)

df_ejet_30['iu_variation'] = np.insert(labels, 0, 0, axis=0)

In [None]:
_, labels = get_categories(
    df_ejet_30['il_fix'],
    window=time_steps,
    zero_phase=False,
)

df_ejet_30['il_variation'] = np.insert(labels, 0, 0, axis=0)

Here we construct IE, IL and IU moving averages with several rolling windows (3, 6, 12 and 24 hours)

In [None]:
# n. of periods needed to get 3, 6, 12, 24 hours
periods = [2 * per_ for per_ in [3, 6, 12, 24]]

for per_ in periods:
    # IE moving average
    df_ejet_30[f'ie_mav_{per_/2:.0f}h'] = df_ejet_30['ie_fix'].rolling(
        window=int(per_)
    ).mean().round(1)
    
    # IU moving average
    df_ejet_30[f'iu_mav_{per_/2:.0f}h'] = df_ejet_30['iu_fix'].rolling(
        window=int(per_)
    ).mean().round(1)
    
    # IL moving average
    df_ejet_30[f'il_mav_{per_/2:.0f}h'] = df_ejet_30['il_fix'].rolling(
        window=int(per_)
    ).mean().round(1)
    
df_ejet_30 = df_ejet_30.drop(columns=['ie','il','iu']).dropna()

## Dataset creation

In [None]:
df_j = df_ejet_30.merge(
    df_tid_30_['quality_index'],
    how='left',
    left_index=True,
    right_index=True,
).merge(
    df_hf_30,
    how='left',
    left_index=True,
    right_index=True,
).merge(
    df_solar['f_107_adj'],
    how='left',
    left_index=True,
    right_index=True,
).merge(
    df_hp_30,
    left_index=True,
    right_index=True,
).merge(
    df_smr_30,
    left_index=True,
    right_index=True,
).merge(
    df_l1_30,
    left_index=True,
    right_index=True,
).merge(
    df_newell_30,
    left_index=True,
    right_index=True,
).merge(
    df_ionosondes_30,
    left_index=True,
    right_index=True,
)

# Solar data need to be repeated, since they're provided on a daily basis only
df_j['f_107_adj'] = df_j['f_107_adj'].ffill()

# Solar zenith angle
df_j['solar_zenith_angle'] = get_solar_position(
    df_j.index, columns='zenith', altitude=0,
).round(1)

In [None]:
assert df_j.reset_index().duplicated('datetime').sum() == 0

Construct the actual **target**, a boolean column which is set to 1 whenever a **TID event** is reported **within a 3-hours timeframe**

In [None]:
steps = 2 * FORECAST_HOURS_IN_ADVANCE

df_j[f'tid_within_{FORECAST_HOURS_IN_ADVANCE}h'] = df_j['quality_index'].rolling(
    window=steps+1, min_periods=1
).sum().gt(0).shift(
    -steps, fill_value=False
).replace(
    {True: 1, False: 0}
)

In [None]:
# df_j['solar_zenith_angle'].apply(np.real).describe()

## Dump

In [None]:
df_j.drop(
    columns=['quality_index'],
).to_pickle(
    Path(DATA_OUT, 'df_dataset.pickle')
)