In [None]:
from datetime import datetime, timedelta

import pandas as pd
import numpy as np
import plotly.express as px

from src.backend.io import (
    get_techtide_hf,
    get_techtide_ionosondes,
    get_gfz_f107,
    get_gfz_hp30,
    get_noaa_l1,
    get_noaa_dst,
    get_fmi_iu_ie,
)
from src.backend.preprocess import (
    resample_time_series,
    get_categories,
    get_solar_position,
    get_moving_avg,
)

In [None]:
STOP_UTC_NOW = datetime.utcnow()
START_UTC = STOP_UTC_NOW - timedelta(hours=6)

STOP_UTC_NOW = STOP_UTC_NOW.strftime("%Y-%m-%d %H:%M:%S")
START_UTC = START_UTC.strftime("%Y-%m-%d %H:%M:%S")

# TechTIDE

#### HF-INT + 2-h moving average

In [None]:
df_hf = get_techtide_hf(start=START_UTC, stop=STOP_UTC_NOW)

df_hf_30 = resample_time_series(df_hf, aggregation_function='mean').round(2)

In [None]:
df_hf_30 = get_moving_avg(df_hf_30, ['hf'], [2])

#### Ionosondes

In [None]:
df_iono = get_techtide_ionosondes(
    START_UTC,
    STOP_UTC_NOW,
    iono_list=['AT138', 'FF051', 'JR055', 'PQ052', 'RO041', 'VT139'],
)

df_iono_30 = resample_time_series(
    df_iono,
    aggregation_function='median',
).round(2)

# Solar Zenith Angle (pvlib)

In [None]:
get_solar_position(
    df_hf_30.index[-1], columns='zenith', altitude=0,
).round(1)

# GFZ

#### Hp-30

In [None]:
df_hp_30 = get_gfz_hp30()

#### F10.7 (adj)

In [None]:
get_gfz_f107().dropna().tail(1)

# NOAA

#### L1 data + Newell coupling, defined by
$$v^{4/3} \left(B_y^2 + B_z^2\right)^{2/3} \sin^{8/3}\left(\frac{1}{2}\arctan\left(\left| \frac{B_y}{B_z} \right|\right)\right)$$

In [None]:
# df_l1 = get_noaa_l1(end_propagated_datetime=STOP_UTC_NOW)

In [None]:
import requests

L1_DIST = 1_500_000
BSN_DIST = 90_000

def real_time_solar_wind(end_propagated_datetime: str) -> pd.DataFrame:

    try:
        response = requests.get('https://services.swpc.noaa.gov/products/solar-wind/mag-6-hour.json')
        df_mag = pd.DataFrame(response.json()[1:], columns=response.json()[0])
        response = requests.get('https://services.swpc.noaa.gov/products/solar-wind/plasma-6-hour.json')
        df_plasma = pd.DataFrame(response.json()[1:], columns=response.json()[0])
    except:
        raise Exception(f'Error in retrieving solar wind data. Status code: {response.status_code}. Text: {response.text}')
    
    df = df_mag.merge(df_plasma, on='time_tag', how='outer')
    df.index = pd.Index(pd.to_datetime(df.pop('time_tag')), name='datetime_measure')
    df = df.apply(pd.to_numeric).reset_index()

    df.columns = df.columns.str.removesuffix('_gsm')
    
    df['seconds_to_arrive'] = np.round((L1_DIST - BSN_DIST) / df['speed'])
    df['datetime'] = df['datetime_measure'] + pd.to_timedelta(df['seconds_to_arrive'], unit='s')
    
    df["newell"] = (
        df["speed"] ** (4 / 3)
        * (df["by"] ** 2 + df["bz"] ** 2) ** (1 / 3)
        * (np.sin(np.arctan((df["by"].div(df["bz"]).abs())) / 2) ** (8 / 3))
    ).round(1)
    
    return df[
        df['datetime'].lt(end_propagated_datetime)
    ].drop(
        columns=['datetime_measure', 'seconds_to_arrive', 'lon', 'lat', 'temperature', 'bx', 'bt']
    ).set_index('datetime')

In [None]:
df_l1 = real_time_solar_wind(end_propagated_datetime=STOP_UTC_NOW)

In [None]:
df_l1_30 = resample_time_series(
    df_l1,
    aggregation_function='median',
)

#### Dst

In [None]:
df_dst = get_noaa_dst(end_datetime=STOP_UTC_NOW)

In [None]:
df_dst_30 = resample_time_series(df_dst, aggregation_function='median').ffill()

# FMI

#### IU & IL + moving avgs + variations

In [None]:
fmi_cols = ['ie', 'iu']

In [None]:
df_fmi = get_fmi_iu_ie()

In [None]:
df_fmi_30 = resample_time_series(df_fmi, aggregation_function='median').round(2)

In [None]:
df_fmi_30 = get_moving_avg(df_fmi_30, fmi_cols, [3, 12])

In [None]:
hours = 6
for col_ in fmi_cols:
    _, labels = get_categories(
        df_fmi_30[col_],
        window=2*hours,
        zero_phase=False,
    )

    df_fmi_30[f'{col_}_variation'] = np.insert(labels, 0, 0, axis=0)

## Dataset creation

In [None]:
df_j = df_hf_30.merge(
    df_iono_30,
    how='outer',
    left_index=True,
    right_index=True,
).merge(
    df_hp_30,
    how='outer',
    left_index=True,
    right_index=True,
).merge(
    df_l1_30.drop(columns=['by','speed']),
    how='outer',
    left_index=True,
    right_index=True,
).merge(
    df_dst_30,
    how='outer',
    left_index=True,
    right_index=True,
).merge(
    df_fmi_30,
    how='outer',
    left_index=True,
    right_index=True,
)

# Solar and Dst data need to be repeated, since they're provided on a daily/hourly basis
df_j['dst'] = df_j['dst'].ffill()    
df_j['f_107_adj'] = get_gfz_f107().dropna().tail(1).values[0,0]

# Solar zenith angle
df_j['solar_zenith_angle'] = get_solar_position(
    df_j.index, columns='zenith', altitude=0,
).round(1)

In [None]:
df_j.tail(1).T

In [None]:
assert df_j.reset_index().duplicated('datetime').sum() == 0

[Estimating Standard Performance Metrics for Binary Classification (NannyML)](https://nannyml.readthedocs.io/en/latest/tutorials/performance_estimation/binary_performance_estimation/standard_metric_estimation.html)