In [None]:
from pathlib import Path

import pandas as pd
import numpy as np
import plotly.express as px

from var import DATA_IN, DATA_OUT, START_DATE, END_DATE
from src.io import read_time_series
from src.preprocess import resample_time_series

## TIDs catalog

In [None]:
df_tid = read_time_series(
    Path(DATA_IN, 'TID_catalog.csv'),
    column_names=[
        'duration',
        'period',
        'amplitude',
        'spectral_contribution',
        'velocity',
        'azimuth',
        'quality_index',
        'datetime',
    ],
)

In [None]:
df_tid_30 = resample_time_series(df_tid, aggregation_function='max')

## HF-EU index

In [None]:
df_hf = read_time_series(
    Path(DATA_IN, 'HF_EU_IDX.csv'),
    column_names=['datetime', 'hf'],
).loc[START_DATE:END_DATE]

In [None]:
df_hf_30 = resample_time_series(df_hf, aggregation_function='mean')
df_hf_30['hf'] = df_hf_30['hf'].round(2)

## Solar Data

In [None]:
df_solar = read_time_series(
    Path(DATA_IN, 'solar_data.csv'),
    column_names=[
        'day_of_rotation',
        'n_sunsposts',
        'f_107_adj',
        'date',
    ],
    datetime_format="%d-%b-%Y"
).loc[START_DATE:END_DATE]

## Auroral Electrojet

In [None]:
df_ejet = read_time_series(
    Path(DATA_IN, 'ImageIDX.csv'),
    column_names=['ie','datetime'],
    usecols=[2,3],
).loc[START_DATE:END_DATE]

In [None]:
df_ejet_30 = resample_time_series(df_ejet, aggregation_function='mean')
df_ejet_30['ie'] = df_ejet_30['ie'].round(1)

## EDA

In [None]:
# px.histogram(df_ejet_30['ie'])

In [None]:
# px.histogram(df_tid['quality_index'])

In [None]:
# px.histogram(df_tid['duration'])

In [None]:
# px.bar(
#     data_frame=df_solar.groupby(
#         df_solar['date'].dt.year
#     )['f_107_adj'].mean().reset_index(),
#     x='date',
#     y='f_107_adj',
# )

In [None]:
# px.histogram(df_solar['f_107_adj'])

## Time serie analysis

Distribuzione dei tempi fra un TID e la successiva

In [None]:
periods = 16

second_order_diff = (
    24 * df_tid.index.to_series().diff(periods).diff(periods).dt.days
) + (
    df_tid.index.to_series().diff(periods).diff(periods).dt.total_seconds().div(3600)
).dropna()

In [None]:
px.histogram(second_order_diff)

In [None]:
second_order_diff.describe()

In [None]:
second_order_diff.kurtosis(), second_order_diff.skew()

In [None]:
from scipy.stats import shapiro, normaltest, kstest

shapiro(second_order_diff.dropna())

In [None]:
normaltest(second_order_diff.dropna())

In [None]:
# Kolmogorov-Smirnov test
ks_statistic, ks_p_value = kstest(second_order_diff.dropna(), 'cauchy')

print(f'KS Statistic: {ks_statistic}')
print(f'P-value: {ks_p_value}')

if ks_p_value < 0.05:
    print('La serie non segue la distribuzione di Cauchy-Lorentz')
else:
    print('La serie segue la distribuzione di Cauchy-Lorentz')

## Dataset creation

In [None]:
periods = [2 * per_ for per_ in [1, 2, 3, 6, 12, 24]]

for per_ in periods:
    df_ejet_30[f'ie_mav_{per_/2}h'] = df_ejet_30['ie'].rolling(
        window=int(per_)
    ).mean().round(1)
    
df_ejet_30 = df_ejet_30.dropna()

In [None]:
df_j = df_ejet_30.merge(
    df_tid_30['quality_index'],
    how='left',
    left_index=True,
    right_index=True,
).merge(
    df_hf_30,
    how='left',
    left_index=True,
    right_index=True,
)

In [None]:
df_j['month'] = df_j.index.to_series().dt.month.astype(str).str.zfill(2)
df_j['hour'] = df_j.index.to_series().dt.hour.astype(str).str.zfill(2)

In [None]:
hours = 3
steps = 2 * hours

df_j[f'tid_within_{hours}h'] = df_j['quality_index'].rolling(
    window=steps+1, min_periods=1
).sum().gt(0).shift(-steps, fill_value=False)

In [None]:
100 * df_j['tid_within_3h'].value_counts(normalize=True)

In [None]:
np.round(100 * df_j['quality_index'].ge(0.5).sum() / df_tid.shape[0], 1)

In [None]:
def color(val):
    color = 'blue' if (val < -0.4 or val > 0.4) else 'black'
    return f'color: {color}'

df_j.corr().style.map(color)

In [None]:
df_j[['hf','ie','ie_mav_6.0h','ie_mav_24.0h']].sample(5)

In [None]:
px.histogram(df_j['ie'].div(df_j['ie_mav_6.0h']))

In [None]:
px.histogram(df_j['ie'].div(df_j['ie_mav_3.0h']))

### Note to self

if a feature is un-correlated with the target, it doesn't tell you that a **non-linear** model (*e.g.* CatBoost) wouldn't perform well by using this feature

Pearson correlation only takes into account *linear* correlation between variables; there might be non-linear (polynomial, logarithmic etc.) relationships between variables. Since the Pearson correlations are low, it seems that the relationships in the dataset (if any) might be non-linear and complex

In [None]:
(
    df_j['ie'].div(
        df_j['ie_mav_3.0h']
    ) ** df_j['hf']
).corr(
    df_j['tid_within_3h']
)

In [None]:
px.histogram(
    df_j['ie'].div(
        df_j['ie_mav_3.0h']
    ) ** df_j['hf']
)

In [None]:
(
    (df_j['ie'] - df_j['ie_mav_3.0h']).div(
        df_j['ie_mav_3.0h']
    ) ** df_j['hf']
).corr(
    df_j['tid_within_3h']
)

In [None]:
px.histogram(
    (df_j['ie'] - df_j['ie_mav_3.0h']).div(
        df_j['ie_mav_3.0h']
    ) ** df_j['hf']
)

In [None]:
# df_j[df_j['tid_within_3h'].eq(True)].head(7)