In [None]:
from pathlib import Path

import pandas as pd
import numpy as np
import plotly.express as px

from var import DATA_IN, DATA_OUT, START_DATE, END_DATE

## TIDs catalog

In [None]:
df_tid = pd.read_csv(
    Path(DATA_IN, 'TID_catalog.csv'),
    header=0,
    names=[
        'duration',
        'period',
        'amplitude',
        'spectral_contribution',
        'velocity',
        'azimuth',
        'quality_index',
        'datetime',
    ],
)

df_tid['datetime'] = pd.to_datetime(df_tid['datetime'], format='%d-%b-%Y %H:%M:%S')

## Solar Data

In [None]:
df_solar = pd.read_csv(
    Path(DATA_IN, 'solar_data.csv'),
    header=0,
    names=[
        'day_of_rotation',
        'n_sunsposts',
        'f_107_adj',
        'date',
    ],
)

df_solar['date'] = pd.to_datetime(df_solar['date'], format='%d-%b-%Y')
df_solar = df_solar[df_solar['date'].gt(START_DATE)].reset_index(drop=True)

## Auroral Electrojet

In [None]:
df_ejet = pd.read_csv(
    Path(DATA_IN, 'ImageIDX.csv'),
    usecols=[2,3],
    header=0,
    names=[
        'ie',
        'datetime',
    ],
)

df_ejet['datetime'] = pd.to_datetime(df_ejet['datetime'], format='%d-%b-%Y %H:%M:%S')
df_ejet = df_ejet[df_ejet['datetime'].gt(START_DATE)].reset_index(drop=True)

In [None]:
df_ejet_30 = df_ejet.resample('30T', on='datetime').mean().reset_index()

## EDA

In [None]:
# Perché cappiamo a 1500?
df_ejet_30['ie'].quantile(.99327)

In [None]:
px.histogram(df_ejet_30['ie'])

In [None]:
px.histogram(df_tid['quality_index'])

In [None]:
px.histogram(df_tid['duration'])

In [None]:
px.bar(
    data_frame=df_solar.groupby(
        df_solar['date'].dt.year
    )['f_107_adj'].mean().reset_index(),
    x='date',
    y='f_107_adj',
)

In [None]:
px.histogram(df_solar['f_107_adj'])

## Time serie analysis

Distribuzione dei tempi fra un TID e la successiva

In [None]:
periods = 16

second_order_diff = (
    24 * df_tid['datetime'].diff(periods).diff(periods).dt.days
) + (
    df_tid['datetime'].diff(periods).diff(periods).dt.total_seconds().div(3600)
).dropna()

In [None]:
px.histogram(second_order_diff)

In [None]:
second_order_diff.describe()

In [None]:
second_order_diff.kurtosis(), second_order_diff.skew()

In [None]:
from scipy.stats import shapiro, normaltest

shapiro(second_order_diff.dropna())

In [None]:
normaltest(second_order_diff.dropna())

In [None]:
from scipy.stats import kstest

# Kolmogorov-Smirnov test
ks_statistic, ks_p_value = kstest(second_order_diff.dropna(), 'cauchy')

print(f'KS Statistic: {ks_statistic}')
print(f'P-value: {ks_p_value}')

if ks_p_value < 0.05:
    print('La serie non segue la distribuzione di Cauchy-Lorentz')
else:
    print('La serie segue la distribuzione di Cauchy-Lorentz')

## Dataset creation

In [None]:
df_tid_30 = df_tid.resample('30T', on='datetime').max().reset_index()

In [None]:
100 * (df_tid_30.notna().sum() / df_tid_30.shape[0])

In [None]:
periods = [2 * per_ for per_ in [1, 2, 3, 6, 12, 24]]

for per_ in periods:
    df_ejet_30[f'ie_mav_{per_/2}h'] = df_ejet_30['ie'].rolling(
        window=int(per_)
    ).mean().round(1)

In [None]:
df_ejet_30 = df_ejet_30.dropna().reset_index(drop=True)

In [None]:
df_j = df_ejet_30.merge(
    df_tid_30[['datetime','quality_index']],
    how='left',
    on='datetime'
)

In [None]:
hours = 3
steps = 2 * hours

df_j[f'tid_in_{hours}h'] = df_j['quality_index'].shift(-steps).notna()
df_j[f'tid_within_{hours}h'] = df_j['quality_index'].rolling(
    window=steps+1, min_periods=1
).sum().gt(0).shift(-steps, fill_value=False)

In [None]:
100 * df_j['tid_within_3h'].value_counts(normalize=True)

In [None]:
df_j['quality_index'].ge(0.5).sum() / 760

In [None]:
def color(val):
    color = 'blue' if (val < -0.4 or val > 0.4) else 'black'
    return f'color: {color}'

df_j.corr().style.map(color)

In [None]:
df_j.info()