In [None]:
from pathlib import Path

import pandas as pd
import numpy as np
import plotly.express as px

from var import DATA_IN, DATA_OUT, START_DATE, END_DATE

## TIDs catalog

In [None]:
df_tid = pd.read_csv(
    Path(DATA_IN, 'TID_catalog.csv'),
    header=0,
    names=[
        'duration',
        'period',
        'amplitude',
        'spectral_contribution',
        'velocity',
        'azimuth',
        'quality_index',
        'datetime',
    ],
)

df_tid['datetime'] = pd.to_datetime(df_tid['datetime'], format='%d-%b-%Y %H:%M:%S')

In [None]:
df_tid.sample(3)

## Solar Data

In [None]:
df_solar = pd.read_csv(
    Path(DATA_IN, 'solar_data.csv'),
    header=0,
    names=[
        'day_of_rotation',
        'n_sunsposts',
        'f_107_adj',
        'date',
    ],
)

df_solar['date'] = pd.to_datetime(df_solar['date'], format='%d-%b-%Y')
df_solar = df_solar[df_solar['date'].gt(START_DATE)].reset_index(drop=True)

## Auroral Electrojet

In [None]:
df_ejet = pd.read_csv(
    Path(DATA_IN, 'ImageIDX.csv'),
    usecols=[2,3],
    header=0,
    names=[
        'ie',
        'datetime',
    ],
)

df_ejet['datetime'] = pd.to_datetime(df_ejet['datetime'], format='%d-%b-%Y %H:%M:%S')
df_ejet = df_ejet[df_ejet['datetime'].gt(START_DATE)].reset_index(drop=True)

In [None]:
df_ejet_30 = df_ejet.resample('30T', on='datetime').mean().reset_index()

## EDA

In [None]:
# Perché cappiamo a 1500?
df_ejet_30['ie'].quantile(.99327)

In [None]:
px.histogram(df_ejet_30['ie'])

In [None]:
px.histogram(df_tid['quality_index'])

In [None]:
px.histogram(df_tid['duration'])

In [None]:
px.bar(
    data_frame=df_solar.groupby(
        df_solar['date'].dt.year
    )['f_107_adj'].mean().reset_index(),
    x='date',
    y='f_107_adj',
)

In [None]:
px.histogram(df_solar['f_107_adj'])

## Dataset creation

In [None]:
df_tid_30 = df_tid.resample('30T', on='datetime').max().reset_index()

In [None]:
100 * (df_tid_30.notna().sum() / df_tid_30.shape[0])

In [None]:
from datetime import timedelta

date = pd.date_range(start='2014-01-01 00:00:00', end='2022-12-31 23:30:00', freq='30T')
df_y = pd.DataFrame({'dt' : date})
df_y['TID_in_3hours'] = 0

TID_times = pd.to_datetime(df_tid['datetime'], format="%d-%b-%Y %H:%M:%S")
for i in TID_times:
  start_date = i - timedelta(hours=3)
  end_date = i
  mask = (df_y['dt']>= start_date) & (df_y['dt'] <= end_date)
  df_y.loc[mask, 'TID_in_3hours'] = 1

print(df_y['TID_in_3hours'].value_counts()[1]/(df_y['TID_in_3hours'].value_counts()[0] + df_y['TID_in_3hours'].value_counts()[1] )*100)

In [None]:
df_y['TID_in_3hours'].value_counts(normalize=True)