In [None]:
from pathlib import Path

import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler, RobustScaler
from sklearn.cluster import DBSCAN, HDBSCAN
from sklearn.manifold import TSNE

import plotly.express as px
import matplotlib.pyplot as plt

from var import DATA_OUT, MODEL_CATB, IMAGE_OUT, FORECAST_HOURS_IN_ADVANCE

In [None]:
import umap

In [None]:
df = pd.read_pickle(Path(DATA_OUT, 'df_dataset.pickle'))

df = df[
    [
        'iu_mav_6h',
        'hf',
        'solar_zenith_angle',
        'hp_30',
        f'tid_within_{FORECAST_HOURS_IN_ADVANCE}h',
    ]
]

df[f'tid_within_{FORECAST_HOURS_IN_ADVANCE}h'] = df[f'tid_within_{FORECAST_HOURS_IN_ADVANCE}h'].replace(
    {True: 1, False: 0}
)

In [None]:
df_ = df.sample(frac=0.5).copy()
df_.loc[df_['hf'].isna(),'hf'] = 0

In [None]:
X = df_[
    [
        col_
        for col_ in df_.columns
        if col_ != f'tid_within_{FORECAST_HOURS_IN_ADVANCE}h'
    ]
].copy()

y = df_[f'tid_within_{FORECAST_HOURS_IN_ADVANCE}h'].copy()

In [None]:
X_sc = MinMaxScaler().fit_transform(X.values)

## UMAP

In [None]:
n_comps = 2

# umap = umap.UMAP(
#     n_components=n_comps,
#     n_neighbors=15,
#     min_dist=0.1,
#     n_jobs=-1,
#     metric='euclidean',
# )
# umap_projections = umap.fit_transform(X_sc)

if n_comps == 3:
    fig = px.scatter_3d(
        x=umap_projections[:,0],
        y=umap_projections[:,1],
        z=umap_projections[:,2],
        color=y,
        color_continuous_scale=px.colors.qualitative.Plotly
    )
elif n_comps == 2:
    fig = px.scatter(
        x=umap_projections[:,0],
        y=umap_projections[:,1],
        color=y,
    )

In [None]:
fig.show()

## t-SNE

In [None]:
n_comps = 2

tsne = TSNE(
    n_components=n_comps,
    perplexity=60,
    random_state=42,
    metric='euclidean',
)
projections = tsne.fit_transform(X_sc)

if n_comps == 3:
    fig = px.scatter_3d(
        x=projections[:,0],
        y=projections[:,1],
        z=projections[:,2],
        color=y,
        color_continuous_scale=px.colors.qualitative.Plotly
    )
elif n_comps == 2:
    fig = px.scatter(
        x=projections[:,0],
        y=projections[:,1],
        color=y,
    )

In [None]:
# fig.write_html(
#     Path(IMAGE_OUT, f'tSNE_labels_{n_comps}_comps.html')
# )

## H-DBSCAN

In [None]:
min_samples = 20

cls_alg = HDBSCAN(min_samples=min_samples)
labels = cls_alg.fit_predict(X_sc)

In [None]:
pd.DataFrame(labels).value_counts(normalize=True)

In [None]:
fig = px.scatter(
    x=projections[:,0],
    y=projections[:,1],
    color=labels,
)

fig.update_layout(
    template='simple_white',
)
fig.show(autosize=False)