# Notebook para "jugar" con los pasos de xai-LRP antes de dejar los definitivos
> No mergear con master...

### Enlace video de Eamonn de Motifs & Matrix Profiling
- Video: [youtube - Eamonn](https://www.youtube.com/watch?v=BYjOp2NoDdc)
- Slides: [dropbox](https://www.dropbox.com/s/i38eyidz1qo9pi3/Motifs.pptx?dl=0)
- Video [youtube-Stumpy](https://www.youtube.com/watch?v=T9_z7EpA8QM)
- Documentation [

In [None]:
#Weight & Biases
import wandb

#Yaml
from yaml import load, FullLoader

#Embeddings
from dvats.all import *
from tsai.data.preparation import prepare_forecasting_data
from tsai.data.validation import get_forecasting_splits
from fastcore.all import *

#Dimensionality reduction
from tsai.imports import *

#Clustering
import hdbscan
import os
import seaborn as sns


In [None]:
check_memory_usage = True

In [None]:
import utils.config as cfg_

In [None]:
config = cfg_.get_artifact_config_xai_lrp(False)

In [None]:
cfg_.show_attrdict(config)

In [None]:
api = wandb.Api()

In [None]:
import os
path = os.path.expanduser("~/work/nbs_pipeline/")
name="05-xai-lrp_"
runname = name
os.environ["WANDB_NOTEBOOK_NAME"] = path+name+".ipynb"

In [None]:
run_dr = wandb.init(
    entity           = config.wandb_entity,
    project          = config.wandb_project if config.use_wandb else 'work-nbs', 
    group            = config.wandb_group,
    allow_val_change = config.allow_val_change, 
    job_type         = config.job_type, 
    mode             = 'online' if config.use_wandb else 'disabled',
    anonymous        = 'never' if config.use_wandb else 'must',
    config           =  config,
    resume           = 'allow',
    name = runname
)
config_dr = wandb.config # Object for storing hyperparameters

In [None]:
df, df_config, enc_artifact, enc_learner = get_dataset(config, run_dr, config_dr, True)

In [None]:
show_time_series_flag = False
if show_time_series_flag:
    # Show time series plot
    fig, ax = plt.subplots(1, figsize=(15,5), )
    cmap = matplotlib.colormaps.get_cmap('viridis')
    df.plot(color=cmap(0.05), ax=ax) # or use colormap=cmap
    # rect = Rectangle((5000, -4.2), 3000, 8.4, facecolor='lightgrey', alpha=0.5)
    # ax.add_patch(rect)
    plt.tight_layout()
    plt.legend()
    plt.show()

In [None]:
w = enc_artifact.metadata['w']
t_start = time.time()
enc_input, _ = prepare_forecasting_data(df, fcst_history = w)
t_end = time.time()
t = t_end - t_start
print("SW start | " , t_start, " | end ", t_end, "total (secs): ", t)
print(enc_input.shape)

In [None]:
stride = df_config['stride']
batch_size = df_config['batch_size']

##### Check de toy ####
stride = 5
####

In [None]:
print(stride)
print(batch_size)
print(enc_input.shape)
print(enc_artifact.name)

In [None]:
chunk_max = 10000000
shape = enc_input.shape
chunk_size_ = min(shape[1]*shape[2],chunk_max/(shape[1]*shape[2]))
N = max(3200,np.floor(chunk_size_/32))
chunk_size = N*32
chunk_size

In [None]:
t_start = time.time()
embs = get_enc_embs_set_stride_set_batch_size(
    enc_input, 
    enc_learner, 
    stride     = stride,
    batch_size = batch_size,
    cpu        = config.cpu_flag,
    to_numpy   = True, 
    print_flag = False,
    time_flag  = True,
    chunk_size = chunk_size
)
t_end = time.time()
t = t_end-t_start
print("GE start | " , t_start, " | end ", t_end, "total (secs): ", t)

In [None]:
embs.shape

In [None]:
#Ensure no nan ((Intento de Macu. La celda de comentada abajo es la original. Pero falla por Nan con sunspot))
embs_no_nan = embs[~np.isnan(embs).any(axis=1)]
embs_no_nan.shape

In [None]:
prjs = get_prjs(embs_no_nan, config_dr, config, False)

In [None]:
# Define HDBSCAN parameters
hdbscan_kwargs = {
    'min_cluster_size' : 7, #100, #100,
    'min_samples' : 3,
    'cluster_selection_epsilon' : 0.0001,
}
metric_kwargs = {
    'metric' : 'euclidean' #'jaccard'
}

In [None]:
# Create clusters using HDBSCAN
clusters = hdbscan.HDBSCAN(**hdbscan_kwargs, **metric_kwargs).fit(prjs)
clusters_labels = clusters.labels_
list(Counter(clusters_labels).items())

In [None]:
score = cluster_score(prjs, clusters_labels, True)

In [None]:
# Testing artifact structure 
test_eq_type(type(clusters_labels), np.ndarray)
test_eq(clusters_labels.size, prjs.shape[0])

In [None]:
# Create and log 'clusters_labels' artifact
clusters_ar = ReferenceArtifact(obj=clusters_labels, name='clusters_labels')
clusters_ar.metadata, clusters_ar.manifest.entries.values()

In [None]:
# Create clusters using HDBSCAN
clusters = hdbscan.HDBSCAN(**hdbscan_kwargs, **metric_kwargs).fit(prjs)
clusters_labels = clusters.labels_
list(Counter(clusters_labels).items())

## Anomalies simple detector: dynamic plot for determining wether a window of time series is anomalous

In [None]:
#anomaly_scores = detector(prjs_umap, clusters_labels)
anomaly_scores = detector(prjs, clusters.labels_)

In [None]:
plot_anomaly_scores_distribution(anomaly_scores)

In [None]:
print(anomaly_scores.shape)
print("min ", np.min(anomaly_scores))
print("max ", np.max(anomaly_scores))
anomaly_scores_mean = np.mean(anomaly_scores)
print("media ", anomaly_scores_mean)
anomaly_scores_std = np.std(anomaly_scores)
print("std ", anomaly_scores_std)

In [None]:
threshold = pd.Series(clusters.outlier_scores_).quantile(0.9)

In [None]:
import ipywidgets as widgets

In [None]:
fig_size = (7,7)
plot_clusters_with_anomalies_interactive_plot(threshold, prjs, clusters_labels, anomaly_scores, fig_size)

#### Using quartiles for the scores

In [None]:
def detect_anomalies_using_iqr(scores):
    # First and third quartiles
    Q1 = np.percentile(scores, 25)
    Q3 = np.percentile(scores, 75)
    # IQR range
    IQR = Q3 - Q1
    
    #Limits for anomalies
    factor = 1.5
    lower_bound = Q1 - (factor * IQR)
    upper_bound = Q3 + (factor * IQR)
    
    # Get anomalies
    anomalies = (scores < lower_bound) | (scores > upper_bound)
    
    return anomalies, lower_bound, upper_bound

In [None]:
anomalies, lower_bound, upper_bound = detect_anomalies_using_iqr(anomaly_scores)

print("Lower Bound for Anomalies:", lower_bound)
print("Upper Bound for Anomalies:", upper_bound)

In [None]:
def plot_data_with_anomalies(data, anomalies):
    plt.scatter(data[:, 0], data[:, 1], color='blue', label='Normal Data')
    plt.scatter(data[anomalies, 0], data[anomalies, 1], color='red', label='Anomalies')
    plt.title('Data with Anomalies highlighted')
    plt.xlabel('Feature 1')
    plt.ylabel('Feature 2')
    plt.legend()
    plt.show()

In [None]:
plot_data_with_anomalies(prjs, anomalies)