In [None]:
# Imports

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import plotly.express as px
import scipy as sp

import torch
import pygsp
import optuna
import joblib
import gc
import argparse
import os
import matplotlib
import pickle
import glob

from matplotlib.ticker import ScalarFormatter, StrMethodFormatter, FormatStrFormatter, FuncFormatter
from matplotlib.animation import FuncAnimation

from sklearn.preprocessing import MinMaxScaler, StandardScaler

from optuna.samplers import TPESampler, BruteForceSampler
from torch.nn import Linear
from torch_geometric.nn.models import GraphUNet
from torch_geometric.nn import GCNConv, Sequential
from torch_geometric.data import Data
from torch_geometric.utils import to_networkx, grid

import sensors.utils.fault_detection as fd
import sensors.utils.analysis as ana

from importlib import reload
ana = reload(ana)

from pyprojroot import here
ROOT_DIR = str(here())
data_dir = '/Users/vitorro/Repositories/dario/data/interim/'

matplotlib.rcParams.update({'font.size': 20})
matplotlib.rcParams.update({'font.family': 'Times New Roman'})

### ONSET DETECTION

In [None]:
dataset = 'df_Gjerdrum_D1L2B'


print(dataset)
df = pd.read_parquet(data_dir+f"{dataset}_filtered.parq")

# Getting features
df['grad'] = df.groupby('pid').smoothed.transform(np.gradient)
# df['grad2'] = df.groupby('pid').grad.transform(np.gradient)
df['grad_abs'] = df.grad.abs()

print('grads done')

df = ana.center_column(df, 'smoothed', 'grad_abs', 'centered')
df['grad_idmax'] = df.groupby('pid', as_index=False)['grad_abs'].transform(lambda x: np.argmax(x))

print('centering done')


# Detecting anomalies (pixels with large gradient)
threshold = 0.65

# Group by 'pid' and find the row with the maximum 'grad_abs' for each sensor
id_list = df.query('grad_abs>@threshold').pid.unique()
df_list = df[df.pid.isin(id_list)]
print(f'pixels with grad>threshold: {len(id_list)}')

df_regions = []
for pid in id_list:
    df_regions.append(ana.get_df_onset(df.query('pid==@pid'), threshold=threshold, clustering_length=120))

df_regions = pd.concat(df_regions).reset_index()

display(df_regions.head())
display(df_regions.groupby('pid', as_index=False).onset_case.min().onset_case.value_counts())

In [None]:
df_regions.to_parquet(data_dir+f'{dataset}_onset.parq')

-----------------------

### CLUSTERING

In [None]:
dataset = 'df_Gjerdrum_D1L2B'
df_orig = pd.read_parquet(data_dir+f"{dataset}_filtered.parq")
df_regions = pd.read_parquet(data_dir+f'{dataset}_onset.parq')
df_regions.head()

In [None]:
n_clusters = 4
df_cluster = df_regions.query('onset_case==1 or onset_case==2').copy()
df_cluster, y_pred = ana.tskmeans(df_cluster, cluster_by='centered', cluster_to='dtw_clusters', metric='dtw',
                                  n_clusters=n_clusters, n_init=5)
df_cluster.to_parquet(data_dir+f'{dataset}_cluster.parq')