# Explain SMAVRA Predictions


In [None]:
import os
os.chdir("../")
from mlflow.tracking import MlflowClient

In [None]:
RUN_ID = "4d8ddb41e7f340c182a6a62699502d9f"

In [None]:
import pandas as pd
import pyarrow.parquet as pq
from pathlib import Path

latent_dir = os.path.join("data/output/explain/latent", RUN_ID)
latents = []
relevant_files = [p for p in Path(latent_dir).iterdir() if int(os.path.basename(p)[:8]) > 20201000]
for p in relevant_files:
    table = pq.read_table(p)
    latents.append(table.to_pandas())
df = pd.concat(latents, axis=0)

In [None]:
df["epoch_loss"] = df.epoch_loss.astype("float")

In [None]:
import hdbscan
from sklearn.preprocessing import RobustScaler
from sklearn.decomposition import PCA

cols = [f"latent_{i}" for i in range(16)] + ["epoch_loss"]
train_df = df

scaled = RobustScaler().fit_transform(train_df.loc[:,cols].values)

train_df.loc[:,cols] = scaled

clusterer = hdbscan.HDBSCAN(min_cluster_size=100, gen_min_span_tree=True)
clusterer.fit(train_df.loc[:,cols])

In [None]:
import plotly.express as px
classification = pd.DataFrame({"labels":clusterer.labels_, "probs": clusterer.probabilities_})

train_df["labels"] = clusterer.labels_
train_df["probs"] = clusterer.probabilities_

In [None]:
import numpy as np
pca = PCA(n_components=3)
components = pca.fit_transform(train_df)
# create df for visualization
pca_columns = [f"PC{i+1}" for i in range(3)]
components = pd.DataFrame(
    components, columns=pca_columns
).reset_index()
components = pd.concat(
    [train_df.reset_index(), components], axis=1)
total_var = pca.explained_variance_ratio_.sum() * 100
labels = {str(i): f"PC {i+1}" for i in range(3)}
labels['color'] = 'log(epoch_loss)'
# fit latent
pca_fig = px.scatter_matrix(
    components,
    color=train_df.labels.astype("str"),
    dimensions=pca_columns,
    labels=labels,
    title=f'Run: {RUN_ID}; Total Explained Variance: {total_var:.2f}%',
    hover_name="file_name",
    hover_data=["epoch_loss", "epoch"]
)

In [None]:
pca_fig

In [None]:
import plotly.express as px
classification = pd.DataFrame({"labels":clusterer.labels_, "props": clusterer.probabilities_})

In [None]:
classification.groupby("labels").count()

In [None]:
score_dir = os.path.join("data/output/score/", RUN_ID)
score = []
for p in Path(score_dir).iterdir():
    df = pq.read_table(
        p,
        columns=[
            "epoch_id",
            "epoch_mse",
            "mask_press_se",
            "resp_flow_se",
            "delivered_volum_se"
        ]).to_pandas()
    
    df = df \
        .groupby("epoch_id") 

    
    means = df \
        .rolling(150) \
        ["mask_press_se","resp_flow_se","delivered_volum_se"] \
        .mean()
    
    stds = df \
        .rolling(150) \
        ["mask_press_se","resp_flow_se","delivered_volum_se"] \
        .std()
    
    mins = df \
        .rolling(150) \
        ["mask_press_se","resp_flow_se","delivered_volum_se"] \
        .min()
    
    maxs = df \
        .rolling(150) \
        ["mask_press_se","resp_flow_se","delivered_volum_se"] \
        .max()
    
    mean_cols = [f"rmean_{c}" for c in ["mask_press_se","resp_flow_se",      "delivered_volum_se"]]

    std_cols = [f"rstd_{c}" for c in ["mask_press_se","resp_flow_se",      "delivered_volum_se"]]

    min_cols = [f"rmin_{c}" for c in ["mask_press_se","resp_flow_se",      "delivered_volum_se"]]

    max_cols = [f"rmax_{c}" for c in ["mask_press_se","resp_flow_se",      "delivered_volum_se"]]


    df = pd.concat([means, stds, mins, maxs], axis = 1).dropna()

    df.columns = mean_cols + std_cols + min_cols + max_cols


    means = df \
        .groupby("epoch_id") \
        .mean()

    stds = df \
        .groupby("epoch_id") \
        .std()

    maxs = df \
        .groupby("epoch_id") \
        .max()
    
    mins = df \
        .groupby("epoch_id") \
        .max()

    df = pd.concat([means, stds, mins, maxs], axis = 1).dropna() 
  
        # .rolling(75) \
        # .agg(
        #     #mean_epoch_mse=pd.NamedAgg("epoch_mse", "mean"),
        #     # mean_mask_press=pd.NamedAgg("mask_press_se", "mean"), 
        #     # mean_resp_flow=pd.NamedAgg("resp_flow_se", "mean"), 
        #     # mean_delivered_volum=pd.NamedAgg("delivered_volum_se", "mean"), 
        #     mean_mask_press_se=pd.NamedAgg("mask_press_se", "mean"), 
        #     mean_resp_flow_se=pd.NamedAgg("resp_flow_se", "mean"), 
        #     mean_delivered_volum_se=pd.NamedAgg("delivered_volum_se", "mean"),
        #     #min_epoch_mse=pd.NamedAgg("epoch_mse", "min"), 
        #     min_mask_press_se=pd.NamedAgg("mask_press_se", "min"), 
        #     min_resp_flow_se=pd.NamedAgg("resp_flow_se", "min"), 
        #     min_delivered_volum_se=pd.NamedAgg("delivered_volum_se", "min"),
        #     #max_epoch_mse=pd.NamedAgg("epoch_mse", "max"),
        #     max_mask_press_se=pd.NamedAgg("mask_press_se", "max"), 
        #     max_resp_flow_se=pd.NamedAgg("resp_flow_se", "max"), 
        #     max_delivered_volum_se=pd.NamedAgg("delivered_volum_se", "max"),
        #     #std_epoch_mse=pd.NamedAgg("epoch_mse", "std"), 
        #     std_mask_press_se=pd.NamedAgg("mask_press_se", "std"), 
        #     std_resp_flow_se=pd.NamedAgg("resp_flow_se", "std"), 
        #     std_delivered_volum_se=pd.NamedAgg("delivered_volum_se", "std")
        # ).reset_index()
    score.append(df)
df = pd.concat(score, axis=0)

In [None]:
df.reset_index().head()

In [None]:
import hdbscan
from sklearn.preprocessing import RobustScaler
from sklearn.decomposition import PCA

train_df = df.reset_index().iloc[:, 1:]

scaled = RobustScaler().fit_transform(train_df.values)

train_df.iloc[:,:] = scaled

clusterer = hdbscan.HDBSCAN(min_cluster_size=50, gen_min_span_tree=True)
clusterer.fit(train_df)

In [None]:
import plotly.express as px
classification = pd.DataFrame({"labels":clusterer.labels_, "probs": clusterer.probabilities_})

train_df["labels"] = clusterer.labels_
train_df["probs"] = clusterer.probabilities_

In [None]:
import numpy as np
pca = PCA(n_components=3)
train_df = train_df[train_df["labels"].values > -1]
components = pca.fit_transform(train_df)
# create df for visualization
pca_columns = [f"PC{i+1}" for i in range(3)]
components = pd.DataFrame(
    components, columns=pca_columns
).reset_index()
components = pd.concat(
    [train_df.reset_index(), components], axis=1)
total_var = pca.explained_variance_ratio_.sum() * 100
labels = {str(i): f"PC {i+1}" for i in range(3)}
labels['color'] = 'log(epoch_loss)'
# fit latent
px.scatter_matrix(
    components,
    color=train_df.labels.astype("str"),
    dimensions=pca_columns,
    labels=labels,
    title=f'Run: {RUN_ID}; Total Explained Variance: {total_var:.2f}%'
)

In [None]:
labels

In [None]:
col_val = df["resp_flow_se"].values
p = np.percentile(col_val, 95)
upper_limit = np.median(col_val) + (4 * p)
stats.tmean(col_val, limits=[0,upper_limit])
stats.tstd(col_val, limits=[0,upper_limit])

In [None]:
means = []
stds = []
for c in df.columns:
    col_val = df[c].values
    p = np.percentile(col_val, 95)
    upper_limit = np.median(col_val) + (4 * p)
    mu = stats.tmean(col_val, limits=[0,upper_limit])
    sd = stats.tstd(col_val, limits=[0,upper_limit])

    means.append(mu)
    stds.append(sd)


In [None]:
means

In [None]:
score_dir = os.path.join("data/output/score/", RUN_ID)
score = []
for p in Path(score_dir).iterdir():
    df = pq.read_table(
        p,
        columns=[
            "delivered_volum",
            "epoch_mse"
        ]).to_pandas()
    df = df.loc[df["delivered_volum"] >-32760, :]
    score.append(df)
df = pd.concat(score, axis=0)

In [None]:
df

In [None]:
df.delivered_volum.values

In [None]:
import plotly.express as px
px.histogram(df.delivered_volum.values)

In [None]:
import numpy as np
np.percentile(df.delivered_volum.values, 99.9)

In [None]:
table.groupby("epoch_id").agg({"epoch_mse":"mean", "mask_press_se": "mean", "resp_flow_se": "mean", "delivered_volum_se": "mean"}).reset_index()

In [None]:
49318*750

In [None]:
RUN_ID = "4377a3ad68e84162827255bc1a0b7e40"
mlflow_client = MlflowClient()
# get run to be explained
data = mlflow_client.get_run(RUN_ID).data

data

In [None]:
mlflow_client.get_experiment_by_name("SMAVRA").experiment_id

In [None]:
from src.visualization import visualize as viz

In [None]:
pca, tsne = viz.plot_latent(run_id=RUN_ID)

In [None]:
pca.update_layout(
    width=1000,
    height=1000
)

In [None]:
tsne.update_layout(
    width=1000,
    height=1000
)

In [None]:
SESSION = "20200930_120001"
EPOCH = 361

In [None]:
attention = viz.epoch_attention(
    run_id=RUN_ID,
    session=SESSION,
    epoch_nr=EPOCH
)

In [None]:
import plotly.express as px
fig = px.imshow(attention[1])
print(fig)

In [None]:
from pathlib import Path
scored_path = Path(os.path.join("data", "output", "score", RUN_ID))


In [None]:
import pandas as pd
import pyarrow.parquet as pq

df = pq.read_table(
    os.path.join(scored_path, f"{SESSION}_0_HRD.edf.parquet")
).to_pandas()


In [None]:
from scipy import stats

z_scores = (df["epoch_mse"].values - stats.trim_mean(df["epoch_mse"].values, 0.05)) / stats.tstd(df["epoch_mse"].values, limits=[0,10])

In [None]:
import numpy as np


In [None]:
px.histogram(z_scores[np.where(z_scores < 4)])

In [None]:
ts_plot = viz.plot_signals(
    session=SESSION,
    df=df
)

ts_plot.update_layout(
    width=1000,
    height=1000,
    title=f"Session {SESSION}; Epoch {EPOCH}"
)

ts_plot