In [70]:
import pandas as pd
import numpy as np

from pathlib import Path
import yaml
from typing import List
from itertools import product

from src.data.utils import create_output_path
from src import paths

In [71]:
params_path: Path = paths.config_dir("params.yaml")
fault_detection_dir: Path = paths.data_processed_dir("fault_detection")

with open(params_path, "r") as file:
    params = yaml.safe_load(file)

N_values: List[int] = params["N_values"]
k_values: List[float] = params["k_values"]
th_values: List[float] = params["voting_thresholds"]

selected_band: str = params["selected_band"]

## Prediction dates metrics

In [111]:
fault_detection_metadata_filename = "_".join(
    ["fault_detection_metadata", selected_band])
fault_detection_metadata_filename += ".csv"
fault_detection_metadata_path = fault_detection_dir / \
    fault_detection_metadata_filename

pixel_true_values_df = pd.read_csv(
    fault_detection_metadata_path, index_col=["ID", "IDpix"])

poly_true_values_df = pixel_true_values_df.groupby(
    "ID")[["change_type", "change_start", "last_non_change_date", "vegetation_type", "label"]].min()

poly_true_values_df["change_start"] = pd.to_datetime(
    poly_true_values_df["change_start"])
poly_true_values_df["last_non_change_date"] = pd.to_datetime(
    poly_true_values_df["last_non_change_date"])

In [74]:
for N, k, th in product(N_values, k_values, th_values):

    filename = f"predictions_N={N}_k={k}_" + selected_band + ".csv"
    pix_pred_path = paths.data_processed_dir("pixel_predictions", filename)

    pix_pred = pd.read_csv(pix_pred_path, index_col=["ID", "IDpix"])

    poly_pred = pix_pred.groupby("ID")["prediction"].mean().apply(
        lambda x: 1.0 if x >= th else 0.0)

    pix_pred["event_date"] = pd.to_datetime(pix_pred["event_date"])
    poly_pred_detection_dates = pix_pred.groupby("ID")["event_date"].max()

    poly_pred_df = pd.DataFrame(
        {"prediction": poly_pred, "event_date": poly_pred_detection_dates})

    # filename = f"predictions_N={N}_k={k}_th={th}"
    # poly_pred_path = paths.data_processed_dir("poly_predictions", filename)
    # create_output_path(poly_pred_path)

    # poly_pred.to_csv(poly_pred_path)

    break

## True positives dates metrics

In [102]:
true_positives_indices = poly_true_values_df[
    (poly_true_values_df["label"] == poly_pred_df["prediction"]) &
    (poly_true_values_df["label"] == 1)
].index

In [123]:
(poly_true_values_df.loc[true_positives_indices]["last_non_change_date"]
 < poly_pred_df.loc[true_positives_indices]["event_date"]).value_counts()

True     130
False      6
Name: count, dtype: int64

In [114]:
time_deltas_non_change_to_detection = poly_pred_df.loc[true_positives_indices]["event_date"] - \
    poly_true_values_df.loc[true_positives_indices]["last_non_change_date"]

In [162]:
time_deltas_non_change_to_detection.sort_values().head(10)

ID
409   -1634 days
371   -1570 days
223     -75 days
207     -62 days
142     -62 days
164     -18 days
159      16 days
175      21 days
185      25 days
162      29 days
dtype: timedelta64[ns]

In [135]:
time_deltas_detenction_to_change_start = (
    poly_true_values_df.loc[true_positives_indices]["change_start"] - poly_pred_df.loc[true_positives_indices]["event_date"]).abs()

In [140]:
time_deltas_detenction_to_change_start.describe()

count                           136
mean     50 days 06:31:45.882352941
std      71 days 23:28:21.638449179
min                 1 days 00:00:00
25%                14 days 18:00:00
50%                21 days 00:00:00
75%                48 days 00:00:00
max               307 days 00:00:00
dtype: object

## False positives metrics

In [141]:
false_positives_indices = poly_true_values_df[
    (poly_true_values_df["label"] != poly_pred_df["prediction"]) &
    (poly_true_values_df["label"] == 0)
].index

In [152]:
poly_pred_df.loc[false_positives_indices]["event_date"].describe()

count                     63
mean     2022-09-26 08:00:00
min      2022-02-27 00:00:00
25%      2022-08-07 00:00:00
50%      2022-09-25 00:00:00
75%      2022-11-30 12:00:00
max      2022-12-11 00:00:00
Name: event_date, dtype: object