# アンサンブルの方法を試す

In [1]:
import sys, os
sys.path.append(os.pardir)

In [2]:
from pathlib import Path
import numpy as np
import polars as pl
import os
from hydra import initialize, compose

with initialize(config_path="../run/conf", version_base=None):
    cfg = compose("cv_train", overrides=["exp_name=exp054_zero_periodicity"])

In [4]:
from src.utils.metrics import event_detection_ap
from src.utils.periodicity import get_periodicity_dict
from src.utils.common import trace
periodicity_dict = get_periodicity_dict(cfg)

In [None]:
train_df = pl.read_parquet(Path(cfg.dir.data_dir) / "train_series.parquet")
train_df = train_df.with_columns(
            pl.col("timestamp").str.to_datetime("%Y-%m-%dT%H:%M:%S%z")
        )

In [None]:
event_df = pl.read_csv(Path(cfg.dir.data_dir) / "train_events.csv").drop_nulls()
event_df = event_df.with_columns(
    pl.col("timestamp").str.to_datetime("%Y-%m-%dT%H:%M:%S%z")
)

In [None]:
## pred1
pred1_df = pl.read_parquet("valid_preds.parquet")
pred1_df.describe()

In [16]:
## pred2
pred2_df = pl.read_parquet("../output/cv_inference/exp054_zero_periodicity/single/train_pred.parquet")
pred2_df = pred2_df.with_columns(
    [pl.col(col).cast(pl.Float32) for col in ["pred_sleep", "pred_onset", "pred_wakeup"]]
)
pred2_df.describe()

describe,pred_sleep,pred_onset,pred_wakeup
str,f64,f64,f64
"""count""",127946340.0,127946340.0,127946340.0
"""null_count""",0.0,0.0,0.0
"""mean""",0.243365,0.004108,0.004029
"""std""",0.406467,0.04015,0.04027
"""min""",5e-06,0.0,1.1921e-07
"""25%""",0.000562,6e-05,6.6e-05
"""50%""",0.003811,0.000201,0.000185
"""75%""",0.344727,0.000685,0.000567
"""max""",0.998047,0.950684,0.96875


In [17]:
# 結合
pred_all_df = pl.concat([train_df, pred1_df, pred2_df], how="horizontal")
pred_all_df.head()

series_id,step,timestamp,anglez,enmo,row_id,prediction_onset,prediction_wakeup,pred_sleep,pred_onset,pred_wakeup
str,u32,"datetime[μs, UTC]",f32,f32,i64,f32,f32,f32,f32,f32
"""038441c925bb""",0,2018-08-14 19:30:00 UTC,2.6367,0.0217,0,3e-06,0.001149,0.012238,0.001269,0.001904
"""038441c925bb""",1,2018-08-14 19:30:05 UTC,2.6368,0.0215,1,4e-06,0.001166,0.009682,0.001046,0.001482
"""038441c925bb""",2,2018-08-14 19:30:10 UTC,2.637,0.0216,2,6.6307e-07,0.000871,0.004574,0.0006,0.000636
"""038441c925bb""",3,2018-08-14 19:30:15 UTC,2.6368,0.0213,3,2e-06,0.001289,0.002068,0.000464,0.000228
"""038441c925bb""",4,2018-08-14 19:30:20 UTC,2.6368,0.0215,4,1.9404e-07,0.001127,0.002161,0.000639,0.000257


In [9]:
from scipy.signal import find_peaks

def make_submission(
    preds_df: pl.DataFrame,
    periodicity_dict: dict[str, np.ndarray],
    height: float = 0.001,
    distance: int = 100,
    daily_score_offset: float = 1.0,
    prefix:str ="prediction"
) -> pl.DataFrame:

    event_dfs = []

    for series_id, series_df in tqdm(preds_df.group_by("series_id"), desc="find peaks", leave=False, total=len(preds_df["series_id"].unique())):
        for event in ["onset", "wakeup"]:
            event_preds = series_df[f"{prefix}_{event}"].to_numpy().copy()
            event_preds *= 1 - periodicity_dict[series_id][: len(event_preds)]
            steps = find_peaks(event_preds, height=height, distance=distance)[0]
            event_dfs.append(
                series_df.filter(pl.col("step").is_in(steps))
                .with_columns(pl.lit(event).alias("event"))
                .rename({f"{prefix}_{event}": "score"})
                .select(["series_id", "step", "timestamp", "event", "score"])
            )
    submission_df = (
        pl.concat(event_dfs)
        .with_columns(pl.col("timestamp").dt.offset_by("2h").dt.date().alias("date"))
        .with_columns(
            pl.col("score") / (pl.col("score").sum().over(["series_id", "event", "date"]) + daily_score_offset)
        )
        .sort(["series_id", "step"])
        .with_columns(pl.arange(0, pl.count()).alias("row_id"))
        .select(["row_id", "series_id", "step", "event", "score"])
    )
    return submission_df

In [10]:
from tqdm.auto import tqdm
def score_ternary_search_distance(
    val_event_df: pl.DataFrame, pred_df, score_th: float = 0.005, end_diff: int=2, prefix="prediction"
) -> [float, float]:
    """
    post_process_for_seg のパラメータdistanceを ternary searchで探索する
    """
    l = 5
    r = 150
    cnt = 0
    best_score = 0.0
    best_distance = 0

    for cnt in tqdm(range(5)):
        if r - l < 1:
            break
        m1 = int(l + (r - l) / 3)
        m2 = int(r - (r - l) / 3)
        score1 = event_detection_ap(
            val_event_df.to_pandas(),
            make_submission(
                pred_df,
                height = score_th,
                distance = m1,
                periodicity_dict = periodicity_dict,
                prefix=prefix,
            ).to_pandas()
        )
        score2 = event_detection_ap(
            val_event_df.to_pandas(),
            make_submission(
                pred_df,
                height = score_th,
                distance = m2,
                periodicity_dict = periodicity_dict,
                prefix=prefix,
            ).to_pandas(),
        )

        if score1 >= score2:
            r = m2
            best_score = score1
            best_distance = m1

        else:
            l = m1
            best_score = score2
            best_distance = m2

        tqdm.write(f"score1(m1): {score1:.5f}({m1:.5f}), score2(m2): {score2:.5f}({m2:.5f}), l: {l:.5f}, r: {r:.5f}")

        if abs(m2 - m1) <= end_diff:
            break

    return best_score, best_distance


## pred1のスコア

In [101]:
score_ternary_search_distance(event_df, pred_all_df, score_th=0.001, end_diff=5, prefix="prediction")

  0%|          | 0/5 [00:00<?, ?it/s]

find peaks:   0%|          | 0/277 [00:00<?, ?it/s]

find peaks:   0%|          | 0/277 [00:00<?, ?it/s]

score1(m1): 0.79688(53.00000), score2(m2): 0.80317(101.00000), l: 53.00000, r: 150.00000


find peaks:   0%|          | 0/277 [00:00<?, ?it/s]

find peaks:   0%|          | 0/277 [00:00<?, ?it/s]

score1(m1): 0.80295(85.00000), score2(m2): 0.80294(117.00000), l: 53.00000, r: 117.00000


find peaks:   0%|          | 0/277 [00:00<?, ?it/s]

find peaks:   0%|          | 0/277 [00:00<?, ?it/s]

score1(m1): 0.80246(74.00000), score2(m2): 0.80313(95.00000), l: 74.00000, r: 117.00000


find peaks:   0%|          | 0/277 [00:00<?, ?it/s]

find peaks:   0%|          | 0/277 [00:00<?, ?it/s]

score1(m1): 0.80309(88.00000), score2(m2): 0.80316(102.00000), l: 88.00000, r: 117.00000


find peaks:   0%|          | 0/277 [00:00<?, ?it/s]

find peaks:   0%|          | 0/277 [00:00<?, ?it/s]

score1(m1): 0.80316(97.00000), score2(m2): 0.80318(107.00000), l: 97.00000, r: 117.00000


(0.8031793680267056, 107)

In [11]:
with trace('make submission'):
    sub_df1 = make_submission(
        pred_all_df,
        periodicity_dict= periodicity_dict,
        height = 0.001,
        distance = 107,
        daily_score_offset = 1.0,
    ) 

find peaks:   0%|          | 0/277 [00:00<?, ?it/s]

[12.8GB(+0.7GB):12.2sec] make submission 


In [12]:
score = event_detection_ap(
    event_df.to_pandas(),
    sub_df1.to_pandas(),
)
score

0.8031793680267056

## pred2 のスコア


In [None]:
"""

score1(m1): 0.77834(53.00000), score2(m2): 0.78543(101.00000), l: 53.00000, r: 150.00000
score1(m1): 0.78468(85.00000), score2(m2): 0.78462(117.00000), l: 53.00000, r: 117.00000
score1(m1): 0.78330(74.00000), score2(m2): 0.78528(95.00000), l: 74.00000, r: 117.00000
score1(m1): 0.78497(88.00000), score2(m2): 0.78541(102.00000), l: 88.00000, r: 117.00000
"""
score_ternary_search_distance(event_df, pred_all_df, score_th=0.001, end_diff=5, prefix="pred")

In [18]:
with trace('make submission'):
    sub_df2 = make_submission(
        pred_all_df,
        periodicity_dict= periodicity_dict,
        height = 0.001,
        distance = 101,
        daily_score_offset = 1.0,
        prefix="pred"
    ) 

find peaks:   0%|          | 0/277 [00:00<?, ?it/s]

[14.3GB(+0.5GB):13.1sec] make submission 


In [19]:
score = event_detection_ap(
    event_df.to_pandas(),
    sub_df2.to_pandas(),
)
score

0.7854299785343979

## 加重平均

In [None]:
weights = [[1 -(0.1*i), 0.1 * i]  for i in range(1, 9)]
events = ['onset', 'wakeup']

for w1, w2 in weights[2:]:
    print(w1, w2)

    tmp_df = pred_all_df.with_columns(
        [(w1*pl.col(f"prediction_{event}") + w2*pl.col(f"pred_{event}")).alias(f'weighted_{event}') for event in events]
    )

    best_score, best_dist = score_ternary_search_distance(event_df, tmp_df, score_th=0.001, end_diff=5, prefix="weighted")
    print(best_score, best_dist)
    print()

0.7 0.30000000000000004


  0%|          | 0/5 [00:00<?, ?it/s]

find peaks:   0%|          | 0/277 [00:00<?, ?it/s]

find peaks:   0%|          | 0/277 [00:00<?, ?it/s]

score1(m1): 0.80712(53.00000), score2(m2): 0.81124(101.00000), l: 53.00000, r: 150.00000


find peaks:   0%|          | 0/277 [00:00<?, ?it/s]

find peaks:   0%|          | 0/277 [00:00<?, ?it/s]

score1(m1): 0.81071(85.00000), score2(m2): 0.81042(117.00000), l: 53.00000, r: 117.00000


find peaks:   0%|          | 0/277 [00:00<?, ?it/s]

find peaks:   0%|          | 0/277 [00:00<?, ?it/s]

score1(m1): 0.81045(74.00000), score2(m2): 0.81084(95.00000), l: 74.00000, r: 117.00000


find peaks:   0%|          | 0/277 [00:00<?, ?it/s]

## 加重平均後にsmooth 

In [None]:
from scipy.ndimage import gaussian_filter

events = ['onset', 'wakeup']

w1, w2  = [0.7, 0.3]
sigma = 1
print(w1, w2)

tmp_df = pred_all_df.with_columns(
    [(w1*pl.col(f"prediction_{event}") + w2*pl.col(f"pred_{event}")).alias(f'weighted_{event}') for event in events]
)
tmp_df = tmp_df.with_columns(
    [pl.Series(name=f'weighted_{event}', values=gaussian_filter(tmp_df.select(f'weighted_{event}').to_numpy(), sigma)) for event in events]
)

best_score, best_dist = score_ternary_search_distance(event_df, tmp_df, score_th=0.001, end_diff=5, prefix="weighted")
print(best_score, best_dist)
print()

## そのまま結合

In [20]:
sub_df_concat = pl.concat([sub_df1, sub_df2])

In [21]:
score = event_detection_ap(
    event_df.to_pandas(),
    sub_df_concat.to_pandas(),
)
score

0.6591321622533195

## WBF

In [153]:
import warnings
import numpy as np
from tqdm.auto import tqdm

def prefilter_line_segments(boxes, scores, labels, weights, thr):
    # Create dict with boxes stored by its label
    new_boxes = dict()

    for t in range(len(boxes)):

        if len(boxes[t]) != len(scores[t]):
            print('Error. Length of boxes arrays not equal to length of scores array: {} != {}'.format(len(boxes[t]), len(scores[t])))
            exit()

        if len(boxes[t]) != len(labels[t]):
            print('Error. Length of boxes arrays not equal to length of labels array: {} != {}'.format(len(boxes[t]), len(labels[t])))
            exit()

        for j in range(len(boxes[t])):
            score = scores[t][j]
            if score < thr:
                continue
            label = labels[t][j]
            box_part = boxes[t][j]
            x1 = float(box_part[0])
            x2 = float(box_part[1])

             # Box data checks
            if x2 < x1:
                warnings.warn('X2 < X1 value in box. Swap them.')
                x1, x2 = x2, x1
            """
            if x1 < 0:
                warnings.warn('X1 < 0 in box. Set it to 0.')
                x1 = 0
            if x1 > 1:
                warnings.warn('X1 > 1 in box. Set it to 1. Check that you normalize boxes in [0, 1] range.')
                x1 = 1
            if x2 < 0:
                warnings.warn('X2 < 0 in box. Set it to 0.')
                x2 = 0
            if x2 > 1:
                warnings.warn('X2 > 1 in box. Set it to 1. Check that you normalize boxes in [0, 1] range.')
                x2 = 1
            """
            if (x2 - x1) == 0.0:
                warnings.warn("Zero length line segment skipped: {}.".format(box_part))
                continue

            # [label, score, weight, model index, x1, y1, x2, y2]
            b = [label, float(score) * weights[t], weights[t], t, x1, x2]
            if label not in new_boxes:
                new_boxes[label] = []
            new_boxes[label].append(b)

    # Sort each list in dict by score and transform it to numpy array
    for k in new_boxes:
        current_boxes = np.array(new_boxes[k])
        new_boxes[k] = current_boxes[current_boxes[:, 1].argsort()[::-1]]

    return new_boxes


def get_weighted_box(boxes, conf_type='avg'):
    """
    Create weighted box for set of boxes
    :param boxes: set of boxes to fuse
    :param conf_type: type of confidence one of 'avg' or 'max'
    :return: weighted box (label, score, weight, index, x1, y1, x2, y2)
    """

    box = np.zeros(6, dtype=np.float32)
    conf = 0
    conf_list = []
    w = 0
    for b in boxes:
        box[4:] += (b[1] * b[4:])
        conf += b[1]
        conf_list.append(b[1])
        w += b[2]
    box[0] = boxes[0][0]
    if conf_type == 'avg':
        box[1] = conf / len(boxes)
    elif conf_type == 'max':
        box[1] = np.array(conf_list).max()
    elif conf_type in ['box_and_model_avg', 'absent_model_aware_avg']:
        box[1] = conf / len(boxes)
    box[2] = w
    box[3] = -1 # model index field is retained for consistensy but is not used.
    box[4:] /= conf
    return box


def find_matching_line_segment_quickly(
        boxes_list,
        new_box,
        match_iou
):
    """
        Reimplementation of find_matching_box with numpy instead of loops. Gives significant speed up for larger arrays
        (~100x). This was previously the bottleneck since the function is called for every entry in the array.
    """
    def bb_iou_array(boxes, new_box):
        # bb interesection over union
        xA = np.maximum(boxes[:, 0], new_box[0])
        xB = np.minimum(boxes[:, 1], new_box[1])

        interSeg = np.maximum(xB - xA, 0)

        # compute the area of both the prediction and ground-truth rectangles
        lsAArea = (boxes[:, 1] - boxes[:, 0])
        lsBArea = (new_box[1] - new_box[0])

        iou = interSeg / (lsAArea + lsBArea - interSeg)

        return iou

    if boxes_list.shape[0] == 0:
        return -1, match_iou
    
    boxes = boxes_list

    ious = bb_iou_array(boxes[:, 4:], new_box[4:])

    ious[boxes[:, 0] != new_box[0]] = -1

    best_idx = np.argmax(ious)
    best_iou = ious[best_idx]

    if best_iou <= match_iou:
        best_iou = match_iou
        best_idx = -1

    return best_idx, best_iou


def weighted_boxes_fusion_1d(
        boxes_list,
        scores_list,
        labels_list,
        weights=None,
        iou_thr=0.55,
        skip_box_thr=0.0,
        conf_type='avg',
        allows_overflow=False
):
    '''
    :param boxes_list: list of line segments predictions from each model, each box is 2 numbers.
    It has 3 dimensions (models_number, model_preds, 2)
    Order of line segments: x1, x2. We expect float normalized coordinates [0; 1]
    :param scores_list: list of scores for each model
    :param labels_list: list of labels for each model
    :param weights: list of weights for each model. Default: None, which means weight == 1 for each model
    :param iou_thr: IoU value for line segments to be a match
    :param skip_box_thr: exclude line segments with score lower than this variable
    :param conf_type: how to calculate confidence in weighted line segments. 'avg': average value, 'max': maximum value, 'box_and_model_avg': box and model wise hybrid weighted average, 'absent_model_aware_avg': weighted average that takes into account the absent model.
    :param allows_overflow: false if we want confidence score not exceed 1.0

    :return: boxes: line segments coordinates (Order of boxes: x1, x2).
    :return: scores: confidence scores
    :return: labels: boxes labels
    '''

    if weights is None:
        weights = np.ones(len(boxes_list))
    if len(weights) != len(boxes_list):
        print('Warning: incorrect number of weights {}. Must be: {}. Set weights equal to 1.'.format(len(weights), len(boxes_list)))
        weights = np.ones(len(boxes_list))
    weights = np.array(weights)

    if conf_type not in ['avg', 'max', 'box_and_model_avg', 'absent_model_aware_avg']:
        print('Unknown conf_type: {}. Must be "avg", "max" or "box_and_model_avg", or "absent_model_aware_avg"'.format(conf_type))
        exit()

    filtered_boxes = prefilter_line_segments(
        boxes_list,
        scores_list,
        labels_list,
        weights,
        skip_box_thr
    )
    if len(filtered_boxes) == 0:
        return np.zeros((0, 2)), np.zeros((0,)), np.zeros((0,))

    overall_boxes = []
    for label in filtered_boxes:
        boxes = filtered_boxes[label]
        new_boxes = []
        weighted_boxes = np.empty((0, 6))

        # Clusterize boxes
        for j in range(0, len(boxes)):
            index, best_iou = find_matching_line_segment_quickly(weighted_boxes, boxes[j], iou_thr)

            if index != -1:
                new_boxes[index].append(boxes[j])
                weighted_boxes[index] = get_weighted_box(new_boxes[index], conf_type)
            else:
                new_boxes.append([boxes[j].copy()])
                weighted_boxes = np.vstack((weighted_boxes, boxes[j].copy()))

        # Rescale confidence based on number of models and boxes
        for i in range(len(new_boxes)):
            clustered_boxes = np.array(new_boxes[i])
            if conf_type == 'box_and_model_avg':
                # weighted average for boxes
                weighted_boxes[i, 1] = weighted_boxes[i, 1] * len(clustered_boxes) / weighted_boxes[i, 2]
                # identify unique model index by model index column
                _, idx = np.unique(clustered_boxes[:, 3], return_index=True)
                # rescale by unique model weights
                weighted_boxes[i, 1] = weighted_boxes[i, 1] *  clustered_boxes[idx, 2].sum() / weights.sum()
            elif conf_type == 'absent_model_aware_avg':
                # get unique model index in the cluster
                models = np.unique(clustered_boxes[:, 3]).astype(int)
                # create a mask to get unused model weights
                mask = np.ones(len(weights), dtype=bool)
                mask[models] = False
                # absent model aware weighted average
                weighted_boxes[i, 1] = weighted_boxes[i, 1] * len(clustered_boxes) / (weighted_boxes[i, 2] + weights[mask].sum())
            elif conf_type == 'max':
                weighted_boxes[i, 1] = weighted_boxes[i, 1] / weights.max()
            elif not allows_overflow:
                weighted_boxes[i, 1] = weighted_boxes[i, 1] * min(len(weights), len(clustered_boxes)) / weights.sum()
            else:
                weighted_boxes[i, 1] = weighted_boxes[i, 1] * len(clustered_boxes) / weights.sum()
        overall_boxes.append(weighted_boxes)
    overall_boxes = np.concatenate(overall_boxes, axis=0)
    overall_boxes = overall_boxes[overall_boxes[:, 1].argsort()[::-1]]
    boxes = overall_boxes[:, 4:]
    scores = overall_boxes[:, 1]
    labels = overall_boxes[:, 0]
    return boxes, scores, labels

In [138]:
dfs = [sub_df1, sub_df2]

In [139]:
# series2max_step = dict(pred_all_df.group_by('series_id').agg(pl.col('step').max()).iter_rows())

In [140]:
sub_df_concat = pl.concat(dfs)
unique_values = sub_df_concat.with_columns(
    (pl.col('series_id')+'_'+ pl.col('event')).alias('id_event')
).select('id_event').unique().sort('id_event')
unique_values = unique_values.with_columns(pl.arange(0, unique_values.height).alias('label'))

id_event2label = dict(unique_values.iter_rows())
label2id_event = dict(zip(id_event2label.values(), id_event2label.keys()))

In [154]:
def get_boxes_for_ensemble(df, tolerance):
    _df = df
    steps = _df.get_column('step').to_list()
    boxes = [[step-tolerance, step+tolerance] for step in steps]
    scores = _df.get_column('score').to_list()
    labels = _df.get_column('label').to_list()

    return boxes, scores, labels


def wbf_from_dfs(dfs: list[pl.DataFrame], 
                 iou_thr=0.50,
                 weights=None,
                 conf_type='avg',
                 tolerance=12,
):

    boxes_list, scores_list, labels_list = [], [], []
    for df in dfs:
        _df = df.with_columns(
            (pl.col('series_id')+'_'+ pl.col('event')).alias('id_event')
        ).with_columns(pl.col('id_event').map_dict(id_event2label).alias('label'))
        boxes, scores, labels = get_boxes_for_ensemble(_df, tolerance)
        boxes_list.append(boxes)
        scores_list.append(scores)
        labels_list.append(labels)

    boxes, scores, labels = weighted_boxes_fusion_1d(
            boxes_list,
            scores_list,
            labels_list,
            weights=weights,
            iou_thr=iou_thr,
            skip_box_thr=0.0,
            conf_type=conf_type,
            allows_overflow=False
    )

    ensemble_df = pl.DataFrame(
        [
        pl.Series("step", boxes.mean(axis=1), dtype=pl.UInt32),
        pl.Series("score", scores, dtype=pl.Float32),
        pl.Series("labels", labels, dtype=pl.UInt32),
        ]
    ).with_columns(
        pl.col("labels").map_dict(label2id_event).alias('id_event')
    ).with_columns(
        [
            pl.col("id_event")
            .str.split_exact("_", 1)
            .struct.rename_fields(["series_id", "event"])
            .alias("fields"),
        ]
    ).unnest("fields").select(['series_id', 'step', 'event', 'score']).sort(["series_id", "step"])

    return ensemble_df


In [150]:
ensemble_df = wbf_from_dfs(dfs, weights=[0.7, 0.3])
display(ensemble_df.head())

score = event_detection_ap(
    event_df.to_pandas(),
    ensemble_df.to_pandas(),
)
score

weights [0.7 0.3]
weights, : [0.7 0.3]


series_id,step,event,score
str,u32,str,f32
"""038441c925bb""",5,"""wakeup""",0.001187
"""038441c925bb""",34,"""onset""",0.000554
"""038441c925bb""",214,"""onset""",0.00077
"""038441c925bb""",389,"""onset""",0.000483
"""038441c925bb""",455,"""wakeup""",0.000879


0.8002527397203432

In [151]:
weights_list = [[1 -(0.1*i), 0.1 * i]  for i in range(1, 9)]

for weights in weights_list:
    print(weights)
    ensemble_df = wbf_from_dfs(dfs, tolerance=36, weights=weights)
    display(ensemble_df.head())

    score, table = event_detection_ap(
        event_df.to_pandas(),
        ensemble_df.to_pandas(),
        with_table=True
    )
    print(score)
    display(table)

[0.9, 0.1]
weights [0.9 0.1]
weights, : [0.9 0.1]


series_id,step,event,score
str,u32,str,f32
"""038441c925bb""",5,"""wakeup""",0.001526
"""038441c925bb""",34,"""onset""",0.000185
"""038441c925bb""",214,"""onset""",0.000257
"""038441c925bb""",389,"""onset""",0.000161
"""038441c925bb""",455,"""wakeup""",0.000293


0.8075297155997794


Unnamed: 0_level_0,Unnamed: 1_level_0,ap,precision,recall
event,tolerance,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
onset,12,0.38648,"[0.01922757740185126, 0.019221316446322672, 0....","[0.6286787726988102, 0.6284700480066792, 0.628..."
onset,36,0.713866,"[0.02773699329715927, 0.027730786663176913, 0....","[0.9069087873095387, 0.9067000626174077, 0.906..."
onset,60,0.797851,"[0.03010533035429301, 0.030099138839060575, 0....","[0.9843456480901691, 0.9841369233980379, 0.984..."
onset,90,0.839408,"[0.03028407277369933, 0.030277882399504626, 0....","[0.9901899394698392, 0.9899812147777082, 0.989..."
onset,120,0.856253,"[0.030309607405043088, 0.030303417193853776, 0...","[0.9910248382383636, 0.9908161135462326, 0.990..."
onset,150,0.867243,"[0.03034790935205873, 0.0303417193853775, 0.03...","[0.99227718639115, 0.992068461699019, 0.992068..."
onset,180,0.876263,"[0.03036067666773061, 0.030354486782552075, 0....","[0.9926946357754123, 0.9924859110832811, 0.992..."
onset,240,0.887089,"[0.029785131822502375, 0.029779064604686357, 0...","[0.9941557086203298, 0.9939469839281987, 0.993..."
onset,300,0.892478,"[0.02979138526189404, 0.02978531808318377, 0.0...","[0.9943644333124608, 0.9941557086203298, 0.994..."
onset,360,0.898594,"[0.03235861357092411, 0.032352042375471124, 0....","[0.9945731580045919, 0.9943644333124608, 0.994..."


[0.8, 0.2]
weights [0.8 0.2]
weights, : [0.8 0.2]


series_id,step,event,score
str,u32,str,f32
"""038441c925bb""",5,"""wakeup""",0.001356
"""038441c925bb""",34,"""onset""",0.000369
"""038441c925bb""",214,"""onset""",0.000513
"""038441c925bb""",389,"""onset""",0.000322
"""038441c925bb""",455,"""wakeup""",0.000586


0.808776315185245


Unnamed: 0_level_0,Unnamed: 1_level_0,ap,precision,recall
event,tolerance,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
onset,12,0.390341,"[0.01927226300670284, 0.019266002336433684, 0....","[0.6301398455437278, 0.6299311208515967, 0.629..."
onset,36,0.717332,"[0.02771145866581551, 0.02770525186882776, 0.0...","[0.9060738885410144, 0.9058651638488833, 0.905..."
onset,60,0.800268,"[0.03010533035429301, 0.030099138839060575, 0....","[0.9843456480901691, 0.9841369233980379, 0.984..."
onset,90,0.842313,"[0.03028407277369933, 0.030277882399504626, 0....","[0.9901899394698392, 0.9899812147777082, 0.989..."
onset,120,0.859293,"[0.030309607405043088, 0.030303417193853776, 0...","[0.9910248382383636, 0.9908161135462326, 0.990..."
onset,150,0.869915,"[0.03034790935205873, 0.0303417193853775, 0.03...","[0.99227718639115, 0.992068461699019, 0.992068..."
onset,180,0.879025,"[0.03036067666773061, 0.030354486782552075, 0....","[0.9926946357754123, 0.9924859110832811, 0.992..."
onset,240,0.889764,"[0.029785131822502375, 0.029779064604686357, 0...","[0.9941557086203298, 0.9939469839281987, 0.993..."
onset,300,0.895192,"[0.02979138526189404, 0.02978531808318377, 0.0...","[0.9943644333124608, 0.9941557086203298, 0.994..."
onset,360,0.901347,"[0.03235861357092411, 0.032352042375471124, 0....","[0.9945731580045919, 0.9943644333124608, 0.994..."


[0.7, 0.30000000000000004]
weights [0.7 0.3]
weights, : [0.7 0.3]


series_id,step,event,score
str,u32,str,f32
"""038441c925bb""",5,"""wakeup""",0.001187
"""038441c925bb""",34,"""onset""",0.000554
"""038441c925bb""",214,"""onset""",0.00077
"""038441c925bb""",389,"""onset""",0.000483
"""038441c925bb""",455,"""wakeup""",0.000879


0.8080941518230034


Unnamed: 0_level_0,Unnamed: 1_level_0,ap,precision,recall
event,tolerance,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
onset,12,0.393864,"[0.019361634216406, 0.019355374116655708, 0.01...","[0.6330619912335629, 0.6328532665414318, 0.632..."
onset,36,0.717926,"[0.02773699329715927, 0.027730786663176913, 0....","[0.9069087873095387, 0.9067000626174077, 0.906..."
onset,60,0.800513,"[0.03011171401212895, 0.030105522537647863, 0....","[0.9845543727823002, 0.9843456480901691, 0.984..."
onset,90,0.842233,"[0.03028407277369933, 0.030277882399504626, 0....","[0.9901899394698392, 0.9899812147777082, 0.989..."
onset,120,0.860259,"[0.030309607405043088, 0.030303417193853776, 0...","[0.9910248382383636, 0.9908161135462326, 0.990..."
onset,150,0.870619,"[0.03034790935205873, 0.0303417193853775, 0.03...","[0.99227718639115, 0.992068461699019, 0.992068..."
onset,180,0.879409,"[0.03036067666773061, 0.030354486782552075, 0....","[0.9926946357754123, 0.9924859110832811, 0.992..."
onset,240,0.890358,"[0.029785131822502375, 0.029779064604686357, 0...","[0.9941557086203298, 0.9939469839281987, 0.993..."
onset,300,0.895702,"[0.02979138526189404, 0.02978531808318377, 0.0...","[0.9943644333124608, 0.9941557086203298, 0.994..."
onset,360,0.901942,"[0.03235861357092411, 0.032352042375471124, 0....","[0.9945731580045919, 0.9943644333124608, 0.994..."


[0.6, 0.4]
weights [0.6 0.4]
weights, : [0.6 0.4]


KeyboardInterrupt: 

In [155]:
weights = [0.8, 0.2]
for tolerance in [6, 12, 24, 36, 60]:
    print(tolerance)
    ensemble_df = wbf_from_dfs(dfs, tolerance=tolerance, weights=weights)
    display(ensemble_df.head())

    score, table = event_detection_ap(
        event_df.to_pandas(),
        ensemble_df.to_pandas(),
        with_table=True
    )
    print(score)
    display(table)

6


series_id,step,event,score
str,u32,str,f32
"""038441c925bb""",5,"""wakeup""",0.001356
"""038441c925bb""",34,"""onset""",0.000369
"""038441c925bb""",214,"""onset""",0.000513
"""038441c925bb""",389,"""onset""",0.000322
"""038441c925bb""",455,"""wakeup""",0.000586


0.7950897090146934


Unnamed: 0_level_0,Unnamed: 1_level_0,ap,precision,recall
event,tolerance,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
onset,12,0.395354,"[0.018883685907240513, 0.01887791419343832, 0....","[0.6700062617407639, 0.6697975370486329, 0.669..."
onset,36,0.698834,"[0.025825352377814904, 0.025819621500467682, 0...","[0.9163013984554372, 0.9160926737633062, 0.916..."
onset,60,0.778081,"[0.02778431418688378, 0.027778594833722577, 0....","[0.9858067209350866, 0.9855979962429555, 0.985..."
onset,90,0.821687,"[0.027913735087182624, 0.027908016495379058, 0...","[0.9903986641619703, 0.9901899394698392, 0.990..."
onset,120,0.84233,"[0.02793726615996423, 0.027931547706589326, 0....","[0.9912335629304947, 0.9910248382383636, 0.991..."
onset,150,0.855288,"[0.027966680000941244, 0.027960961720602164, 0...","[0.99227718639115, 0.992068461699019, 0.992068..."
onset,180,0.865638,"[0.027978445537332048, 0.027972727326207297, 0...","[0.9926946357754123, 0.9924859110832811, 0.992..."
onset,240,0.878141,"[0.02748476594958914, 0.027479154043682737, 0....","[0.9941557086203298, 0.9939469839281987, 0.993..."
onset,300,0.884639,"[0.027490536423229617, 0.02748492455062177, 0....","[0.9943644333124608, 0.9941557086203298, 0.994..."
onset,360,0.892462,"[0.029693468683205274, 0.02968742210479087, 0....","[0.9945731580045919, 0.9943644333124608, 0.994..."


12


series_id,step,event,score
str,u32,str,f32
"""038441c925bb""",5,"""wakeup""",0.001356
"""038441c925bb""",34,"""onset""",0.000369
"""038441c925bb""",214,"""onset""",0.000513
"""038441c925bb""",389,"""onset""",0.000322
"""038441c925bb""",455,"""wakeup""",0.000586


0.8044393001363817


Unnamed: 0_level_0,Unnamed: 1_level_0,ap,precision,recall
event,tolerance,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
onset,12,0.39801,"[0.01895383569295029, 0.01894787967094679, 0.0...","[0.651638488833229, 0.6514297641410979, 0.6514..."
onset,36,0.712344,"[0.02652444206998713, 0.02651853200983517, 0.0...","[0.9119181799206846, 0.9117094552285535, 0.911..."
onset,60,0.792987,"[0.02867359576483159, 0.028667698752390492, 0....","[0.9858067209350866, 0.9855979962429555, 0.985..."
onset,90,0.835097,"[0.028807158988804975, 0.028801262787238564, 0...","[0.9903986641619703, 0.9901899394698392, 0.990..."
onset,120,0.853184,"[0.02883144321134559, 0.02882554715721094, 0.0...","[0.9912335629304947, 0.9910248382383636, 0.991..."
onset,150,0.864107,"[0.02886179848952136, 0.02885590261967641, 0.0...","[0.99227718639115, 0.992068461699019, 0.992068..."
onset,180,0.874048,"[0.028873940600791667, 0.0288680448046626, 0.0...","[0.9926946357754123, 0.9924859110832811, 0.992..."
onset,240,0.885588,"[0.028348996803818752, 0.02834321357998238, 0....","[0.9941557086203298, 0.9939469839281987, 0.993..."
onset,300,0.891394,"[0.028354948724205863, 0.028349165535795064, 0...","[0.9943644333124608, 0.9941557086203298, 0.994..."
onset,360,0.898075,"[0.03069599051741909, 0.03068974624913838, 0.0...","[0.9945731580045919, 0.9943644333124608, 0.994..."


24


series_id,step,event,score
str,u32,str,f32
"""038441c925bb""",5,"""wakeup""",0.001356
"""038441c925bb""",34,"""onset""",0.000369
"""038441c925bb""",214,"""onset""",0.000513
"""038441c925bb""",389,"""onset""",0.000322
"""038441c925bb""",455,"""wakeup""",0.000586


0.808452696751842


Unnamed: 0_level_0,Unnamed: 1_level_0,ap,precision,recall
event,tolerance,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
onset,12,0.393726,"[0.019206017341294505, 0.019199859360088403, 0...","[0.638488833228971, 0.6382801085368399, 0.6382..."
onset,36,0.717104,"[0.02731787559724498, 0.02731176854688834, 0.0...","[0.9081611354623252, 0.9079524107701941, 0.907..."
onset,60,0.799422,"[0.029634652452079135, 0.02962855994776232, 0....","[0.9851805468586934, 0.9849718221665623, 0.984..."
onset,90,0.841315,"[0.029785337125564283, 0.02977924556733136, 0....","[0.9901899394698392, 0.9899812147777082, 0.989..."
onset,120,0.858594,"[0.029816729765873688, 0.029810638404741574, 0...","[0.9912335629304947, 0.9910248382383636, 0.991..."
onset,150,0.869096,"[0.029848122406183095, 0.02984203124215179, 0....","[0.99227718639115, 0.992068461699019, 0.992068..."
onset,180,0.87816,"[0.029860679462306857, 0.029854588377115878, 0...","[0.9926946357754123, 0.9924859110832811, 0.992..."
onset,240,0.889124,"[0.029302474376484196, 0.029296502506998064, 0...","[0.9941557086203298, 0.9939469839281987, 0.993..."
onset,300,0.894626,"[0.02930862648111919, 0.02930265464948168, 0.0...","[0.9943644333124608, 0.9941557086203298, 0.994..."
onset,360,0.900833,"[0.03180016283818956, 0.031793701323402805, 0....","[0.9945731580045919, 0.9943644333124608, 0.994..."


36


series_id,step,event,score
str,u32,str,f32
"""038441c925bb""",5,"""wakeup""",0.001356
"""038441c925bb""",34,"""onset""",0.000369
"""038441c925bb""",214,"""onset""",0.000513
"""038441c925bb""",389,"""onset""",0.000322
"""038441c925bb""",455,"""wakeup""",0.000586


0.808776315185245


Unnamed: 0_level_0,Unnamed: 1_level_0,ap,precision,recall
event,tolerance,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
onset,12,0.390341,"[0.01927226300670284, 0.019266002336433684, 0....","[0.6301398455437278, 0.6299311208515967, 0.629..."
onset,36,0.717332,"[0.02771145866581551, 0.02770525186882776, 0.0...","[0.9060738885410144, 0.9058651638488833, 0.905..."
onset,60,0.800268,"[0.03010533035429301, 0.030099138839060575, 0....","[0.9843456480901691, 0.9841369233980379, 0.984..."
onset,90,0.842313,"[0.03028407277369933, 0.030277882399504626, 0....","[0.9901899394698392, 0.9899812147777082, 0.989..."
onset,120,0.859293,"[0.030309607405043088, 0.030303417193853776, 0...","[0.9910248382383636, 0.9908161135462326, 0.990..."
onset,150,0.869915,"[0.03034790935205873, 0.0303417193853775, 0.03...","[0.99227718639115, 0.992068461699019, 0.992068..."
onset,180,0.879025,"[0.03036067666773061, 0.030354486782552075, 0....","[0.9926946357754123, 0.9924859110832811, 0.992..."
onset,240,0.889764,"[0.029785131822502375, 0.029779064604686357, 0...","[0.9941557086203298, 0.9939469839281987, 0.993..."
onset,300,0.895192,"[0.02979138526189404, 0.02978531808318377, 0.0...","[0.9943644333124608, 0.9941557086203298, 0.994..."
onset,360,0.901347,"[0.03235861357092411, 0.032352042375471124, 0....","[0.9945731580045919, 0.9943644333124608, 0.994..."


60


series_id,step,event,score
str,u32,str,f32
"""038441c925bb""",5,"""wakeup""",0.001356
"""038441c925bb""",34,"""onset""",0.000369
"""038441c925bb""",214,"""onset""",0.000513
"""038441c925bb""",389,"""onset""",0.000322
"""038441c925bb""",455,"""wakeup""",0.000586


0.8091060856139093


Unnamed: 0_level_0,Unnamed: 1_level_0,ap,precision,recall
event,tolerance,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
onset,12,0.385958,"[0.01935643435474213, 0.019350051743327063, 0....","[0.6207472343978293, 0.6205385097056982, 0.620..."
onset,36,0.717435,"[0.02814948842779412, 0.028143163046803303, 0....","[0.9027342934669171, 0.9025255687747861, 0.902..."
onset,60,0.801373,"[0.030668298143760903, 0.030661989156681398, 0...","[0.9835107493216447, 0.9833020246295137, 0.983..."
onset,90,0.843275,"[0.030876571815365392, 0.03087026418385478, 0....","[0.9901899394698392, 0.9899812147777082, 0.989..."
onset,120,0.860254,"[0.03090260602431595, 0.030896298562251453, 0....","[0.9910248382383636, 0.9908161135462326, 0.990..."
onset,150,0.870893,"[0.030941657337741794, 0.030935350129846464, 0...","[0.99227718639115, 0.992068461699019, 0.992068..."
onset,180,0.87992,"[0.030954674442217073, 0.0309483673190448, 0.0...","[0.9926946357754123, 0.9924859110832811, 0.992..."
onset,240,0.890504,"[0.030358269648741498, 0.03035208933533896, 0....","[0.9941557086203298, 0.9939469839281987, 0.993..."
onset,300,0.895793,"[0.030364643419400483, 0.030358463146623153, 0...","[0.9943644333124608, 0.9941557086203298, 0.994..."
onset,360,0.902014,"[0.03302125418396269, 0.033014553014553014, 0....","[0.9945731580045919, 0.9943644333124608, 0.994..."


In [156]:
weights = [0.8, 0.2]
for tolerance in [90, 120]:
    print(tolerance)
    ensemble_df = wbf_from_dfs(dfs, tolerance=tolerance, weights=weights)
    display(ensemble_df.head())

    score, table = event_detection_ap(
        event_df.to_pandas(),
        ensemble_df.to_pandas(),
        with_table=True
    )
    print(score)
    display(table)

90


series_id,step,event,score
str,u32,str,f32
"""038441c925bb""",5,"""wakeup""",0.001356
"""038441c925bb""",34,"""onset""",0.000369
"""038441c925bb""",214,"""onset""",0.000513
"""038441c925bb""",389,"""onset""",0.000322
"""038441c925bb""",455,"""wakeup""",0.000586


0.8092895565996421


Unnamed: 0_level_0,Unnamed: 1_level_0,ap,precision,recall
event,tolerance,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
onset,12,0.381647,"[0.019418053434357457, 0.019411570247933885, 0...","[0.6130244207889793, 0.6128156960968483, 0.612..."
onset,36,0.715269,"[0.028436175628590885, 0.028429752066115702, 0...","[0.8977249008557713, 0.8975161761636402, 0.897..."
onset,60,0.803127,"[0.03114028998155384, 0.03113388429752066, 0.0...","[0.9830932999373826, 0.9828845752452515, 0.982..."
onset,90,0.844532,"[0.03135847035722078, 0.03135206611570248, 0.0...","[0.9899812147777082, 0.9897724900855771, 0.989..."
onset,120,0.861551,"[0.03139152798989759, 0.03138512396694215, 0.0...","[0.9910248382383636, 0.9908161135462326, 0.990..."
onset,150,0.872031,"[0.03143119714910976, 0.03142479338842975, 0.0...","[0.99227718639115, 0.992068461699019, 0.992068..."
onset,180,0.880889,"[0.03144442020218048, 0.03143801652892562, 0.0...","[0.9926946357754123, 0.9924859110832811, 0.992..."
onset,240,0.891401,"[0.030831871468057975, 0.03082559780427493, 0....","[0.9941557086203298, 0.9939469839281987, 0.993..."
onset,300,0.896655,"[0.030838344672229307, 0.030832071050348908, 0...","[0.9943644333124608, 0.9941557086203298, 0.994..."
onset,360,0.902844,"[0.03356532029698088, 0.033558512549221264, 0....","[0.9945731580045919, 0.9943644333124608, 0.994..."


120


series_id,step,event,score
str,u32,str,f32
"""038441c925bb""",5,"""wakeup""",0.001356
"""038441c925bb""",34,"""onset""",0.000369
"""038441c925bb""",214,"""onset""",0.000513
"""038441c925bb""",389,"""onset""",0.000322
"""038441c925bb""",455,"""wakeup""",0.000586


0.8092316640511025


Unnamed: 0_level_0,Unnamed: 1_level_0,ap,precision,recall
event,tolerance,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
onset,12,0.378718,"[0.01942862484059848, 0.01942207800878634, 0.0...","[0.6073888541014402, 0.6071801294093091, 0.607..."
onset,36,0.710866,"[0.02851534594302272, 0.028508859779139792, 0....","[0.8914631600918389, 0.8912544353997078, 0.891..."
onset,60,0.803809,"[0.03141294841065837, 0.03140648159275728, 0.0...","[0.9820496764767273, 0.9818409517845961, 0.981..."
onset,90,0.845904,"[0.03166665553916103, 0.031660190415147754, 0....","[0.9899812147777082, 0.9897724900855771, 0.989..."
onset,120,0.862783,"[0.031700038056069274, 0.03169357315493597, 0....","[0.9910248382383636, 0.9908161135462326, 0.990..."
onset,150,0.873195,"[0.03174009707635917, 0.03173363244268183, 0.0...","[0.99227718639115, 0.992068461699019, 0.992068..."
onset,180,0.881959,"[0.03175345008312247, 0.031746985538597124, 0....","[0.9926946357754123, 0.9924859110832811, 0.992..."
onset,240,0.892418,"[0.031130515486826882, 0.03112418300653595, 0....","[0.9941557086203298, 0.9939469839281987, 0.993..."
onset,300,0.897554,"[0.031137051391820967, 0.031130718954248366, 0...","[0.9943644333124608, 0.9941557086203298, 0.994..."
onset,360,0.903652,"[0.033906385643331864, 0.03389951114684807, 0....","[0.9945731580045919, 0.9943644333124608, 0.994..."


In [None]:

def post_process_find_peaks_2dim(
    series2preds: dict[np.ndarray],
    score_th: float = 0.01,
    distance: int = 5000,
    periodicity_dict: dict[np.ndarray] | None = None,
) -> pl.DataFrame:
    """make submission dataframe for segmentation task

    Args:
        series2preds (dict[np.ndarray]): series_id を key に 2d の予測結果を持つ辞書
        score_th (float, optional): threshold for score. Defaults to 0.5.
        distance (int, optional): distance for peaks. Defaults to 5000.
        periodicity_dict (dict[np.ndarray], optional): series_id を key に periodicity の 1d の予測結果を持つ辞書. 値は 0 or 1 の np.ndarray. Defaults to None.

    Returns:
        pl.DataFrame: submission dataframe
    """
    LOGGER.info("is periodicity_dict None? : {}".format(periodicity_dict is None))

    records = []
    for series_id in series2preds.keys():
        this_series_preds = series2preds[series_id][:, :]
        if periodicity_dict is not None:
            this_series_preds = this_series_preds[: len(periodicity_dict[series_id]), :]
            this_series_preds *= 1 - periodicity_dict[series_id][:, None]  # periodicity があるところは0にする

        for i, event_name in enumerate(["onset", "wakeup"]):
            this_event_preds = this_series_preds[:, i]
            steps = find_peaks(this_event_preds, height=score_th, distance=distance)[0]
            scores = this_event_preds[steps]
            for step, score in zip(steps, scores):
                records.append(
                    {
                        "series_id": series_id,
                        "step": step,
                        "event": event_name,
                        "score": score,
                    }
                )

    if len(records) == 0:  # 一つも予測がない場合はdummyを入れる
        records.append(
            {
                "series_id": series_id,
                "step": 0,
                "event": "onset",
                "score": 0,
            }
        )

    sub_df = pl.DataFrame(records).sort(by=["series_id", "step"])
    row_ids = pl.Series(name="row_id", values=np.arange(len(sub_df)))
    sub_df = sub_df.with_columns(row_ids).select(["row_id", "series_id", "step", "event", "score"])
    return sub_df

