In [1]:
import sqlite3
import pandas as pd
import math

from tqdm import tqdm
from joblib import dump, load

### Model Metrics

In [3]:
with sqlite3.connect('./data/data.db') as con:

    # Dynamically generate SQL
    sql_str = """
        with
        cte as (
            select p.*, t.pseudo_label, t.pseudo_label_lps, t.failure_id
            from prophet_results_f2_f3_agg as p
                left join train_data as t
                    on t.ts = p.ds
        )
        select
            count(*) as n_rows
            ,sum(pseudo_label) as n_outliers
            ,sum(case when pseudo_label is null then 1 else 0 end) as n_inliers
            
            ,sum(pred) as total_outliers
            ,sum(case when pred = 0 then 1 else 0 end) as total_inliers
            ,sum(case when pseudo_label is not null then pred else 0 end) as TP
            ,sum(case when pseudo_label is null then pred else 0 end) as FP
            ,sum(case when pseudo_label is null and pred = 0 then 1 else 0 end) as TN
            ,sum(case when pseudo_label is not null and pred = 0 then 1 else 0 end) as FN

        from cte
        where failure_id is null
    """

    # Execute
    df = pd.read_sql(sql_str, con=con)

In [4]:
with pd.option_context('display.float_format', '{:.0f}'.format):
    display(df.transpose())

Unnamed: 0,0
n_rows,4368986
n_outliers,504006
n_inliers,3864980
total_outliers,134998
total_inliers,4233988
TP,13841
FP,121157
TN,3743823
FN,490165


In [5]:
# Calculate model metrics
results = df.to_dict(orient='index')[0]

tp, fp, tn, fn = results['TP'], results['FP'], results['TN'], results['FN']
n_outliers = results['n_outliers']
n_inliers = results['n_inliers']

accuracy = (tp + tn) / (tp + fp + tn + fn)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1 = (2*precision*recall)/(precision + recall)

percent_inliers = fp / n_inliers
percent_outliers = tp / n_outliers

metrics = ['puv_f2_f3', accuracy, precision, recall, f1, percent_inliers, percent_outliers, tp, fp, tn, fn]

In [9]:
results_df = pd.DataFrame([metrics], columns=['model', 'accuracy', 'precision', 'recall', 'f1', '% inliers', '% outliers', 'tp', 'fp', 'tn', 'fn'])

with pd.option_context('display.float_format', '{:.2%}'.format):
    display(results_df.sort_values(by='precision', ascending=False))

Unnamed: 0,model,accuracy,precision,recall,f1,% inliers,% outliers,tp,fp,tn,fn
0,puv_f2_f3,86.01%,10.25%,2.75%,4.33%,3.13%,2.75%,13841,121157,3743823,490165


### Event Durations

In [14]:
with sqlite3.connect('./data/data.db') as con:
    cur = con.cursor()

    cur.execute('drop table if exists outlier_events_f2_f3')
    cur.execute("""
        create table outlier_events_f2_f3 (
            model text,
            event_id int,
            start_ts timestamp,
            end_ts timestamp,
            event_duration_in_secs int
        )
    """)
    cur.execute("""
        insert into outlier_events_f2_f3 (model, event_id, start_ts, end_ts, event_duration_in_secs)
        values  
            ('pseudo_label', 2, '2022-03-16 06:00:00', '2022-03-23 02:00:00', 504006), 
            ('failure', 2, '2022-03-23 14:54:00', '2022-03-23 15:24:00', 1800),                
            ('pseudo_label', 3, '2022-05-23 06:00:00', '2022-05-30 02:00:00', 504006), 
            ('failure', 3, '2022-05-30 12:00:00', '2022-06-02 06:18:00', 195483)
    """)

    model = 'puv_f2_f3'

    cur.execute(f"""
        with 
        lagged as (
            select
                ds as ts
                ,pred
                ,lag(pred, 1, 0) over (order by ds asc) as prev_pred
            from prophet_results_f2_f3_agg
        ),
        flagged as (
            select
                ts
                ,pred
                ,sum(case when pred != prev_pred then 1 else 0 end) over (order by ts rows unbounded preceding) as event_id
            from lagged
        )
        insert into outlier_events_f2_f3 (model, event_id, start_ts, end_ts, event_duration_in_secs)
        select
            '{model}' as model
            ,event_id
            ,min(ts) as start_ts
            ,max(ts) as end_ts
            ,count(*) as event_duration_in_secs
        from flagged
        where pred = 1
        group by event_id
        order by start_ts asc
    """)

In [15]:
# Taken from: https://stackoverflow.com/questions/2298339/standard-deviation-for-sqlite
class StdevFunc:
    def __init__(self):
        self.M = 0.0
        self.S = 0.0
        self.k = 1

    def step(self, value):
        if value is None:
            return
        tM = self.M
        self.M += (value - tM) / self.k
        self.S += (value - tM) * (value - self.M)
        self.k += 1

    def finalize(self):
        if self.k < 3:
            return None
        return math.sqrt(self.S / (self.k-2))


with sqlite3.connect('./data/data.db') as con:
    con.create_aggregate("stdev", 1, StdevFunc)

    events = pd.read_sql("""
        select
            model
            ,count(*) as n_events
            ,sum(case when event_duration_in_secs <= 60 then 1 else 0 end) as n_extra_short_events
            ,sum(case when event_duration_in_secs <= 60*5 then 1 else 0 end) as n_short_events
            ,sum(case when event_duration_in_secs > 60*5  and event_duration_in_secs <= 60*60 then 1 else 0 end) as n_medium_events
            ,sum(case when event_duration_in_secs > 60*60 and event_duration_in_secs <= 60*60*8 then 1 else 0 end) as n_long_events
            ,sum(case when event_duration_in_secs > 60*60*8 then 1 else 0 end) as n_extra_long_events
            ,min(event_duration_in_secs) as min_event_duration
            ,max(event_duration_in_secs) as max_event_duration
            ,avg(event_duration_in_secs) as avg_event_duration
            ,stdev(event_duration_in_secs) as std_event_duration
                            
        from outlier_events_f2_f3
        group by model
        order by n_events desc
    """, con=con)

In [16]:
with pd.option_context('display.float_format', '{:.0f}'.format):
    display(events)

Unnamed: 0,model,n_events,n_extra_short_events,n_short_events,n_medium_events,n_long_events,n_extra_long_events,min_event_duration,max_event_duration,avg_event_duration,std_event_duration
0,puv_f2_f3,12591,12413,12584,6,1,0,1,7126,11,67
1,pseudo_label,2,0,0,0,0,2,504006,504006,504006,0
2,failure,2,0,0,1,0,1,1800,195483,98642,136955


### Model Metrics (Removing Short Events)

In [17]:
with sqlite3.connect('./data/data.db') as con:
    cur = con.cursor()

    cur.execute(f"""
        alter table prophet_results_f2_f3_agg
        add column event_id int;
    """)

    cur.execute(f"""
        alter table prophet_results_f2_f3_agg
        add column pred_filtered int;
    """)

In [19]:
%%time
with sqlite3.connect('./data/data.db') as con:
    cur = con.cursor()

    cur.execute(f"""
        with 
        lagged as (
            select
                ds as ts
                ,pred
                ,lag(pred, 1, 0) over (order by ds asc) as prev_pred
            from prophet_results_f2_f3_agg
        ),
        flagged as (
            select
                ts
                ,pred
                ,sum(case when pred != prev_pred then 1 else 0 end) over (order by ts rows unbounded preceding) as event_id
            from lagged
        )
        update prophet_results_f2_f3_agg as r
        set event_id = f.event_id
        from flagged as f
        where f.ts = r.ds
    """)

    cur.execute(f"""
        update prophet_results_f2_f3_agg
        set pred_filtered = null
    """)

    cur.execute(f"""
        update prophet_results_f2_f3_agg
        set pred_filtered = pred
        where event_id in (
            select event_id
            from outlier_events_f2_f3
            where 
                model = 'puv_f2_f3'
                and event_duration_in_secs > 60*5
        )
    """)

    cur.execute(f"""
        update prophet_results_f2_f3_agg
        set pred_filtered = 0
        where pred_filtered is null
    """)

CPU times: user 13.9 s, sys: 2.96 s, total: 16.8 s
Wall time: 17.2 s


In [21]:
with sqlite3.connect('./data/data.db') as con:

    # Dynamically generate SQL
    sql_str = """
        with
        cte as (
            select p.*, t.pseudo_label, t.pseudo_label_lps, t.failure_id
            from prophet_results_f2_f3_agg as p
                left join train_data as t
                    on t.ts = p.ds
        )
        select
            count(*) as n_rows
            ,sum(pseudo_label) as n_outliers
            ,sum(case when pseudo_label is null then 1 else 0 end) as n_inliers
            
            ,sum(pred_filtered) as total_outliers
            ,sum(case when pred_filtered = 0 then 1 else 0 end) as total_inliers
            ,sum(case when pseudo_label is not null then pred_filtered else 0 end) as TP
            ,sum(case when pseudo_label is null then pred_filtered else 0 end) as FP
            ,sum(case when pseudo_label is null and pred_filtered = 0 then 1 else 0 end) as TN
            ,sum(case when pseudo_label is not null and pred_filtered = 0 then 1 else 0 end) as FN

        from cte
        where failure_id is null
    """

    # Execute
    df = pd.read_sql(sql_str, con=con)

In [22]:
with pd.option_context('display.float_format', '{:.0f}'.format):
    display(df.transpose())

Unnamed: 0,0
n_rows,4368986
n_outliers,504006
n_inliers,3864980
total_outliers,2759
total_inliers,4366227
TP,449
FP,2310
TN,3862670
FN,503557


In [23]:
# Calculate model metrics
results = df.to_dict(orient='index')[0]

tp, fp, tn, fn = results['TP'], results['FP'], results['TN'], results['FN']
n_outliers = results['n_outliers']
n_inliers = results['n_inliers']

accuracy = (tp + tn) / (tp + fp + tn + fn)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1 = (2*precision*recall)/(precision + recall)

percent_inliers = fp / n_inliers
percent_outliers = tp / n_outliers

metrics = ['puv_f2_f3', accuracy, precision, recall, f1, percent_inliers, percent_outliers, tp, fp, tn, fn]

In [25]:
results_df = pd.DataFrame([metrics], columns=['model', 'accuracy', 'precision', 'recall', 'f1', '% inliers', '% outliers', 'tp', 'fp', 'tn', 'fn'])

with pd.option_context('display.float_format', '{:.2%}'.format):
    display(results_df.sort_values(by='precision', ascending=False))

Unnamed: 0,model,accuracy,precision,recall,f1,% inliers,% outliers,tp,fp,tn,fn
0,puv_f2_f3,88.42%,16.27%,0.09%,0.18%,0.06%,0.09%,449,2310,3862670,503557


### LPS Alarm Analysis


In [26]:
with sqlite3.connect('./data/data.db') as con:

    # Dynamically generate SQL
    sql_str = """
        with
        cte as (
            select p.*, t.pseudo_label, t.pseudo_label_lps, t.failure_id
            from prophet_results_f2_f3_agg as p
                left join train_data as t
                    on t.ts = p.ds
        )
        select
            count(*) as n_rows
            ,sum(pseudo_label_lps) as n_outliers
            ,sum(case when pseudo_label_lps is null then 1 else 0 end) as n_inliers
            
            ,sum(pred) as total_outliers
            ,sum(case when pred = 0 then 1 else 0 end) as total_inliers
            ,sum(case when pseudo_label_lps is not null then pred else 0 end) as TP
            ,sum(case when pseudo_label_lps is null then pred else 0 end) as FP
            ,sum(case when pseudo_label_lps is null and pred = 0 then 1 else 0 end) as TN
            ,sum(case when pseudo_label_lps is not null and pred = 0 then 1 else 0 end) as FN

        from cte
        where failure_id is null
    """

    # Execute
    df = pd.read_sql(sql_str, con=con)

In [27]:
with pd.option_context('display.float_format', '{:.0f}'.format):
    display(df.transpose())

Unnamed: 0,0
n_rows,4368986
n_outliers,723426
n_inliers,3645560
total_outliers,134998
total_inliers,4233988
TP,18213
FP,116785
TN,3528775
FN,705213


In [28]:
# Calculate model metrics
results = df.to_dict(orient='index')[0]

tp, fp, tn, fn = results['TP'], results['FP'], results['TN'], results['FN']
n_outliers = results['n_outliers']
n_inliers = results['n_inliers']

accuracy = (tp + tn) / (tp + fp + tn + fn)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1 = (2*precision*recall)/(precision + recall)

percent_inliers = fp / n_inliers
percent_outliers = tp / n_outliers

metrics = ['puv_f2_f3', accuracy, precision, recall, f1, percent_inliers, percent_outliers, tp, fp, tn, fn]

In [30]:
results_df = pd.DataFrame([metrics], columns=['model', 'accuracy', 'precision', 'recall', 'f1', '% inliers', '% outliers', 'tp', 'fp', 'tn', 'fn'])

with pd.option_context('display.float_format', '{:.2%}'.format):
    display(results_df.sort_values(by='precision', ascending=False))

Unnamed: 0,model,accuracy,precision,recall,f1,% inliers,% outliers,tp,fp,tn,fn
0,puv_f2_f3,81.19%,13.49%,2.52%,4.24%,3.20%,2.52%,18213,116785,3528775,705213
