# Outlier Detection Modeling

In [1]:
import sqlite3
import pandas as pd
import warnings
from tqdm import tqdm
from joblib import dump, load
from pyod.models.ecod import ECOD
from pyod.models.dif import DIF
from pyod.models.abod import ABOD
from pyod.models.copod import COPOD

from prophet import Prophet

In [2]:
with sqlite3.connect('./data/data.db') as con:
    df = pd.read_sql("select * from train_data where date(ts) <= '2022-03-02' order by ts asc", con=con, parse_dates=['ts'])

print(df.shape)

(4383806, 24)


In [3]:
analog_cols = [
    'tp2',
    'tp3',
    'h1',
    'dv_pressure',
    'reservoirs',
    'oil_temperature',
    'flowmeter',
    'motor_current'
]

digital_cols = [
    'comp',
    'dv_electric',
    'towers',
    'mpg',
    'lps',
    'pressure_switch',
    'oil_level',
    'caudal_impulses'
]

gps_cols = [
    'gps_long',
    'gps_lat',
    'gps_speed',
    'gps_quality'
]

In [4]:
with sqlite3.connect('./data/data.db') as con:
    cur = con.cursor()

    cur.execute('drop table if exists pyod_results;')

    cur.execute('''
        create table pyod_results (
            ts timestamp,
            model text,
            p_normal real,
            p_outlier real,
            confidence real,
            pred int
        )
    ''')

## Prophet-Based Models

### Prophet - Univariate

In [None]:
%%time
for col in analog_cols:
    model = Prophet(
        changepoint_prior_scale=0.5,
        yearly_seasonality=False,
        weekly_seasonality=False,
        daily_seasonality=False
    )

    model.add_seasonality(name='30min', period=60*30, fourier_order=5)
    model.add_seasonality(name='daily', period=72000, fourier_order=5)

    model.fit(df.loc[:, ['ts', col]].rename(columns={'ts': 'ds', col: 'y'}))

    dump(model, f'models/prophet_{col}.joblib')

In [None]:
chunk_size = 1_000_000

with sqlite3.connect('./data/data.db') as con:
    cur = con.cursor()
    cur.execute('drop table if exists prophet_results;') 
    cur.execute('''
        create table prophet_results (
            ds timestamp,
            col text,
            yhat real,
            yhat_lower real,
            yhat_upper real,
            yhat_lower_expanded real,
            yhat_upper_expanded real,
            pred integer
        );
    ''')

    for col in tqdm(analog_cols):
        model = load(f'models/prophet_{col}.joblib')

        for start in range(0, len(df), chunk_size):
            chunk = df.iloc[start:start + chunk_size]

            pred_df = (
                model
                    .predict(chunk.loc[:, ['ts', col]].rename(columns={'ts': 'ds'}))
                    .merge(chunk.loc[:, ['ts', col]], left_on='ds', right_on='ts')
            )

            pred_df['col'] = col
            pred_df['range'] = pred_df['yhat_upper'] - pred_df['yhat_lower']
            pred_df['range_expanded'] = pred_df['range'] * 1.1
            pred_df['yhat_upper_expanded'] = pred_df['yhat'] + pred_df['range_expanded'] / 2
            pred_df['yhat_lower_expanded'] = pred_df['yhat'] - pred_df['range_expanded'] / 2

            pred_df['pred'] = (
                (pred_df[col] > pred_df['yhat_upper_expanded']) |
                (pred_df[col] < pred_df['yhat_lower_expanded'])
            ).astype(int)

            sql_cols = [
                'ds',
                'col',
                'yhat',
                'yhat_lower',
                'yhat_upper',
                'yhat_lower_expanded',
                'yhat_upper_expanded',
                'pred'
            ]
            
            pred_df[sql_cols].to_sql(
                name='prophet_results',
                con=con, 
                if_exists='append', 
                index=False,
                chunksize=999,
                method='multi'
            )


100%|██████████| 8/8 [1:29:13<00:00, 669.23s/it] 


### Prophet Results Aggregation - Univariate

In [None]:
n_signals = 5

# Post process results
with sqlite3.connect('./data/data.db') as con:
    cur = con.cursor()

    cur.execute('drop table if exists prophet_results_agg;')

    cur.execute('''create table if not exists prophet_results_agg (ds timestamp, {cols}, {agg_cols});'''.format(
        cols = ', '.join([f'{col}_pred integer' for col in analog_cols]),
        agg_cols = 'total_sum integer, pred integer'
    ))

    cur.execute('''
        insert into prophet_results_agg (ds, {cols})
        select ds, {sum_cols}
        from prophet_results
        group by ds
    '''.format(
        cols = ', '.join([f'{col}_pred' for col in analog_cols]),
        sum_cols = ', '.join([f"sum(case when col='{col}' then pred else 0 end) as {col}_pred" for col in analog_cols])
    ))

    cur.execute('''update prophet_results_agg set total_sum = {total_sum_col}'''.format(
        total_sum_col = ' + '.join([f'{col}_pred' for col in analog_cols])
    ))

    cur.execute(f'''update prophet_results_agg set pred = case when total_sum >= {n_signals} then 1 else 0 end''')

### Prophet - Multivariate

In [None]:
model = Prophet(
    changepoint_prior_scale=0.5,
    yearly_seasonality=False,
    weekly_seasonality=False,
    daily_seasonality=False
)

model.add_seasonality(name='30min', period=60*30, fourier_order=5)
model.add_seasonality(name='daily', period=72000, fourier_order=5)

for col in analog_cols + digital_cols:
    if col != 'reservoirs':
        model.add_regressor(col)

model.fit(df.rename(columns={'ts': 'ds', 'reservoirs': 'y'}))

dump(model, f'models/prophet_reservoirs_mv.joblib')

20:38:05 - cmdstanpy - INFO - Chain [1] start processing
21:24:40 - cmdstanpy - INFO - Chain [1] done processing


['models/prophet_reservoirs_mv.joblib']

In [None]:
chunk_size = 1_000_000

with sqlite3.connect('./data/data.db') as con:
    cur = con.cursor()
    cur.execute("delete from prophet_results where col = 'reservoirs_mv'")
    
    model = load(f'models/prophet_reservoirs_mv.joblib')

    for start in range(0, len(df), chunk_size):
        chunk = df.iloc[start:start + chunk_size]

        pred_df = (
            model
                .predict(chunk.rename(columns={'ts': 'ds'}))
                .merge(chunk, left_on='ds', right_on='ts')
        )

        pred_df['col'] = 'reservoirs_mv'
        pred_df['range'] = pred_df['yhat_upper'] - pred_df['yhat_lower']
        pred_df['range_expanded'] = pred_df['range'] * 1.1
        pred_df['yhat_upper_expanded'] = pred_df['yhat'] + pred_df['range_expanded'] / 2
        pred_df['yhat_lower_expanded'] = pred_df['yhat'] - pred_df['range_expanded'] / 2

        pred_df['pred'] = (
            (pred_df['reservoirs'] > pred_df['yhat_upper_expanded']) |
            (pred_df['reservoirs'] < pred_df['yhat_lower_expanded'])
        ).astype(int)

        sql_cols = [
            'ds',
            'col',
            'yhat',
            'yhat_lower',
            'yhat_upper',
            'yhat_lower_expanded',
            'yhat_upper_expanded',
            'pred'
        ]
        
        pred_df[sql_cols].to_sql(
            name='prophet_results',
            con=con, 
            if_exists='append', 
            index=False,
            chunksize=999,
            method='multi'
        )


### Prophet - Multivariate + Multiple Signals

In [7]:
%%time
for col in tqdm(analog_cols):
    model = Prophet(
        changepoint_prior_scale=0.5,
        yearly_seasonality=False,
        weekly_seasonality=False,
        daily_seasonality=False
    )

    model.add_seasonality(name='30min', period=60*30, fourier_order=5)
    model.add_seasonality(name='daily', period=72000, fourier_order=5)

    # For every univariate model, use all other columns as additional regressors
    for regressor in analog_cols + digital_cols:
        if regressor != col:
            model.add_regressor(regressor)

    model.fit(df.rename(columns={'ts': 'ds', col: 'y'}))

    dump(model, f'models/prophet_mv_multi_{col}.joblib')

  0%|          | 0/8 [00:00<?, ?it/s]12:01:49 - cmdstanpy - INFO - Chain [1] start processing
12:09:30 - cmdstanpy - INFO - Chain [1] done processing
 12%|█▎        | 1/8 [10:23<1:12:46, 623.80s/it]12:12:09 - cmdstanpy - INFO - Chain [1] start processing
12:25:46 - cmdstanpy - INFO - Chain [1] done processing
 25%|██▌       | 2/8 [26:40<1:23:08, 831.37s/it]12:28:25 - cmdstanpy - INFO - Chain [1] start processing
12:44:38 - cmdstanpy - INFO - Chain [1] done processing
 38%|███▊      | 3/8 [45:31<1:20:41, 968.35s/it]12:47:17 - cmdstanpy - INFO - Chain [1] start processing
12:52:30 - cmdstanpy - INFO - Chain [1] done processing
 50%|█████     | 4/8 [53:24<51:30, 772.62s/it]  12:55:09 - cmdstanpy - INFO - Chain [1] start processing
13:41:34 - cmdstanpy - INFO - Chain [1] done processing
 62%|██████▎   | 5/8 [1:42:27<1:17:46, 1555.39s/it]13:44:13 - cmdstanpy - INFO - Chain [1] start processing
14:45:34 - cmdstanpy - INFO - Chain [1] done processing
 75%|███████▌  | 6/8 [2:46:28<1:17:44, 233

CPU times: user 20min 12s, sys: 55.7 s, total: 21min 8s
Wall time: 3h 23min 33s





In [9]:
chunk_size = 1_000_000

with sqlite3.connect('./data/data.db') as con:
    cur = con.cursor()
    cur.execute('drop table if exists prophet_results_mv_multi;') 
    cur.execute('''
        create table prophet_results_mv_multi (
            ds timestamp,
            col text,
            yhat real,
            yhat_lower real,
            yhat_upper real,
            yhat_lower_expanded real,
            yhat_upper_expanded real,
            pred integer
        );
    ''')

    for col in tqdm(analog_cols):
        model = load(f'models/prophet_mv_multi_{col}.joblib')

        for start in range(0, len(df), chunk_size):
            chunk = df.iloc[start:start + chunk_size]

            pred_df = (
                model
                    .predict(chunk.rename(columns={'ts': 'ds'}))
                    .merge(chunk, left_on='ds', right_on='ts')
            )

            pred_df['col'] = col
            pred_df['range'] = pred_df['yhat_upper'] - pred_df['yhat_lower']
            pred_df['range_expanded'] = pred_df['range'] * 1.1
            pred_df['yhat_upper_expanded'] = pred_df['yhat'] + pred_df['range_expanded'] / 2
            pred_df['yhat_lower_expanded'] = pred_df['yhat'] - pred_df['range_expanded'] / 2

            pred_df['pred'] = (
                (pred_df[col] > pred_df['yhat_upper_expanded']) |
                (pred_df[col] < pred_df['yhat_lower_expanded'])
            ).astype(int)

            sql_cols = [
                'ds',
                'col',
                'yhat',
                'yhat_lower',
                'yhat_upper',
                'yhat_lower_expanded',
                'yhat_upper_expanded',
                'pred'
            ]
            
            pred_df[sql_cols].to_sql(
                name='prophet_results_mv_multi',
                con=con, 
                if_exists='append', 
                index=False,
                chunksize=999,
                method='multi'
            )


100%|██████████| 8/8 [1:05:31<00:00, 491.41s/it]


### Prophet Results Aggregation - Multivariate + Multiple Signals

In [10]:
n_signals = 5

# Post process results
with sqlite3.connect('./data/data.db') as con:
    cur = con.cursor()

    cur.execute('drop table if exists prophet_results_mv_multi_agg;')

    cur.execute('''create table if not exists prophet_results_mv_multi_agg (ds timestamp, {cols}, {agg_cols});'''.format(
        cols = ', '.join([f'{col}_pred integer' for col in analog_cols]),
        agg_cols = 'total_sum integer, pred integer'
    ))

    cur.execute('''
        insert into prophet_results_mv_multi_agg (ds, {cols})
        select ds, {sum_cols}
        from prophet_results_mv_multi
        group by ds
    '''.format(
        cols = ', '.join([f'{col}_pred' for col in analog_cols]),
        sum_cols = ', '.join([f"sum(case when col='{col}' then pred else 0 end) as {col}_pred" for col in analog_cols])
    ))

    cur.execute('''update prophet_results_mv_multi_agg set total_sum = {total_sum_col}'''.format(
        total_sum_col = ' + '.join([f'{col}_pred' for col in analog_cols])
    ))

    cur.execute(f'''update prophet_results_mv_multi_agg set pred = case when total_sum >= {n_signals} then 1 else 0 end''')

## pyOD Models

### ECOD

In [12]:
%%time
m_ecod = ECOD(contamination=0.02, n_jobs=8)
m_ecod.fit(df.loc[:, analog_cols + digital_cols])

dump(m_ecod, 'models/ecod.joblib')

[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   2 out of   8 | elapsed:    3.9s remaining:   11.6s
[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:    6.0s finished


CPU times: user 1.94 s, sys: 5.15 s, total: 7.09 s
Wall time: 10.1 s


['models/ecod.joblib']

In [7]:
%%time
name = 'ecod'

m = load(f'models/{name}.joblib')

chunk_size = 1_000_000

for start in range(0, len(df), chunk_size):
    chunk = df.iloc[start:start + chunk_size]
        
    results_df = chunk.loc[:, ['ts']].copy(deep=True)

    print('Predicting..')
    results_df[['p_normal', 'p_outlier']], results_df['confidence'] = m.predict_proba(chunk.loc[:, analog_cols + digital_cols], return_confidence=True)
    results_df['pred'] = results_df['p_outlier'].round().astype(int)
    results_df['model'] = name

    print('Uploading..')
    with sqlite3.connect('./data/data.db') as con:
        results_df.to_sql(
            name='pyod_results',
            con=con, 
            if_exists='append', 
            index=False,
            chunksize=999,
            method='multi'
        )

Predicting..


[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   2 out of   8 | elapsed:    5.1s remaining:   15.4s
[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:    8.5s finished
[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   2 out of   8 | elapsed:    3.2s remaining:    9.5s
[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:    6.9s finished


Uploading..
Predicting..


[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   2 out of   8 | elapsed:    4.3s remaining:   13.0s
[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:    7.2s finished
[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   2 out of   8 | elapsed:    2.8s remaining:    8.4s
[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:    5.5s finished


Uploading..
Predicting..


[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   2 out of   8 | elapsed:    4.5s remaining:   13.6s
[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:    8.9s finished
[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   2 out of   8 | elapsed:    2.8s remaining:    8.5s
[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:    5.8s finished


Uploading..
Predicting..


[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   2 out of   8 | elapsed:    4.4s remaining:   13.3s
[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:    7.2s finished
[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   2 out of   8 | elapsed:    2.8s remaining:    8.4s
[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:    5.6s finished


Uploading..
Predicting..


[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   2 out of   8 | elapsed:    3.9s remaining:   11.6s
[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:    6.2s finished
[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   2 out of   8 | elapsed:    2.3s remaining:    6.9s
[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:    4.6s finished


Uploading..
CPU times: user 2h 3min 43s, sys: 13min 26s, total: 2h 17min 10s
Wall time: 2h 17min 45s


### ABOD

In [10]:
%%time
m_abod = ABOD(contamination=0.02, n_neighbors=20, method='fast')
m_abod.fit(df.loc[:, analog_cols + digital_cols])

dump(m_abod, 'models/abod.joblib')

CPU times: user 14h 34min 53s, sys: 2min 44s, total: 14h 37min 38s
Wall time: 2h 18min 29s


['models/abod.joblib']

In [9]:
%%time
name = 'abod'

m = load(f'models/{name}.joblib')

chunk_size = 1_000_000

for start in range(0, len(df), chunk_size):
    chunk = df.iloc[start:start + chunk_size]
        
    results_df = chunk.loc[:, ['ts']].copy(deep=True)

    print('Predicting..')
    results_df[['p_normal', 'p_outlier']], results_df['confidence'] = m.predict_proba(chunk.loc[:, analog_cols + digital_cols], return_confidence=True)
    results_df['pred'] = results_df['p_outlier'].round().astype(int)
    results_df['model'] = name

    print('Uploading..')
    with sqlite3.connect('./data/data.db') as con:
        results_df.to_sql(
            name='pyod_results',
            con=con, 
            if_exists='append', 
            index=False,
            chunksize=999,
            method='multi'
        )

Predicting..
Uploading..
Predicting..
Uploading..
Predicting..
Uploading..
Predicting..
Uploading..
Predicting..
Uploading..
CPU times: user 3h 45min 16s, sys: 12min 18s, total: 3h 57min 34s
Wall time: 3h 57min 55s


### COPOD

In [11]:
%%time
m_copod = COPOD(contamination=0.02, n_jobs=8)
m_copod.fit(df.loc[:, analog_cols + digital_cols])

dump(m_copod, 'models/copod.joblib')

[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   2 out of   8 | elapsed:    3.5s remaining:   10.6s
[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:    6.2s finished


CPU times: user 1.88 s, sys: 5.17 s, total: 7.05 s
Wall time: 10.2 s


['models/copod.joblib']

In [8]:
%%time
name = 'copod'

m = load(f'models/{name}.joblib')

chunk_size = 1_000_000

for start in range(0, len(df), chunk_size):
    chunk = df.iloc[start:start + chunk_size]
        
    results_df = chunk.loc[:, ['ts']].copy(deep=True)

    print('Predicting..')
    results_df[['p_normal', 'p_outlier']], results_df['confidence'] = m.predict_proba(chunk.loc[:, analog_cols + digital_cols], return_confidence=True)
    results_df['pred'] = results_df['p_outlier'].round().astype(int)
    results_df['model'] = name

    print('Uploading..')
    with sqlite3.connect('./data/data.db') as con:
        results_df.to_sql(
            name='pyod_results',
            con=con, 
            if_exists='append', 
            index=False,
            chunksize=999,
            method='multi'
        )

Predicting..


[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   2 out of   8 | elapsed:    4.6s remaining:   13.8s
[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:    6.9s finished
[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   2 out of   8 | elapsed:    2.8s remaining:    8.3s
[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:    5.7s finished


Uploading..
Predicting..


[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   2 out of   8 | elapsed:    4.1s remaining:   12.3s
[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:    6.5s finished
[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   2 out of   8 | elapsed:    2.7s remaining:    8.0s
[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:    4.7s finished


Uploading..
Predicting..


[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   2 out of   8 | elapsed:    4.1s remaining:   12.3s
[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:    7.2s finished
[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   2 out of   8 | elapsed:    2.8s remaining:    8.5s
[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:    6.3s finished


Uploading..
Predicting..


[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   2 out of   8 | elapsed:    4.4s remaining:   13.1s
[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:    7.5s finished
[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   2 out of   8 | elapsed:    2.9s remaining:    8.6s
[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:    5.7s finished


Uploading..
Predicting..


[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   2 out of   8 | elapsed:    3.8s remaining:   11.4s
[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:    6.2s finished
[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   2 out of   8 | elapsed:    2.4s remaining:    7.3s
[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:    4.8s finished


Uploading..
CPU times: user 2h 3min 11s, sys: 12min 25s, total: 2h 15min 37s
Wall time: 2h 16min 3s


### DIF

In [8]:
%%time
warnings.filterwarnings(
    "ignore",
    message="'pin_memory' argument is set as true but not supported on MPS",
    category=UserWarning,
    module="torch.utils.data.dataloader"
)

m_dif = DIF(contamination=0.02, random_state=2025)
m_dif.fit(df.loc[:, analog_cols + digital_cols])

dump(m_dif, 'models/dif.joblib')

CPU times: user 1h 9min 39s, sys: 33min 22s, total: 1h 43min 1s
Wall time: 1h 37min 55s


['models/dif.joblib']

In [9]:
%%time
name = 'dif'

m = load(f'models/{name}.joblib')

chunk_size = 1_000_000

for start in range(0, len(df), chunk_size):
    chunk = df.iloc[start:start + chunk_size]
        
    results_df = chunk.loc[:, ['ts']].copy(deep=True)

    print('Predicting..')
    results_df[['p_normal', 'p_outlier']], results_df['confidence'] = m.predict_proba(chunk.loc[:, analog_cols + digital_cols], return_confidence=True)
    results_df['pred'] = results_df['p_outlier'].round().astype(int)
    results_df['model'] = name

    print('Uploading..')
    with sqlite3.connect('./data/data.db') as con:
        results_df.to_sql(
            name='pyod_results',
            con=con, 
            if_exists='append', 
            index=False,
            chunksize=999,
            method='multi'
        )

Predicting..
Uploading..
Predicting..
Uploading..
Predicting..
Uploading..
Predicting..
Uploading..
Predicting..
Uploading..
CPU times: user 3h 49min 56s, sys: 55min 56s, total: 4h 45min 53s
Wall time: 4h 32min 3s


### pyOD Results Aggregation

In [None]:
n_signals = 5

# Post process results
with sqlite3.connect('./data/data.db') as con:
    cur = con.cursor()

    cur.execute('drop table if exists pyod_results_agg;')

    cur.execute('''create table if not exists pyod_results_agg (ds timestamp, {cols}, {agg_cols});'''.format(
        cols = ', '.join([f'{col}_pred integer' for col in analog_cols]),
        agg_cols = 'total_sum integer, pred integer'
    ))

    cur.execute('''
        insert into pyod_results_agg (ds, {cols})
        select ds, {sum_cols}
        from prophet_results
        group by ds
    '''.format(
        cols = ', '.join([f'{col}_pred' for col in analog_cols]),
        sum_cols = ', '.join([f"sum(case when col='{col}' then pred else 0 end) as {col}_pred" for col in analog_cols])
    ))

    cur.execute('''update pyod_results_agg set total_sum = {total_sum_col}'''.format(
        total_sum_col = ' + '.join([f'{col}_pred' for col in analog_cols])
    ))

    cur.execute(f'''update pyod_results_agg set pred = case when total_sum >= {n_signals} then 1 else 0 end''')