In [1]:
import sqlite3
import pandas as pd
import warnings
from tqdm import tqdm
from joblib import dump, load
from prophet import Prophet

In [2]:
analog_cols = [
    'tp2',
    'tp3',
    'h1',
    'dv_pressure',
    'reservoirs',
    'oil_temperature',
    'flowmeter',
    'motor_current'
]

digital_cols = [
    'comp',
    'dv_electric',
    'towers',
    'mpg',
    'lps',
    'pressure_switch',
    'oil_level',
    'caudal_impulses'
]

gps_cols = [
    'gps_long',
    'gps_lat',
    'gps_speed',
    'gps_quality'
]

In [3]:
cols = ', '.join(analog_cols)
with sqlite3.connect('./data/data.db') as con:
    df = pd.read_sql(
        "select ts, {cols} from train_data where date(ts) > '2022-03-02' order by ts asc".format(cols=cols), 
        con=con, 
        parse_dates=['ts']
    )

print(df.shape)

(6389782, 9)


In [4]:
%%time
for col in tqdm(analog_cols):
    model = Prophet(
        changepoint_prior_scale=0.5,
        yearly_seasonality=False,
        weekly_seasonality=False,
        daily_seasonality=False
    )

    model.add_seasonality(name='30min', period=60*30, fourier_order=5)
    model.add_seasonality(name='daily', period=72000, fourier_order=5)

    model.fit(df.loc[:, ['ts', col]].rename(columns={'ts': 'ds', col: 'y'}))

    dump(model, f'models/prophet_f2_f3_{col}.joblib')

  0%|          | 0/8 [00:00<?, ?it/s]20:39:52 - cmdstanpy - INFO - Chain [1] start processing
20:56:01 - cmdstanpy - INFO - Chain [1] done processing
 12%|█▎        | 1/8 [18:43<2:11:06, 1123.74s/it]20:58:36 - cmdstanpy - INFO - Chain [1] start processing
21:00:46 - cmdstanpy - INFO - Chain [1] done processing
 25%|██▌       | 2/8 [23:28<1:03:01, 630.26s/it] 21:03:21 - cmdstanpy - INFO - Chain [1] start processing
21:23:00 - cmdstanpy - INFO - Chain [1] done processing
 38%|███▊      | 3/8 [45:42<1:19:18, 951.68s/it]21:25:35 - cmdstanpy - INFO - Chain [1] start processing
21:27:42 - cmdstanpy - INFO - Chain [1] done processing
 50%|█████     | 4/8 [50:25<45:50, 687.53s/it]  21:30:18 - cmdstanpy - INFO - Chain [1] start processing
21:56:05 - cmdstanpy - INFO - Chain [1] done processing
 62%|██████▎   | 5/8 [1:18:48<52:41, 1053.70s/it]21:58:41 - cmdstanpy - INFO - Chain [1] start processing
22:27:42 - cmdstanpy - INFO - Chain [1] done processing
 75%|███████▌  | 6/8 [1:50:24<44:40, 1340.

CPU times: user 19min 52s, sys: 45.9 s, total: 20min 38s
Wall time: 2h 34min 12s





In [5]:
chunk_size = 1_000_000

with sqlite3.connect('./data/data.db') as con:
    cur = con.cursor()
    cur.execute('drop table if exists prophet_results_f2_f3;') 
    cur.execute('''
        create table prophet_results_f2_f3 (
            ds timestamp,
            col text,
            yhat real,
            yhat_lower real,
            yhat_upper real,
            yhat_lower_expanded real,
            yhat_upper_expanded real,
            pred integer
        );
    ''')

    for col in tqdm(analog_cols):
        model = load(f'models/prophet_f2_f3_{col}.joblib')

        for start in range(0, len(df), chunk_size):
            chunk = df.iloc[start:start + chunk_size]

            pred_df = (
                model
                    .predict(chunk.loc[:, ['ts', col]].rename(columns={'ts': 'ds'}))
                    .merge(chunk.loc[:, ['ts', col]], left_on='ds', right_on='ts')
            )

            pred_df['col'] = col
            pred_df['range'] = pred_df['yhat_upper'] - pred_df['yhat_lower']
            pred_df['range_expanded'] = pred_df['range'] * 1.1
            pred_df['yhat_upper_expanded'] = pred_df['yhat'] + pred_df['range_expanded'] / 2
            pred_df['yhat_lower_expanded'] = pred_df['yhat'] - pred_df['range_expanded'] / 2

            pred_df['pred'] = (
                (pred_df[col] > pred_df['yhat_upper_expanded']) |
                (pred_df[col] < pred_df['yhat_lower_expanded'])
            ).astype(int)

            sql_cols = [
                'ds',
                'col',
                'yhat',
                'yhat_lower',
                'yhat_upper',
                'yhat_lower_expanded',
                'yhat_upper_expanded',
                'pred'
            ]
            
            pred_df[sql_cols].to_sql(
                name='prophet_results_f2_f3',
                con=con, 
                if_exists='append', 
                index=False,
                chunksize=999,
                method='multi'
            )


100%|██████████| 8/8 [1:26:27<00:00, 648.38s/it]


In [6]:
n_signals = 5

# Post process results
with sqlite3.connect('./data/data.db') as con:
    cur = con.cursor()

    cur.execute('drop table if exists prophet_results_f2_f3_agg;')

    cur.execute('''create table if not exists prophet_results_f2_f3_agg (ds timestamp, {cols}, {agg_cols});'''.format(
        cols = ', '.join([f'{col}_pred integer' for col in analog_cols]),
        agg_cols = 'total_sum integer, pred integer'
    ))

    cur.execute('''
        insert into prophet_results_f2_f3_agg (ds, {cols})
        select ds, {sum_cols}
        from prophet_results
        group by ds
    '''.format(
        cols = ', '.join([f'{col}_pred' for col in analog_cols]),
        sum_cols = ', '.join([f"sum(case when col='{col}' then pred else 0 end) as {col}_pred" for col in analog_cols])
    ))

    cur.execute('''update prophet_results_f2_f3_agg set total_sum = {total_sum_col}'''.format(
        total_sum_col = ' + '.join([f'{col}_pred' for col in analog_cols])
    ))

    cur.execute(f'''update prophet_results_f2_f3_agg set pred = case when total_sum >= {n_signals} then 1 else 0 end''')