In [2]:
import pandas as pd
import sqlite3

In [None]:
%%time

# Load Data into SQLite
with sqlite3.connect('./data/data.db') as con:

    for csv_file in ['../raw-data/train_data_part1.csv', '../raw-data/train_data_part2.csv']:
        data = pd.read_csv(csv_file, parse_dates=['timestamp'], chunksize=100000)
        for chunk in data:
            chunk.to_sql('train_data', con, if_exists='append', index=False)

In [None]:
# Rename columns & add failure_id column
config = [
    ('timestamp', 'ts'),
    ('TP2', 'tp2'),
    ('TP3', 'tp3'),
    ('H1', 'h1'),
    ('DV_pressure', 'dv_pressure'),
    ('Reservoirs', 'reservoirs'),
    ('Oil_temperature', 'oil_temperature'),
    ('Flowmeter', 'flowmeter'),
    ('Motor_current', 'motor_current'),
    ('COMP', 'comp'),
    ('DV_eletric', 'dv_electric'),
    ('Towers', 'towers'),
    ('MPG', 'mpg'),
    ('LPS', 'lps'),
    ('Pressure_switch', 'pressure_switch'),
    ('Oil_level', 'oil_level'),
    ('Caudal_impulses', 'caudal_impulses'),
    ('gpsLong', 'gps_long'),
    ('gpsLat', 'gps_lat'),
    ('gpsSpeed', 'gps_speed'),
    ('gpsQuality', 'gps_quality')
]

with sqlite3.connect('./data/data.db') as con:
    cur = con.cursor()
    for c in config:
        cur.execute(f'alter table train_data rename column {c[0]} to {c[1]};')
    
    cur.execute('create index if not exists idx_ts on train_data(ts)')
    cur.execute('alter table train_data add column failure_id integer')
    cur.execute('alter table train_data add column day_id integer;')
    cur.execute('alter table train_data add column pseudo_label real;')
    cur.execute('alter table train_data add column pseudo_label_lps real;')

In [6]:
# failure_id logic
with sqlite3.connect('./data/data.db') as con:
    cur = con.cursor()
    
    # Failure ID
    cur.execute("update train_data set failure_id = null;")
    cur.execute("update train_data set failure_id = 1 where '2022-02-28 21:53:00' <= ts and ts < '2022-03-01 02:00:00';")
    cur.execute("update train_data set failure_id = 2 where '2022-03-23 14:54:00' <= ts and ts < '2022-03-23 15:24:00';")
    cur.execute("update train_data set failure_id = 3 where '2022-05-30 12:00:00' <= ts and ts < '2022-06-02 06:18:00';")

    # Add a pseudo-label
    cur.execute("update train_data set pseudo_label = 1 where '2022-02-21 06:00:00' <= ts and ts < '2022-02-28 02:00:00';")
    cur.execute("update train_data set pseudo_label = 2 where '2022-03-16 06:00:00' <= ts and ts < '2022-03-23 02:00:00';")
    cur.execute("update train_data set pseudo_label = 3 where '2022-05-23 06:00:00' <= ts and ts < '2022-05-30 02:00:00';")

In [None]:
# LPS Events
with sqlite3.connect('./data/data.db') as con:
    cur = con.cursor()

    cur.execute("""
    create table lps_events as
        with 
        lagged as (
            select
                ts
                ,lps
                ,lag(lps, 1, 0) over (order by ts asc) as prev_lps
            from train_data
        ),
        flagged as (
            select
                ts
                ,lps
                ,sum(case when lps != prev_lps then 1 else 0 end) over (order by ts rows unbounded preceding) as event_id
            from lagged
        )
        select
            event_id
            ,min(ts) as start_ts
            ,max(ts) as end_ts
            ,count(*) as event_duration_in_secs
        from flagged
        where lps = 1
        group by event_id
        order by start_ts asc
    """)

In [None]:
# lps_events pseudo-label
with sqlite3.connect('./data/data.db') as con:
    cur = con.cursor()
    
    df = pd.read_sql('select * from lps_events where event_duration_in_secs > 60*5', con = con, parse_dates=['start_ts', 'end_ts'])
    
    # Reset
    cur.execute("""
        update train_data
        set pseudo_label_lps = null
    """)

    # Load LPS Events
    for _, event in df.iterrows():
        sql_str = """
            update train_data
            set pseudo_label_lps = 1
            where
                '{start}' <= ts and ts < '{end}'
        """.format(
            start = event['start_ts'].replace(hour=6, minute=0, second=0),
            end = event['start_ts']
        )
        cur.execute(sql_str)
    
    # Account for same-day events
    cur.execute("""
        update train_data
        set pseudo_label_lps = null
        where lps = 1
    """)

In [7]:
# day_id logic
with sqlite3.connect('./data/data.db') as con:
    cur = con.cursor()

    # Segment days
    cur.execute("""
        with 
        lagged as (
            select
                ts
                ,lag(ts, 1, datetime(ts, '-1 second')) over (order by ts asc) as prev_ts
            from train_data
        ),
        flagged as (
            select
                ts
                ,sum(strftime('%s', ts) - strftime('%s', prev_ts) - 1) over (order by ts rows unbounded preceding) as group_id
            from lagged
        )
        update train_data
        set day_id = (select group_id from flagged where flagged.ts = train_data.ts)
    """)

    # Renumber
    cur.execute("""
        with 
        cte as (
            select distinct day_id from train_data
        ),
        renumbered as (
            select day_id, row_number() over (order by day_id) as new_day_id
            from cte
        )
        update train_data
        set day_id = (select new_day_id from renumbered where renumbered.day_id = train_data.day_id)
    """)