In [1]:
# filename: output.ipynb
# purpose: generate output data

# OHT noise, normal and outlier data generation

### Outlier detection method
- Outlier data will be detected and generated based on Moving Average and Moving Standard Deviation 

### Processing flow
- Read Raw table, ohtraw from duckdb filedb, which was created in previous step, parse notebook
- Create a Work table, work in memory duckdb with additional columns MVAVG_,MVSTD_,MVSIG_,FLAG columns.
- Calculate Moving Average, Standard Devidation
- Calculate Sigma value of the column data based on Moving Average and Standard Deviation
- Calcuate FLAG column value based on Sigma value
- Fetch the work table into work dataframe
- Split noise and normal dataframe from work dataframe based on FLAG value
- Create outlier dataframe based on normal dataframe
- Update outlier dataframe by applying outlier pattern 
- Save noise, normal, outler dataframe to duckdb tables for later graphing.
- Save noise, normal, outler csvfle
- Check csvfile size 

In [2]:
# packages
import time
import pathlib
import textwrap
import pandas as pd

import humanfriendly as human
import duckdb

import ohtconf as conf
import ohtcomm as comm

## Main

In [3]:
mainstart = time.time()

### Prepare in-memory work table base on in-file raw table 

In [4]:
# open in-memory db

con = duckdb.connect(database=":memory:")

In [5]:
# create in-memory work table

query = "DROP TABLE IF EXISTS work"
con.execute(query)

coldef = ""
for name, dtype in zip(conf.COLUMN_NAMES, conf.COLUMN_DBTYPES):
    if coldef:
        coldef += ", " + name + " " + dtype
    else:
        coldef += name + " " + dtype

for col in conf.COLUMN_GRAPH:
    coldef += ", " + conf.MVAVG + col + " " + "FLOAT"
    coldef += ", " + conf.MVSTD + col + " " + "FLOAT"
    coldef += ", " + conf.MVSIG + col + " " + "INTEGER"

coldef += ", " + f"{conf.COLUMN_FLAG}  INTEGER"

query = f"CREATE TABLE work ( {coldef} )"
# print(textwrap.fill(query, width=120))

con.execute(query)

<duckdb.duckdb.DuckDBPyConnection at 0x2811794bf30>

In [6]:
# attach in-file db, raw table was prepared in the previous step, parse

con.execute(f"ATTACH DATABASE '{conf.DBFILE}' AS filedb (READ_ONLY)")

<duckdb.duckdb.DuckDBPyConnection at 0x2811794bf30>

In [7]:
# insert into in-memory work table from in-file raw table with additional Moving Avg,Std and Flag=0

coldef = ", ".join(conf.COLUMN_NAMES)

for col in conf.COLUMN_GRAPH:
    coldef += (
        ", "
        + f"AVG({col}) OVER (ORDER BY {conf.COLUMN_NAMES[0]} ROWS BETWEEN {conf.POINTS['MOVING']} PRECEDING AND CURRENT ROW) AS {conf.MVAVG}{col}"
    )
    coldef += (
        ", "
        + f"STDDEV({col}) OVER (ORDER BY {conf.COLUMN_NAMES[0]} ROWS BETWEEN {conf.POINTS['MOVING']} PRECEDING AND CURRENT ROW) AS {conf.MVSTD}{col}"
    )
    coldef += ", 0"  # mvsig_
coldef += ", 0"  # flag

query = f"INSERT INTO work SELECT {coldef} FROM filedb.{conf.TABNAME_RAW} ORDER BY {conf.COLUMN_NAMES[0]}"
# print(textwrap.fill(query, width=120))

con.execute(query)

<duckdb.duckdb.DuckDBPyConnection at 0x2811794bf30>

In [8]:
# detach filedb

con.execute("DETACH DATABASE filedb")

<duckdb.duckdb.DuckDBPyConnection at 0x2811794bf30>

In [9]:
# Update NULL value after window function
query = f"SELECT * FROM work ORDER BY {conf.COLUMN_NAMES[0]}"
dfwork = con.execute(query).df()
dfwork.bfill(inplace=True)

# recreate work table base one work dataframe
con.execute("DROP TABLE IF EXISTS work")
con.execute("CREATE TABLE work AS SELECT * FROM dfwork")

<duckdb.duckdb.DuckDBPyConnection at 0x2811794bf30>

In [10]:
# calculate Sigma value

coldef = ""
for col in conf.COLUMN_GRAPH:
    setdef = textwrap.dedent(f"""{conf.MVSIG}{col} = CASE 
                  WHEN {col} >= ({col} - 1 * {conf.MVSTD}{col}) AND {col} <= ({col} + 1 * {conf.MVSTD}{col})  then 1
                  WHEN {col} >= ({col} - 2 * {conf.MVSTD}{col}) AND {col} <= ({col} + 2 * {conf.MVSTD}{col})  then 2
                  WHEN {col} >= ({col} - 3 * {conf.MVSTD}{col}) AND {col} <= ({col} + 3 * {conf.MVSTD}{col})  then 3
                  WHEN {col} >= ({col} - 4 * {conf.MVSTD}{col}) AND {col} <= ({col} + 4 * {conf.MVSTD}{col})  then 4
                  WHEN {col} >= ({col} - 5 * {conf.MVSTD}{col}) AND {col} <= ({col} + 5 * {conf.MVSTD}{col})  then 5
                  ELSE 6
                  END""")
    if not coldef:
        coldef = setdef
    else:
        coldef += f", {setdef}"

query = f"UPDATE work SET {coldef}"
# print(textwrap.fill(query, width=120))

con.execute(query)

<duckdb.duckdb.DuckDBPyConnection at 0x2811794bf30>

In [11]:
# calculate Flag=1 based on configured sigma value

coldef = ""
for col in conf.COLUMN_GRAPH:
    if not coldef:
        coldef = f"{conf.MVSIG}{col} > {conf.SIGMA_NOISE}"
    else:
        coldef += f" OR {conf.MVSIG}{col} > {conf.SIGMA_NOISE}"

query = f"UPDATE work SET {conf.COLUMN_FLAG}=1 WHERE {coldef}"
print(textwrap.fill(query, width=120))

con.execute(query)

UPDATE work SET FLAG=1 WHERE MVSIG_TEM > 2 OR MVSIG_PM1 > 2 OR MVSIG_PM2_5 > 2 OR MVSIG_PM10 > 2 OR MVSIG_CO > 2 OR
MVSIG_NH3 > 2 OR MVSIG_CT1 > 2 OR MVSIG_CT2 > 2 OR MVSIG_CT3 > 2 OR MVSIG_CT4 > 2


<duckdb.duckdb.DuckDBPyConnection at 0x2811794bf30>

### Prepare noise, normal dataframe

In [12]:
# fetch table from work table based on Flag

dfnoise = con.execute(f"SELECT * FROM work WHERE {conf.COLUMN_FLAG}=1 ORDER BY {conf.COLUMN_NAMES[0]}").df()
dfnorm = con.execute(f"SELECT * FROM work WHERE {conf.COLUMN_FLAG}=0 ORDER BY {conf.COLUMN_NAMES[0]}").df()

# update datetm value
dfnoise[conf.COLUMN_NAMES[0]] = pd.date_range(start="2024-01-01", periods=len(dfnoise), freq="100ms")
dfnorm[conf.COLUMN_NAMES[0]] = pd.date_range(start="2024-01-01", periods=len(dfnorm), freq="100ms")

# round float type value
dfnoise = dfnoise.round(1)
dfnorm = dfnorm.round(1)

print(f"row count, noise={len(dfnoise)}, normal={len(dfnorm)}")

row count, noise=0, normal=6095021


In [13]:
# set float display format
pd.set_option("display.float_format", "{:.1f}".format)

In [14]:
dfnoise.head()

Unnamed: 0,DATETM,TEM,PM1,PM2_5,PM10,CO,NH3,CT1,CT2,CT3,CT4,MVAVG_TEM,MVSTD_TEM,MVSIG_TEM,MVAVG_PM1,MVSTD_PM1,MVSIG_PM1,MVAVG_PM2_5,MVSTD_PM2_5,MVSIG_PM2_5,MVAVG_PM10,MVSTD_PM10,MVSIG_PM10,MVAVG_CO,MVSTD_CO,MVSIG_CO,MVAVG_NH3,MVSTD_NH3,MVSIG_NH3,MVAVG_CT1,MVSTD_CT1,MVSIG_CT1,MVAVG_CT2,MVSTD_CT2,MVSIG_CT2,MVAVG_CT3,MVSTD_CT3,MVSIG_CT3,MVAVG_CT4,MVSTD_CT4,MVSIG_CT4,FLAG


In [15]:
dfnorm.head()

Unnamed: 0,DATETM,TEM,PM1,PM2_5,PM10,CO,NH3,CT1,CT2,CT3,CT4,MVAVG_TEM,MVSTD_TEM,MVSIG_TEM,MVAVG_PM1,MVSTD_PM1,MVSIG_PM1,MVAVG_PM2_5,MVSTD_PM2_5,MVSIG_PM2_5,MVAVG_PM10,MVSTD_PM10,MVSIG_PM10,MVAVG_CO,MVSTD_CO,MVSIG_CO,MVAVG_NH3,MVSTD_NH3,MVSIG_NH3,MVAVG_CT1,MVSTD_CT1,MVSIG_CT1,MVAVG_CT2,MVSTD_CT2,MVSIG_CT2,MVAVG_CT3,MVSTD_CT3,MVSIG_CT3,MVAVG_CT4,MVSTD_CT4,MVSIG_CT4,FLAG
0,2024-01-01 00:00:00.000,40.8,10,12,13,161,88,0.8,1.0,0.5,0.6,40.8,0.0,1,10.0,0.0,1,12.0,0.0,1,13.0,0.0,1,161.0,0.0,1,88.0,0.0,1,0.8,0.0,1,1.0,0.0,1,0.5,0.0,1,0.6,0.0,1,0
1,2024-01-01 00:00:00.100,40.8,10,12,13,161,88,0.8,1.0,0.5,0.6,40.8,0.0,1,10.0,0.0,1,12.0,0.0,1,13.0,0.0,1,161.0,0.0,1,88.0,0.0,1,0.8,0.0,1,1.0,0.0,1,0.5,0.0,1,0.6,0.0,1,0
2,2024-01-01 00:00:00.200,40.8,10,12,13,161,88,0.8,1.0,0.5,0.5,40.8,0.0,1,10.0,0.0,1,12.0,0.0,1,13.0,0.0,1,161.0,0.0,1,88.0,0.0,1,0.8,0.0,1,1.0,0.0,1,0.5,0.0,1,0.6,0.1,1,0
3,2024-01-01 00:00:00.300,40.8,10,12,13,161,88,0.9,1.0,0.5,0.6,40.8,0.0,1,10.0,0.0,1,12.0,0.0,1,13.0,0.0,1,161.0,0.0,1,88.0,0.0,1,0.8,0.0,1,1.0,0.0,1,0.5,0.0,1,0.6,0.1,1,0
4,2024-01-01 00:00:00.400,40.8,10,12,13,161,88,0.8,1.0,0.7,0.6,40.8,0.0,1,10.0,0.0,1,12.0,0.0,1,13.0,0.0,1,161.0,0.0,1,88.0,0.0,1,0.8,0.0,1,1.0,0.0,1,0.5,0.1,1,0.6,0.0,1,0


### Prepare outlier dataframe from normal dataframe

In [16]:
# choose dfoutl data in dfnorm
dfoutl = dfnorm.iloc[: round(len(dfnorm) * conf.OUTLIER_RATIO)].copy()  # normal:outlier = 3:1 (6:2)
dfoutl = dfoutl.sort_values(by=conf.COLUMN_NAMES[0])
dfoutl = dfoutl.reset_index(drop=True)

In [17]:
# Update outlier
_start = time.time()

dfoutl = comm.gen_outlier(dfoutl)

# round float type value
dfoutl = dfoutl.round(1)

_elapsed = time.time() - _start
print(f"get_outlier elapsed time: {human.format_timespan(_elapsed)}")
# 8 min 17 sec, 2_031_674 rows, INPUT_MAXSIZE=400MB, include DATETM on csvfile
# 14 min 16 sec, 3_280_186 rows, INPUT_MAXSIZE=650MB, exclude DATETM on csvfile

outlier count=1 / 2031674


outlier count=100001 / 2031674


outlier count=200001 / 2031674


outlier count=300001 / 2031674


outlier count=400001 / 2031674


outlier count=500001 / 2031674


outlier count=600001 / 2031674


outlier count=700001 / 2031674


outlier count=800001 / 2031674


outlier count=900001 / 2031674


outlier count=1000001 / 2031674


outlier count=1100001 / 2031674


outlier count=1200001 / 2031674


outlier count=1300001 / 2031674


outlier count=1400001 / 2031674


outlier count=1500001 / 2031674


outlier count=1600001 / 2031674


outlier count=1700001 / 2031674


outlier count=1800001 / 2031674


outlier count=1900001 / 2031674


outlier count=2000001 / 2031674


get_outlier elapsed time: 5 minutes and 0.52 seconds


In [18]:
# recreate work table base one dfoutl
con.execute("DROP TABLE IF EXISTS work")
con.execute("CREATE TABLE work AS SELECT * FROM dfoutl WHERE 1=0")

<duckdb.duckdb.DuckDBPyConnection at 0x2811794bf30>

In [19]:
# insert into in-memory work table from dfoutl with calculated Moving Avg,Std and asis Flag value

coldef = ", ".join(conf.COLUMN_NAMES)

for col in conf.COLUMN_GRAPH:
    coldef += (
        ", "
        + f"AVG({col}) OVER (ORDER BY {conf.COLUMN_NAMES[0]} ROWS BETWEEN {conf.POINTS['MOVING']} PRECEDING AND CURRENT ROW) AS {conf.MVAVG}{col}"
    )
    coldef += (
        ", "
        + f"STDDEV({col}) OVER (ORDER BY {conf.COLUMN_NAMES[0]} ROWS BETWEEN {conf.POINTS['MOVING']} PRECEDING AND CURRENT ROW) AS {conf.MVSTD}{col}"
    )
    coldef += ", 0"  # mvsig_
coldef += ", 0"  # flag

query = f"INSERT INTO work SELECT {coldef} FROM dfoutl ORDER BY {conf.COLUMN_NAMES[0]}"
# print(textwrap.fill(query, width=120))

con.execute(query)

<duckdb.duckdb.DuckDBPyConnection at 0x2811794bf30>

In [20]:
# Update NULL value after window function
query = f"SELECT * FROM work ORDER BY {conf.COLUMN_NAMES[0]}"
dfoutl = con.execute(query).df()
dfoutl.bfill(inplace=True)

# recreate work table base one work dataframe
con.execute("DROP TABLE IF EXISTS work")
con.execute("CREATE TABLE work AS SELECT * FROM dfoutl")

<duckdb.duckdb.DuckDBPyConnection at 0x2811794bf30>

In [21]:
# calculate Sigma value

coldef = ""
for col in conf.COLUMN_GRAPH:
    setdef = textwrap.dedent(f"""{conf.MVSIG}{col} = CASE 
                  WHEN {col} >= ({col} - 1 * {conf.MVSTD}{col}) AND {col} <= ({col} + 1 * {conf.MVSTD}{col})  then 1
                  WHEN {col} >= ({col} - 2 * {conf.MVSTD}{col}) AND {col} <= ({col} + 2 * {conf.MVSTD}{col})  then 2
                  WHEN {col} >= ({col} - 3 * {conf.MVSTD}{col}) AND {col} <= ({col} + 3 * {conf.MVSTD}{col})  then 3
                  WHEN {col} >= ({col} - 4 * {conf.MVSTD}{col}) AND {col} <= ({col} + 4 * {conf.MVSTD}{col})  then 4
                  WHEN {col} >= ({col} - 5 * {conf.MVSTD}{col}) AND {col} <= ({col} + 5 * {conf.MVSTD}{col})  then 5
                  ELSE 6
                  END""")
    if not coldef:
        coldef = setdef
    else:
        coldef += f", {setdef}"

query = f"UPDATE work SET {coldef}"
# print(textwrap.fill(query, width=120))

con.execute(query)

<duckdb.duckdb.DuckDBPyConnection at 0x2811794bf30>

In [22]:
# calculate Flag=1 based on configured sigma value

coldef = ""
for col in conf.COLUMN_GRAPH:
    if not coldef:
        coldef = f"{conf.MVSIG}{col} > {conf.SIGMA_NOISE}"
    else:
        coldef += f" OR {conf.MVSIG}{col} > {conf.SIGMA_NOISE}"

query = f"UPDATE work SET {conf.COLUMN_FLAG}=1 WHERE {coldef}"
print(textwrap.fill(query, width=120))

con.execute(query)

UPDATE work SET FLAG=1 WHERE MVSIG_TEM > 2 OR MVSIG_PM1 > 2 OR MVSIG_PM2_5 > 2 OR MVSIG_PM10 > 2 OR MVSIG_CO > 2 OR
MVSIG_NH3 > 2 OR MVSIG_CT1 > 2 OR MVSIG_CT2 > 2 OR MVSIG_CT3 > 2 OR MVSIG_CT4 > 2


<duckdb.duckdb.DuckDBPyConnection at 0x2811794bf30>

In [23]:
dfoutl = con.execute(f"SELECT * FROM work ORDER BY {conf.COLUMN_NAMES[0]}").df()

# round float type value
dfoutl = dfoutl.round(1)

In [24]:
print(f"row count, outlier={len(dfoutl)}, normal={len(dfnorm)}")

row count, outlier=2031674, normal=6095021


In [25]:
print(f"outlier noise count={len(dfoutl[dfoutl[conf.COLUMN_FLAG]==1])}")

outlier noise count=0


In [26]:
# outlier first pattern range data
dfoutl.iloc[conf.POINTS["MOVING"] : conf.POINTS["MOVING"] + conf.POINTS["PATTERN"]]

Unnamed: 0,DATETM,TEM,PM1,PM2_5,PM10,CO,NH3,CT1,CT2,CT3,CT4,MVAVG_TEM,MVSTD_TEM,MVSIG_TEM,MVAVG_PM1,MVSTD_PM1,MVSIG_PM1,MVAVG_PM2_5,MVSTD_PM2_5,MVSIG_PM2_5,MVAVG_PM10,MVSTD_PM10,MVSIG_PM10,MVAVG_CO,MVSTD_CO,MVSIG_CO,MVAVG_NH3,MVSTD_NH3,MVSIG_NH3,MVAVG_CT1,MVSTD_CT1,MVSIG_CT1,MVAVG_CT2,MVSTD_CT2,MVSIG_CT2,MVAVG_CT3,MVSTD_CT3,MVSIG_CT3,MVAVG_CT4,MVSTD_CT4,MVSIG_CT4,FLAG
600,2024-01-01 00:01:00.000,40.8,10,12,13,162,88,0.8,1.0,0.5,0.5,44.4,3.4,1,17.7,7.1,1,19.0,6.6,1,19.6,6.2,1,169.1,6.6,1,92.7,4.4,1,1.3,0.4,1,1.4,0.4,1,1.1,0.5,1,1.1,0.6,1,0
601,2024-01-01 00:01:00.100,41.5,13,17,16,166,91,1.0,1.1,0.7,0.6,44.4,3.4,1,17.7,7.1,1,19.0,6.6,1,19.6,6.2,1,169.1,6.5,1,92.7,4.4,1,1.3,0.4,1,1.4,0.4,1,1.1,0.5,1,1.1,0.6,1,0
602,2024-01-01 00:01:00.200,43.3,16,18,17,168,92,1.2,1.1,0.9,0.7,44.4,3.4,1,17.7,7.1,1,19.0,6.6,1,19.6,6.2,1,169.1,6.5,1,92.7,4.4,1,1.3,0.4,1,1.4,0.4,1,1.1,0.5,1,1.1,0.6,1,0
603,2024-01-01 00:01:00.300,44.2,16,18,20,169,93,1.2,1.3,1.0,0.9,44.4,3.4,1,17.8,7.1,1,19.0,6.6,1,19.6,6.2,1,169.1,6.5,1,92.7,4.4,1,1.3,0.4,1,1.4,0.4,1,1.1,0.5,1,1.1,0.6,1,0
604,2024-01-01 00:01:00.400,44.7,17,19,21,170,93,1.3,1.3,1.1,1.0,44.4,3.4,1,17.8,7.1,1,19.0,6.5,1,19.6,6.2,1,169.1,6.5,1,92.7,4.4,1,1.3,0.4,1,1.4,0.4,1,1.1,0.5,1,1.1,0.5,1,0
605,2024-01-01 00:01:00.500,45.1,18,20,22,170,94,1.3,1.4,1.1,1.1,44.4,3.4,1,17.8,7.1,1,19.0,6.5,1,19.6,6.2,1,169.1,6.5,1,92.7,4.4,1,1.3,0.4,1,1.4,0.4,1,1.1,0.5,1,1.1,0.5,1,0
606,2024-01-01 00:01:00.600,45.5,19,21,22,170,94,1.4,1.4,1.2,1.2,44.5,3.4,1,17.8,7.1,1,19.0,6.5,1,19.7,6.2,1,169.1,6.5,1,92.7,4.4,1,1.3,0.4,1,1.4,0.4,1,1.1,0.5,1,1.1,0.5,1,0
607,2024-01-01 00:01:00.700,45.7,21,22,23,171,95,1.4,1.5,1.2,1.3,44.5,3.4,1,17.8,7.1,1,19.0,6.5,1,19.7,6.2,1,169.1,6.5,1,92.7,4.4,1,1.3,0.4,1,1.4,0.4,1,1.1,0.5,1,1.1,0.5,1,0
608,2024-01-01 00:01:00.800,46.1,22,22,24,171,95,1.5,1.5,1.3,1.3,44.5,3.4,1,17.8,7.1,1,19.0,6.5,1,19.7,6.2,1,169.2,6.5,1,92.7,4.4,1,1.3,0.4,1,1.4,0.4,1,1.1,0.5,1,1.1,0.5,1,0
609,2024-01-01 00:01:00.900,46.3,22,23,24,172,95,1.5,1.5,1.3,1.3,44.5,3.4,1,17.8,7.1,1,19.1,6.5,1,19.7,6.2,1,169.2,6.5,1,92.7,4.4,1,1.3,0.4,1,1.4,0.4,1,1.1,0.5,1,1.1,0.5,1,0


In [27]:
# normal the first pattern range data
dfnorm.iloc[conf.POINTS["MOVING"] : conf.POINTS["MOVING"] + conf.POINTS["PATTERN"]]

Unnamed: 0,DATETM,TEM,PM1,PM2_5,PM10,CO,NH3,CT1,CT2,CT3,CT4,MVAVG_TEM,MVSTD_TEM,MVSIG_TEM,MVAVG_PM1,MVSTD_PM1,MVSIG_PM1,MVAVG_PM2_5,MVSTD_PM2_5,MVSIG_PM2_5,MVAVG_PM10,MVSTD_PM10,MVSIG_PM10,MVAVG_CO,MVSTD_CO,MVSIG_CO,MVAVG_NH3,MVSTD_NH3,MVSIG_NH3,MVAVG_CT1,MVSTD_CT1,MVSIG_CT1,MVAVG_CT2,MVSTD_CT2,MVSIG_CT2,MVAVG_CT3,MVSTD_CT3,MVSIG_CT3,MVAVG_CT4,MVSTD_CT4,MVSIG_CT4,FLAG
600,2024-01-01 00:01:00.000,40.8,10,12,13,161,89,0.8,1.0,0.5,0.5,40.8,0.0,1,10.3,0.9,1,12.1,0.3,1,13.2,0.6,1,162.2,3.9,1,88.2,1.3,1,0.8,0.1,1,1.0,0.1,1,0.5,0.1,1,0.5,0.1,1,0
601,2024-01-01 00:01:00.100,40.8,10,12,13,161,89,0.9,1.0,0.4,0.6,40.8,0.0,1,10.3,0.9,1,12.1,0.3,1,13.2,0.6,1,162.2,3.9,1,88.2,1.3,1,0.8,0.1,1,1.0,0.1,1,0.5,0.1,1,0.5,0.1,1,0
602,2024-01-01 00:01:00.200,40.8,10,12,13,161,89,0.8,1.0,0.7,0.6,40.8,0.0,1,10.3,0.9,1,12.1,0.3,1,13.2,0.6,1,162.2,3.9,1,88.2,1.3,1,0.8,0.1,1,1.0,0.1,1,0.5,0.1,1,0.5,0.1,1,0
603,2024-01-01 00:01:00.300,40.8,10,12,13,161,89,0.8,1.0,0.4,0.5,40.8,0.0,1,10.3,0.9,1,12.1,0.3,1,13.2,0.6,1,162.2,3.9,1,88.2,1.3,1,0.8,0.1,1,1.0,0.1,1,0.5,0.1,1,0.5,0.1,1,0
604,2024-01-01 00:01:00.400,40.8,10,12,13,161,89,0.8,1.1,0.6,0.6,40.8,0.0,1,10.3,0.9,1,12.1,0.3,1,13.2,0.6,1,162.2,3.9,1,88.2,1.3,1,0.8,0.1,1,1.0,0.1,1,0.5,0.1,1,0.5,0.1,1,0
605,2024-01-01 00:01:00.500,40.8,10,12,13,162,88,0.8,1.0,0.4,0.6,40.8,0.0,1,10.3,0.9,1,12.1,0.3,1,13.2,0.6,1,162.2,3.9,1,88.2,1.3,1,0.8,0.1,1,1.0,0.1,1,0.5,0.1,1,0.5,0.1,1,0
606,2024-01-01 00:01:00.600,40.8,10,12,13,162,88,0.8,1.1,0.4,0.6,40.8,0.0,1,10.3,0.9,1,12.1,0.3,1,13.2,0.6,1,162.2,3.9,1,88.2,1.3,1,0.8,0.1,1,1.0,0.1,1,0.5,0.1,1,0.5,0.1,1,0
607,2024-01-01 00:01:00.700,40.8,10,12,13,162,88,0.8,1.0,0.4,0.5,40.8,0.0,1,10.3,0.9,1,12.1,0.3,1,13.2,0.6,1,162.2,3.9,1,88.2,1.3,1,0.8,0.1,1,1.0,0.1,1,0.5,0.1,1,0.5,0.1,1,0
608,2024-01-01 00:01:00.800,40.8,10,12,13,162,89,0.8,1.0,0.5,0.6,40.8,0.0,1,10.3,0.9,1,12.1,0.3,1,13.2,0.6,1,162.2,3.9,1,88.2,1.3,1,0.8,0.1,1,1.0,0.1,1,0.5,0.1,1,0.5,0.1,1,0
609,2024-01-01 00:01:00.900,40.8,10,12,13,162,89,0.8,0.9,0.4,0.6,40.8,0.0,1,10.3,0.9,1,12.1,0.3,1,13.2,0.6,1,162.2,3.9,1,88.2,1.3,1,0.8,0.1,1,1.0,0.1,1,0.5,0.1,1,0.5,0.1,1,0


### Save dataframe into duckdb table

In [28]:
# save noise
_start = time.time()

comm.save_dftab(dfnoise, conf.TABNAME_NOISE)

_elapsed = time.time() - _start
print(f"save db, noise elapsed time: {human.format_timespan(_elapsed)}")

save db, noise elapsed time: 0.05 seconds


In [29]:
# save normal
_start = time.time()

comm.save_dftab(dfnorm, conf.TABNAME_NORM)

_elapsed = time.time() - _start
print(f"save db, norm elapsed time: {human.format_timespan(_elapsed)}")

save db, norm elapsed time: 3.26 seconds


In [30]:
# save outlier
_start = time.time()

comm.save_dftab(dfoutl, conf.TABNAME_OUTL)

_elapsed = time.time() - _start
print(f"save db, outl elapsed time: {human.format_timespan(_elapsed)}")

save db, outl elapsed time: 1.96 seconds


In [31]:
# close in-memory db
con.close()

### Save dataframe into csvfile

In [32]:
# save noise
_start = time.time()

comm.save_csvfile(dfnoise[conf.COLUMN_NAMES], conf.FILENAME_NOISE, conf.DIROUT)  # only noise

_elapsed = time.time() - _start
print(f"save csvfile, noise elapsed time: {human.format_timespan(_elapsed)}")  # no files for conf.INPUT_MAXSIZE=400 MB

save_csvfile, dataframe has no rows to write, skip
save csvfile, noise elapsed time: 0 seconds


In [33]:
# save normal
_start = time.time()

comm.save_csvfile(dfnorm[conf.COLUMN_NAMES], conf.FILENAME_NORM, conf.DIROUT)

_elapsed = time.time() - _start
print(f"save csvfile, norm elapsed time: {human.format_timespan(_elapsed)}")
#  8 min. 52 sec, 170 files, 374 MB for conf.INPUT_MAXSIZE=400 MB, exclude DATETM
# 11 min. 49 sec, 274 files, 380 MB for conf.INPUT_MAXSIZE=650 MB, include DATETM

saved 1/170 csvfile=C:\projects\ohtdatafiles\dataout\ohtnorm\ohtnorm-001.csv


saved 11/170 csvfile=C:\projects\ohtdatafiles\dataout\ohtnorm\ohtnorm-011.csv


saved 21/170 csvfile=C:\projects\ohtdatafiles\dataout\ohtnorm\ohtnorm-021.csv


saved 31/170 csvfile=C:\projects\ohtdatafiles\dataout\ohtnorm\ohtnorm-031.csv


saved 41/170 csvfile=C:\projects\ohtdatafiles\dataout\ohtnorm\ohtnorm-041.csv


saved 51/170 csvfile=C:\projects\ohtdatafiles\dataout\ohtnorm\ohtnorm-051.csv


saved 61/170 csvfile=C:\projects\ohtdatafiles\dataout\ohtnorm\ohtnorm-061.csv


saved 71/170 csvfile=C:\projects\ohtdatafiles\dataout\ohtnorm\ohtnorm-071.csv


saved 81/170 csvfile=C:\projects\ohtdatafiles\dataout\ohtnorm\ohtnorm-081.csv


saved 91/170 csvfile=C:\projects\ohtdatafiles\dataout\ohtnorm\ohtnorm-091.csv


saved 101/170 csvfile=C:\projects\ohtdatafiles\dataout\ohtnorm\ohtnorm-101.csv


saved 111/170 csvfile=C:\projects\ohtdatafiles\dataout\ohtnorm\ohtnorm-111.csv


saved 121/170 csvfile=C:\projects\ohtdatafiles\dataout\ohtnorm\ohtnorm-121.csv


saved 131/170 csvfile=C:\projects\ohtdatafiles\dataout\ohtnorm\ohtnorm-131.csv


saved 141/170 csvfile=C:\projects\ohtdatafiles\dataout\ohtnorm\ohtnorm-141.csv


saved 151/170 csvfile=C:\projects\ohtdatafiles\dataout\ohtnorm\ohtnorm-151.csv


saved 161/170 csvfile=C:\projects\ohtdatafiles\dataout\ohtnorm\ohtnorm-161.csv


saved 170/170 files
save csvfile, norm elapsed time: 4 minutes and 36.64 seconds


In [34]:
# save outlier
_start = time.time()

comm.save_csvfile(dfoutl[conf.COLUMN_NAMES], conf.FILENAME_OUTL, conf.DIROUT)

_elapsed = time.time() - _start
print(f"save csvfile, outl elapsed time: {human.format_timespan(_elapsed)}")
# 2 min. 57 sec, 57 files 126 MB for conf.INPUT_MAXSIZE=400 MB
# 3 min. 58 sec. 72 files 128 MB for conf.INPUT_MAXSIZE=650 MB

saved 1/57 csvfile=C:\projects\ohtdatafiles\dataout\ohtoutl\ohtoutl-001.csv


saved 11/57 csvfile=C:\projects\ohtdatafiles\dataout\ohtoutl\ohtoutl-011.csv


saved 21/57 csvfile=C:\projects\ohtdatafiles\dataout\ohtoutl\ohtoutl-021.csv


saved 31/57 csvfile=C:\projects\ohtdatafiles\dataout\ohtoutl\ohtoutl-031.csv


saved 41/57 csvfile=C:\projects\ohtdatafiles\dataout\ohtoutl\ohtoutl-041.csv


saved 51/57 csvfile=C:\projects\ohtdatafiles\dataout\ohtoutl\ohtoutl-051.csv


saved 57/57 files
save csvfile, outl elapsed time: 1 minute and 32.36 seconds


### Check file size

In [35]:
# check file size

fileinfo = dict()  # count, size
for afile in [conf.FILENAME_NOISE, conf.FILENAME_NORM, conf.FILENAME_OUTL]:
    basename = pathlib.Path(afile).stem
    adir = str(pathlib.Path(conf.DIROUT) / basename)
    files = comm.get_multifiles_indir(adir, "*.csv")
    sizes = comm.get_multifiles_size(files)

    fileinfo[basename] = [len(files), sum(sizes)]

total_count, total_size = 0, 0
for basename, count_size in fileinfo.items():
    print(f"output {basename} files={count_size[0]}, size={human.format_size(count_size[1])}")
    total_count = total_count + count_size[0]
    total_size = total_size + count_size[1]

print(f"total files={total_count}, size={human.format_size(total_size)}")

output ohtnoise files=0, size=0 bytes
output ohtnorm files=170, size=374.3 MB
output ohtoutl files=57, size=126.64 MB
total files=227, size=500.94 MB


In [36]:
_elapsed = time.time() - mainstart
print(f"main elapsed time: {human.format_timespan(_elapsed)}")
# 3 min.  for conf.INPUT_MAXSIZE=400MB, when include DATETM

main elapsed time: 11 minutes and 49.54 seconds


## eof