In [1]:
# filename: arse.ipynb
# purpose: parse input dataset files and save to duckdb file

# OHT raw dataset parsing

## Dataset information
- raw dataset is multiple CSV files in a directory. ex) r"\\BlueServer\공동작업폴더\데이터바우처_OHT\수집csv\newdata"
- CSV filename patterns are "afpLog_YYYY-MM-DD_HHMMSS.csv". ex) r"\\BlueServer\공동작업폴더\데이터바우처_OHT\수집csv\newdata\afpLog_2024-05-29_101339.csv" 
- CSV file does not include head line, comma(,) delimited, 6 columns
- CSV file line example: datatime_ms, temper, ecm1(electronic current measure 1), ecm2, ecm3, ecm4. ex) 2024-05-29 09:13:39:889,34.5,-0.1,1,0.5,0.6
- For reusing performance, a duckdb file will be created with dir-basename.duckdb. ex) r"\\BlueServer\공동작업폴더\데이터바우처_OHT\수집csv\수집csv.duckdb"

## Duckdb information
- raw dataset is multiple CSV files in a directory. ex) r"\\BlueServer\공동작업폴더\데이터바우처_OHT\수집csv\newdata"
- For reusing performance, a duckdb file will be created with parentdir-basename.duckdb. ex) r"\\BlueServer\공동작업폴더\데이터바우처_OHT\수집csv\수집csv.duckdb"
- TABNAME_RAW table was created when the dbfile was created.
- TABNAME_NORM, NOISE, OUTL will be created and used in this script

## Processing flow
- Parse raws csvfiles
- Save to Raw table in-file duckdb
- Display data info

In [2]:
# packages
import time
import pathlib
import pandas as pd

import humanfriendly as human

import ohtconf as conf
import ohtcomm as comm

## Main

In [3]:
mainstart = time.time()

In [4]:
if conf.DBFILE_RECREATE:
    comm.remove_file(conf.DBFILE)

In [5]:
# read dataset files in a directory
_start = time.time()

dfraw: pd.DataFrame = None

if pathlib.Path(conf.DBFILE).exists():
    dfraw = comm.read_tabdf(conf.TABNAME_RAW)

    _elapsed = time.time() - _start
    print(f"Read elapsed time: {human.format_timespan(_elapsed)}")
else:
    csvfiles = comm.get_multifiles_indir(conf.DIRRAW, conf.FILENAME_PATTERN)
    if len(csvfiles) < 1:
        raise Exception(f"dataset no csvfiles in the directory, {conf.DIRRAW}")

    print(f"all csvfile count={len(csvfiles)} in dir={conf.DIRRAW}")

    # skip files
    if conf.SKIP_FILES is not None and conf.SKIP_FILES > 0:
        if len(csvfiles) > conf.SKIP_FILES:
            csvfiles = csvfiles[conf.SKIP_FILES :]
            print(f"skip files={conf.SKIP_FILES} applied")
        else:
            print(f"Ignore SKIP_FILES={conf.SKIP_FILES} as not enough files at dir={conf.DIRRAW}")

    # limit files
    if conf.LIMIT_FILES is not None and conf.LIMIT_FILES > 0:
        if len(csvfiles) > conf.LIMIT_FILES:
            csvfiles = csvfiles[: conf.LIMIT_FILES]
            print(f"limit files={conf.LIMIT_FILES} applied")

    csvsizes = comm.get_multifiles_size(csvfiles)

    if conf.INPUT_MAXSIZE is not None and conf.INPUT_MAXSIZE > 0:
        if sum(csvsizes) > conf.INPUT_MAXSIZE:
            cumfiles, cumsizes, sumsize = [], [], 0
            for fname, fsize in zip(csvfiles, csvsizes):
                cumfiles.append(fname)
                cumsizes.append(fsize)
                sumsize += fsize
                if sumsize > conf.INPUT_MAXSIZE:
                    csvfiles = cumfiles
                    csvsizes = cumsizes
                    break
            print(f"input maxsize={human.format_size(conf.INPUT_MAXSIZE)} applied")

    print(f"selected csvfile count={len(csvfiles)}, size={human.format_size(sum(csvsizes))}")
    print(f"selected first file={csvfiles[0]}, last file={csvfiles[-1]}")

    dfraw = comm.read_multifiles(files=csvfiles, logstep=conf.LOGSTEP_FILES, verbose=conf.VERBOSE)

    # save to db
    comm.save_dftab(dfraw, conf.TABNAME_RAW)

    _elapsed = time.time() - _start
    print(f"Parse elapsed time: {human.format_timespan(_elapsed)}")

all csvfile count=3 in dir=.\sample\dataraw
selected csvfile count=3, size=7.53 MB
selected first file=sample\dataraw\afpLog_2024-07-29_094318.csv, last file=sample\dataraw\afpLog_2024-07-29_114318.csv
file reading 1 file=afpLog_2024-07-29_094318.csv
dataframe prepared with (rows,columns)=(108000, 11) in 3 files.
Parse elapsed time: 2.15 seconds


In [6]:
# 데이터 체크
dfraw.head()

Unnamed: 0,DATETM,TEMPER,PM1,PM2_5,PM10,CO,NH3,CT1,CT2,CT3,CT4
0,2024-07-29 09:43:18.696,40.799999,10,12,13,161,88,0.8,1.0,0.5,0.6
1,2024-07-29 09:43:18.781,40.799999,10,12,13,161,88,0.8,1.0,0.5,0.6
2,2024-07-29 09:43:18.883,40.799999,10,12,13,161,88,0.8,1.0,0.5,0.5
3,2024-07-29 09:43:18.989,40.799999,10,12,13,161,88,0.9,1.0,0.5,0.6
4,2024-07-29 09:43:19.094,40.799999,10,12,13,161,88,0.8,1.0,0.7,0.6


In [7]:
# 데이터 통계
dfraw.describe()

Unnamed: 0,DATETM,TEMPER,PM1,PM2_5,PM10,CO,NH3,CT1,CT2,CT3,CT4
count,108000,108000.0,108000.0,108000.0,108000.0,108000.0,108000.0,108000.0,108000.0,108000.0,108000.0
mean,2024-07-29 11:13:18.669019392,40.544147,11.003269,12.3525,13.732074,162.644713,88.483009,0.830381,1.003029,0.481338,0.546794
min,2024-07-29 09:43:18.696000,40.400002,9.0,11.0,12.0,134.0,74.0,0.6,0.8,0.3,0.3
25%,2024-07-29 10:28:18.694749952,40.5,10.0,12.0,13.0,161.0,88.0,0.8,1.0,0.4,0.5
50%,2024-07-29 11:13:18.669000192,40.5,10.0,12.0,13.0,163.0,89.0,0.8,1.0,0.5,0.5
75%,2024-07-29 11:58:18.644000,40.599998,13.0,13.0,15.0,164.0,89.0,0.9,1.0,0.5,0.6
max,2024-07-29 12:43:18.619000,40.900002,15.0,19.0,26.0,170.0,92.0,1.0,1.2,0.8,0.7
std,,0.07835,1.426086,0.516026,1.022888,2.466705,0.892518,0.054012,0.062317,0.065392,0.058604


In [8]:
_elapsed = time.time() - mainstart
print(
    f"main elapsed time: {human.format_timespan(_elapsed)}"
)  # 1 min. 57 sec for 170 files, 420 MB input files when set conf.INPUT_MAXSIZE = 400 MB

main elapsed time: 2.23 seconds


## eof