In [1]:
# filename: arse.ipynb
# purpose: parse input dataset files and save to duckdb file

# OHT raw data parsing

## Dataset information
- raw dataset is multiple CSV files in a directory. ex) r"\\BlueServer\공동작업폴더\데이터바우처_OHT\수집csv\newdata"
- CSV filename patterns are "afpLog_YYYY-MM-DD_HHMMSS.csv". ex) r"\\BlueServer\공동작업폴더\데이터바우처_OHT\수집csv\newdata\afpLog_2024-05-29_101339.csv" 
- CSV file does not include head line, comma(,) delimited, 6 columns
- CSV file line example: datatime_ms, tem, ecm1(electronic current measure 1), ecm2, ecm3, ecm4. ex) 2024-05-29 09:13:39:889,34.5,-0.1,1,0.5,0.6
- For reusing performance, a duckdb file will be created with dir-basename.duckdb. ex) r"\\BlueServer\공동작업폴더\데이터바우처_OHT\수집csv\수집csv.duckdb"

## Duckdb information
- raw dataset is multiple CSV files in a directory. ex) r"\\BlueServer\공동작업폴더\데이터바우처_OHT\수집csv\newdata"
- For reusing performance, a duckdb file will be created with parentdir-basename.duckdb. ex) r"\\BlueServer\공동작업폴더\데이터바우처_OHT\수집csv\수집csv.duckdb"
- TABNAME_RAW table was created when the dbfile was created.
- TABNAME_NORM, NOISE, OUTL will be created and used in this script

## Processing flow
- Parse raws csvfiles
- Save to Raw table in-file duckdb
- Display data info

In [2]:
# packages
import time
import pathlib
import pandas as pd

import humanfriendly as human

import ohtconf as conf
import ohtcomm as comm

## Main

In [None]:
mainstart = time.time()

In [None]:
if conf.DBFILE_RECREATE:
    comm.remove_file(conf.DBFILE)

In [None]:
# read dataset files in a directory
_start = time.time()

dfraw: pd.DataFrame = None

if pathlib.Path(conf.DBFILE).exists():
    dfraw = comm.read_tabdf(conf.TABNAME_RAW)
    # ensure order
    dfraw.sort_values(by=conf.COLUMN_NAMES[0], inplace=True)
    dfraw.reset_index(drop=True, inplace=True)

    _elapsed = time.time() - _start
    print(f"Read elapsed time: {human.format_timespan(_elapsed)}")
else:
    csvfiles = comm.get_multifiles_indir(conf.DIRRAW, conf.FILENAME_PATTERN)
    if len(csvfiles) < 1:
        raise Exception(f"dataset no csvfiles in the directory, {conf.DIRRAW}")

    print(f"all csvfile count={len(csvfiles)} in dir={conf.DIRRAW}")

    # skip files
    if conf.SKIP_FILES is not None and conf.SKIP_FILES > 0:
        if len(csvfiles) > conf.SKIP_FILES:
            csvfiles = csvfiles[conf.SKIP_FILES :]
            print(f"skip files={conf.SKIP_FILES} applied")
        else:
            print(f"Ignore SKIP_FILES={conf.SKIP_FILES} as not enough files at dir={conf.DIRRAW}")

    # limit files
    if conf.LIMIT_FILES is not None and conf.LIMIT_FILES > 0:
        if len(csvfiles) > conf.LIMIT_FILES:
            csvfiles = csvfiles[: conf.LIMIT_FILES]
            print(f"limit files={conf.LIMIT_FILES} applied")

    csvsizes = comm.get_multifiles_size(csvfiles)

    if conf.INPUT_MAXSIZE is not None and conf.INPUT_MAXSIZE > 0:
        if sum(csvsizes) > conf.INPUT_MAXSIZE:
            cumfiles, cumsizes, sumsize = [], [], 0
            for fname, fsize in zip(csvfiles, csvsizes):
                cumfiles.append(fname)
                cumsizes.append(fsize)
                sumsize += fsize
                if sumsize > conf.INPUT_MAXSIZE:
                    csvfiles = cumfiles
                    csvsizes = cumsizes
                    break
            print(f"input maxsize={human.format_size(conf.INPUT_MAXSIZE)} applied")

    print(f"selected csvfile count={len(csvfiles)}, size={human.format_size(sum(csvsizes))}")
    print(f"selected first file={csvfiles[0]}, last file={csvfiles[-1]}")

    dfraw = comm.read_multifiles(files=csvfiles, logstep=conf.LOGSTEP_FILES, verbose=conf.VERBOSE)

    # ensure order, non-null
    dfraw.sort_values(by=conf.COLUMN_NAMES[0], inplace=True)
    dfraw.reset_index(drop=True, inplace=True)
    dfraw.bfill(inplace=True)
    dfraw.ffill(inplace=True)

    # save to db
    comm.save_dftab(dfraw, conf.TABNAME_RAW)

    _elapsed = time.time() - _start
    print(f"Parse elapsed time: {human.format_timespan(_elapsed)}")

In [None]:
# 데이터 체크
dfraw.head()

In [None]:
# 데이터 통계
dfraw.describe()

In [None]:
_elapsed = time.time() - mainstart
print(f"main elapsed time: {human.format_timespan(_elapsed)}")
# 1 min. for 170 files, 420 MB input files when set conf.INPUT_MAXSIZE = 400 MB

## eof