In [1]:
# filename: arse.ipynb
# purpose: parse input dataset files and save to duckdb file

# OHT raw data parsing

## Dataset information
- raw dataset is multiple CSV files in a directory. ex) r"\\BlueServer\공동작업폴더\데이터바우처_OHT\수집csv\newdata"
- CSV filename patterns are "afpLog_YYYY-MM-DD_HHMMSS.csv". ex) r"\\BlueServer\공동작업폴더\데이터바우처_OHT\수집csv\newdata\afpLog_2024-05-29_101339.csv" 
- CSV file does not include head line, comma(,) delimited, 6 columns
- CSV file line example: datatime_ms, tem, ecm1(electronic current measure 1), ecm2, ecm3, ecm4. ex) 2024-05-29 09:13:39:889,34.5,-0.1,1,0.5,0.6
- For reusing performance, a duckdb file will be created with dir-basename.duckdb. ex) r"\\BlueServer\공동작업폴더\데이터바우처_OHT\수집csv\수집csv.duckdb"

## Duckdb information
- raw dataset is multiple CSV files in a directory. ex) r"\\BlueServer\공동작업폴더\데이터바우처_OHT\수집csv\newdata"
- For reusing performance, a duckdb file will be created with parentdir-basename.duckdb. ex) r"\\BlueServer\공동작업폴더\데이터바우처_OHT\수집csv\수집csv.duckdb"
- TABNAME_RAW table was created when the dbfile was created.
- TABNAME_NORM, NOISE, OUTL will be created and used in this script

## Processing flow
- Parse raws csvfiles
- Save to Raw table in-file duckdb
- Display data info

In [2]:
# packages
import time
import pathlib
import pandas as pd

import humanfriendly as human

import ohtconf as conf
import ohtcomm as comm

## Main

In [3]:
mainstart = time.time()

In [4]:
if conf.DBFILE_RECREATE:
    comm.remove_file(conf.DBFILE)

In [5]:
# read dataset files in a directory
_start = time.time()

dfraw: pd.DataFrame = None

if pathlib.Path(conf.DBFILE).exists():
    dfraw = comm.read_tabdf(conf.TABNAME_RAW)
    # ensure order
    dfraw.sort_values(by=conf.COLUMN_NAMES[0], inplace=True)
    dfraw.reset_index(drop=True, inplace=True)

    _elapsed = time.time() - _start
    print(f"Read elapsed time: {human.format_timespan(_elapsed)}")
else:
    csvfiles = comm.get_multifiles_indir(conf.DIRRAW, conf.FILENAME_PATTERN)
    if len(csvfiles) < 1:
        raise Exception(f"dataset no csvfiles in the directory, {conf.DIRRAW}")

    print(f"all csvfile count={len(csvfiles)} in dir={conf.DIRRAW}")

    # skip files
    if conf.SKIP_FILES is not None and conf.SKIP_FILES > 0:
        if len(csvfiles) > conf.SKIP_FILES:
            csvfiles = csvfiles[conf.SKIP_FILES :]
            print(f"skip files={conf.SKIP_FILES} applied")
        else:
            print(f"Ignore SKIP_FILES={conf.SKIP_FILES} as not enough files at dir={conf.DIRRAW}")

    # limit files
    if conf.LIMIT_FILES is not None and conf.LIMIT_FILES > 0:
        if len(csvfiles) > conf.LIMIT_FILES:
            csvfiles = csvfiles[: conf.LIMIT_FILES]
            print(f"limit files={conf.LIMIT_FILES} applied")

    csvsizes = comm.get_multifiles_size(csvfiles)

    if conf.INPUT_MAXSIZE is not None and conf.INPUT_MAXSIZE > 0:
        if sum(csvsizes) > conf.INPUT_MAXSIZE:
            cumfiles, cumsizes, sumsize = [], [], 0
            for fname, fsize in zip(csvfiles, csvsizes):
                cumfiles.append(fname)
                cumsizes.append(fsize)
                sumsize += fsize
                if sumsize > conf.INPUT_MAXSIZE:
                    csvfiles = cumfiles
                    csvsizes = cumsizes
                    break
            print(f"input maxsize={human.format_size(conf.INPUT_MAXSIZE)} applied")

    print(f"selected csvfile count={len(csvfiles)}, size={human.format_size(sum(csvsizes))}")
    print(f"selected first file={csvfiles[0]}, last file={csvfiles[-1]}")

    dfraw = comm.read_multifiles(files=csvfiles, logstep=conf.LOGSTEP_FILES, verbose=conf.VERBOSE)

    # ensure order, non-null
    dfraw.sort_values(by=conf.COLUMN_NAMES[0], inplace=True)
    dfraw.reset_index(drop=True, inplace=True)
    dfraw.bfill(inplace=True)
    dfraw.ffill(inplace=True)

    # save to db
    comm.save_dftab(dfraw, conf.TABNAME_RAW)

    _elapsed = time.time() - _start
    print(f"Parse elapsed time: {human.format_timespan(_elapsed)}")

all csvfile count=279 in dir=C:\projects\ohtdatafiles\dataraw
input maxsize=419.43 MB applied
selected csvfile count=170, size=420.81 MB
selected first file=C:\projects\ohtdatafiles\dataraw\afpLog_2024-07-29_094318.csv, last file=C:\projects\ohtdatafiles\dataraw\afpLog_2024-08-05_091706.csv
file reading 1 file=afpLog_2024-07-29_094318.csv


file reading 11 file=afpLog_2024-07-29_194318.csv


file reading 21 file=afpLog_2024-07-30_054318.csv


file reading 31 file=afpLog_2024-07-30_154318.csv


file reading 41 file=afpLog_2024-07-31_014318.csv


file reading 51 file=afpLog_2024-07-31_114318.csv


file reading 61 file=afpLog_2024-07-31_214318.csv


file reading 71 file=afpLog_2024-08-01_074318.csv


CSV error lines in file=afpLog_2024-08-01_084318.csv:
no=11174, line="                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  

file reading 81 file=afpLog_2024-08-01_161706.csv


file reading 91 file=afpLog_2024-08-02_021706.csv


file reading 101 file=afpLog_2024-08-02_121706.csv


file reading 111 file=afpLog_2024-08-02_221706.csv


file reading 121 file=afpLog_2024-08-03_081706.csv


file reading 131 file=afpLog_2024-08-03_181706.csv


file reading 141 file=afpLog_2024-08-04_041706.csv


file reading 151 file=afpLog_2024-08-04_141706.csv


file reading 161 file=afpLog_2024-08-05_001706.csv


dataframe prepared with (rows,columns)=(6095021, 11) in 170 files.


Parse elapsed time: 1 minute and 20.04 seconds


In [6]:
# 데이터 체크
dfraw.head()

Unnamed: 0,DATETM,TEM,PM1,PM2_5,PM10,CO,NH3,CT1,CT2,CT3,CT4
0,2024-07-29 09:43:18.696,40.799999,10,12,13,161,88,0.8,1.0,0.5,0.6
1,2024-07-29 09:43:18.781,40.799999,10,12,13,161,88,0.8,1.0,0.5,0.6
2,2024-07-29 09:43:18.883,40.799999,10,12,13,161,88,0.8,1.0,0.5,0.5
3,2024-07-29 09:43:18.989,40.799999,10,12,13,161,88,0.9,1.0,0.5,0.6
4,2024-07-29 09:43:19.094,40.799999,10,12,13,161,88,0.8,1.0,0.7,0.6


In [7]:
# 데이터 통계
dfraw.describe()

Unnamed: 0,DATETM,TEM,PM1,PM2_5,PM10,CO,NH3,CT1,CT2,CT3,CT4
count,6095021,6095021.0,6095021.0,6095021.0,6095021.0,6095021.0,6095021.0,6095021.0,6095021.0,6095021.0,6095021.0
mean,2024-08-01 21:56:36.582684928,42.11648,9.383445,10.84684,12.31504,157.2084,98.24781,0.6282347,0.9729512,0.4428262,0.5762765
min,2024-07-29 09:43:18.696000,22.6,7.0,8.0,9.0,31.0,66.0,-0.5,0.2,0.2,0.3
25%,2024-07-31 04:02:54.219000064,40.8,8.0,9.0,11.0,141.0,88.0,0.3,0.9,0.4,0.5
50%,2024-08-01 21:37:40.710000128,41.7,9.0,11.0,12.0,158.0,93.0,0.6,1.0,0.4,0.6
75%,2024-08-03 15:57:22.609999872,43.5,10.0,12.0,13.0,170.0,110.0,0.9,1.0,0.5,0.6
max,2024-08-05 10:17:05.909000,45.5,27.0,29.0,42.0,253.0,140.0,3.0,1.5,0.8,1.0
std,,1.725541,1.61597,1.644981,1.463494,28.8416,12.3927,0.3341508,0.07151102,0.07401512,0.06755878


In [8]:
_elapsed = time.time() - mainstart
print(f"main elapsed time: {human.format_timespan(_elapsed)}")
# 1 min. for 170 files, 420 MB input files when set conf.INPUT_MAXSIZE = 400 MB

main elapsed time: 1 minute and 21.36 seconds


## eof