In [None]:
# filename: graph.ipynb
# purpose: data visualize analysis

# OHT data visualize & Analysis 

### Proessing flow
- 데이터 체크 - column, type, value and volumn
- 통계 분석 - statistics
- 추이 분석 - lineplot,
- 분산 분석 - historam, boxplot, violinplot
- 상관관계 분석 - heatmap, scatterplot

In [None]:
# packages
import time
import pathlib
import pandas as pd

import humanfriendly as human

import ohtconf as conf
import ohtcomm as comm
import ohtgraph as graph

## Main

In [None]:
mainstart = time.time()

In [None]:
# set float display format
pd.set_option("display.float_format", "{:.1f}".format)

In [None]:
# recreate chart output directory
comm.remove_directory(conf.DIRCHART)

pathlib.Path(conf.DIRCHART).mkdir(parents=True, exist_ok=True)  # create directory of pngfiles

In [None]:
# dataframe slice for chart
xs = conf.CHARTSLICE
# update this value to effect on this file
# xs = slice(conf.CHARTSLICE.start, conf.CHARTSLICE.stop, conf.CHARTSLICE.step)
print(f"chart dataframe slice={xs}")

In [None]:
# default plot size
# update conf.PLOTSIZE to effect on the graph.py functions
conf.PLOTSIZE = [conf.PLOTSIZE[0], conf.PLOTSIZE[1]]
print(f"chart plotsize={conf.PLOTSIZE}")

### 데이터 첵크 

In [None]:
# read table data
_start = time.time()

dfnoise = comm.read_tabdf(conf.TABNAME_NOISE)
dfnoise.sort_values(by=conf.COLUMN_NAMES[0], inplace=True)
dfnoise.reset_index(drop=True, inplace=True)

dfnorm = comm.read_tabdf(conf.TABNAME_NORM)
dfnorm.sort_values(by=conf.COLUMN_NAMES[0], inplace=True)
dfnorm.reset_index(drop=True, inplace=True)

dfoutl = comm.read_tabdf(conf.TABNAME_OUTL)
dfoutl.sort_values(by=conf.COLUMN_NAMES[0], inplace=True)
dfoutl.reset_index(drop=True, inplace=True)

_elapsed = time.time() - _start
print(f"Rea elapsed time: {human.format_timespan(_elapsed)}")

In [None]:
# data count
print(f"row count noise={len(dfnoise)}, norm={len(dfnorm)}, outl={len(dfoutl)}")

In [None]:
# null checking
# dfnorm.info()

In [None]:
# null checking
# dfoutl.info()

In [None]:
# data sample check
dfnorm.head()

In [None]:
# data sample check
dfoutl.head()

### 테이터 통계 분석

In [None]:
# data statistics
dfnorm.describe()

In [None]:
# data statistics
dfoutl.describe()

### 데이터 추이 분석

In [None]:
# line chart
ggrids = [["tem", (1, 1)], ["pma", (1, 3)], ["coa", (1, 2)], ["cta", (2, 2)]]
for ggrid, cols in zip(ggrids, [conf.COLUMN_TEM, conf.COLUMN_PMA, conf.COLUMN_COA, conf.COLUMN_CTA]):
    graph.linechart(
        dfs=[dfoutl[xs], dfnorm[xs]],
        labels=["outlier", "normal"],
        cols=cols,
        grid=ggrid[1],
        pngfile=f"outl-norm-line-{ggrid[0]}.png",
    )

In [None]:
# line chart - moving average
mvs = slice(
    conf.CHARTSLICE.start, min(len(dfoutl), conf.CHARTSLICE.start + conf.POINTS["MOVING"] * 100), conf.POINTS["MOVING"]
)

mvavg_tem = [conf.MVAVG + col for col in conf.COLUMN_TEM]
mvavg_pma = [conf.MVAVG + col for col in conf.COLUMN_PMA]
mvavg_coa = [conf.MVAVG + col for col in conf.COLUMN_COA]
mvavg_cta = [conf.MVAVG + col for col in conf.COLUMN_CTA]

ggrids = [["tem", (1, 1)], ["pma", (1, 3)], ["coa", (1, 2)], ["cta", (2, 2)]]
for ggrid, cols in zip(ggrids, [mvavg_tem, mvavg_pma, mvavg_coa, mvavg_cta]):
    graph.linechart(
        dfs=[dfoutl[mvs], dfnorm[mvs]],
        labels=["outlier", "normal"],
        cols=cols,
        grid=ggrid[1],
        title="Moving Average by Line chart",
        pngfile=f"outl-norm-mvavg-line-{ggrid[0]}.png",
    )

### 데이터 분산 분석

In [None]:
# histogram chart
ggrids = [["tem", (1, 1)], ["pma", (1, 3)], ["coa", (1, 2)], ["cta", (2, 2)]]
for ggrid, cols in zip(ggrids, [conf.COLUMN_TEM, conf.COLUMN_PMA, conf.COLUMN_COA, conf.COLUMN_CTA]):
    graph.histchart(
        dfs=[dfoutl, dfnorm.iloc[: len(dfoutl)]],
        labels=["outlier", "normal"],
        cols=cols,
        grid=ggrid[1],
        pngfile=f"outl-norm-hist-{ggrid[0]}.png",
    )

In [None]:
# box chart
ggrids = [["tem", (1, 1)], ["pma", (1, 3)], ["coa", (1, 2)], ["cta", (2, 2)]]
for ggrid, cols in zip(ggrids, [conf.COLUMN_TEM, conf.COLUMN_PMA, conf.COLUMN_COA, conf.COLUMN_CTA]):
    graph.boxchart(
        dfs=[dfoutl, dfnorm.iloc[: len(dfoutl)]],
        labels=["outlier", "normal"],
        cols=cols,
        grid=ggrid[1],
        pngfile=f"outl-norm-box-{ggrid[0]}.png",
    )

In [None]:
# violin chart
ggrids = [["tem", (1, 1)], ["pma", (1, 3)], ["coa", (1, 2)], ["cta", (2, 2)]]
for ggrid, cols in zip(ggrids, [conf.COLUMN_TEM, conf.COLUMN_PMA, conf.COLUMN_COA, conf.COLUMN_CTA]):
    graph.violinchart(
        dfs=[dfoutl, dfnorm.iloc[: len(dfoutl)]],
        labels=["outlier", "normal"],
        cols=cols,
        grid=ggrid[1],
        pngfile=f"outl-norm-violin-{ggrid[0]}.png",
    )

### 데이터 상관관계 분석

In [None]:
# heatmap chart based on correlation
graph.heatmapchart(dfnorm[conf.COLUMN_GRAPH], pngfile="norm-heatmap-all.png")

# NOTE: with large dataset, found strong(>0.75) and moderate(>0.50) correationship
# TEM-NH3: 0.88, PM1-PM2_5: 0.92, PM1-PM10: 0.98, PM2_5:PM10: 0.95, PM2_5-CT1: 0.56, CO-CT1: 0.57

In [None]:
# scatter chart
if conf.SCATTER_INCLUDE:
    for c1idx, c1name in enumerate(conf.COLUMN_GRAPH):
        if c1idx < (len(conf.COLUMN_NAMES) - 1):
            for c2idx, c2name in enumerate(conf.COLUMN_GRAPH[c1idx + 1 :]):
                graph.scatterchart(
                    dfnorm, cols=[c1name, c2name], pngfile="norm-scatter-" + "-".join([c1name, c2name]) + ".png"
                )

# """
# Correlation in between features:
# - TEM has no correlation with the other features.
# - PM1,2_5,10 has some liner correlation
# - CO, NH3 has some linear correlaton.
# - CO, NH3 and CT1~4 has some linear correlation
# - CT1~CT4 have no correlation in between them.
# """

In [None]:
_elapsed = time.time() - mainstart
print(f"main elapsed time: {human.format_timespan(_elapsed)}")
# 1 min. for conf.INPUT_MAXSIZE = 400 MB

### eof