In [None]:
import numpy as np
import pandas as pd
import wandb
import weave

In [None]:
wandb.login()

In [None]:
ENTITY = 'dpaiton'
PROJECT = 'exploring-tabular-data'
with wandb.init(entity=ENTITY, project=PROJECT, job_type='load-data') as run:
    split_artifact = run.use_artifact(f'{ENTITY}/splitting-tabular-data/data-library:latest')
    data_table = split_artifact.get('data-table')

In [None]:
data_df = pd.DataFrame(columns=data_table.columns, data=data_table.data)

data_df.describe()

In [None]:
def dropna(col: list) -> np.ndarray:
    col_arry = np.array(col, dtype=float)
    return col_arry[np.isfinite(col_arry)]

@weave.op()
def col_count(col: list) -> int:
    return dropna(col).size

@weave.op()
def mean(col: list) -> float:
    return float(np.mean(dropna(col)))

@weave.op()
def std(col: list) -> float:
    return float(np.std(dropna(col)))

@weave.op()
def col_min(col: list) -> float:
    return float(np.min(dropna(col)))

@weave.op()
def first_quantile(col: list) -> float:
    return float(np.quantile(dropna(col), 0.25))

@weave.op()
def second_quantile(col: list) -> float:
    return float(np.quantile(dropna(col), 0.5))

@weave.op()
def third_quantile(col: list) -> float:
    return float(np.quantile(dropna(col), 0.75))

@weave.op()
def col_max(col: list) -> float:
    return float(np.max(dropna(col)))

@weave.op()
def describe(col: list) -> list:
    col_desc = [
        weave.use(col_count(col)),
        weave.use(mean(col)),
        weave.use(std(col)),
        weave.use(col_min(col)),
        weave.use(first_quantile(col)),
        weave.use(second_quantile(col)),
        weave.use(third_quantile(col)),
        weave.use(col_max(col))
    ]
    return col_desc

In [None]:
col = data_table.get_column('AGE')
list(weave.use(describe(col)))

In [None]:
def describe_table(in_table):
    out_table = wandb.Table(columns=[])
    for column in in_table.columns:
        col = in_table.get_column(column)
        print(column)
        try:
            summary = list(weave.use(describe(col)))
        except:
            print(col)
        out_table.add_column(
            name=column,
            data=summary
        )
    return out_table

In [None]:
out_table = describe_table(data_table)

In [None]:
with wandb.init(entity=ENTITY, project=PROJECT, job_type='log-data') as run:
    run.log({"dataset_summary":out_table})