In [None]:
import numpy as np
import pandas as pd
import wandb
import weave

In [None]:
wandb.login()

In [None]:
PROJECT_NAME = 'exploring-tabular-data'
with wandb.init(project=PROJECT_NAME, job_type='load-data') as run:
    split_artifact = run.use_artifact('splitting-tabular-data/data-splits:latest')
    data_table = split_artifact.get('train-data')

In [None]:
linked_table = np.all([
    type(value) is wandb.data_types._ForeignIndexType
    for value in data_table._column_types.params['type_map'].values()
])
linked_table

In [None]:
columns = data_table.get_column(data_table.columns[0])
ndx = 0
ref_table = data_table.get_column(data_table.columns[0])
ref_row = list(ref_table[ndx].get_row().values())
ref_table[ndx].get_row().keys()

In [None]:
len(data_table.get_column(data_table.columns[0])[0].get_row().values())

In [None]:
def check_linked_table(table):
    # Check if the table's contents are pointers to another table or not
    linked_table = np.all([
        type(value) is wandb.data_types._ForeignIndexType
        for value in table._column_types.params['type_map'].values()
    ])
    return linked_table


def dereference_linked_table(table):
    # Dereference the table assuming that there is only one reference column
    return table.get_column(table.columns[0])


def get_table_columns(table):
    if check_linked_table(table):
        ref_table = dereference_linked_table(table)
        columns = ref_table[0].get_row().keys()
    else:
        columns = table.columns
    return columns

    
def get_table_row(table, ndx):
    """
    Given a table and index, return the corresponding row
    Arguments:
        table (wandb.Table) can be a standard table of data or a pointer to a reference table
        ndx (int) row index to slice
    Returns:
        ref_row (list) of data entries for the row referenced by ndx
    """
    if check_linked_table(table): # The table entries reference another table
        ref_table = dereference_linked_table(table)
        # The pointers are dereferenced using the get_row() function
        if type(ndx) is list:
            ref_row = [list(ref_table[i].get_row().values()) for i in ndx]
        elif type(ndx) is int:
            ref_row = list(ref_table[ndx].get_row().values())
        else:
            raise ValueError(
                f'Input argument ndx must be of type int or list, not {type(ndx)}'
            )
        return ref_row
    else: # Standard w&b Table containing the data
        return table.data[ndx]

def wandb_table_to_df(table):
    data_df = pd.DataFrame(columns=get_table_columns(table))#, data=data_table.data[0])
    for ndx in range(len(table.data)):
        data_df.loc[ndx] = get_table_row(table, ndx)
    return data_df

In [None]:
data_df = wandb_table_to_df(data_table)

data_df.describe()

In [None]:
def dropna(col: list) -> np.ndarray:
    return np.array(col)[np.isfinite(col)]
    
@weave.op()
def mean(col: list) -> float:
    return float(np.mean(dropna(col)))

@weave.op()
def std(col: list) -> float:
    return float(np.std(dropna(col)))

@weave.op()
def col_min(col: list) -> float:
    return float(np.min(dropna(col)))

@weave.op()
def first_quantile(col: list) -> float:
    return float(np.quantile(dropna(col), 0.25))

@weave.op()
def second_quantile(col: list) -> float:
    return float(np.quantile(dropna(col), 0.5))

@weave.op()
def third_quantile(col: list) -> float:
    return float(np.quantile(dropna(col), 0.75))

@weave.op()
def col_max(col: list) -> float:
    return float(np.max(dropna(col)))

@weave.op()
def describe(col: list) -> list:
    col_desc = [
        weave.use(mean(col)),
        weave.use(std(col)),
        weave.use(col_min(col)),
        weave.use(first_quantile(col)),
        weave.use(second_quantile(col)),
        weave.use(third_quantile(col)),
        weave.use(col_max(col))
    ]
    return col_desc

In [None]:
data_df.columns

In [None]:
col = list(data_df['AGE'])

In [None]:
desc = weave.use(describe(col))

In [None]:
data_table.columns

In [None]:
def describe_table(input_table):
    table_columns = get_table_columns(input_table)
    summary = []
    df = wandb_table_to_df(input_table)
    for column in table_columns:
        col_data = input_table.
        summary.append(weave.use(describe(list(df[column]))))
    out_table = wandb.Table(dataframe=pd.DataFrame(columns=table_columns, data=summary))
    return out_table

In [None]:
out_table = describe_table(data_table)