bigfunctions/explore_column.yaml

type: procedure
category: explore
author:
  name: Paul Marcombes
  url: https://www.linkedin.com/in/paul-marcombes
  avatar_url: "https://lh3.googleusercontent.com/a-/ACB-R5RDf2yxcw1p_IYLCKmiUIScreatDdhG8B83om6Ohw=s260"
description: Show column statistics
arguments:
  - name: fully_qualified_column
    type: string
examples:
  - description: ""
    arguments:
      - '"{BIGFUNCTIONS_DATASET}.natality.weight_pounds"'
    screenshot: explore_column.png
code: |
  declare project, dataset, table, column, data_type, query string;
  declare context json;
  declare nbins string default '10';
  declare nbins_top_values string default '100';

  declare parts array<string> default split(replace(fully_qualified_column, '`', ''), '.');
  set project = parts[offset(0)];
  set dataset = parts[offset(1)];
  set table = parts[offset(2)];
  set column = parts[offset(3)];


  execute immediate replace(replace(replace(replace(
    '''
    select data_type
    from `{{project}}.{{dataset}}.INFORMATION_SCHEMA.COLUMNS`
    where
      table_name = '{{table}}'
      and column_name = '{{column}}'
    ''',
    '{{project}}', project),
    '{{dataset}}', dataset),
    '{{table}}', table),
    '{{column}}', column
  ) into data_type;

  set query = '''
    create or replace temp table bigfunction_result as

    with

    stats as (
      select
        count(*) as row_count,
        count(distinct `{{column}}`) as distinct_count,
        count(distinct `{{column}}`) / count(*) as distinct_ratio,
        count(*) - count(`{{column}}`) as missing_count,
        (count(*) - count(`{{column}}`)) / count(*) as missing_ratio,
        {% if data_type in ['FLOAT64', 'INT64', 'NUMERIC'] %}cast(min(`{{column}}`) as float64){% else %}null{% endif %} as min,
        {% if data_type in ['FLOAT64', 'INT64', 'NUMERIC'] %}cast(avg(`{{column}}`) as float64){% else %}null{% endif %} as mean,
        {% if data_type in ['FLOAT64', 'INT64', 'NUMERIC'] %}cast(max(`{{column}}`) as float64){% else %}null{% endif %} as max,
      from `{{project}}.{{dataset}}.{{table}}`
    ),

    {% if data_type in ['FLOAT64', 'NUMERIC'] %}
      hist_bins as (
        select
          i,
          `min` + i/{{nbins}}. * (`max`-`min`)  as low,
          `min` + (i+1)/{{nbins}}. * (`max`-`min`) + if(i={{nbins}}-1, 1, 0)  as up,
          (
            '[' ||
              cast(round(`min` + i/{{nbins}}. * (`max`-`min`), 2) as string) || ', ' ||
              cast(round(`min` + (i+1)/{{nbins}}. * (`max`-`min`), 2) as string) ||
            if(i={{nbins}}-1, ']', '[')
          ) as label,
          'real' as bin_mode,
        from stats, unnest(generate_array(0, {{nbins}})) i
        where i < {{nbins}}
      ),

      hist_counts as (
        select
          bin.i,
          count(*) as row_count,
        from `{{project}}.{{dataset}}.{{table}}`
        inner join hist_bins as bin
        on `{{column}}` >= bin.low and `{{column}}` < bin.up
        group by i
      ),

      hist as (
        select array_agg(
          struct(hist_bins.label as x, cast(ifnull(row_count, 0) as float64) as y)
          order by i
        ) hist
        from hist_bins
        left join hist_counts using(i)
      )

    {% elif data_type == 'INT64' %}
      hist_bins_for_reals as (
        select
          i,
          `min` + i/{{nbins}}. * (`max`-`min`)  as low,
          `min` + (i+1)/{{nbins}}. * (`max`-`min`) + if(i={{nbins}}-1, 1, 0)  as up,
          (
            '[' ||
              cast(round(`min` + i/{{nbins}}. * (`max`-`min`), 2) as string) || ', ' ||
              cast(round(`min` + (i+1)/{{nbins}}. * (`max`-`min`), 2) as string) ||
            if(i={{nbins}}-1, ']', '[')
          ) as label,
          'real' as bin_mode,
        from stats, unnest(generate_array(0, {{nbins}})) i
        where i < {{nbins}}
      ),

      hist_bins_for_integers as (
        select
          i,
          i - 0.5  as low,
          i + 0.5  as up,
          cast(i as string) as label,
          'integer' as bin_mode,
        from stats, unnest(generate_array(stats.min, least(stats.max, stats.min + 60))) i
      ),

      hist_bins as (
        select * from hist_bins_for_reals
        where bin_mode = if('{{data_type}}' = 'INT64' and (select count(*) from hist_bins_for_integers) <= 60, 'integer', 'real')
        union all
        select * from hist_bins_for_integers
        where bin_mode = if('{{data_type}}' = 'INT64' and (select count(*) from hist_bins_for_integers) <= 60, 'integer', 'real')
      ),

      hist_counts as (
        select
          bin.i,
          count(*) as row_count,
        from `{{project}}.{{dataset}}.{{table}}`
        inner join hist_bins as bin
        on `{{column}}` >= bin.low and `{{column}}` < bin.up
        group by i
      ),

      hist as (
        select array_agg(
          struct(hist_bins.label as x, cast(ifnull(row_count, 0) as float64) as y)
          order by i
        ) hist
        from hist_bins
        left join hist_counts using(i)
      )


    {% else %}
      top_values as (
        select
          {% if data_type in ['TIMESTAMP', 'DATETIME'] %}
            cast(date(`{{column}}`) as string) as value,
          {% else %}
            cast(`{{column}}` as string) as value,
          {% endif %}
          count(*) as value_count,
          count(*) * 1. / (select row_count from stats) as value_ratio,
        from `{{project}}.{{dataset}}.{{table}}`
        where {{column}} is not null
        group by 1
        limit {{nbins_top_values}}
      ),

      hist as (
        select array_agg(
          struct(value as x, cast(value_count as float64) as y)
          order by value_count desc
        ) hist
        from top_values
      )
    {% endif %}

    select to_json(struct(
      '{{column}}' as name,
      '{{data_type}}' as data_type,
      (select stats from stats) as stats,
      (select hist.hist from hist) as hist
    )) as json;
  ''';

  set context = to_json(struct(
    project as project,
    dataset as dataset,
    table as table,
    column as column,
    data_type as data_type,
    nbins as nbins,
    nbins_top_values as nbins_top_values
  ));

  set query = (select {BIGFUNCTIONS_DATASET}.render_template(query, to_json(context)));
  execute immediate query;
template: |
  <div class="container">
    <div class="box">
      <p class="is-size-4 mb-6 mt-4">Column <code>{{ name }}</code> <code>{{ data_type }}</code></p>
        <div class="columns">
          <div class="column is-one-quarter">
            <table class="table is-narrow">
              {% if data_type not in ['STRING', 'BOOL'] %}
                <tr><th class="has-text-weight-bold">min           </th><td>{{ stats.min            | add_thousands_separators_to_integers }}</td></tr>
                <tr><th class="has-text-weight-bold">mean          </th><td>{{ stats.mean           | add_thousands_separators_to_integers }}</td></tr>
                <tr><th class="has-text-weight-bold">max           </th><td>{{ stats.max            | add_thousands_separators_to_integers }}</td></tr>
              {% endif %}
              {% if data_type != 'BOOL' %}
                <tr><th class="has-text-weight-bold">distinct_count</th><td>{{ stats.distinct_count | add_thousands_separators_to_integers }}</td></tr>
                {% if stats.distinct_ratio > 0.001 %}
                  <tr><th class="has-text-weight-bold">distinct_ratio</th><td {% if stats.distinct_ratio == 1 %}class="has-background-success-light"{% elif stats.distinct_ratio > 0.9 %}class="has-background-danger-light"{% endif %}>{{ stats.distinct_ratio | as_percentage }}</td></tr>
                {% endif %}
              {% endif %}
              <tr><th class="has-text-weight-bold">missing_count </th><td {% if stats.missing_ratio == 0 %}class="has-background-success-light"{% endif %}>{{ stats.missing_count  | add_thousands_separators_to_integers }}</td></tr>
              {% if stats.missing_count != 0 %}
                <tr><th class="has-text-weight-bold">missing_ratio </th><td class="has-background-danger-light">{{ stats.missing_ratio  | as_percentage }}</td></tr>
              {% endif %}
              <tr><th class="has-text-weight-bold">row_count     </th><td>{{ stats.row_count      | add_thousands_separators_to_integers }}</td></tr>
            </table>
          </div>
          <div class="column">
            {{ chart(hist, 'bar', ylabel='value_count') }}
          </div>
        </div>
    </div>
  </div>