# Overview

This notebook generates test, train, and validation data from the SANTOS benchmark. The resulting files are serialized for input to a language model, such as in the Rotom framework.

## Setup

1. Download the SANTOS labeled benchmark.
2. The data lake tables -- in the `datalake` directory -- are the tables to be sampled. Update the `table_dir` variable under the *Cluster Tables* section to point to this directory, relative to this notebook.
3. The test, train, and validation sets are written to separate `.txt` files. Update the `test_txt`, `train_txt`, and `valid_txt` variables under the *Open Files* section to the desired output files.

# Imports

In [None]:
import os
import pandas as pd
from itertools import combinations
from random import sample
from pandas.api.types import is_string_dtype

# Cluster Tables

In [None]:
table_dir = "datalake/"
all_tables = os.listdir(table_dir)
table_sets = [ s for s in dict.fromkeys( [ "_".join(f.split("_")[:-1]) for f in all_tables ] ) if s ]
table_sets

['311_calls_historic_data',
 'abandoned_wells',
 'albums',
 'animal_tag_data',
 'biodiversity',
 'business_rates',
 'cdc_nutrition_physical_activity_and_obesity_legislation',
 'cihr_co-applicant',
 'civic_building_locations',
 'complaint_by_practice',
 'contributors_parties',
 'data_mill',
 'deaths_2012_2018',
 'film_locations_in_san_francisco',
 'HMRC_exceptions_to_spending_controls_April_to_June_2018',
 'HMRC_exceptions_to_spending_controls_April_to_June_2018_facilities',
 'HMRC_exceptions_to_spending_controls_April_to_June_2019',
 'HMRC_WMI_headcount_and_payroll_data_Mar',
 'HMRC_exceptions_to_spending_controls_October_to_December_2017',
 'HMRC_Officials_meetings_with_tobacco_stakeholders_Apr_2015_to_Jun',
 'HMRC_Officials_meetings_with_tobacco_stakeholders_Apr_2017_to_June',
 'HMRC_Officials_meetings_with_tobacco_stakeholders_Apr_2018_to_June',
 'HMRC_Officials_meetings_with_tobacco_stakeholders_Jan_2020_to_Mar',
 'HMRC_Officials_meetings_with_tobacco_stakeholders_Jul_2014_to_Sept'

# Open Files

In [None]:
test_txt = open("exp/test.txt", "w")
train_txt = open("exp/train.txt", "w")
valid_txt = open("exp/valid.txt", "w")

# Iterate and Write Test/Train/Validation Sets

In [None]:
def get_splits(pre: str) -> tuple:
  '''
  Split the tables into test, train, and validation sets.
  
  Params:
  pre: prefix of table set to split

  Returns:
  table names of test, train, validation sets
  '''
  s = [ t for t in all_tables if t.startswith(pre) ]
  if len(s) < 6:
    test = s[:1]
    train = s[1:-1]
    valid = s[-1:]
  if len(s) < 8:
    test = s[:2]
    train = s[2:-2]
    valid = s[-2:]
  else:
    test = s[:2]
    train = s[2:6]
    valid = s[-2:]
  return test, train, valid

In [None]:
TOKEN_CT = 15  # MAX TOKENS SERIALIZED PER COLUMN

def get_labels(table_lst: list) -> list:
  '''
  Generate and serialize positive and negative samples for the set of tables.

  Params:
  table_lst: list of table names

  Returns:
  list of serialized samples
  '''
  dfs = [ pd.read_csv(table_dir + csv) for csv in table_lst ]
  labeled = []
  for i1, i2 in list(combinations(range(len(table_lst)), 2)):
    df1: pd.DataFrame = dfs[i1]
    df2: pd.DataFrame = dfs[i2]
    sample_neg = True
    for c1 in df1.columns:
      # only consider string columns
      if not is_string_dtype(df1[c1]):
        continue
      # remove nan values
      col1 = df1[c1].dropna()
      # if col doesn't have enough tokens left, skip
      if len(col1) < TOKEN_CT:
        continue
      for c2 in df2.columns:
        # get class label
        cls = 1 if c1 == c2 else 0
        # skip if got enough negative samples
        if not sample_neg and cls == 0:
          continue
        # only consider string columns
        if not is_string_dtype(df2[c2]):
          continue
        # remove nan values
        col2 = df2[c2].dropna()
        # if col doesn't have enough tokens left, skip
        if len(col2) < TOKEN_CT:
          continue
        # serialize
        toks1 = [ w for l in col1 for w in str(l).split() ]
        line1 = "COL " + " ".join(sample(toks1, TOKEN_CT))
        toks2 = [ w for l in col2 for w in str(l).split() ]
        line2 = "COL " + " ".join(sample(toks2, TOKEN_CT))
        labeled.append(f"{line1}\t{line2}\t{cls}\n")
      sample_neg = False
  return labeled

In [None]:
def get_cluster_labels(pre: str, table_lst: list) -> list:
  '''
  Records the table prefix for each sample generated.
  Unused currently, but was used to help split up existing labeled files for closer analysis,
  without having to regenerate the data sets (due to randomization when choosing tokens).

  Params:
  pre: table set prefix
  table_lst: list of table names

  Returns:
  list of prefixes, one per sample generated by this set of tables
  '''
  dfs = [ pd.read_csv(table_dir + csv) for csv in table_lst ]
  labeled = []
  for i1, i2 in list(combinations(range(len(table_lst)), 2)):
    df1: pd.DataFrame = dfs[i1]
    df2: pd.DataFrame = dfs[i2]
    sample_neg = True
    for c1 in df1.columns:
      # only consider string columns
      if not is_string_dtype(df1[c1]):
        continue
      # remove nan values
      col1 = df1[c1].dropna()
      # if col doesn't have enough tokens left, skip
      if len(col1) < TOKEN_CT:
        continue
      for c2 in df2.columns:
        # get class label
        cls = 1 if c1 == c2 else 0
        # skip if got enough negative samples
        if not sample_neg and cls == 0:
          continue
        # only consider string columns
        if not is_string_dtype(df2[c2]):
          continue
        # remove nan values
        col2 = df2[c2].dropna()
        # if col doesn't have enough tokens left, skip
        if len(col2) < TOKEN_CT:
          continue
        # record cluster
        labeled.append(pre + '\n')
      sample_neg = False
  return labeled

In [None]:
# MAIN LOOP
for pre in table_sets:
  print(f"Sampling from {pre}")
  test, train, valid = get_splits(pre)
  test_txt.writelines(get_labels(test))
  train_txt.writelines(get_labels(train))
  valid_txt.writelines(get_labels(valid))

Sampling from 311_calls_historic_data
Sampling from abandoned_wells
Sampling from albums
Sampling from animal_tag_data
Sampling from biodiversity
Sampling from business_rates
Sampling from cdc_nutrition_physical_activity_and_obesity_legislation
Sampling from cihr_co-applicant
Sampling from civic_building_locations
Sampling from complaint_by_practice
Sampling from contributors_parties
Sampling from data_mill
Sampling from deaths_2012_2018
Sampling from film_locations_in_san_francisco
Sampling from HMRC_exceptions_to_spending_controls_April_to_June_2018
Sampling from HMRC_exceptions_to_spending_controls_April_to_June_2018_facilities
Sampling from HMRC_exceptions_to_spending_controls_April_to_June_2019
Sampling from HMRC_WMI_headcount_and_payroll_data_Mar
Sampling from HMRC_exceptions_to_spending_controls_October_to_December_2017
Sampling from HMRC_Officials_meetings_with_tobacco_stakeholders_Apr_2015_to_Jun
Sampling from HMRC_Officials_meetings_with_tobacco_stakeholders_Apr_2017_to_June


# Cleanup

In [None]:
test_txt.close()
train_txt.close()
valid_txt.close()