In [1]:
import pandas as pd

In [2]:
# Read data csv created at the end of 01-outcomes-prep.R:
df_tanf = pd.read_csv('df_replication_anon.csv', na_values=['NA'])

In [3]:
# Define function to remove times from some date-times, and account for 2 date formats:
from datetime import datetime
strptime = datetime.strptime

def dater(x):
    if '/' in x:
        return strptime(x, '%m/%d/%Y')
    else:
        return strptime(x, '%Y-%m-%d')

In [4]:
# Recode appt_date to include only dates in a single format: 
df_tanf['appt_date'] = df_tanf.appt_date.apply(lambda x: x.split(' ')[0]).apply(dater)

In [5]:
"""
Tools for anonymizing pandas DataFrames.

@author Kevin H. Wilson <kevin.wilson@dc.gov>
"""
import math

try:
  import secrets
except ImportError:
  from . import secretsport as secrets


def anonymize_str_columns(df, cols, current_map=None, inplace=False):
  """
  Given a collection of columns `cols` in a DataFrame `df`,
  assume they all are strings. Replace all the values, including
  those common across columns, with a unique, random string.
  If `current_map` is not None, then it will be used first to determine
  the values of the columns.

  For example, if `df` looks like::

    A | B | C
   -----------
    a | b | x
    a | a | y
    b | c | z

  and `cols == ['A', 'B']`, then the output might look like::

    A | B | C
   -----------
    1 | 2 | x
    1 | 1 | y
    2 | 3 | z
    
  This function also preserves missing data as misisng.

  Args:
    df (pd.DataFrame): The DataFrame whose columns to anonymize
    cols (list[str]): The list of column names to anonymize
    current_map (dict[str, str]|None): The map of keys to values to append to
    inplace (bool): Should the anonymization be done in place?

  Returns:
    pd.DataFrame: The anonymized DataFrame
    dict[str, str]: The map of anonymized values
  """
  if type(cols) not in (list, tuple):
    cols = [cols]

  if not current_map:
    current_map = {}

  old_keys = set(current_map.keys())
  old_vals = set(current_map.values())

  new_keys = set()
  for col in cols:
    new_keys.update(df.loc[~df[col].isnull(), col])
    new_keys -= old_keys

  # Keep the probability of collisions relatively low, but
  # keep the size at least a standard 16 bytes
  nbytes = max(4 + int(math.log(len(new_keys) + len(old_keys)) / math.log(8)), 16)
  for new_key in new_keys:
    new_val = secrets.token_hex(nbytes=nbytes)
    while new_val in old_vals:
      new_val = secrets.token_hex(nbytes=nbytes)
    current_map[new_key] = new_val

  if not inplace:
    df = df.copy()

  for col in cols:
    # TODO(khw): Allow an option to map missing data to something else.
    df.loc[:, col] = df[col].map(current_map, na_action='ignore')

  return df, current_map

In [6]:
df_tanf_anon, map = anonymize_str_columns(df_tanf, 
                                          cols = ['ic_case_id', 'head_hh', 'address', 'city_state', 'tel', 'pdc_number'])

In [7]:
df_tanf_anon = df_tanf_anon.drop(columns = ['Unnamed: 0'])

In [8]:
df_tanf_anon.to_csv('df_replication_anonymized.csv', index = False)