# ZRP User Guide
The purpose of this notebook is to illustrate how to use ZRP, the main class of the zrp package that processes user input data &  returns race/ethnicity predictions

In [1]:
%load_ext autoreload
%autoreload 2
%config Completer.use_jedi=False

In [2]:
from os.path import join, expanduser
import pandas as pd
import sys
import os
import re
import warnings

In [3]:
warnings.filterwarnings(action='once')
home = expanduser('~')

In [4]:
src_path = '{}/zrp'.format(home)
sys.path.append(src_path)

In [5]:
test_ids = ['GA_10961114',  'GA_07588296', 'GA_11951308', 'GA_03567641',  
            'GA_11493478', 'GA_08063136', 'GA_02144077', 'GA_06757359', 
            'GA_10561962', 'GA_07690722',   'GA_11003386'
           ]

In [6]:
from zrp import ZRP
from zrp.prepare.utils import load_file

  from numpy.dual import register_func
  supported_dtypes = [np.typeDict[x] for x in supported_dtypes]
  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  from numpy import (exp, inf, pi, sqrt, floor, sin, cos, around, int,
  version = LooseVersion(pd.__version__)


## load data
simulating user input data

In [7]:
support_files_path = "/d/shared/zrp/shared_data"
key ='ZEST_KEY'

In [8]:
df = load_file("/d/shared/zrp/shared_data/processed/data/state_level/voters/base_ga_2022q1.parquet")
df.shape

  if distutils.version.LooseVersion(version) < minimum_version:
  other = LooseVersion(other)
  _pyarrow_version_ge_015 = LooseVersion(pyarrow.__version__) >= LooseVersion("0.15")


(7517881, 14)

### sample data
sample for test case

In [9]:
samp = df.copy()
samp = samp[samp[key].isin(test_ids)]
samp.shape

(11, 14)

## ZRP   
- Processes input data
- Generates geo mappings
    - Requires standard address categories:
    - House Number
    - Street Address (including directional, street name, & street suffix)
    - City
    - State
    - Zip/postal code (prefer ZCTA5)
- Integrates processed data with American Community Survey (ACS) data
- Generates feature engineered data
- Predicts race & ethnicity

In [13]:
%%time
zest_race_predictor = ZRP()
zest_race_predictor.fit()
output = zest_race_predictor.transform(samp)

  0%|          | 0/11 [00:00<?, ?it/s][Parallel(n_jobs=49)]: Using backend ThreadingBackend with 49 concurrent workers.
100%|██████████| 11/11 [00:00<00:00, 38035.73it/s]

Data is loaded
   Formatting P1
   Formatting P2
reduce whitespace

[Start] Preparing geo data
  The following states are included in the data: ['GA']
   ... on state: GA

   Data is loaded
   [Start] Processing geo data
/d/shared/zrp/shared_data
      ...address cleaning
      ...replicating address
         ...Base
         ...Map street suffixes...



[Parallel(n_jobs=49)]: Done  11 out of  11 | elapsed:    0.0s finished


         ...Mapped & split by street suffixes...
         ...Number processing...

     Address dataframe expansion is complete! (n=17)
      ...formatting
   [Completed] Processing geo data
   [Start] Mapping geo data
      ...merge user input & lookup table
      ...mapping


  if distutils.version.LooseVersion(version) < minimum_version:
  other = LooseVersion(other)


Output saved
   [Completed] Mapping geo data
[Completed] Preparing geo data

[Start] Preparing ACS data
User input data is loaded
   ...loading ACS lookup tables
   ... combining ACS & user input data
ZEST_KEY
 ...Copy dataframes
 ...Block group
 ...Census tract
 ...Zip code
 ...No match
 ...Merge
 ...Merging complete
[Complete] Preparing ACS data


  return self._constructor_sliced(result, index=self._get_agg_axis(axis))


Output saved
Handle Compounds (in transform): (11, 92)
Handle Compounds (in transform reset): (11, 93)
Handle Compounds (end transform): (12, 93)
App FE (in transform) (12, 94)


  0%|          | 0/1 [00:00<?, ?it/s][Parallel(n_jobs=90)]: Using backend ThreadingBackend with 90 concurrent workers.
100%|██████████| 1/1 [00:00<00:00, 3155.98it/s]

App FE (in transform post data_fe 1) (12, 94)
App FE (in transform post data_fe 2) (12, 105)
App FE (end transform) (12, 105)
Custom Ratios (in transform) (12, 105)
Custom Ratios (end transform) (12, 126)
Name Aggregation (in transform) (12, 126)



[Parallel(n_jobs=90)]: Done   1 out of   1 | elapsed:    0.0s finished


(11, 15)
(11, 15)
Empty DataFrame
Columns: [HISPANIC_last_name, BLACK_middle_name, AAPI_middle_name, WHITE_last_name, BLACK_last_name, AAPI_last_name, AIAN_first_name, WHITE_middle_name, AAPI_first_name, HISPANIC_first_name, AIAN_middle_name, HISPANIC_middle_name, WHITE_first_name, AIAN_last_name, BLACK_first_name]
Index: []

(12, 126)

(11, 110)

(11, 15)

Output saved
CPU times: user 44.3 s, sys: 14.2 s, total: 58.5 s
Wall time: 27.9 s


  if distutils.version.LooseVersion(version) < minimum_version:
  other = LooseVersion(other)


In [14]:
output.reset_index(drop=True)

Unnamed: 0,AAPI,AIAN,BLACK,HISPANIC,WHITE,source_block_group
0,0.003693,0.012285,0.68988,0.010139,0.284002,1
1,0.003317,0.008846,0.08652,0.006012,0.895306,1
2,0.015528,0.001163,0.001545,0.025489,0.956274,1
3,0.13645,0.03533,0.537617,0.071682,0.218921,1
4,0.009961,0.000892,0.948494,0.010603,0.03005,1
5,0.00314,6.6e-05,0.000432,0.994575,0.001788,1
6,0.003461,0.002152,0.013817,0.007392,0.973178,1
7,0.013305,0.010755,0.013567,0.024665,0.937708,1
8,0.013309,0.001035,0.003674,0.024191,0.95779,1
9,0.016148,0.000652,0.125726,0.012625,0.844849,1
