# ZRP Example Usage
The purpose of this notebook is to illustrate how to use ZRP, the main class of the zrp package that processes user input data &  returns race/ethnicity predictions

In [31]:
%load_ext autoreload
%autoreload 2
%config Completer.use_jedi=False

  and should_run_async(code)


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [32]:
from os.path import join, expanduser
import pandas as pd
import sys
import os
import re
import warnings

## Set source code path here

In [33]:
warnings.filterwarnings(action='once')
home = expanduser('~')

src_path = os.path.dirname(os.getcwd())
src_path

'/d/shared/zrp/zrp'

In [34]:
from zrp import ZRP
from zrp.prepare.utils import load_file

## Load sample data for prediction
Load list of New Jersey Mayors downloaded from https://www.nj.gov/dca/home/2022mayors.csv 

In [35]:
nj_mayors = load_file(src_path + "/examples/2022-nj-mayors.csv")
nj_mayors.shape

(565, 18)

In [36]:
nj_mayors

Unnamed: 0,MUNI CODE,MUNI NAME,COUNTY,ADDRESS 1,ADDRESS 2,CITY,STATE,ZIP,PHONE,FAX,MAYOR NAME,TERM START,TERM END,FORM,TERM LEGNTH,EMAIL,SOCIAL MEDIA HANDLE,Municipal Contact List
0,1330,Aberdeen Township,Monmouth,One Aberdeen Square,,Aberdeen,NJ,07747-2300,(732) 583-4200,,Fred Tagliarini,,12/31/2025,COUNCIL-MANAGER,4,fred.tagliarini@aberdeennj.org,,
1,0101,Absecon City,Atlantic,Absecon Municipal Complex,500 Mill Road,Absecon,NJ,08201,(609) 641-0663,(609) 645-5098,Kimberly Horton,,12/31/2024,MAYOR-COUNCIL,3,khorton@abseconnj.org,,
2,1001,Alexandria Township,Hunterdon,782 Frenchtown Road,,Milford,NJ,08848,(908) 996-7071,,Gabe Plumer,,12/31/2022,TOWNSHIP,3,clerk@alexandrianj.gov,,
3,2101,Allamuchy Township,Warren,Post Office Box A,,Allamuchy,NJ,07820,(908) 852-5132,,Rosemary Tuohy,,12/31/2024,FAULKNER ACT,3,mayor@allamuchynj.org,,
4,0201,Allendale Borough,Bergen,500 West Crescent Avenue,,Allendale,NJ,07401,(201) 818-4400,,Ari Bernstein,,12/31/2022,,,aribernstein@allendalenj.gov,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
560,0269,Wood-Ridge Borough,Bergen,85 Humboldt Street,,Wood-Ridge,NJ,07075-2344,(201) 939-0202,,Paul A Sarlo,,12/31/2023,,,psarlo@njwoodridge.org,,
561,1715,Woodstown Borough,Salem,Post Office Box 286,,Woodstown,NJ,08098,(856) 769-2200,,Donald Dietrich,,12/31/2023,,,Don.dietrich@comcast.net,,
562,0824,Woolwich Township,Gloucester,120 Village Green Drive,,Woolwich Township,NJ,08085-3180,(856) 467-2666,,Craig Frederick,,12/31/2024,,,cfrederick@woolwichtwp.org,,
563,0340,Wrightstown Borough,Burlington,21 Saylors Pond Road,,Wrightstown,NJ,08562,(609) 723-4450,(609) 723-7137,Donald Cottrell,,12/31/2022,,,mayor@wrightstownborough.com,,


### Wrangle NJ mayor data for predictions
Prepare the NJ mayor data.  This parsing of the NJ mayors file will leave some NA's, but it is sufficient for demonstration purposes


In [37]:
zrp_sample = pd.DataFrame(columns=['first_name', 'middle_name', 'last_name', 'house_number', 'street_address', 'city', 'state', 'zip_code'])

Prepare Names

In [38]:
split_mayor_names = nj_mayors['MAYOR NAME'].str.split(' ')
zrp_sample['first_name'] = split_mayor_names.str[0]
zrp_sample['last_name'] = split_mayor_names.str[-1]

City, State, Zip

In [39]:
zrp_sample['city'] = nj_mayors['CITY']
zrp_sample['state'] = nj_mayors['STATE']
zrp_sample['zip_code'] = nj_mayors['ZIP']

Address

In [40]:
zrp_sample['house_number'] = nj_mayors['ADDRESS 1'].str.extract('([0-9]+)')
zrp_sample['street_address'] = nj_mayors['ADDRESS 1'].str.extract('.*[0-9]+([^0-9]+)')


In [41]:
zrp_sample['ZEST_KEY'] = zrp_sample.index.astype(str)  #must specify key to establish correspondence between inputs and outputs
zrp_sample

Unnamed: 0,first_name,middle_name,last_name,house_number,street_address,city,state,zip_code,ZEST_KEY
0,Fred,,Tagliarini,,,Aberdeen,NJ,07747-2300,0
1,Kimberly,,Horton,,,Absecon,NJ,08201,1
2,Gabe,,Plumer,782,Frenchtown Road,Milford,NJ,08848,2
3,Rosemary,,Tuohy,,,Allamuchy,NJ,07820,3
4,Ari,,Bernstein,500,West Crescent Avenue,Allendale,NJ,07401,4
...,...,...,...,...,...,...,...,...,...
560,Paul,,Sarlo,85,Humboldt Street,Wood-Ridge,NJ,07075-2344,560
561,Donald,,Dietrich,286,,Woodstown,NJ,08098,561
562,Craig,,Frederick,120,Village Green Drive,Woolwich Township,NJ,08085-3180,562
563,Donald,,Cottrell,21,Saylors Pond Road,Wrightstown,NJ,08562,563


### Invoke the Zest Race Predictor on the sample data

In [42]:
%%time
zest_race_predictor = ZRP()
zest_race_predictor.fit()
zrp_output = zest_race_predictor.transform(zrp_sample)

  0%|          | 0/1 [00:00<?, ?it/s]
  0%|          | 0/565 [00:00<?, ?it/s][A[Parallel(n_jobs=49)]: Using backend ThreadingBackend with 49 concurrent workers.
[Parallel(n_jobs=49)]: Done 102 tasks      | elapsed:    0.0s
[Parallel(n_jobs=49)]: Done 352 tasks      | elapsed:    0.0s
100%|██████████| 565/565 [00:00<00:00, 16291.19it/s]

Directory already exists
Data is loaded
   [Start] Validating input data
     Number of observations: 565
     Is key unique: True
   [Completed] Validating input data

   Formatting P1
   Formatting P2
   reduce whitespace

[Start] Preparing geo data

  The following states are included in the data: ['NJ']
   ... on state: NJ

   Data is loaded
   [Start] Processing geo data
/home/csw/.conda/envs/py37/lib/python3.7/site-packages/zrp/prepare/../data/processed
      ...address cleaning



[Parallel(n_jobs=49)]: Done 565 out of 565 | elapsed:    0.0s finished


      ...replicating address
         ...Base
         ...Map street suffixes...
         ...Mapped & split by street suffixes...
         ...Number processing...

         Address dataframe expansion is complete! (n=1003)
         ...Base
         ...Number processing...
         House number dataframe expansion is complete! (n=1003)
      ...formatting
   [Completed] Processing geo data
   [Start] Mapping geo data


  if distutils.version.LooseVersion(version) < minimum_version:
  other = LooseVersion(other)


      ...merge user input & lookup table
      ...mapping


  if distutils.version.LooseVersion(version) < minimum_version:
  other = LooseVersion(other)
100%|██████████| 1/1 [00:05<00:00,  5.21s/it]

Directory already exists
Output saved
   [Completed] Mapping geo data

[Completed] Preparing geo data

[Start] Preparing ACS data
   [Start] Validating ACS input data
     Number of observations: 565
     Is key unique: True
   [Completed] Validating ACS input data

   ...loading ACS lookup tables



  if distutils.version.LooseVersion(version) < minimum_version:
  other = LooseVersion(other)


   ... combining ACS & user input data
 ...Copy dataframes
 ...Block group
 ...Census tract
 ...Zip code
 ...No match
 ...Merge
 ...Merging complete
[Complete] Preparing ACS data

   [Start] Validating pipeline input data
     Number of observations: 1629
     Is key unique: False
   [Completed] Validating pipeline input data



  0%|          | 0/1 [00:00<?, ?it/s][Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 60 concurrent workers.
100%|██████████| 1/1 [00:00<00:00, 1046.48it/s]
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
  0%|          | 0/1 [00:00<?, ?it/s][Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 60 concurrent workers.
100%|██████████| 1/1 [00:00<00:00, 1057.83it/s]
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
  0%|          | 0/1 [00:00<?, ?it/s][Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 60 concurrent workers.
100%|██████████| 1/1 [00:00<00:00, 910.22it/s]
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished


Directory already exists
Output saved


  if distutils.version.LooseVersion(version) < minimum_version:
  other = LooseVersion(other)


Output saved
CPU times: user 50.3 s, sys: 6.46 s, total: 56.7 s
Wall time: 51.7 s


  if distutils.version.LooseVersion(version) < minimum_version:
  other = LooseVersion(other)


### Inspect the output and join

In [43]:
zrp_output

Unnamed: 0,first_name,middle_name,last_name,house_number,street_address,city,state,zip_code,ZEST_KEY,AAPI,AIAN,BLACK,HISPANIC,WHITE,race_proxy,source_block_group,source_census_tract,source_zip_code,source_bisg
0,Fred,,Tagliarini,,,Aberdeen,NJ,07747-2300,0,0.000445,0.000367,0.001623,0.000926,0.996639,WHITE,0.0,0.0,1.0,0.0
1,Kimberly,,Horton,,,Absecon,NJ,08201,1,0.020005,0.022037,0.385148,0.041842,0.530967,WHITE,0.0,0.0,1.0,0.0
2,Gabe,,Plumer,782,Frenchtown Road,Milford,NJ,08848,2,0.000117,0.000096,0.031971,0.002676,0.965141,WHITE,1.0,0.0,0.0,0.0
3,Rosemary,,Tuohy,,,Allamuchy,NJ,07820,3,0.000584,0.000502,0.001105,0.002336,0.995472,WHITE,0.0,0.0,1.0,0.0
4,Ari,,Bernstein,500,West Crescent Avenue,Allendale,NJ,07401,4,0.016728,0.008392,0.002305,0.050236,0.922338,WHITE,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
560,Paul,,Sarlo,85,Humboldt Street,Wood-Ridge,NJ,07075-2344,560,0.000146,0.000116,0.000424,0.000582,0.998732,WHITE,1.0,0.0,0.0,0.0
561,Donald,,Dietrich,286,,Woodstown,NJ,08098,561,0.009935,0.000523,0.007039,0.008381,0.974121,WHITE,0.0,0.0,1.0,0.0
562,Craig,,Frederick,120,Village Green Drive,Woolwich Township,NJ,08085-3180,562,0.012623,0.018644,0.190711,0.023586,0.754436,WHITE,0.0,0.0,1.0,0.0
563,Donald,,Cottrell,21,Saylors Pond Road,Wrightstown,NJ,08562,563,0.011897,0.027253,0.118932,0.023193,0.818726,WHITE,1.0,0.0,0.0,0.0


### Check the most likely Hispanic 

In [44]:
zrp_output.nlargest(10, "HISPANIC")

Unnamed: 0,first_name,middle_name,last_name,house_number,street_address,city,state,zip_code,ZEST_KEY,AAPI,AIAN,BLACK,HISPANIC,WHITE,race_proxy,source_block_group,source_census_tract,source_zip_code,source_bisg
377,Hector,,Lora,330.0,Passaic Street,Passaic,NJ,07055-5815,377,0.000121,0.00023,0.001202,0.98207,0.016378,HISPANIC,1.0,0.0,0.0,0.0
286,Marcial,,Mojena,249.0,,Columbus,NJ,08022,286,0.00102,0.000309,0.001682,0.965945,0.031043,HISPANIC,0.0,0.0,1.0,0.0
536,Gabriel,,Rodriguez,428.0,th Street,West New York,NJ,07093-2222,536,0.001721,0.005429,0.002667,0.958142,0.032041,HISPANIC,0.0,0.0,1.0,0.0
388,Helmin,,Caba,,,Perth Amboy,NJ,08861,388,0.021552,0.000426,0.00847,0.953645,0.015907,HISPANIC,0.0,0.0,1.0,0.0
236,Alberto,,Santos,402.0,Kearny Avenue,Kearny,NJ,07032,236,0.019797,0.000704,0.002792,0.949895,0.026812,HISPANIC,1.0,0.0,0.0,0.0
543,Ray,,Arroyo,101.0,Washington Avenue,Westwood,NJ,07675,543,0.005317,0.006722,0.005754,0.917283,0.064924,HISPANIC,1.0,0.0,0.0,0.0
499,Manuel,,Figueiredo,,,Union,NJ,07083-3597,499,0.000594,0.000619,0.001788,0.916256,0.080742,HISPANIC,0.0,0.0,1.0,0.0
378,Andre,,Sayegh,125.0,st Floor,Paterson,NJ,07505-1414,378,0.075077,0.000379,0.005269,0.889816,0.029458,HISPANIC,0.0,0.0,1.0,0.0
418,Ramopn,,Hache,131.0,North Maple Avenue,Ridgewood,NJ,07450-3236,418,0.001468,0.000549,0.001547,0.84619,0.150247,HISPANIC,0.0,0.0,1.0,0.0
556,Carlos,,Rendo,188.0,Pascack Road,Woodcliff Lake,NJ,07677-7921,556,0.000315,0.000395,0.0013,0.837821,0.160169,HISPANIC,0.0,0.0,1.0,0.0


### Check the most likely Black

In [45]:
zrp_output.nlargest(10, "BLACK")

Unnamed: 0,first_name,middle_name,last_name,house_number,street_address,city,state,zip_code,ZEST_KEY,AAPI,AIAN,BLACK,HISPANIC,WHITE,race_proxy,source_block_group,source_census_tract,source_zip_code,source_bisg
343,Ras,,Baraka,920.0,Broad Street,Newark,NJ,07102,343,0.000497,7e-05,0.978793,0.002025,0.018616,BLACK,1.0,0.0,0.0,0.0
215,Dahlia,,Vertreese,,,Hillside,NJ,07205,215,0.000495,0.000783,0.973164,0.009093,0.016466,BLACK,0.0,0.0,1.0,0.0
229,Anthony,,Vauss,,,Irvington,NJ,07111-2412,229,0.002764,0.001271,0.970029,0.016721,0.009215,BLACK,0.0,0.0,1.0,0.0
549,Tiffani,,Worthy,1.0,Salem Road,Willingboro,NJ,08046,549,0.00084,0.000422,0.963874,0.008897,0.025967,BLACK,1.0,0.0,0.0,0.0
370,Dwayne,,Warren,29.0,North Day Street,Orange,NJ,07050,370,0.002516,0.01233,0.96043,0.005826,0.018898,BLACK,0.0,0.0,1.0,0.0
397,Adrian,,Mapp,515.0,Watchung Avenue,Plainfield,NJ,07060-1720,397,0.004099,0.008725,0.944972,0.020197,0.022007,BLACK,1.0,0.0,0.0,0.0
258,Derek,,Armstead,301.0,North Wood Avenue,Linden,NJ,07036-4296,258,0.01619,0.009768,0.941253,0.002946,0.029844,BLACK,0.0,0.0,1.0,0.0
250,Mary,,Wardlow,4.0,East Douglas Avenue,Lawnside,NJ,08045-1597,250,0.000452,0.000842,0.925571,0.019787,0.053347,BLACK,0.0,0.0,1.0,0.0
78,Jamila,,Odom-Bremmer,201.0,Grant Avenue,Chesilhurst,NJ,08089,78,0.013521,0.016294,0.919407,0.018952,0.031827,BLACK,1.0,0.0,0.0,0.0
540,Sandy,,Henley,710.0,Rancocas Road,Westampton,NJ,08060-5642,540,0.021035,0.011138,0.808005,0.054403,0.105419,BLACK,1.0,0.0,0.0,0.0


In [46]:
bisg_probs = pd.read_feather(src_path + "/examples/artifacts/bisg_proxy_probs.feather")

  if distutils.version.LooseVersion(version) < minimum_version:
  other = LooseVersion(other)


In [47]:
bisg_probs

Unnamed: 0,ZEST_KEY,WHITE,BLACK,AAPI,AIAN,OTHER,HISPANIC,race,source_bisg
0,270,0.968087,0.000000,0.001908,0.001175,0.005464,0.023365,WHITE,1
1,302,0.946149,0.011806,0.016053,0.000379,0.014805,0.010807,WHITE,1
2,356,0.885378,0.001248,0.076241,0.000000,0.006771,0.030363,WHITE,1
3,359,0.957466,0.001720,0.007673,0.000478,0.010949,0.021715,WHITE,1
4,2,0.990258,0.006154,0.000478,0.000463,0.000715,0.001932,WHITE,1
...,...,...,...,...,...,...,...,...,...
560,556,,,,,,,,1
561,557,,,,,,,,1
562,559,,,,,,,,1
563,560,,,,,,,,1
