# ZRP Example Usage
The purpose of this notebook is to illustrate how to use ZRP, the main class of the zrp package that processes user input data &  returns race/ethnicity predictions

In [1]:
%load_ext autoreload
%autoreload 2
%config Completer.use_jedi=False

In [2]:
from os.path import join, expanduser
import pandas as pd
import sys
import os
import re
import warnings

## Set source code path here

In [3]:
warnings.filterwarnings(action='once')
home = expanduser('~')

src_path = '{}/zest/zrp'.format(home)
sys.path.append(src_path)

In [4]:
from zrp import ZRP
from zrp.prepare.utils import load_file

  from scipy.sparse.csr import csr_matrix
  version = LooseVersion(pd.__version__)
  from pandas import Int64Index as NumericIndex
  from pandas import MultiIndex, Int64Index


## Load sample data for prediction
Load list of New Jersey Mayors downloaded from https://www.nj.gov/dca/home/2022mayors.csv 

In [5]:
nj_mayors = load_file(src_path + "/examples/2022-nj-mayors.csv")
nj_mayors.shape

(565, 18)

In [6]:
nj_mayors

Unnamed: 0,MUNI CODE,MUNI NAME,COUNTY,ADDRESS 1,ADDRESS 2,CITY,STATE,ZIP,PHONE,FAX,MAYOR NAME,TERM START,TERM END,FORM,TERM LEGNTH,EMAIL,SOCIAL MEDIA HANDLE,Municipal Contact List
0,1330,Aberdeen Township,Monmouth,One Aberdeen Square,,Aberdeen,NJ,07747-2300,(732) 583-4200,,Fred Tagliarini,,12/31/2025,COUNCIL-MANAGER,4,fred.tagliarini@aberdeennj.org,,
1,0101,Absecon City,Atlantic,Absecon Municipal Complex,500 Mill Road,Absecon,NJ,08201,(609) 641-0663,(609) 645-5098,Kimberly Horton,,12/31/2024,MAYOR-COUNCIL,3,khorton@abseconnj.org,,
2,1001,Alexandria Township,Hunterdon,782 Frenchtown Road,,Milford,NJ,08848,(908) 996-7071,,Gabe Plumer,,12/31/2022,TOWNSHIP,3,clerk@alexandrianj.gov,,
3,2101,Allamuchy Township,Warren,Post Office Box A,,Allamuchy,NJ,07820,(908) 852-5132,,Rosemary Tuohy,,12/31/2024,FAULKNER ACT,3,mayor@allamuchynj.org,,
4,0201,Allendale Borough,Bergen,500 West Crescent Avenue,,Allendale,NJ,07401,(201) 818-4400,,Ari Bernstein,,12/31/2022,,,aribernstein@allendalenj.gov,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
560,0269,Wood-Ridge Borough,Bergen,85 Humboldt Street,,Wood-Ridge,NJ,07075-2344,(201) 939-0202,,Paul A Sarlo,,12/31/2023,,,psarlo@njwoodridge.org,,
561,1715,Woodstown Borough,Salem,Post Office Box 286,,Woodstown,NJ,08098,(856) 769-2200,,Donald Dietrich,,12/31/2023,,,Don.dietrich@comcast.net,,
562,0824,Woolwich Township,Gloucester,120 Village Green Drive,,Woolwich Township,NJ,08085-3180,(856) 467-2666,,Craig Frederick,,12/31/2024,,,cfrederick@woolwichtwp.org,,
563,0340,Wrightstown Borough,Burlington,21 Saylors Pond Road,,Wrightstown,NJ,08562,(609) 723-4450,(609) 723-7137,Donald Cottrell,,12/31/2022,,,mayor@wrightstownborough.com,,


### Wrangle NJ mayor data for predictions
Prepare the NJ mayor data.  This parsing of the NJ mayors file will leave some NA's, but it is sufficient for demonstration purposes


In [7]:
zrp_sample = pd.DataFrame(columns=['first_name', 'middle_name', 'last_name', 'house_number', 'street_address', 'city', 'state', 'zip_code'])

Prepare Names

In [8]:
split_mayor_names = nj_mayors['MAYOR NAME'].str.split(' ')
zrp_sample['first_name'] = split_mayor_names.str[0]
zrp_sample['last_name'] = split_mayor_names.str[-1]

City, State, Zip

In [9]:
zrp_sample['city'] = nj_mayors['CITY']
zrp_sample['state'] = nj_mayors['STATE']
zrp_sample['zip_code'] = nj_mayors['ZIP']

Address

In [10]:
zrp_sample['house_number'] = nj_mayors['ADDRESS 1'].str.extract('([0-9]+)')
zrp_sample['street_address'] = nj_mayors['ADDRESS 1'].str.extract('.*[0-9]+([^0-9]+)')


In [11]:
zrp_sample['ZEST_KEY'] = zrp_sample.index.astype(str)  #must specify key to establish correspondence between inputs and outputs
zrp_sample

Unnamed: 0,first_name,middle_name,last_name,house_number,street_address,city,state,zip_code,ZEST_KEY
0,Fred,,Tagliarini,,,Aberdeen,NJ,07747-2300,0
1,Kimberly,,Horton,,,Absecon,NJ,08201,1
2,Gabe,,Plumer,782,Frenchtown Road,Milford,NJ,08848,2
3,Rosemary,,Tuohy,,,Allamuchy,NJ,07820,3
4,Ari,,Bernstein,500,West Crescent Avenue,Allendale,NJ,07401,4
...,...,...,...,...,...,...,...,...,...
560,Paul,,Sarlo,85,Humboldt Street,Wood-Ridge,NJ,07075-2344,560
561,Donald,,Dietrich,286,,Woodstown,NJ,08098,561
562,Craig,,Frederick,120,Village Green Drive,Woolwich Township,NJ,08085-3180,562
563,Donald,,Cottrell,21,Saylors Pond Road,Wrightstown,NJ,08562,563


### Invoke the Zest Race Predictor on the sample data

In [12]:
%%time
zest_race_predictor = ZRP()
zest_race_predictor.fit()
zrp_output = zest_race_predictor.transform(zrp_sample)

Directory already exists
Data is loaded
   Formatting P1
   Formatting P2
reduce whitespace

[Start] Preparing geo data
  The following states are included in the data: ['NJ']
   ... on state: NJ

   Data is loaded
   [Start] Processing geo data
/Users/j/zest/zrp/zrp/prepare/../data/processed
      ...address cleaning


  0%|                                                   | 0/565 [00:00<?, ?it/s][Parallel(n_jobs=49)]: Using backend ThreadingBackend with 49 concurrent workers.
[Parallel(n_jobs=49)]: Done 102 tasks      | elapsed:    0.0s
[Parallel(n_jobs=49)]: Done 352 tasks      | elapsed:    0.0s
100%|██████████████████████████████████████| 565/565 [00:00<00:00, 14604.39it/s]

      ...replicating address
         ...Base
         ...Map street suffixes...



[Parallel(n_jobs=49)]: Done 565 out of 565 | elapsed:    0.0s finished


         ...Mapped & split by street suffixes...
         ...Number processing...

     Address dataframe expansion is complete! (n=1010)
      ...formatting
   [Completed] Processing geo data
   [Start] Mapping geo data
      ...merge user input & lookup table
      ...mapping
Directory already exists
Output saved
   [Completed] Mapping geo data
[Completed] Preparing geo data

[Start] Preparing ACS data
   ...loading ACS lookup tables
   ... combining ACS & user input data
ZEST_KEY
 ...Copy dataframes
 ...Block group
 ...Census tract
 ...Zip code
 ...No match
 ...Merge
 ...Merging complete
[Complete] Preparing ACS data


  identifiedRaces = subset.idxmax(axis=1)


Output saved


  joint_result = non_compound.append(compound_result).reset_index(drop=True)
  0%|                                                     | 0/1 [00:00<?, ?it/s][Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
100%|███████████████████████████████████████████| 1/1 [00:00<00:00, 4485.89it/s]
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
  joint_result = non_compound.append(compound_result).reset_index(drop=True)
  data[col+'_pct'] = round(data[col]/ data['B01003_001'], 5)
  data[col+'_pct'] = round(data[col]/ data['B01003_001'], 5)
  data[col+'_pct'] = round(data[col]/ data['B01003_001'], 5)
  data[col+'_pct'] = round(data[col]/ data['B01003_001'], 5)
  data[col+'_pct'] = round(data[col]/ data['B01003_001'], 5)
  data[col+'_pct'] = round(data[col]/ data['B01003_001'], 5)
  data[col+'_pct'] = round(data[col]/ data['B01003_001'], 5)
  data[col+'_pct'] = round(data[col]/ data['B01003_001'], 5)
  data[col+'_pct'] = round(data[col]/ data['

Directory already exists
Output saved
CPU times: user 36.1 s, sys: 5.51 s, total: 41.6 s
Wall time: 35.7 s


  identifiedRaces = subset.idxmax(axis=1)


### Inspect the output and join

In [13]:
zrp_output

Unnamed: 0,AAPI,AIAN,BLACK,HISPANIC,WHITE,source_block_group,source_zip_code,OTHER,source_bisg
270,0.003071,0.113848,0.003382,0.006237,0.873462,1.0,,,
302,0.063315,0.000668,0.016008,0.065184,0.854824,1.0,,,
356,0.000608,0.000292,0.000695,0.001195,0.997210,1.0,,,
359,0.000897,0.000425,0.002105,0.038322,0.958252,1.0,,,
1,0.034340,0.016307,0.268386,0.023059,0.657908,,1.0,,
...,...,...,...,...,...,...,...,...,...
88,0.038122,0.000537,0.002971,0.027246,0.931124,,1.0,,
91,0.069381,0.000452,0.004539,0.022942,0.902686,,1.0,,
94,0.024983,0.008647,0.001998,0.004494,0.959878,,1.0,,
97,0.008535,0.000498,0.044080,0.008873,0.938014,,1.0,,


In [14]:
zrp_output['ZEST_KEY'] = zrp_output.index.astype(str)
joined = pd.merge(left=zrp_sample, right=zrp_output, how='left', left_on='ZEST_KEY', right_on='ZEST_KEY')
joined

Unnamed: 0,first_name,middle_name,last_name,house_number,street_address,city,state,zip_code,ZEST_KEY,AAPI,AIAN,BLACK,HISPANIC,WHITE,source_block_group,source_zip_code,OTHER,source_bisg
0,Fred,,Tagliarini,,,Aberdeen,NJ,07747-2300,0,,,,,,,,,
1,Kimberly,,Horton,,,Absecon,NJ,08201,1,0.034340,0.016307,0.268386,0.023059,0.657908,,1.0,,
2,Gabe,,Plumer,782,Frenchtown Road,Milford,NJ,08848,2,0.000230,0.000135,0.010123,0.000238,0.989275,,1.0,,
3,Rosemary,,Tuohy,,,Allamuchy,NJ,07820,3,0.000989,0.000454,0.000942,0.001634,0.995982,,1.0,,
4,Ari,,Bernstein,500,West Crescent Avenue,Allendale,NJ,07401,4,0.033730,0.007690,0.002028,0.037997,0.918556,,1.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
560,Paul,,Sarlo,85,Humboldt Street,Wood-Ridge,NJ,07075-2344,560,,,,,,,,,
561,Donald,,Dietrich,286,,Woodstown,NJ,08098,561,0.019148,0.000475,0.005890,0.006393,0.968095,,1.0,,
562,Craig,,Frederick,120,Village Green Drive,Woolwich Township,NJ,08085-3180,562,,,,,,,,,
563,Donald,,Cottrell,21,Saylors Pond Road,Wrightstown,NJ,08562,563,0.002410,0.011150,0.044157,0.004736,0.937548,,1.0,,


### Check the most likely Hispanic 

In [15]:
joined.nlargest(10, "HISPANIC")

Unnamed: 0,first_name,middle_name,last_name,house_number,street_address,city,state,zip_code,ZEST_KEY,AAPI,AIAN,BLACK,HISPANIC,WHITE,source_block_group,source_zip_code,OTHER,source_bisg
286,Marcial,,Mojena,249.0,,Columbus,NJ,8022,286,0.003759,0.000455,0.002409,0.957815,0.035562,,1.0,,
388,Helmin,,Caba,,,Perth Amboy,NJ,8861,388,0.089394,0.000665,0.014554,0.867471,0.027916,,1.0,,
543,Ray,,Arroyo,101.0,Washington Avenue,Westwood,NJ,7675,543,0.046443,0.007562,0.007524,0.80878,0.129691,,1.0,,
236,Alberto,,Santos,402.0,Kearny Avenue,Kearny,NJ,7032,236,0.163086,0.000413,0.003644,0.7791,0.053758,,1.0,,
398,Peter,,Cantu,641.0,Plainsboro Road,Plainsboro,NJ,8536,398,0.095828,0.019481,0.008296,0.6884,0.187995,,1.0,,
310,Al,,Ferro,240.0,,Perrineville,NJ,8535,310,0.096688,0.00115,0.010128,0.616156,0.275877,,1.0,,
231,Michael,,Reina,95.0,West Veterans Highway,Jackson,NJ,8527,231,0.003557,0.045545,0.014217,0.473806,0.462875,,1.0,,
263,Mauro,,Raguseo,215.0,Liberty Street,Little Ferry,NJ,7643,263,0.413985,0.001932,0.007669,0.357151,0.219262,,1.0,,
202,Louis,,Manzo,114.0,Bridgeton Pike,Mullica Hill,NJ,8062,202,0.032235,0.000335,0.000879,0.321358,0.645194,,1.0,,
299,Christine,,Serrano-Glassner,2.0,West Main Street,Mendham,NJ,7945,299,0.031365,0.000413,0.001019,0.240629,0.726573,,1.0,,


### Check the most likely Black

In [16]:
joined.nlargest(10, "BLACK")

Unnamed: 0,first_name,middle_name,last_name,house_number,street_address,city,state,zip_code,ZEST_KEY,AAPI,AIAN,BLACK,HISPANIC,WHITE,source_block_group,source_zip_code,OTHER,source_bisg
343,Ras,,Baraka,920.0,Broad Street,Newark,NJ,7102,343,0.000172,0.000132,0.991151,0.002718,0.005827,,1.0,,
215,Dahlia,,Vertreese,,,Hillside,NJ,7205,215,0.001001,0.000785,0.972889,0.007082,0.018243,,1.0,,
549,Tiffani,,Worthy,1.0,Salem Road,Willingboro,NJ,8046,549,0.005268,0.000557,0.969393,0.00119,0.023592,,1.0,,
370,Dwayne,,Warren,29.0,North Day Street,Orange,NJ,7050,370,0.004937,0.012164,0.954348,0.004891,0.023661,,1.0,,
78,Jamila,,Odom-Bremmer,201.0,Grant Avenue,Chesilhurst,NJ,8089,78,0.071559,0.022216,0.760723,0.009119,0.136383,,1.0,,
330,Kareem,,Pritchett,100.0,Mount Laurel Road,Mount Laurel,NJ,8054,330,0.007597,0.044503,0.743321,0.059827,0.144752,,1.0,,
13,Marty,,Small,,,Atlantic City,NJ,8401,13,0.006202,0.014564,0.722801,0.006486,0.249946,,1.0,,
384,Ladaena,,Thomas,527.0,,Penns Grove,NJ,8069,384,0.132811,0.013427,0.704326,0.026943,0.122493,,1.0,,
423,Corey,,Kimble,237.0,S. Pavilion Avenue,Riverside,NJ,8075,423,0.050291,0.01075,0.592927,0.00648,0.339551,,1.0,,
295,Glenn,,Ewan,590.0,Main Street,Leesburg,NJ,8327,295,0.001342,0.000848,0.570913,0.001679,0.425217,,1.0,,
