Skip to content

Commit

Permalink
Merge pull request #20 from zestai/cw/dev
Browse files Browse the repository at this point in the history
Cw/dev
  • Loading branch information
chriswill21 committed Jun 3, 2022
2 parents 9b7faa8 + 4fdf68e commit 6cf2564
Show file tree
Hide file tree
Showing 8 changed files with 921 additions and 89 deletions.
755 changes: 755 additions & 0 deletions examples/build/zsub_build_ZRP.ipynb

Large diffs are not rendered by default.

7 changes: 7 additions & 0 deletions tests/data/data_descriptions.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
{
"sm_1": "A small pandas data frame with shape, (5, 14). Contains 5 entries of data which can be entered into ZRP including all requisite column names for ZRP to run.",
"sm_2": "A small pandas data frame with shape, (5, 14). Contains 5 entries of data which can be entered into ZRP including all requisite column names for ZRP to run. Contains missings, non-unique keys, and 100% of first_name is na.",
"sm_3": "A small pandas data frame with shape, (5, 14). Contains an unformatted row with extra spaces, additional numeric characters, and lowercase names. ",
"sm_4": "A small pandas data frame with shape, (5, 15). Contains a column not named the expected name (the 'race' column has been renamed to 'r').",
"sm_5": "A small pandas data frame with shape, (14, 15). The test dataset consists of valid address and name records, records without names but with valid ZRP addresses, records with invalid ZRP but valid BISG addresses and names, records with invalid ZRP and BISG addresses and names, and records with invalid ZRP and BISG addresses and no names. The expected behavior respectively for the dataset are proxies with 'source_zrp_block_group'/'source_zrp_census_tract'/'source_zrp_zip_code', 'source_zrp_block_group_geo_only'/'source_zrp_census_tract_geo_only'/'source_zrp_zip_code_geo_only', 'source_bisg','source_zrp_name_only', 'source_no_proxy'."
}
6 changes: 6 additions & 0 deletions tests/data/sm_1.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
ZEST_KEY,first_name,middle_name,last_name,house_number,street_address,city,state,zip_code,original_race,race,original_sex,sex,age
SC_514427307,SAM,T,JONES,121,GRAYS MARKET RD,EARLY BRANCH,SC,29916,WHITE,WHITE,FEMALE,FEMALE,19
SC_514495249,HENRY,None,NGO,632,SANDBAR PT,CLOVER,SC,29710,ASIAN,AAPI,MALE,MALE,32
SC_514476575,GABBY,L,BRIDGES,605,KERSHAW ST,CHERAW,SC,29520,WHITE,WHITE,FEMALE,FEMALE,50
SC_514414510,JAMES,M,HORN,3401,DUNCAN ST,COLUMBIA,SC,29205,WHITE,WHITE,MALE,MALE,26
SC_514405450,LONDON,Z,ABARA,26,PECAN CIR,YORK,SC,29745,BLACK/AFRICAN,BLACK,FEMALE,FEMALE,22
6 changes: 6 additions & 0 deletions tests/data/sm_3.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
ZEST_KEY,first_name,middle_name,last_name,house_number,street_address,city,state,zip_code,original_race,race,original_sex,sex,age
SC_514427307,sam,T,j0nes,1 2 1,GRAYS MARKET RD,EARLY BRANCH,SC,29916,WHITE,WHITE,FEMALE,FEMALE,19
SC_514495249,henry,None,ngo,632,SANDBAR PT,CLOVER,SC,29710,ASIAN,AAPI,MALE,MALE,32
SC_514476575,GABBY,L,BRIDGES,605,KERSHAW ST,CHERAW,SC,29520,WHITE,WHITE,FEMALE,FEMALE,50
SC_514414510,JAMES,M,HORN,3401,DUNCAN ST,COLUMBIA,SC,29205,WHITE,WHITE,MALE,MALE,26
SC_514405450,LONDON,Z,ABARA,26,PECAN CIR,YORK,SC,29745,BLACK/AFRICAN,BLACK,FEMALE,FEMALE,22
15 changes: 15 additions & 0 deletions tests/data/sm_5.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
,ZEST_KEY,first_name,middle_name,last_name,house_number,street_address,city,state,zip_code,original_race,race,original_sex,sex,age
0,GA_02144077,MICHAEL,EARL,WILLIAMS,3793,EMPERORS CV,SNELLVILLE,GA,30039,BH,BLACK,M,MALE,32
1,GA_03567641,REBEKAH,WILLIS,MITCHELL,20,TIMBERWOODS DR,COVINGTON,GA,30016,WH,WHITE,F,FEMALE,37
2,GA_06757359,JAMAAL,RASHAD,ROBINSON,200,ENGRACIA DR,WARNER ROBINS,GA,310885834,BH,BLACK,M,MALE,24
3,GA_07588296,JEDIDIAH,DALE,ARTMAN,5790,CHARLESTON LN,CUMMING,GA,30041,WH,WHITE,M,MALE,29
4,GA_07690722,RAAVIN,ROCKELLE-KATRICE,EVANS,714,CALIBRE WOODS DR NE,ATLANTA,GA,30329,BH,BLACK,F,FEMALE,27
5,GA_08063136,NIKITA,PATRICE,COBB,405,CAMERON LANDING DR,STOCKBRIDGE,GA,30281,BH,BLACK,F,FEMALE,23
6,GA_10561962,RUBI,None,MARTINEZ,156,HONEY BEAR RD,NORMAN PARK,GA,31771,HP,HISPANIC,F,FEMALE,25
7,GA_10961114,JEFFREY,STANLEY,BLACK,552,LEES TRCE SW,MARIETTA,GA,30064,WH,WHITE,M,MALE,30
8,GA_11003386,GRACE,ELIZABETH,MCMULLEN,416,7TH ST NE,ATLANTA,GA,30308,WH,WHITE,F,FEMALE,24
9,GA_11493478,JULIE,MAY,PIERIDES,120,NORTH AVE NW,ATLANTA,GA,30313,OT,OTHER,F,FEMALE,20
10,GA_11951308,,,,5145,BEDE DR NE,COVINGTON,GA,30014,BH,BLACK,M,MALE,52
11,GA_10561963,Christien,S,Williams,156,HONEY BEAR RD,S,CA,00130,HP,HISPANIC,F,FEMALE,25
12,GA_10561964,,,,156,HONEY BEAR RD,S,CA,00130,HP,HISPANIC,F,FEMALE,25
13,GA_10561965,Christien,S,Williams,156,HONEY BEAR RD,S,CA,601,HP,HISPANIC,F,FEMALE,25
210 changes: 124 additions & 86 deletions zrp/modeling/pipeline_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,18 +41,17 @@ class ZRP_Build_Pipeline(BaseZRP):
zrp_model_name: str
Name of zrp_model
zrp_model_source: str
Indicates the source of zrp_modeling data to use. There are three optional sources 'bg' (for block_group), 'ct' (for census_tract), and 'zp' (for zip_code). By default 'census_tract' is inferred.
Indicates the source of zrp_modeling data to use. There are three optional sources 'block_group', 'census_tract', and 'zip_code'. By default 'census_tract' is inferred.
"""

def __init__(self, file_path=None, zrp_model_name='zrp_0', zrp_model_source='ct', *args, **kwargs):
def __init__(self, zrp_model_source, file_path=None, zrp_model_name='zrp_0', *args, **kwargs):
super().__init__(file_path=file_path, *args, **kwargs)
self.zrp_model_name = zrp_model_name
self.zrp_model_source = zrp_model_source
self.outputs_path = os.path.join(self.out_path,
"experiments",
self.zrp_model_name,
self.zrp_model_source,
"data")
self.zrp_model_source)
self.geo_key = 'GEOID'

def fit(self, X, y):
Expand Down Expand Up @@ -105,18 +104,17 @@ class ZRP_Build_Model(BaseZRP):
zrp_model_name: str
Name of zrp_model
zrp_model_source: str
Indicates the source of zrp_modeling data to use. There are three optional sources 'bg' (for block_group), 'ct' (for census_tract), and 'zp' (for zip_code). By default 'census_tract' is inferred.
Indicates the source of zrp_modeling data to use. There are three optional sources 'block_group', 'census_tract', and 'zip_code'. By default 'census_tract' is inferred.
"""

def __init__(self, file_path=None, zrp_model_name='zrp_0', zrp_model_source='ct', *args, **kwargs):
def __init__(self, zrp_model_source, file_path=None, zrp_model_name='zrp_0', *args, **kwargs):
super().__init__(file_path=file_path, *args, **kwargs)
self.zrp_model_name = zrp_model_name
self.zrp_model_source = zrp_model_source
self.outputs_path = os.path.join(self.out_path,
"experiments",
self.zrp_model_name,
self.zrp_model_source,
"data")
self.zrp_model_source)
self.geo_key = 'GEOID'

def fit(self, X, y):
Expand Down Expand Up @@ -144,10 +142,7 @@ def fit(self, X, y):

self.y_unique = y[self.race].unique()
self.y_unique.sort()

return self

def transform(self, X):

make_directory(self.outputs_path)
# Save zrp_model
pickle.dump(self.zrp_model, open(os.path.join(self.outputs_path, "zrp_model.pkl"), "wb"))
Expand All @@ -156,6 +151,10 @@ def transform(self, X):
except:
pass

return self

def transform(self, X):

##### Return Race Probabilities
print('\n---\nGenerate & save race predictions (labels)')
y_hat_train = pd.DataFrame({'race': self.zrp_model.predict(X)}, index=X.index)
Expand Down Expand Up @@ -183,18 +182,17 @@ class ZRP_DataSampling(BaseZRP):
zrp_model_name: str
Name of zrp_model
zrp_model_source: str
Indicates the source of zrp_modeling data to use. There are three optional sources 'bg' (for block_group), 'ct' (for census_tract), and 'zp' (for zip_code). By default 'census_tract' is inferred.
Indicates the source of zrp_modeling data to use. There are three optional sources 'block_group', 'census_tract', and 'zip_code'. By default 'census_tract' is inferred.
"""

def __init__(self, file_path=None, zrp_model_name='zrp_0', zrp_model_source='ct', *args, **kwargs):
def __init__(self, zrp_model_source, file_path=None, zrp_model_name='zrp_0', *args, **kwargs):
super().__init__(file_path=file_path, *args, **kwargs)
self.zrp_model_name = zrp_model_name
self.zrp_model_source = zrp_model_source
self.outputs_path = os.path.join(self.out_path,
"experiments",
self.zrp_model_name,
self.zrp_model_source,
"data")
self.zrp_model_source)
self.geo_key = 'GEOID'

def fit(self):
Expand Down Expand Up @@ -251,35 +249,45 @@ def transform(self, data):

class ZRP_Build(BaseZRP):
"""
This class builds a new custom ZRP model trained off of user input data. Supply standard ZRP requirements including name, address, and race to build your custom model-pipeline. Race & ethnicity probablities and labels are returned from this class. The pipeline, model, and supporting data is saved automatically to "./artifacts/experiments/{zrp_model_name}/{zrp_model_source}/data/" in the support files path defined.
This class builds a new custom ZRP model trained off of user input data. Supply standard ZRP requirements including name, address, and race to build your custom model-pipeline. The pipeline, model, and supporting data is saved automatically to "./artifacts/experiments/{zrp_model_name}/{zrp_model_source}/" in the support files path defined.
Parameters
----------
file_path: str
Path indicating where to put artifacts folder its files (pipeline, model, and supporting data), generated during intermediate steps.
zrp_model_name: str
Name of zrp_model.
zrp_model_source: str
Indicates the source of zrp_modeling data to use. There are three optional sources 'bg' (for block_group), 'ct' (for census_tract), and 'zp' (for zip_code). By default 'census_tract' is inferred.
"""

def __init__(self, file_path=None, zrp_model_name='zrp_0', zrp_model_source='ct', *args, **kwargs):
def __init__(self, file_path=None, zrp_model_name='zrp_0', *args, **kwargs):
super().__init__(file_path=file_path, *args, **kwargs)
self.zrp_model_name = zrp_model_name
self.zrp_model_source = zrp_model_source
self.outputs_path = os.path.join(self.out_path,
"experiments",
self.zrp_model_name,
self.zrp_model_source,
"data")
self.geo_key = 'GEOID'

def validate_input_columns(self, data):
"""
Passes if the input data has the requisite columns to run ZRP Build.
Parameters
-----------
data: DataFrame
A pandas data frame of user input data.
"""
modeling_col_names = self.get_column_names
for name in modeling_col_names():
if name not in data.columns:
raise KeyError("Your input dataframe has incorrect columns provided. Ensure that the following data is in your input data frame: first_name, middle_name, last_name, house_number, street_address, city, state, zip_code, race. If you have provided this data, ensure that the column names for said data are either the same as the aformentioned data column names, or ensure that you have specified, via arguements, the column names for these data you have provided in your input data frame.")
return True

def fit(self):
return self

def transform(self, data):
make_directory(self.outputs_path)
sample_path = self.outputs_path
cur_path = dirname(__file__)

self.validate_input_columns(data)

# Prepare data
data = data.rename(columns = {self.first_name : "first_name",
self.middle_name : "middle_name",
Expand All @@ -290,68 +298,98 @@ def transform(self, data):
self.zip_code : "zip_code",
self.state : "state",
self.block_group : "block_group",
self.census_tract : "census_tract"
self.census_tract : "census_tract",
self.race: "race"
}
)
data = data.drop_duplicates(subset=['ZEST_KEY'])
z_prepare = ZRP_Prepare(file_path=self.file_path)
z_prepare.fit(data)
prepared_data = z_prepare.transform(data)

# Data Sampling
dsamp = ZRP_DataSampling(file_path=self.file_path)

X_train, X_test, y_train, y_test = dsamp.transform(prepared_data)

data = data.drop_duplicates(subset=['ZEST_KEY'])
print("Post-sampling shape: ", data.shape)

print("Unique labels: ", y_train['race'].unique())
print("Other unique labels: ", y_test['race'].unique())
cur_path = dirname(__file__)
feature_list = load_json(os.path.join(cur_path, f'feature_list_{self.zrp_model_source}.json'))

y_train = y_train.drop_duplicates(self.key)
train_keys = list(y_train[self.key].values)
X_train = X_train[X_train[self.key].isin(train_keys)]
X_train = X_train.drop_duplicates(self.key)

y_train[[self.geo_key, self.key]] = y_train[[self.geo_key, self.key]].astype(str)
sample_weights = y_train[[self.key, 'sample_weight']].copy()

if X_train.shape[0] != y_train.shape[0]:
raise AssertionError("Unexpected mismatch between shapes. There are duplicates in the data, please remove duplicates & resubmit the data")

#### Set Index
X_train.set_index(self.key, inplace=True)
y_train.set_index(self.key, inplace=True)
sample_weights.set_index(self.key, inplace=True)
X_train.sort_index(inplace=True)
y_train.sort_index(inplace=True)
sample_weights.sort_index(inplace=True)

feature_cols = list(set(X_train.columns) - set([self.key, self.geo_key, 'GEOID_BG', 'GEOID_CT',
'GEOID_ZIP', "first_name", "middle_name",
"last_name", 'ZEST_KEY_COL']))

print(' train to numeric')
X_train[feature_cols] = X_train[feature_cols].apply(pd.to_numeric, errors='coerce')

print('\n---\nSaving raw data')
save_feather(X_train, self.outputs_path, "train_raw_data.feather")
save_feather(y_train, self.outputs_path, "train_raw_targets.feather")

# Build Pipeline
build_pipe = ZRP_Build_Pipeline(file_path=self.file_path)
build_pipe.fit(X_train, y_train)
X_train_fe = build_pipe.transform(X_train)

# Build Model
build_model = ZRP_Build_Model(file_path=self.file_path)
build_model.fit(X_train_fe, y_train)
y_hat_train, y_phat_train = build_model.transform(X_train_fe)

pred_dict = {}
pred_dict['labels'] = y_hat_train
pred_dict['probablities'] = y_phat_train
return (pred_dict)
ft_list_source_map = {'census_tract': 'ct', 'block_group': 'bg', 'zip_code': 'zp'}
source_to_geoid_level_map = {'census_tract': 'GEOID_CT', 'block_group': 'GEOID_BG', 'zip_code': 'GEOID_ZIP'}
sources = ['block_group', 'census_tract', 'zip_code']

for source in sources:
print("=========================")
print(f"BUILDING {source} MODEL.")
print("=========================\n")
outputs_path = os.path.join(self.out_path,
"experiments",
self.zrp_model_name,
source)

make_directory(outputs_path)

# Get features to drop from prepared data
print(f"Dropping {list(set(sources).difference({source}))} features")

features_to_keep_list = load_json(os.path.join(cur_path, f'feature_list_{ft_list_source_map[source]}.json'))
features_to_keep_list.append('race')

print(" ...Len features to keep list: ", len(features_to_keep_list))

# Get records that can be geocoded down to given source geo level
geoid_level = source_to_geoid_level_map[source]
relevant_source_data = prepared_data[~prepared_data[geoid_level].isna()]

print(" ...Data shape pre feature drop: ", relevant_source_data.shape)
relevant_source_data = relevant_source_data[relevant_source_data.columns.intersection(features_to_keep_list)]
print(" ...Data shape post feature drop: ", relevant_source_data.shape)

# Data Sampling
dsamp = ZRP_DataSampling(file_path=self.file_path, zrp_model_source=source, zrp_model_name=self.zrp_model_name)

X_train, X_test, y_train, y_test = dsamp.transform(relevant_source_data)

data = data.drop_duplicates(subset=['ZEST_KEY'])
print("Post-sampling shape: ", data.shape)
print("\n")
print("Unique train labels: ", y_train['race'].unique())
print("Unique test labels: ", y_test['race'].unique())

y_train = y_train.drop_duplicates(self.key)
train_keys = list(y_train[self.key].values)
X_train = X_train[X_train[self.key].isin(train_keys)]
X_train = X_train.drop_duplicates(self.key)

y_train[[self.geo_key, self.key]] = y_train[[self.geo_key, self.key]].astype(str)
sample_weights = y_train[[self.key, 'sample_weight']].copy()

if X_train.shape[0] != y_train.shape[0]:
raise AssertionError("Unexpected mismatch between shapes. There are duplicates in the data, please remove duplicates & resubmit the data")

#### Set Index
X_train.set_index(self.key, inplace=True)
y_train.set_index(self.key, inplace=True)
sample_weights.set_index(self.key, inplace=True)
X_train.sort_index(inplace=True)
y_train.sort_index(inplace=True)
sample_weights.sort_index(inplace=True)

feature_cols = list(set(X_train.columns) - set([self.key, self.geo_key, 'GEOID_BG', 'GEOID_CT',
'GEOID_ZIP', "first_name", "middle_name",
"last_name", 'ZEST_KEY_COL']))

print(' train to numeric')
X_train[feature_cols] = X_train[feature_cols].apply(pd.to_numeric, errors='coerce')

print('\n---\nSaving raw data')
save_feather(X_train, outputs_path, "train_raw_data.feather")
save_feather(y_train, outputs_path, "train_raw_targets.feather")

# Build Pipeline
build_pipe = ZRP_Build_Pipeline(file_path=self.file_path, zrp_model_source=source, zrp_model_name=self.zrp_model_name)
build_pipe.fit(X_train, y_train)
X_train_fe = build_pipe.transform(X_train)

# Build Model
build_model = ZRP_Build_Model(file_path=self.file_path, zrp_model_source=source, zrp_model_name=self.zrp_model_name)
build_model.fit(X_train_fe, y_train)

print(f"Completed building {source} model.")

print("\n##############################")
print("Custom ZRP model build complete.")

Loading

0 comments on commit 6cf2564

Please sign in to comment.