## Data Cleaning for New York

In [289]:
import geopandas as gpd
import maup

maup.progress.enabled = True

### Import the necessary files from the redistricting hub:

- **ny_pl2020_vtd**: https://redistrictingdatahub.org/dataset/new-york-vtd-pl-94171-2020/
- **ny_vest_18**: https://redistrictingdatahub.org/dataset/vest-2018-new-york-precinct-and-election-results/
- **ny_pl2020_cd**: https://redistrictingdatahub.org/dataset/new-york-congressional-district-pl-94171-2020/
- **ny_cong_adopted_2022**: https://redistrictingdatahub.org/dataset/2022-new-york-congressional-districts-plan/

In [290]:
population_df = gpd.read_file("./new_states/vt_pl2020_cd/vt_pl2020_cd.shp")
# election_df = gpd.read_file("./new_states/wa_vest_20/wa_vest_20.shp")
# cong_df = gpd.read_file("./new_states/wa_pl2020_cd/wa_pl2020_cd.shp")

In [291]:
# population_df = gpd.read_file("./new_states/wa_pl2020_vtd/wa_pl2020_vtd.shp")
election_df = gpd.read_file("./new_states/vt_vest_20/vt_vest_20.shp")
# cong_df = gpd.read_file("./new_states/wa_pl2020_cd/wa_pl2020_cd.shp")

In [292]:
# population_df = gpd.read_file("./new_states/wa_pl2020_vtd/wa_pl2020_vtd.shp")
# election_df = gpd.read_file("./new_states/wa_vest_20/wa_vest_20.shp")
cong_df = gpd.read_file("./new_states/vt_2020_gen_2020_blocks/vt_2020_gen_2020_blocks.shp")

#### Let's clean up the column names from ```election_df```, and remove the ones we don't need

In [293]:
for i in population_df.columns:
    print(i)

STATEFP20
GEOID20
CD116FP
NAMELSAD20
LSAD20
CDSESSN
MTFCC20
FUNCSTAT20
ALAND20
AWATER20
INTPTLAT20
INTPTLON20
FILEID
STUSAB
SUMLEV
GEOVAR
GEOCOMP
CHARITER
LOGRECNO
GEOID
GEOCODE
REGION
DIVISION
STATE
STATENS
CD116
AREALAND
AREAWATR
BASENAME
NAME
FUNCSTAT
POP100
HU100
INTPTLAT
INTPTLON
LSADC
P0010001
P0010002
P0010003
P0010004
P0010005
P0010006
P0010007
P0010008
P0010009
P0010010
P0010011
P0010012
P0010013
P0010014
P0010015
P0010016
P0010017
P0010018
P0010019
P0010020
P0010021
P0010022
P0010023
P0010024
P0010025
P0010026
P0010027
P0010028
P0010029
P0010030
P0010031
P0010032
P0010033
P0010034
P0010035
P0010036
P0010037
P0010038
P0010039
P0010040
P0010041
P0010042
P0010043
P0010044
P0010045
P0010046
P0010047
P0010048
P0010049
P0010050
P0010051
P0010052
P0010053
P0010054
P0010055
P0010056
P0010057
P0010058
P0010059
P0010060
P0010061
P0010062
P0010063
P0010064
P0010065
P0010066
P0010067
P0010068
P0010069
P0010070
P0010071
P0020001
P0020002
P0020003
P0020004
P0020005
P0020006
P0020007
P00200

In [294]:
for i in cong_df.columns:
    print(i)

GEOID20
STATEFP
COUNTYFP
PRECINCTID
VAP_MOD
G20PREDBID
G20PRERTRU
G20PRELJOR
G20PREGHAW
G20PREIWES
G20PREOOTH
G20PREOSAN
G20PREOWRI
G20HALDWEL
G20HALDRBE
G20HALCHEL
G20HALIBEC
G20HALIHOR
G20HALIORR
G20HALITRU
G20HALOWRI
G20GOVDZUC
G20GOVRSCO
G20GOVIHOY
G20GOVTPEY
G20GOVIWHI
G20GOVIBIL
G20GOVIDEV
G20GOVUDIC
G20GOVOWRI
G20LTGDGRA
G20LTGRMIL
G20LTGPERI
G20LTGIBIL
G20LTGBCOR
G20LTGOWRI
G20ATGDDON
G20ATGRPAI
G20ATGPERI
G20ATGOWRI
G20SOSDCON
G20SOSRPAI
G20SOSPERI
G20SOSISMI
G20SOSOWRI
G20TREDPEA
G20TRERBRA
G20TREPERI
G20TREIWRI
G20TREOWRI
G20AUDOHOF
G20AUDPERI
G20AUDOWRI
geometry


In [295]:
election_df.columns

Index(['STATEFP20', 'COUNTYFP20', 'NAME20', 'G20PREDBID', 'G20PRERTRU',
       'G20PRELJOR', 'G20PREGHAW', 'G20PREIWES', 'G20PREOOTH', 'G20PREOSAN',
       'G20PREOWRI', 'G20HALDWEL', 'G20HALDRBE', 'G20HALCHEL', 'G20HALIBEC',
       'G20HALIHOR', 'G20HALIORR', 'G20HALITRU', 'G20HALOWRI', 'G20GOVDZUC',
       'G20GOVRSCO', 'G20GOVIHOY', 'G20GOVTPEY', 'G20GOVIWHI', 'G20GOVIBIL',
       'G20GOVIDEV', 'G20GOVUDIC', 'G20GOVOWRI', 'G20LTGDGRA', 'G20LTGRMIL',
       'G20LTGPERI', 'G20LTGIBIL', 'G20LTGBCOR', 'G20LTGOWRI', 'G20ATGDDON',
       'G20ATGRPAI', 'G20ATGPERI', 'G20ATGOWRI', 'G20SOSDCON', 'G20SOSRPAI',
       'G20SOSPERI', 'G20SOSISMI', 'G20SOSOWRI', 'G20TREDPEA', 'G20TRERBRA',
       'G20TREPERI', 'G20TREIWRI', 'G20TREOWRI', 'G20AUDOHOF', 'G20AUDPERI',
       'G20AUDOWRI', 'geometry'],
      dtype='object')

In [296]:
# "G18SEND", "G18SENR"

replacing_columns_info = {
    "G20PREDBID": "G18PRED",
    "G20PRERTRU": "G18PRER",
    "G20GOVDZUC": "G18GOVD",
    "G20GOVRSCO": "G18GOVR",
    # "G18COMDDIN": "G18COMD",
    # "G18COMRTRI": "G18COMR",
    "G20ATGDDON": "G18ATGD",
    "G20ATGRPAI": "G18ATGR",
    # "G20ATGDFER": "G18ATGD",
    # "G20ATGRLAR": "G18ATGR"
}

election_df.rename(columns=replacing_columns_info, inplace=True)

In [297]:
election_df.columns

Index(['STATEFP20', 'COUNTYFP20', 'NAME20', 'G18PRED', 'G18PRER', 'G20PRELJOR',
       'G20PREGHAW', 'G20PREIWES', 'G20PREOOTH', 'G20PREOSAN', 'G20PREOWRI',
       'G20HALDWEL', 'G20HALDRBE', 'G20HALCHEL', 'G20HALIBEC', 'G20HALIHOR',
       'G20HALIORR', 'G20HALITRU', 'G20HALOWRI', 'G18GOVD', 'G18GOVR',
       'G20GOVIHOY', 'G20GOVTPEY', 'G20GOVIWHI', 'G20GOVIBIL', 'G20GOVIDEV',
       'G20GOVUDIC', 'G20GOVOWRI', 'G20LTGDGRA', 'G20LTGRMIL', 'G20LTGPERI',
       'G20LTGIBIL', 'G20LTGBCOR', 'G20LTGOWRI', 'G18ATGD', 'G18ATGR',
       'G20ATGPERI', 'G20ATGOWRI', 'G20SOSDCON', 'G20SOSRPAI', 'G20SOSPERI',
       'G20SOSISMI', 'G20SOSOWRI', 'G20TREDPEA', 'G20TRERBRA', 'G20TREPERI',
       'G20TREIWRI', 'G20TREOWRI', 'G20AUDOHOF', 'G20AUDPERI', 'G20AUDOWRI',
       'geometry'],
      dtype='object')

In [298]:
safe_cols = ["geometry", 'COUNTYFP20', 'NAME20', 'PRECCODE', 'COUNTYNAME', 'ST_CODE', 'PRECNAME'] + list(replacing_columns_info.values())

cols_to_drop = [i for i in election_df.columns if i.startswith("G20")]

# print(cols_to_drop)
election_df.drop(columns=cols_to_drop, inplace=True)

#### Check to make sure we only have the election columns we need

In [288]:
election_df.columns

Index(['STATEFP20', 'COUNTYFP20', 'NAME20', 'G18PRED', 'G18PRER', 'geometry'], dtype='object')

#### Find the column from ```cong_df``` that gives us the unique district identifier

In [299]:
cong_df.head()

Unnamed: 0,GEOID20,STATEFP,COUNTYFP,PRECINCTID,VAP_MOD,G20PREDBID,G20PRERTRU,G20PRELJOR,G20PREGHAW,G20PREIWES,...,G20SOSOWRI,G20TREDPEA,G20TRERBRA,G20TREPERI,G20TREIWRI,G20TREOWRI,G20AUDOHOF,G20AUDPERI,G20AUDOWRI,geometry
0,500019601001000,50,1,Starksboro,35,15.84,8.3,0.45,0.05,0.17,...,0.0,13.75,8.05,1.27,1.42,0.0,19.38,3.21,0.17,"POLYGON ((-73.01554 44.29290, -73.01545 44.292..."
1,500019601001001,50,1,Starksboro,48,21.73,11.38,0.61,0.07,0.24,...,0.0,18.86,11.03,1.74,1.95,0.0,26.58,4.41,0.24,"POLYGON ((-73.01559 44.29146, -73.01555 44.291..."
2,500019601001002,50,1,Starksboro,24,10.86,5.69,0.31,0.03,0.12,...,0.0,9.43,5.52,0.87,0.97,0.0,13.29,2.2,0.12,"POLYGON ((-73.01678 44.27761, -73.01675 44.277..."
3,500019601001003,50,1,Starksboro,78,35.31,18.49,1.0,0.11,0.39,...,0.0,30.64,17.93,2.83,3.16,0.0,43.19,7.16,0.39,"POLYGON ((-73.04444 44.23707, -73.04427 44.237..."
4,500019601001004,50,1,Starksboro,0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"POLYGON ((-72.99836 44.27129, -72.99834 44.271..."


##### *We'll save column DISTRICT for later*

In [300]:
district_col_name = "DISTRICT"

#### Next we need to align the precincts from the 2018 data with the 2020 census data, but first we must make sure the CRS values match

In [305]:
print(population_df.crs)
print(cong_df.crs)
print(election_df.crs)

EPSG:4269
EPSG:4269
EPSG:4269


##### Looks like the CRS values don't match. This will cause issues while calling ```maup.assign()```, so let's update them to match `population_df`

In [316]:
# election_df.to_crs(population_df.crs, inplace=True)
# cong_df.to_crs(population_df.crs, inplace=True)

population_df.to_crs(election_df.crs, inplace=True)
cong_df.to_crs(election_df.crs, inplace=True)

print(population_df.crs)
print(cong_df.crs)

EPSG:4269
EPSG:4269


#### Great! Now lets go ahead and get our mappings between the census and election data

In [317]:
# vtds_to_precincts_assignment = maup.assign(election_df.geometry, population_df.geometry)

vtds_to_precincts_assignment = maup.assign(election_df.geometry, population_df.geometry)


100%|██████████| 1/1 [00:00<00:00, 11.79it/s]


##### A lot of these columns names don't make sense, but we'll copy the population data columns from `population_df` into `election_df` for now

In [318]:
pop_column_names = ['P0020001', 'P0020002', 'P0020005', 'P0020006',
                    'P0020007', 'P0020008', 'P0020009', 'P0020010']

vap_column_names = ['P0040001', 'P0040002', 'P0040005', 'P0040006',
                    'P0040007', 'P0040008', 'P0040009', 'P0040010']

In [319]:
for i in population_df.columns:
    print(i)

STATEFP20
GEOID20
CD116FP
NAMELSAD20
LSAD20
CDSESSN
MTFCC20
FUNCSTAT20
ALAND20
AWATER20
INTPTLAT20
INTPTLON20
FILEID
STUSAB
SUMLEV
GEOVAR
GEOCOMP
CHARITER
LOGRECNO
GEOID
GEOCODE
REGION
DIVISION
STATE
STATENS
CD116
AREALAND
AREAWATR
BASENAME
NAME
FUNCSTAT
POP100
HU100
INTPTLAT
INTPTLON
LSADC
P0010001
P0010002
P0010003
P0010004
P0010005
P0010006
P0010007
P0010008
P0010009
P0010010
P0010011
P0010012
P0010013
P0010014
P0010015
P0010016
P0010017
P0010018
P0010019
P0010020
P0010021
P0010022
P0010023
P0010024
P0010025
P0010026
P0010027
P0010028
P0010029
P0010030
P0010031
P0010032
P0010033
P0010034
P0010035
P0010036
P0010037
P0010038
P0010039
P0010040
P0010041
P0010042
P0010043
P0010044
P0010045
P0010046
P0010047
P0010048
P0010049
P0010050
P0010051
P0010052
P0010053
P0010054
P0010055
P0010056
P0010057
P0010058
P0010059
P0010060
P0010061
P0010062
P0010063
P0010064
P0010065
P0010066
P0010067
P0010068
P0010069
P0010070
P0010071
P0020001
P0020002
P0020003
P0020004
P0020005
P0020006
P0020007
P00200

In [320]:
election_df[pop_column_names] = population_df[vap_column_names].groupby(vtds_to_precincts_assignment).sum()

election_df[pop_column_names].head()

Unnamed: 0,P0020001,P0020002,P0020005,P0020006,P0020007,P0020008,P0020009,P0020010
0,524482.0,11113.0,474146.0,6143.0,1632.0,9117.0,144.0,1929.0
1,,,,,,,,
2,,,,,,,,
3,,,,,,,,
4,,,,,,,,


In [321]:
population_df.columns

Index(['STATEFP20', 'GEOID20', 'CD116FP', 'NAMELSAD20', 'LSAD20', 'CDSESSN',
       'MTFCC20', 'FUNCSTAT20', 'ALAND20', 'AWATER20',
       ...
       'P0050002', 'P0050003', 'P0050004', 'P0050005', 'P0050006', 'P0050007',
       'P0050008', 'P0050009', 'P0050010', 'geometry'],
      dtype='object', length=338)

#### Time to check to see if we lost any of the population in the merge

In [322]:
print('population_df:')
print(population_df[pop_column_names].sum())

print('election_df:')
print(election_df[pop_column_names].sum())

population_df:
P0020001    643077
P0020002     15504
P0020005    573201
P0020006      8649
P0020007      1986
P0020008     11457
P0020009       170
P0020010      2561
dtype: int64
election_df:
P0020001    524482.0
P0020002     11113.0
P0020005    474146.0
P0020006      6143.0
P0020007      1632.0
P0020008      9117.0
P0020009       144.0
P0020010      1929.0
dtype: float64


#### And now comes the mapping between 2018 and 2020 data using `maup.prorate`. This will give us population weights that we can use to reassign the district population to the 2020 districts

In [199]:
# weights2018 = population_df["P0040001"] / vtds_to_precincts_assignment.map(population_df["P0040001"].groupby(vtds_to_precincts_assignment).sum())
# weights2018 = weights2018.fillna(0)

In [200]:
# prorated2018 = maup.prorate(vtds_to_precincts_assignment, election_df[pop_column_names], weights2018)

# prorated2018.head()

#### Next we'll store the prorated election columns from `election_df` in `population_df`

In [201]:
# election_cols = ["G18SEND", "G18SENR", "G18GOVD", "G18GOVR", "G18COMD", "G18COMR", "G18ATGD", "G18ATGR"]

# population_df[election_cols] = prorated2018

#### One more check to make sure we didn't lose anyone in the proration step

In [302]:
print(population_df[pop_column_names].sum())
print(election_df[pop_column_names].sum())

P0020001    643077
P0020002     15504
P0020005    573201
P0020006      8649
P0020007      1986
P0020008     11457
P0020009       170
P0020010      2561
dtype: int64


KeyError: "None of [Index(['P0020001', 'P0020002', 'P0020005', 'P0020006', 'P0020007', 'P0020008',\n       'P0020009', 'P0020010'],\n      dtype='object')] are in the [columns]"

#### Perfect! Now that we know we haven't lost anyone, let's make sure `maup.doctor()` runs without any holes in the map

In [203]:
maup.doctor(population_df)

100%|██████████| 7434/7434 [00:04<00:00, 1542.52it/s]

  overlaps = inters[inters.area > 0].make_valid()


True

#### This step will feel familiar, we'll call `maup.assign()` to map the congressional districts from `cong_df` to the precincts in `population_df`
#### Once we have this assignment, we'll add a new `CD` column to `population_df` with the district that the given precinct falls under

In [204]:
precincts_to_districts_assignment = maup.assign(election_df.geometry, cong_df.geometry)
population_df["CD"] = precincts_to_districts_assignment


# precincts_to_districts_assignment = maup.assign(population_df.geometry, cong_df.geometry)
# population_df["CD"] = precincts_to_districts_assignment

for precinct_index in range(len(population_df)):
    population_df.at[precinct_index, "CD"] = int(cong_df.at[population_df.at[precinct_index, "CD"], district_col_name])

100%|██████████| 10/10 [00:00<00:00, 60.23it/s]
100%|██████████| 10/10 [00:00<00:00, 20.63it/s]

  df = df[df.area > area_cutoff].reset_index(drop=True)

  geometries = geometries[geometries.area > area_cutoff]

  return assign_to_max(intersections(sources, targets, area_cutoff=0).area)


#### Almost done! Now it's time to rename those columns from before so that we know what they stand for

In [205]:
rename_dict = {'P0020001': 'TOTPOP', 'P0020002': 'HISP', 'P0020005': 'NH_WHITE', 'P0020006': 'NH_BLACK', 'P0020007': 'NH_AMIN',
                    'P0020008': 'NH_ASIAN', 'P0020009': 'NH_NHPI', 'P0020010': 'NH_OTHER',
                    'P0040001': 'VAP', 'P0040002': 'HVAP', 'P0040005': 'WVAP', 'P0040006': 'BVAP', 'P0040007': 'AMINVAP',
                                        'P0040008': 'ASIANVAP', 'P0040009': 'NHPIVAP', 'P0040010': 'OTHERVAP'}

population_df.rename(columns=rename_dict, inplace=True)

population_df.columns

Index(['STATEFP20', 'COUNTYFP20', 'VTDST20', 'GEOID20', 'VTDI20', 'NAME20',
       'NAMELSAD20', 'LSAD20', 'MTFCC20', 'FUNCSTAT20',
       ...
       'P0050003', 'P0050004', 'P0050005', 'P0050006', 'P0050007', 'P0050008',
       'P0050009', 'P0050010', 'geometry', 'CD'],
      dtype='object', length=349)

#### Finally, our last step is to save our dataframe into a shapefile we can use for future analysis!

In [206]:
population_df.to_file("./WA/WA.shp")