# Injecting the Vest 2018 data into the map

@authors: vcle, bpuhani

In [44]:
import io
import time
import warnings
from contextlib import redirect_stdout
import geopandas as gpd

import maup

import utilities as util

In [45]:
maup.progress.enabled = True

warnings.filterwarnings('ignore')

start_time = time.time()

## Loading the needed data.
For this notebook to work we assume, that you ran the following notebooks first:
* `0_IL_import_and_explore_data.ipynb`
* `B_2_IL_clean_maup_with_congress.ipynb`
* `B_4_IL_find_map_without_holes_vest20_cong.ipynb`

In [46]:
il_df = util.load_shapefile("il_data/IL_congress_without_holes.shp")
vest18_df = util.load_shapefile("il_data/il_vest_18/il_vest_18.shp")
county_df = util.load_shapefile("il_data/il_pl2020_cnty/il_pl2020_cnty.shp")
vap_df = util.load_shapefile("il_data/il_pl2020_b/il_pl2020_p4_b.shp")
population_df = util.load_shapefile("il_data/il_pl2020_b/il_pl2020_p2_b.shp")

Loading shapefile from il_data/IL_congress_without_holes.shp...
Shapefile data loaded from cache.
Loading shapefile from il_data/il_vest_18/il_vest_18.shp...
Shapefile data loaded from cache.
Loading shapefile from il_data/il_pl2020_cnty/il_pl2020_cnty.shp...
Shapefile data loaded from cache.
Loading shapefile from il_data/il_pl2020_b/il_pl2020_p4_b.shp...
Shapefile data loaded from cache.
Loading shapefile from il_data/il_pl2020_b/il_pl2020_p2_b.shp...
Shapefile data loaded from cache.


## Cleaning the vest18 data
### Reformating the `crs` of the `vest18_df` to the metric system.

In [47]:
vest18_df = vest18_df.to_crs(vest18_df.estimate_utm_crs())
county_df = county_df.to_crs(county_df.estimate_utm_crs())
vap_df = vap_df.to_crs(vap_df.estimate_utm_crs())
population_df = population_df.to_crs(population_df.estimate_utm_crs())

### Examining the il_df data (MAUP Doctor)


In [48]:
try:
    print(maup.doctor(il_df))
except Exception as e:
    print(f"Error in MAUP Doctor for il_df: {e}")

100%|██████████| 10083/10083 [00:09<00:00, 1013.27it/s]


True


### Examining the vest18 data (MAUP Doctor)

In [49]:
try:
    print(maup.doctor(vest18_df))
except Exception as e:
    print(f"Error in MAUP Doctor for vest18_df: {e}")

100%|██████████| 10116/10116 [00:10<00:00, 1007.86it/s]


There are 4 overlaps.
There are 5 holes.
False


### Clean the data
* `with min_rook_length = 30`
* `nest_within_regions = county_df`

In [50]:
repaired_vest18_df = maup.smart_repair(
    vest18_df
)

Snapping all geometries to a grid with precision 10^( -5 ) to avoid GEOS errors.
Identifying overlaps...


100%|██████████| 10746/10746 [00:05<00:00, 1959.12it/s]


Resolving overlaps...
Assigning order 2 pieces...
Filling gaps...


Gaps to simplify: 100%|██████████| 4/4 [00:10<00:00,  2.75s/it]
Gaps to fill: 100%|██████████| 1/1 [00:05<00:00,  5.55s/it]


### Examining the vest18 data (MAUP Doctor) after the cleaning

In [51]:
try:
    print(maup.doctor(repaired_vest18_df))
except Exception as e:
    print(f"Error in MAUP Doctor for repaired_vest18_df: {e}")

100%|██████████| 10116/10116 [00:09<00:00, 1021.83it/s]


True


Looks good! Now we can clean the `repaired_vest18_df` by removing and renaming the columns.

In [52]:
repaired_vest18_df.columns

Index(['STATEFP20', 'COUNTYFP20', 'VTDST20', 'GEOID20', 'NAME20', 'G18GOVDPRI',
       'G18GOVRRAU', 'G18GOVCMCC', 'G18GOVLJAC', 'G18ATGDRAO', 'G18ATGRHAR',
       'G18ATGLHAR', 'G18SOSDWHI', 'G18SOSRHEL', 'G18SOSLDUT', 'G18COMDMEN',
       'G18COMRSEN', 'G18COMLBAL', 'G18TREDFRE', 'G18TRERDOD', 'G18TRELLEH',
       'geometry'],
      dtype='object')

Let's rename the columns we need and remove the columns we don't need.

In [53]:
# copied from SC_MAUP.ipynb # and modified for the G20USS candidates from Illinois
rename_dict = {
    'G18GOVDPRI': 'G18GOVD',
    'G18GOVRRAU': 'G18GOVR',
    'G18ATGDRAO': 'G18ATGD',
    'G18ATGRHAR': 'G18ATGR',
    'G18SOSDWHI': 'G18SOSD',
    'G18SOSRHEL': 'G18SOSR',
    'G18COMDMEN': 'G18COMD',
    'G18COMRSEN': 'G18COMR',
    'G18TREDFRE': 'G18TRED',
    'G18TRERDOD': 'G18TRER'
}

In [54]:
drop_list = [
    'G18GOVCMCC',
    'G18GOVLJAC',
    'G18ATGLHAR',
    'G18SOSLDUT',
    'G18COMLBAL',
    'G18TRELLEH'
]

In [55]:
repaired_vest18_df.rename(columns=rename_dict, inplace=True)
repaired_vest18_df.drop(columns=drop_list, inplace=True)

In [56]:
repaired_vest18_df.columns

Index(['STATEFP20', 'COUNTYFP20', 'VTDST20', 'GEOID20', 'NAME20', 'G18GOVD',
       'G18GOVR', 'G18ATGD', 'G18ATGR', 'G18SOSD', 'G18SOSR', 'G18COMD',
       'G18COMR', 'G18TRED', 'G18TRER', 'geometry'],
      dtype='object')

In [57]:
elec2018_cols = list(rename_dict.values())
print(elec2018_cols)

['G18GOVD', 'G18GOVR', 'G18ATGD', 'G18ATGR', 'G18SOSD', 'G18SOSR', 'G18COMD', 'G18COMR', 'G18TRED', 'G18TRER']


Now that is's clean, we can add it to the `repaired_vest18_df`

Remember that the 2020 and 2018 election years have different precincts, so we need to disaggregate this data to the block level as previously discussed, and re-aggregate to the 2020 precincts.

In [58]:
repaired_vest18_df.head()

Unnamed: 0,STATEFP20,COUNTYFP20,VTDST20,GEOID20,NAME20,G18GOVD,G18GOVR,G18ATGD,G18ATGR,G18SOSD,G18SOSR,G18COMD,G18COMR,G18TRED,G18TRER,geometry
0,17,19,CN0100,17019CN0100,Cunningham 1,554,40,510,94,588,22,569,28,566,29,"POLYGON ((395000.017 4443248.947, 395062.695 4..."
1,17,19,CC0600,17019CC0600,City of Champaign 06,708,186,674,252,772,146,724,186,734,174,"POLYGON ((392826.416 4443312.151, 392828.282 4..."
2,17,19,CC0100,17019CC0100,City of Champaign 01,497,23,463,69,517,14,514,17,503,17,"POLYGON ((395286.570 4441435.160, 395283.825 4..."
3,17,19,CC0900,17019CC0900,City of Champaign 09,454,81,444,125,505,57,480,83,470,82,"POLYGON ((392819.261 4442698.203, 392818.473 4..."
4,17,19,CC0300,17019CC0300,City of Champaign 03,892,182,896,195,956,137,900,167,867,184,"POLYGON ((394718.724 4440992.295, 394872.839 4..."


In [59]:
print(f"Sum of all Votes: {sum(repaired_vest18_df[elec2018_cols].sum(axis=1)):_}")

Sum of all Votes: 21_874_163


In [69]:
print(len(repaired_vest18_df))

10116


In [70]:
blocks_to_2018precincts_assignment = maup.assign(vap_df.geometry, repaired_vest18_df.geometry)

100%|██████████| 10116/10116 [00:12<00:00, 793.45it/s]
100%|██████████| 10116/10116 [00:46<00:00, 216.00it/s]


In [78]:
print(len(blocks_to_2018precincts_assignment))

369978


In [79]:
# VAP = P0040001
weights2018 = (vap_df["P0040001"]/
               blocks_to_2018precincts_assignment.map(vap_df["P0040001"].groupby(blocks_to_2018precincts_assignment).sum()))
weights2018 = weights2018.fillna(0)

In [80]:
len(weights2018)

369978

In [81]:
weights2018

0         0.022222
1         0.009119
2         0.015267
3         0.111702
4         0.000000
            ...   
369973    0.016362
369974    0.012388
369975    0.018930
369976    0.009322
369977    0.040449
Length: 369978, dtype: float64

In [82]:
prorated2018 = maup.prorate(blocks_to_2018precincts_assignment, repaired_vest18_df[elec2018_cols], weights2018)

In [83]:
vap_df[elec2018_cols] = prorated2018

In [84]:
blocks_to_precincts_assignment = maup.assign(vap_df.geometry, il_df.geometry)

100%|██████████| 10083/10083 [00:12<00:00, 821.67it/s] 
100%|██████████| 10083/10083 [00:43<00:00, 229.36it/s]


In [85]:
il_df[elec2018_cols] = vap_df[elec2018_cols].groupby(blocks_to_precincts_assignment).sum()

In [86]:
il_df.head()

Unnamed: 0,boundary_n,area,STATEFP20,COUNTYFP20,VTDST20,GEOID20,NAME20,G20PRED,G20PRER,G20USSD,...,G18GOVD,G18GOVR,G18ATGD,G18ATGR,G18SOSD,G18SOSR,G18COMD,G18COMR,G18TRED,G18TRER
0,False,44420250.0,17,89,00HA01,1708900HA01,HAMPSHIRE 1,533,951,496,...,307.0,659.0,321.0,679.0,499.0,510.0,383.0,609.0,328.0,645.0
1,False,15959620.0,17,89,00HA03,1708900HA03,HAMPSHIRE 3,817,1075,784,...,503.0,689.0,544.0,706.0,714.0,534.0,588.0,655.0,520.0,684.0
2,False,1689435.0,17,37,00DK09,1703700DK09,DEKALB 9,716,346,691,...,530.0,294.0,542.0,317.0,649.0,213.0,574.0,271.0,533.0,295.0
3,False,9126377.0,17,37,00DK02,1703700DK02,DEKALB 2,677,133,604,...,811.0,127.0,801.0,142.0,864.0,98.0,812.0,119.0,766.0,148.0
4,False,83994330.0,17,201,00Bu01,1720100Bu01,Burritt 1,187,432,183,...,119.0,300.0,123.0,346.0,193.0,281.0,149.0,313.0,144.0,319.0


In [87]:
print(f"Sum of all Votes (VEST18) \t\t\t\t: {sum(repaired_vest18_df[elec2018_cols].sum(axis=1)):_}")
print(f"Sum of all Votes (projected to VEST20)\t: {sum(il_df[elec2018_cols].sum(axis=1)):_}")

Sum of all Votes (VEST18) 				: 21_874_163
Sum of all Votes (projected to VEST20)	: 21_873_265.0


As you can see, we lost over 1000 votes in the process.

So it would not be a good idea to use the elections from 2018 for the following analysis.