# Illinois Data Cleaning and Assembling

@authors: vcle, bpuhani



Load the needed libraries.

In [1]:
import utilities as util
import maup
import warnings
import time

In [2]:
maup.progress.enabled = True

warnings.filterwarnings('ignore')

start_time = time.time()

## Loading the data
1. Setting all the paths to the data.

In [3]:
# Paths to the data
population_path = "il_data/il_pl2020_b/il_pl2020_p2_b.shp"
vap_path = "il_data/il_pl2020_b/il_pl2020_p4_b.shp"
vest20_path = "il_data/il_vest_20/il_vest_20.shp"
county_path = "il_data/il_pl2020_cnty/il_pl2020_cnty.shp"
sen_path = "il_data/il_sldu_2021/il_sldu_2021.shp"

2. Loading the data using the `load_shapefile` function from the `utilities.py` file.

In [4]:
# population data
print("Loading population data...")
population_df = util.load_shapefile(population_path)

# voting age population data
print("\nLoading voting age population data...")
vap_df = util.load_shapefile(vap_path)

# election data
print("\nLoading election data...")
vest20_df = util.load_shapefile(vest20_path)

# county data
print("\nLoading county data...")
county_df = util.load_shapefile(county_path)

# senate data
print("\nLoading senate data...")
sen_df = util.load_shapefile(sen_path)

Loading population data...
Loading shapefile from il_data/il_pl2020_b/il_pl2020_p2_b.shp...
Shapefile data loaded from cache.

Loading voting age population data...
Loading shapefile from il_data/il_pl2020_b/il_pl2020_p4_b.shp...
Shapefile data loaded from cache.

Loading election data...
Loading shapefile from il_data/il_vest_20/il_vest_20.shp...
Shapefile data loaded from cache.

Loading county data...
Loading shapefile from il_data/il_pl2020_cnty/il_pl2020_cnty.shp...
Shapefile data loaded from cache.

Loading senate data...
Loading shapefile from il_data/il_sldu_2021/il_sldu_2021.shp...
Shapefile data loaded from cache.


## Cleaning the data
### Reformating the `crs` of all dataframes to the metric system.

In [5]:
population_df = population_df.to_crs(population_df.estimate_utm_crs())
vap_df = vap_df.to_crs(vap_df.estimate_utm_crs())
county_df = county_df.to_crs(county_df.estimate_utm_crs())
sen_df = sen_df.to_crs(sen_df.estimate_utm_crs())
vest20_df = vest20_df.to_crs(vest20_df.estimate_utm_crs())

### Examining the data (MAUP Doctor)

In [6]:
try:
    print(maup.doctor(population_df))
except Exception as e:
    print(f"Error in MAUP Doctor for population_df: {e}")

100%|██████████| 369978/369978 [04:07<00:00, 1495.34it/s]


True


In [7]:
try:
    print(maup.doctor(vap_df))
except Exception as e:
    print(f"Error in MAUP Doctor for vap_df: {e}")

100%|██████████| 369978/369978 [04:11<00:00, 1469.88it/s]


True


In [8]:
try:
    print(maup.doctor(vest20_df))
except Exception as e:
    print(f"Error in MAUP Doctor for vest20_df: {e}")

100%|██████████| 10083/10083 [00:10<00:00, 973.48it/s] 


True


In [9]:
try:
    print(maup.doctor(county_df))
except Exception as e:
    print(f"Error in MAUP Doctor for county_df: {e}")

100%|██████████| 102/102 [00:00<00:00, 243.54it/s]


True


In [10]:
try:
    print(maup.doctor(sen_df))
except Exception as e:
    print(f"Error in MAUP Doctor for sen_df: {e}")

100%|██████████| 59/59 [00:00<00:00, 103.87it/s]


True


Maup Doctor says, that all the dataframes are valid.


## Assembling `vest20_df` by merging population data

We are going to be using `vest20_df` to create an initial partition, that can be used to run a gerrychain on.

So we need to add population data by assigning by the geometric shapes.

### Making sure to use queen adjacencies for the geometric shapes

In [11]:
vest20_df_repaired_0 = maup.smart_repair(
    vest20_df,
    min_rook_length = 30
)

Snapping all geometries to a grid with precision 10^( -5 ) to avoid GEOS errors.
Identifying overlaps...


100%|██████████| 10705/10705 [00:06<00:00, 1710.87it/s]


Resolving overlaps...
Filling gaps...


Gaps to simplify: 0it [00:00, ?it/s]
Gaps to fill: 0it [00:00, ?it/s]

Converting small rook adjacencies to queen...



100%|██████████| 10083/10083 [00:11<00:00, 905.80it/s] 
100%|██████████| 4/4 [00:00<00:00, 720.86it/s]
100%|██████████| 4/4 [00:00<00:00, 1332.69it/s]
100%|██████████| 4/4 [00:00<00:00, 1330.79it/s]
100%|██████████| 4/4 [00:00<00:00, 1124.55it/s]
100%|██████████| 4/4 [00:00<00:00, 1324.48it/s]
100%|██████████| 4/4 [00:00<00:00, 2018.43it/s]
100%|██████████| 4/4 [00:00<00:00, 1333.01it/s]
100%|██████████| 5/5 [00:00<00:00, 1249.72it/s]
100%|██████████| 4/4 [00:00<00:00, 999.12it/s]
100%|██████████| 4/4 [00:00<00:00, 1253.15it/s]
100%|██████████| 4/4 [00:00<00:00, 1483.92it/s]
100%|██████████| 4/4 [00:00<00:00, 2001.10it/s]
100%|██████████| 4/4 [00:00<00:00, 991.21it/s]
100%|██████████| 4/4 [00:00<00:00, 1294.04it/s]
100%|██████████| 4/4 [00:00<00:00, 1991.36it/s]
100%|██████████| 4/4 [00:00<00:00, 1136.74it/s]
100%|██████████| 4/4 [00:00<00:00, 1999.67it/s]
100%|██████████| 3/3 [00:00<00:00, 1492.46it/s]
100%|██████████| 4/4 [00:00<00:00, 1331.00it/s]
100%|██████████| 4/4 [00:00<00:00,

### Assigning the population data to `vest20_df`

1. assembling the population data from the `population_df` and `vap_df` dataframes we are interested in

In [12]:
pop_column_names = ['P0020001', 'P0020002', 'P0020005', 'P0020006', 'P0020007',
                    'P0020008', 'P0020009', 'P0020010', 'P0020011']

vap_column_names = ['P0040001', 'P0040002', 'P0040005', 'P0040006', 'P0040007',
                    'P0040008', 'P0040009', 'P0040010', 'P0040011']

2. assigning the population data to `vest20_df` using the `assign_population_data_to` function from `utilities.py`

In [13]:
util.assign_population_data_to(
    vest20_df_repaired_0,
    population_df,
    vap_df,
    pop_column_names,
    vap_column_names
)

100%|██████████| 10083/10083 [00:13<00:00, 754.84it/s]
100%|██████████| 10083/10083 [00:47<00:00, 210.12it/s]
100%|██████████| 10083/10083 [00:12<00:00, 812.12it/s]
100%|██████████| 10083/10083 [00:48<00:00, 209.26it/s]


3. Testing if the population data was assigned correctly and if anyone is missing.

In [14]:
# P0020001 is the total population
print(f"Total pop in Illinois:\t {population_df['P0020001'].sum():_}")
print(f"Total pop in vest20_df:\t {vest20_df_repaired_0['P0020001'].sum():_}")
# P0040001 is the total voting age population
print(f"Total vap in Illinois:\t {vap_df['P0040001'].sum():_}")
print(f"Total vap in vest20_df:\t {vest20_df_repaired_0['P0040001'].sum():_}")

Total pop in Illinois:	 12_812_508
Total pop in vest20_df:	 12_812_508
Total vap in Illinois:	 9_999_469
Total vap in vest20_df:	 9_999_469


    Great There is no one missing!

4. Let's Check if there are any `NAN` values in the `vest20_df` dataframe.

In [15]:
# print rows where there are NA values
vest20_df_repaired_0[vest20_df_repaired_0.isna().any(axis=1)]

Unnamed: 0,STATEFP20,COUNTYFP20,VTDST20,GEOID20,NAME20,G20PREDBID,G20PRERTRU,G20PRELJOR,G20PREGHAW,G20PREACAR,...,P0020011,P0040001,P0040002,P0040005,P0040006,P0040007,P0040008,P0040009,P0040010,P0040011


    Even better, there are no `NAN` values in the dataframe.
5. Let's rename the columns to be more readable.

In [16]:
# copied from SC_MAUP.ipynb # and modified for the G20USS candidates from Illinois
rename_dict = {'P0020001': 'TOTPOP', 'P0020002': 'HISP', 'P0020005': 'NH_WHITE', 'P0020006': 'NH_BLACK', 'P0020007': 'NH_AMIN',
                    'P0020008': 'NH_ASIAN', 'P0020009': 'NH_NHPI', 'P0020010': 'NH_OTHER', 'P0020011': 'NH_2MORE',
                    'P0040001': 'VAP', 'P0040002': 'HVAP', 'P0040005': 'WVAP', 'P0040006': 'BVAP', 'P0040007': 'AMINVAP',
                                        'P0040008': 'ASIANVAP', 'P0040009': 'NHPIVAP', 'P0040010': 'OTHERVAP', 'P0040011': '2MOREVAP',
                                        'G20PREDBID': 'G20PRED', 'G20PRERTRU': 'G20PRER', 'G20USSDDUR': 'G20USSD',
                                        'G20USSRCUR': 'G20USSR'}

In [17]:
vest20_df_repaired_0.columns

Index(['STATEFP20', 'COUNTYFP20', 'VTDST20', 'GEOID20', 'NAME20', 'G20PREDBID',
       'G20PRERTRU', 'G20PRELJOR', 'G20PREGHAW', 'G20PREACAR', 'G20PRESLAR',
       'G20USSDDUR', 'G20USSRCUR', 'G20USSIWIL', 'G20USSLMAL', 'G20USSGBLA',
       'geometry', 'P0020001', 'P0020002', 'P0020005', 'P0020006', 'P0020007',
       'P0020008', 'P0020009', 'P0020010', 'P0020011', 'P0040001', 'P0040002',
       'P0040005', 'P0040006', 'P0040007', 'P0040008', 'P0040009', 'P0040010',
       'P0040011'],
      dtype='object')

In [18]:
vest20_df_repaired_0.rename(columns=rename_dict, inplace = True)

In [19]:
vest20_df_repaired_0.columns

Index(['STATEFP20', 'COUNTYFP20', 'VTDST20', 'GEOID20', 'NAME20', 'G20PRED',
       'G20PRER', 'G20PRELJOR', 'G20PREGHAW', 'G20PREACAR', 'G20PRESLAR',
       'G20USSD', 'G20USSR', 'G20USSIWIL', 'G20USSLMAL', 'G20USSGBLA',
       'geometry', 'TOTPOP', 'HISP', 'NH_WHITE', 'NH_BLACK', 'NH_AMIN',
       'NH_ASIAN', 'NH_NHPI', 'NH_OTHER', 'NH_2MORE', 'VAP', 'HVAP', 'WVAP',
       'BVAP', 'AMINVAP', 'ASIANVAP', 'NHPIVAP', 'OTHERVAP', '2MOREVAP'],
      dtype='object')

6. Let's drop unused columns from the `vest20_df` dataframe.

In [20]:
vest20_df_repaired_0.drop(columns=[ 'G20PRELJOR','G20PREGHAW','G20PREACAR','G20PRESLAR',  'G20USSIWIL', 'G20USSLMAL', 'G20USSGBLA'], inplace=True)

In [21]:
vest20_df_repaired_0.columns

Index(['STATEFP20', 'COUNTYFP20', 'VTDST20', 'GEOID20', 'NAME20', 'G20PRED',
       'G20PRER', 'G20USSD', 'G20USSR', 'geometry', 'TOTPOP', 'HISP',
       'NH_WHITE', 'NH_BLACK', 'NH_AMIN', 'NH_ASIAN', 'NH_NHPI', 'NH_OTHER',
       'NH_2MORE', 'VAP', 'HVAP', 'WVAP', 'BVAP', 'AMINVAP', 'ASIANVAP',
       'NHPIVAP', 'OTHERVAP', '2MOREVAP'],
      dtype='object')

## Assembling `vest20_df` by adding the approved senate plan data
1. Assign the senate plan data to the `vest20_df` dataframe using the `maup.assign()` function on the `geometry` field.

In [22]:
precincts_to_districts_assignment = maup.assign(vest20_df_repaired_0.geometry, sen_df.geometry)
# create a new column in the `vest20_df` dataframe with the district assignment
vest20_df_repaired_0["SEND"] = precincts_to_districts_assignment

100%|██████████| 59/59 [00:00<00:00, 130.14it/s]
100%|██████████| 59/59 [00:03<00:00, 16.98it/s]


In [23]:
vest20_df_repaired_0.head()

Unnamed: 0,STATEFP20,COUNTYFP20,VTDST20,GEOID20,NAME20,G20PRED,G20PRER,G20USSD,G20USSR,geometry,...,VAP,HVAP,WVAP,BVAP,AMINVAP,ASIANVAP,NHPIVAP,OTHERVAP,2MOREVAP,SEND
0,17,19,CN0100,17019CN0100,Cunningham 1,753,62,684,51,"POLYGON ((395000.017 4443248.947, 395062.695 4...",...,2619,147,498,953,1,934,0,10,76,51
1,17,19,CC0600,17019CC0600,City of Champaign 06,1035,264,958,253,"POLYGON ((392826.416 4443312.151, 392828.282 4...",...,3806,673,1512,1067,6,395,0,14,139,51
2,17,19,CC0100,17019CC0100,City of Champaign 01,590,34,532,28,"POLYGON ((395286.570 4441435.160, 395283.825 4...",...,1683,180,202,844,8,391,1,11,46,51
3,17,19,CC0900,17019CC0900,City of Champaign 09,618,98,578,84,"POLYGON ((392819.261 4442698.203, 392818.473 4...",...,1971,213,556,1060,2,54,0,4,82,51
4,17,19,CC0300,17019CC0300,City of Champaign 03,1073,209,1007,232,"POLYGON ((394718.724 4440992.295, 394872.839 4...",...,4209,309,1535,120,5,2091,0,17,132,51


In [24]:
print(set(vest20_df_repaired_0["SEND"].values))

{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58}


In Illinois, the states are good numbered, only starting with 1 instead of 0 so lets fix that and use the `DISTRICTN`.


In [25]:
# rename the districts with the matching `DISTRICTN` from the sen_df
for i in range(len(vest20_df_repaired_0)):
    vest20_df_repaired_0.at[i, "SEND"] = sen_df.at[int(vest20_df_repaired_0.at[i, "SEND"]), "DISTRICTN"]

In [26]:
print(set(vest20_df_repaired_0["SEND"]))

{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59}


## Save our cleaned vest20_df as a shapefile

In [27]:
# Save the final repaired shapefile
vest20_df_repaired_0.to_file("il_data/IL_state_senate.shp")