In [1]:
import shapefile
import pandas as pd
import numpy as np
import us
from itertools import combinations
import haversine
import json
from shapely.geometry import shape as shapely
pd.options.display.max_columns = 100

# Preprocess Congressional District Geography
This notebook will:
1. Calculate the distance between every congressional district going back to 102.
2. Determine if districts are neighbors
3. Convert the shape file to geojson for ease of use with D3.
4. Calculate distances between state centroids (for senators)

In [163]:
def construct_coordinates(row):
    """Fill district bounding box and place in format accepted by shapely"""
    points = []
    points.append((row.bbox0, row.bbox1))
    points.append((row.bbox0, row.bbox3))
    points.append((row.bbox2, row.bbox3))
    points.append((row.bbox2, row.bbox1))
    return {'coordinates':[points], 'type': 'Polygon'}

df_members = pd.read_csv('../propublica/members_house.csv')
df_members = df_members[['id','session','state','district']]
df_members['district'] = df_members.district.replace({'At-Large': 0})
df_members['dist_full'] = df_members.apply(lambda x: x.state + "_" + str(x.district), axis=1)
df_members.drop(['state', 'district'], axis = 1, inplace=True)

In [165]:
for congress in reversed(range(102,115)):
#for congress in reversed(range(114,115)):
    print(congress)
    # load shape file
    file_loc = 'districts' + str(congress)
    file_loc += ("/" + file_loc + ".shp")
    districts = shapefile.Reader('shapefiles/' + file_loc)
    
    # get bounding box and centroid of districts
    rows = []
    shapes = districts.shapes()
    records = districts.records()
    i = 0
    for shape, record in zip(shapes, records):
        bbox = {}
        for b in range(0,4):
            try:
                bbox[b] = shape.bbox[b]
            except:
                bbox[b] = np.nan

        lat = (bbox[1] + bbox[3])/ 2
        lon = (bbox[0] + bbox[2])/ 2

        row = {'state': record[0], 
               'district': record[2], 
               'lat': lat, 
               'lon': lon, 
               'bbox0': bbox[0],
               'bbox1': bbox[1],
               'bbox2': bbox[2],
               'bbox3': bbox[3], 
               'i':i}
        rows.append(row)
        i += 1
    
    # convert district info to df
    df_districts = pd.DataFrame(rows)
    df_districts.dropna(inplace=True)
    cw = us.states.mapping('name','abbr')
    df_districts['state_abbr'] = df_districts.state.replace(cw)
    df_districts['dist_full'] = df_districts.apply(lambda x: x.state_abbr + "_" + str(x.district), axis=1)
    df_districts['coords'] = df_districts.apply(lambda x: tuple([x.lat, x.lon]), axis=1)
    
    # calculate distances between all districts
    # and if districts touch
    distances = []
    for d1,d2 in combinations(df_districts.dist_full.unique(), 2):
        d1_coord = df_districts[df_districts.dist_full == d1].coords.iloc[0]
        d1_shape = shapely(construct_coordinates(df_districts[df_districts.dist_full == d1].iloc[0]))
        d2_coord = df_districts[df_districts.dist_full == d2].coords.iloc[0]
        d2_shape = shapely(construct_coordinates(df_districts[df_districts.dist_full == d2].iloc[0]))

        distance = haversine.haversine(d1_coord, d2_coord, miles = True)
        if d1_shape.touches(d2_shape):
            neighbor = 1
        elif d1_shape.intersects(d2_shape):
            neighbor = 1
        else:
            neighbor = 0

        row = {'d1':d1,'d2':d2, 'distance': distance, 'neighbor': neighbor}
        distances.append(row)
        
    # for ease of future lookup, 
    # duplicate df, switch d1 and d2 columns, and concat
    df_distances = pd.DataFrame(distances)
    df_distances = pd.concat([df_distances, df_distances.rename(columns={'d1':'d2','d2':'d1'})])
    df_distances = pd.merge(df_distances, 
                            df_districts, 
                            how = 'left', 
                            left_on='d1', 
                            right_on = 'dist_full')
    df_distances = pd.merge(df_distances, 
                            df_districts, 
                            how = 'left', 
                            left_on='d2', 
                            right_on = 'dist_full', 
                            suffixes=['_d1','_d2'])
    df_distances['congress'] = congress

    # merge in propublica member id
    df_congress = df_members[df_members.session == congress]
    df_congress.drop(['session'], axis = 1, inplace=True)
    df_distances = pd.merge(df_distances, df_congress.rename(columns={'dist_full': 'd1'}), how = 'left', on='d1')
    df_distances = pd.merge(df_distances, 
                    df_congress.rename(columns={'dist_full': 'd2'}), 
                    how = 'left', 
                    on='d2', suffixes=['_d1','_d2'])

    df_distances.to_csv('geo_preprocessed/geo_' + str(congress) + '.csv', index = False)

    # convert shape file to geojson and export
    fields = districts.fields[1:]
    field_names = [field[0] for field in fields]
    buffer = []
    for sr in districts.shapeRecords():
        atr = dict(zip(field_names, sr.record))
        geom = sr.shape.__geo_interface__
        buffer.append(dict(type="Feature", \
        geometry=geom, properties=atr)) 
    geojson = open('geojson/congress_' + str(congress) + '.json', "w")
    geojson.write(json.dumps({"type": "FeatureCollection","features": buffer}, indent=2) + "\n")
    geojson.close()

114


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


113
112
111
110
109
108
107
106
105
104
103
102


## State Distances

In [6]:
df_states = pd.read_csv('state_geo.csv', encoding = 'latin1', index_col='state_abbr')

In [7]:
df_states.head()

Unnamed: 0_level_0,State,Location,Coordinates,lat,lon,neighbors
state_abbr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
AL,Alabama,12.1 mi (19.5 km) southwest of Clanton,32.7794°N 86.8287°W,32.7794,-86.8287,"['FL', 'GA', 'MS', 'TN']"
AK,Alaska,77.9 mi (125.4 km) northwest of Denali,64.0685°N 152.2782°W,64.0685,-152.2782,['WA']
AZ,Arizona,49.7 mi (80.0 km) east-southeast of Prescott,34.2744°N 111.6602°W,34.2744,-111.6602,"['CA', 'CO', 'NM', 'NV', 'UT']"
AR,Arkansas,14.2 mi (22.9 km) northwest of Little Rock,34.8938°N 92.4426°W,34.8938,-92.4426,"['LA', 'MO', 'MS', 'OK', 'TN', 'TX']"
CA,California,"36 mi (58 km) northeast of Madera,",37.1841°N 119.4696°W,37.1841,-119.4696,"['HI', 'NV', 'OR', 'AZ']"


In [12]:
output = []
for state1, row1 in df_states.iterrows():
    for state2, row2 in df_states.iterrows():
        new_row = {'state1': state1, 'state2':state2, 'touching': False}
        coords1 = (row1.lat, row1.lon)
        coords2 = (row2.lat, row2.lon)
        distance = haversine.haversine(coords1, coords2, miles=True)
        new_row['distance'] = distance
        if state2 in eval(row1.neighbors):
            new_row['touching'] = True
        output.append(new_row)
pd.DataFrame(output).to_csv('state_distances.csv', index = False)

In [18]:
cw = us.states.mapping('name','abbr')

In [19]:
df_state_divs = pd.read_csv('state_divisions.csv')

In [22]:
df_states = pd.merge(df_states, df_state_divs, how = 'left', left_on='State', right_on='Name')

In [23]:
df_states.to_csv('state_geo.csv')

In [20]:
df_state_divs.head()

Unnamed: 0,Region,Division,State (FIPS),Name
0,1,0,0,Northeast Region
1,1,1,0,New England Division
2,1,1,9,Connecticut
3,1,1,23,Maine
4,1,1,25,Massachusetts
