In [1]:
import pandas as pd
import topojson as tp
import json
import numpy as np
from Constants import Constants

In [13]:
state_file = lambda : Constants.data_root + 'state_2018.json'
county_file = lambda : Constants.data_root + 'county_2018.json'
cd_file = lambda : Constants.data_root + 'cd_2018.json'
cd_vote_file = lambda : Constants.data_root + 'cd_voting.xlsx'
cd_education_file = lambda : Constants.data_root + 'cd_education.xlsx'
cd_age_file = lambda : Constants.data_root + 'cd_age.xlsx'
cd_sexpoverty_file = lambda : Constants.data_root + 'cd_sexpoverty.xlsx'
cd_historical_votes_file = lambda : Constants.data_root + 'cd_historical_party_votes.csv'
county_confirmed_covid_file = lambda : Constants.data_root + 'covid_confirmed_usafacts.csv'
county_population_file = lambda : Constants.data_root + 'covid_county_population_usafacts.csv'
county_covid_deaths_file = lambda : Constants.data_root + 'covid_deaths_usafacts.csv'

In [24]:
def read_topojson(file):
    with open(file) as f:
        jsonfile = json.load(f)
    return jsonfile

def format_geojson(filename, top_key = 'GEOID'):
    topomap = read_topojson(filename)
    print(topomap.keys())
    newmap = []
    for entry in topomap['features']:
        region = {}
        if entry['type'] != 'Feature':
            print(entry['type'])
            continue
        region['coordinates'] = entry['geometry']['coordinates']
        region['geometry_type'] = entry['geometry']['type']
        for k,v in entry['properties'].items():
            region[k] = v
        newmap.append(region)
    return newmap

def merge_maps(files, names):
    maps = {}
    for file, name in zip(files, names):
        maps[name] = format_geojson(file)
    return maps

def save_maps():
    maps = merge_maps([state_file(), county_file(), cd_file()],['state','county','district'])
    with open('merged_borders.json', 'w', encoding='utf-8') as f:
        json.dump(maps, f, ensure_ascii=False)
        
maps = merge_maps([state_file(), county_file(), cd_file()],['state','county','district'])

dict_keys(['type', 'features'])
dict_keys(['type', 'features'])
dict_keys(['type', 'features'])


In [4]:
district_map_df = pd.DataFrame(merge_maps([cd_file()],['district'])['district']).set_index('GEOID')
district_map_df

dict_keys(['type', 'features'])


Unnamed: 0_level_0,coordinates,geometry_type,STATEFP,CD116FP,AFFGEOID,LSAD,CDSESSN,ALAND,AWATER
GEOID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1710,"[[[-87.89799116363989, 42.4938770661944], [-87...",Polygon,17,10,5001600US1710,C2,116,777404163,31605644
4706,"[[[-87.114976, 36.642413999999995], [-87.11743...",Polygon,47,06,5001600US4706,C2,116,16770155959,324676580
4806,"[[[-97.321371, 32.64374], [-97.327344, 32.6352...",Polygon,48,06,5001600US4806,C2,116,5564805243,255530191
4807,"[[[-95.584796, 29.932987999999998], [-95.59785...",Polygon,48,07,5001600US4807,C2,116,419784487,3069802
4826,"[[[-97.39267, 33.183509], [-97.39847999999999,...",Polygon,48,26,5001600US4826,C2,116,2349987793,191353567
...,...,...,...,...,...,...,...,...,...
3605,"[[[-73.93426954323711, 40.5667651450013], [-73...",Polygon,36,05,5001600US3605,C2,116,134550083,184506173
4902,"[[[-114.046682520363, 40.1169354028727], [-114...",Polygon,49,02,5001600US4902,C2,116,103639513751,2944001576
3907,"[[[-82.06697199999999, 41.478654999999996], [-...",Polygon,39,07,5001600US3907,C2,116,10009974812,64567382
4505,"[[[-82.0484052363051, 35.189667965478], [-82.0...",Polygon,45,05,5001600US4505,C2,116,14260689226,265732852


In [5]:
def load_cd_data(file):
    try:
        cd_data = pd.read_excel(file,header=[2,3],index_col=0)
    except:
        cd_data = pd.read_excel(file,header=[2,3,4],index_col=0)
    cd_data = cd_data.loc[cd_data.loc[:,'State abbreviation'].dropna().index,:]
    cd_data.columns = cd_data.columns.map(lambda x: '_'.join([k for k in x if 'Unnamed' not in k]))
    cd_data['state_fips'] = cd_data['State abbreviation'].apply(lambda x: Constants.STATE_FIPS.get(x,'00'))
    cd_data['cd_fips'] = cd_data['Congressional district'].apply(lambda x: Constants.CD_FIPS[x] if x in Constants.CD_FIPS else '0' + str(int(x)) if int(x) < 10 else str(int(x)))
    cd_data['GEOID'] = cd_data['state_fips'] + cd_data['cd_fips']
    return cd_data.set_index('GEOID')

load_cd_data(cd_vote_file())

Unnamed: 0_level_0,State abbreviation,State name,Congressional district,"Votes cast for congressional representative for the November 6, 2018 election1",Citizen voting- age population2_Estimate,Citizen voting- age population2_Margin of error (MOE),Voting rate3_Estimate,Voting rate3_Margin of error (MOE),state_fips,cd_fips
GEOID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0101,AL,Alabama,1,242617.0,544464.0,3424.0,44.560706,0.280231,01,01
0102,AL,Alabama,2,226230.0,516295.0,5674.0,43.817972,0.481553,01,02
0103,AL,Alabama,3,231915.0,543854.0,4099.0,42.642878,0.321397,01,03
0104,AL,Alabama,4,230969.0,515701.0,4678.0,44.787386,0.406273,01,04
0105,AL,Alabama,5,260673.0,551968.0,2121.0,47.226107,0.181472,01,05
...,...,...,...,...,...,...,...,...,...,...
5505,WI,Wisconsin,5,364288.0,558805.0,5239.0,65.190541,0.611185,55,05
5506,WI,Wisconsin,6,325065.0,556485.0,2532.0,58.413973,0.265783,55,06
5507,WI,Wisconsin,7,322840.0,552651.0,2556.0,58.416614,0.270176,55,07
5508,WI,Wisconsin,8,328774.0,556205.0,2262.0,59.110220,0.240392,55,08


In [6]:
def merged_cd_demo_data():
    cd_demo_df = None
    for file in [cd_vote_file, cd_education_file, cd_age_file, cd_sexpoverty_file]:
        demo_data = load_cd_data(file())
        if cd_demo_df is None:
            cd_demo_df = demo_data
        else:
            overlap = set(demo_data.columns).intersection(set(cd_demo_df.columns))
            cd_demo_df = cd_demo_df.merge(demo_data.drop(list(overlap),axis=1),on='GEOID',how='inner')
    return cd_demo_df
merged_cd_demo_data().columns

Index(['State abbreviation', 'State name', 'Congressional district',
       'Votes cast for congressional representative for the November 6, 2018 election1',
       'Citizen voting- age population2_Estimate',
       'Citizen voting- age population2_Margin of error (MOE)',
       'Voting rate3_Estimate', 'Voting rate3_Margin of error (MOE)',
       'state_fips', 'cd_fips', 'Citizen voting-age population_Estimate',
       'Citizen voting-age population_Margin of error (MOE)',
       'Educational attainment_Less than 9th grade_Estimate',
       'Educational attainment_Less than 9th grade_Margin of error (MOE)',
       'Educational attainment_Less than 9th grade_Percent of total',
       'Educational attainment_Less than 9th grade_Percent of total margin of error (MOE)',
       'Educational attainment_9th to 12 Grade, no diploma_Estimate',
       'Educational attainment_9th to 12 Grade, no diploma_Margin of error (MOE)',
       'Educational attainment_9th to 12 Grade, no diploma_Percent of

In [7]:
def load_2018_votedata():
    drop_cols = ['candidate','writein','version','year','unofficial','mode','runoff']
    vote_data = pd.read_csv(cd_historical_votes_file(), engine='python',dtype = {'party':str})
    vote_data = vote_data[vote_data.year == 2018]
    vote_data = vote_data[~vote_data.unofficial]
    vote_data = vote_data[vote_data['mode'] == 'total']
    vote_data.party = vote_data.party.apply(lambda x : x if x in ['democrat','republican','libertarian'] else 'other')
    vote_data = vote_data.drop(drop_cols, axis=1)
    return vote_data

def to_2digit(num):
    if num > 9:
        return str(num)
    else:
        return '0' + str(num)
    
def cleaned_votedata():
    vote_data = load_2018_votedata()
    votedata_df = []
    for (state_fips,district),v in vote_data.groupby(['state_fips','district']):
        geoid = to_2digit(state_fips) + to_2digit(district)
        district_data = {'GEOID': geoid}
        district_data['totalvotes'] = np.unique(v.totalvotes).mean()
        parties_included = np.unique(v['party'])
        for party in ['democrat','republican','libertarian','other']:
            colname = party + '_' + 'votes'
            if party not in parties_included:
                vcount = 0
            else:
                vcount = v[v.party == party].candidatevotes.sum()
            district_data[colname] = vcount
        votedata_df.append(district_data)

    return pd.DataFrame(votedata_df).set_index('GEOID')   

votedata = cleaned_votedata()
votedata

Unnamed: 0_level_0,totalvotes,democrat_votes,republican_votes,libertarian_votes,other_votes
GEOID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0101,242617.0,89226,153228,0,163
0102,226230.0,86931,138879,0,420
0103,231915.0,83996,147770,0,149
0104,230969.0,46492,184255,0,222
0105,260673.0,101388,159063,0,222
...,...,...,...,...,...
5505,364288.0,138385,225619,0,284
5506,325065.0,144536,180311,0,218
5507,322840.0,124307,194061,0,4472
5508,328774.0,119265,209410,0,99


In [10]:
def combine_data():
    district_map_df = pd.DataFrame(merge_maps([cd_file()],['district'])['district']).set_index('GEOID')
    df = district_map_df.merge(merged_cd_demo_data(), on='GEOID',how='left').fillna('null')
    to_keep = list(Constants.CD_KEYS.keys())
    df = df.loc[:,to_keep].rename(Constants.CD_KEYS,axis=1)
    df = df.merge(cleaned_votedata(),on='GEOID',how='left').fillna(0)
    return df
combine_data().to_json(Constants.data_root + 'congression_district_data.json',orient='index')

dict_keys(['type', 'features'])


In [28]:
county_map_df = pd.DataFrame(format_geojson(county_file())).set_index('GEOID')
county_map_df

dict_keys(['type', 'features'])


Unnamed: 0_level_0,coordinates,geometry_type,STATEFP,COUNTYFP,COUNTYNS,AFFGEOID,NAME,LSAD,ALAND,AWATER
GEOID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
37017,"[[[-78.90199799999999, 34.835268], [-78.805720...",Polygon,37,017,01026336,0500000US37017,Bladen,06,2265887723,33010866
37167,"[[[-80.29542099999999, 35.502919999999996], [-...",Polygon,37,167,01025844,0500000US37167,Stanly,06,1023370459,25242751
39153,"[[[-81.68495, 41.277145999999995], [-81.68699,...",Polygon,39,153,01074088,0500000US39153,Summit,06,1069181981,18958267
42113,"[[[-76.81373099999999, 41.590033999999996], [-...",Polygon,42,113,01213687,0500000US42113,Sullivan,06,1165338428,6617028
48459,"[[[-95.15211, 32.902640999999996], [-95.15274,...",Polygon,48,459,01384015,0500000US48459,Upshur,06,1509910100,24878888
...,...,...,...,...,...,...,...,...,...,...
38005,"[[[-99.492919, 48.370945999999996], [-99.84661...",Polygon,38,005,01034216,0500000US38005,Benson,06,3596569006,131708143
72079,"[[[-67.109044, 18.056085], [-67.115765, 18.031...",Polygon,72,079,01804520,0500000US72079,Lajas,13,155287827,106643202
31159,"[[[-97.368118, 41.046946999999996], [-97.36840...",Polygon,31,159,00835901,0500000US31159,Seward,06,1479995670,11542537
37023,"[[[-81.94318799999999, 35.960049999999995], [-...",Polygon,37,023,01008539,0500000US37023,Burke,06,1311146878,20719896


In [148]:
def drop_dates(data, start_date, date_interval):
    curr_date = pd.to_datetime(start_date)
    #'/' checks if it's a date column.  Not super robust but works for now since usafacts used m/d/yyy format
    keep_cols = [k for k in data.columns if '/' not in k]
    dates = [k for k in data.columns if '/' in k]
    for d in dates:
        d_date = pd.to_datetime(d)
        if (d_date - curr_date).days == date_interval:
            keep_cols.append(d)
            curr_date = d_date
    return data.loc[:, keep_cols]
    
def clean_usafacts_countyfips(file, start_date = '4/01/20', date_interval = 14):
    countydata = pd.read_csv(file)
    if start_date is not None:
        countydata = drop_dates(countydata, start_date, date_interval)
    newfips = np.zeros((countydata.shape[0],)).astype(str)
    pos = 0
    for idx, row in county_covid.iterrows():
        fips = str(int(row.countyFIPS))
        if len(fips) == 4:
            fips = '0'+fips
        if fips == '0' or fips == '1':
            #statewide unallocated stuff. will probably be dropped since it doesn't show up in the map
            fips = to_2digit(row.stateFIPS) + '000'
        newfips[pos] = fips
        pos += 1
    countydata.index = newfips
    countydata.index.name = "GEOID"
    return countydata
clean_usafacts_countyfips(county_confirmed_covid_file())

Unnamed: 0_level_0,countyFIPS,County Name,State,stateFIPS,4/15/20,4/29/20,5/13/20,5/27/20,6/10/20
GEOID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
01000,0,Statewide Unallocated,AL,1,0,0,0,0,0
01001,1001,Autauga County,AL,1,25,42,93,192,295
01003,1003,Baldwin County,AL,1,98,173,231,281,331
01005,1005,Barbour County,AL,1,13,37,69,130,208
01007,1007,Bibb County,AL,1,19,42,46,71,89
...,...,...,...,...,...,...,...,...,...
56037,56037,Sweetwater County,WY,56,10,10,13,25,35
56039,56039,Teton County,WY,56,59,64,68,100,101
56041,56041,Uinta County,WY,56,4,6,7,12,39
56043,56043,Washakie County,WY,56,5,5,5,33,39


In [149]:
def nondate_cols(df):
    return set([x for x in df.columns if '/' not in x])

def merged_county_data(to_drop = None):
    county_deaths = clean_usafacts_countyfips(county_covid_deaths_file())
    county_cases = clean_usafacts_countyfips(county_confirmed_covid_file())
    county_pop = clean_usafacts_countyfips(county_population_file(), None)
    shared_cols = list(nondate_cols(county_deaths).intersection(nondate_cols(county_pop)))
    county_data = county_pop.merge(county_deaths.drop(shared_cols,axis=1), on='GEOID')
    shared_cols = list(nondate_cols(county_data).intersection(nondate_cols(county_cases)))
    county_data = county_data.merge(county_cases.drop(shared_cols,axis=1),on='GEOID',suffixes = ('_deaths','_cases'))
    if to_drop is not None:
        county_data = county_data.drop(to_drop,axis=1)
    return county_data

county_data = merged_county_data()
county_data

Unnamed: 0_level_0,countyFIPS,County Name,State,population,stateFIPS,4/15/20_deaths,4/29/20_deaths,5/13/20_deaths,5/27/20_deaths,6/10/20_deaths,4/15/20_cases,4/29/20_cases,5/13/20_cases,5/27/20_cases,6/10/20_cases
GEOID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
01000,0,Statewide Unallocated,AL,0,1,0,0,0,0,0,0,0,0,0,0
01001,1001,Autauga County,AL,55869,1,1,3,3,3,6,25,42,93,192,295
01003,1003,Baldwin County,AL,223234,1,0,2,7,9,9,98,173,231,281,331
01005,1005,Barbour County,AL,24686,1,0,1,1,1,1,13,37,69,130,208
01007,1007,Bibb County,AL,22394,1,0,0,1,1,1,19,42,46,71,89
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56037,56037,Sweetwater County,WY,42343,56,0,0,0,0,0,10,10,13,25,35
56039,56039,Teton County,WY,23464,56,0,1,1,1,1,59,64,68,100,101
56041,56041,Uinta County,WY,20226,56,0,0,0,0,0,4,6,7,12,39
56043,56043,Washakie County,WY,7805,56,0,0,0,3,3,5,5,5,33,39


In [180]:
def county_vote_data():
    df = pd.read_csv(Constants.data_root + 'election-context-2018.csv')
    df['GEOID'] = df.fips.apply(lambda x: str(x) if len(str(x)) > 4 else '0' + str(x))
    
    for party in ['rep','dem','other']:
        name = party + 'gov'
        newcol = np.zeros((df.shape[0]))
        for i,row in df.loc[:,[name+'16',name+'14']].iterrows():
            newval = row[name+'16']
            if np.isnan(newval):
                newval = row[name+'14']
            newcol[i] = newval
        df[name] = newcol
        
    valid_fields = set(Constants.COUNTY_DEMOGRAPHICS).intersection(set(df.columns))
    if len(set(Constants.COUNTY_DEMOGRAPHICS) - valid_fields) > 0:
        print(set(Constants.COUNTY_DEMOGRAPHICS) - valid_fields)
    df = df.set_index('GEOID').loc[:,list(valid_fields)]
    return df
county_vote_data()

Unnamed: 0_level_0,trump16,demgov,age65andolder_pct,lesscollege_pct,othergov,repgov,foreignborn_pct,hispanic_pct,clf_unemploy_pct,black_pct,rural_pct,age29andunder_pct,lesshs_pct,clinton16,female_pct,white_pct,median_hh_inc,otherpres16,ruralurban_cc,cvap
GEOID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
01001,18172,3638.0,13.978456,75.407229,0.0,9427.0,1.838362,2.572254,5.591657,18.370906,42.002162,40.037058,12.417046,5936,51.176225,75.683482,53099.0,865,2.0,40690.0
01003,72883,8416.0,18.714851,70.452889,0.0,37650.0,3.269510,4.366698,6.286843,9.225603,42.279099,35.474412,9.972418,18458,51.194928,83.178788,51365.0,3874,3.0,151770.0
01005,5454,3651.0,16.528895,87.132213,0.0,3111.0,2.859397,4.309762,12.824738,47.888329,67.789635,37.664387,26.235928,4871,46.498084,45.885624,33956.0,144,6.0,20375.0
01007,6738,1368.0,14.885699,88.000000,0.0,3525.0,1.351232,2.223994,7.146827,21.212121,68.352607,37.329435,19.301587,1874,46.464646,74.765196,39776.0,207,1.0,17590.0
01009,22859,2178.0,17.192916,86.950243,0.0,12074.0,4.271801,8.727298,5.953833,1.557951,89.951502,37.240053,19.968585,2156,50.485235,87.657701,46212.0,573,1.0,42430.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56037,12154,3886.0,9.417120,78.628507,796.0,6501.0,5.509685,15.859591,5.072255,0.865840,10.916313,44.153352,9.314606,3231,47.824244,79.815674,68233.0,1745,5.0,30565.0
56039,3921,3573.0,11.837510,46.211584,307.0,3998.0,11.475048,15.174822,2.123447,0.614419,46.430920,35.569995,4.633570,7314,48.097069,81.200548,75594.0,1392,7.0,16335.0
56041,6154,1620.0,10.678218,81.793082,564.0,3251.0,3.986981,8.959939,6.390755,0.186665,43.095937,43.205858,10.361224,1202,49.327526,87.718375,53323.0,1114,7.0,14355.0
56043,2911,498.0,19.650341,78.923920,225.0,2039.0,3.783978,13.962400,7.441860,0.790325,35.954529,34.774279,12.577108,532,51.359119,82.397318,46212.0,371,7.0,6135.0


In [167]:
def merge_left(df_list, how = 'inner'):
    df = df_list[0]
    for df2 in df_list[1:]:
        overlap = set(df.columns).intersection(set(df2.columns))
        df = df.merge(df2.drop(list(overlap),axis=1),how=how,on=df.index.name)
    return df

def all_county_data():
    county_map_df = pd.DataFrame(format_geojson(county_file())).set_index('GEOID')
    county_map_df = county_map_df.drop(set(county_map_df.columns) - set(Constants.COUNTY_KEYS.keys()) ,axis=1)
    county_map_df = county_map_df.rename(Constants.COUNTY_KEYS, axis=1)
    county_data = merge_left([county_map_df, 
                              merged_county_data().rename(Constants.COUNTY_KEYS, axis=1),
                              county_vote_data()])
    return county_data.fillna('null')
all_county_data()

dict_keys(['type', 'features'])


Unnamed: 0_level_0,coordinates,geometry_type,state_fips,county_fips,county_name,land_area,water_area,state,population,4/15/20_deaths,...,lesshs_pct,clinton16,female_pct,white_pct,median_hh_inc,demgov16,otherpres16,ruralurban_cc,cvap,othergov16
GEOID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
37017,"[[[-78.90199799999999, 34.835268], [-78.805720...",Polygon,37,017,Bladen,2265887723,33010866,NC,32722,0,...,21.557306,7058,52.513496,54.240437,30408.0,7227,289,6.0,25675.0,189
37167,"[[[-80.29542099999999, 35.502919999999996], [-...",Polygon,37,167,Stanly,1023370459,25242751,NC,62806,1,...,15.258048,7094,50.242534,81.516251,44140.0,8163,859,6.0,46100.0,601
39153,"[[[-81.68495, 41.277145999999995], [-81.68699,...",Polygon,39,153,Summit,1069181981,18958267,OH,541013,15,...,9.016560,134256,51.520581,78.243057,51562.0,,11697,2.0,412230.0,
42113,"[[[-76.81373099999999, 41.590033999999996], [-...",Polygon,42,113,Sullivan,1165338428,6617028,PA,6066,0,...,10.957512,750,47.588067,93.525865,44926.0,,95,8.0,5445.0,
48459,"[[[-95.15211, 32.902640999999996], [-95.15274,...",Polygon,48,459,Upshur,1509910100,24878888,TX,41753,0,...,16.602730,2380,50.594367,80.948008,47724.0,,424,3.0,29600.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22003,"[[[-92.978453, 30.878175], [-92.97917, 30.5983...",Polygon,22,003,Allen,1973083810,10634063,LA,25627,8,...,20.817761,2106,42.199149,71.064444,41801.0,,272,6.0,19415.0,
38005,"[[[-99.492919, 48.370945999999996], [-99.84661...",Polygon,38,005,Benson,3596569006,131708143,ND,6832,0,...,15.022015,842,48.662158,41.502499,41530.0,817,240,9.0,4465.0,81
31159,"[[[-97.368118, 41.046946999999996], [-97.36840...",Polygon,31,159,Seward,1479995670,11542537,NE,17284,0,...,7.214503,1875,49.184830,95.336878,61563.0,,593,2.0,12965.0,
37023,"[[[-81.94318799999999, 35.960049999999995], [-...",Polygon,37,023,Burke,1311146878,20719896,NC,90485,4,...,20.395955,11251,50.507398,82.287106,39759.0,13372,1431,2.0,68740.0,876


In [168]:
def reduce_date_data(date_data, days_per_step = 14):
    dates = sorted(date_data.keys())
    new_dates = {dates[0]: date_data[dates[0]]}
    curr_pos = 0
    for pos,d in enumerate(dates):
        if pos - curr_pos >= days_per_step:
            new_dates[d] = date_data[d]
            curr_pos = pos
    return new_dates

def county_data_to_json(acd, savefile = None):
    new_dict = {}
    for k,v in acd.to_dict(orient='index').items():
        entry = {}
        date_data = {}
        #assumes covid date data is format 'dd/mm/yyy_(deaths|cases)' and nothing else has a /
        for key, val in v.items():
            if '/' not in key:
                entry[key] = val
            else:
                date,data_type = key.split('_')
                if val == 'null':
                    val = 0

                if date not in date_data:
                    date_data[date] = {data_type: int(val)}
                else:
                    ed = date_data[date]
                    ed[data_type] = int(val)
                    date_data[date] = ed
        entry['dates'] = date_data
        new_dict[k] = entry
    if savefile is not None:
        with open(Constants.data_root + savefile, 'w') as f:
            json.dump(new_dict, f)
    return new_dict

county_data_to_json(all_county_data(), 'merged_county_data.json')

dict_keys(['type', 'features'])


{'37017': {'coordinates': [[[-78.90199799999999, 34.835268],
    [-78.80572099999999, 34.689202],
    [-78.868961, 34.484778],
    [-78.830353, 34.462578],
    [-78.64837299999999, 34.460868999999995],
    [-78.44532, 34.3774],
    [-78.334569, 34.367536],
    [-78.256085, 34.399468999999996],
    [-78.176501, 34.465356],
    [-78.255468, 34.508614],
    [-78.254441, 34.553595],
    [-78.374363, 34.700722],
    [-78.391978, 34.741265],
    [-78.390748, 34.749463],
    [-78.494705, 34.856182],
    [-78.516123, 34.845918999999995],
    [-78.79959699999999, 34.850857],
    [-78.90199799999999, 34.835268]]],
  'geometry_type': 'Polygon',
  'state_fips': '37',
  'county_fips': '017',
  'county_name': 'Bladen',
  'land_area': 2265887723,
  'water_area': 33010866,
  'state': 'NC',
  'population': 32722,
  'trump16': 8550,
  'age65andolder_pct': 18.3839321994543,
  'lesscollege_pct': 85.1816719594177,
  'foreignborn_pct': 4.40877691995124,
  'hispanic_pct': 7.48824519649388,
  'clf_unemploy_pc

In [171]:
Constants.COUNTY_DEMOGRAPHICS = ['trump16', 'clinton16', 'otherpres16', 
                        'repgov','demgov','othergov',
                        'cvap',
                        'white_pct', 'black_pct','hispanic_pct',
                        'foreignborn_pct',
                        'female_pct',
                        'age29andunder_pct','age65andolder_pct',
                        'median_hh_inc','clf_unemploy_pct',
                        'lesshs_pct','lesscollege_pct',
                        'rural_pct','ruralurban_cc'
                       ]