In [21]:
import pandas as pd
import numpy as np
from sklearn.decomposition import TruncatedSVD
pd.options.display.max_columns = 100
from sklearn.metrics.pairwise import cosine_similarity
import os
import us

# Pre Process CBP Data
The County Business Patttern (CBP) data set provides an economic overview of a congressional district by industry. Data can be downloaded here:https://www.census.gov/data/datasets/2013/econ/cbp/2013-cbp.html. This notebook will:
1. Calculate cosign similarity between districts.
2. Compute industry percent of total employment for each district.
3. Groupby state and perform the same calculations for senators
4. Merge in Propublica IDs

In [154]:
# get propublica id for district representative
members_house = pd.read_csv('../propublica/members_house.csv')
members_senate = pd.read_csv('../propublica/members_senate.csv')
members_house = members_house[['id','session','state','district']]
members_house['district'] = members_house.district.replace({'At-Large': 0})
members_house['dist_full'] = members_house.apply(lambda x: x.state + "_" + str(x.district), axis=1)
members_house.drop(['state', 'district'], axis = 1, inplace=True)
members_house.rename(columns={'dist_full': 'district'}, inplace = True)
members_senate = members_senate[['id','session','state']]
members_senate.rename(columns = {'state':'state_abbr'}, inplace = True)

path = 'cbp/'
cw = us.states.mapping('name','abbr')
for file in os.listdir(path):
    if file.endswith('_2.xlsx'):
        
        # load and clean data
        df = pd.read_excel(path + file)
        congress = int("1" + file[3:5])
        code = [col for col in df.columns if col.lower().endswith('naics code')][0]
        district = [col for col in df.columns if col.lower().count('dist') > 0][0]
        df.rename(columns={'Unnamed: 1': 'state_full'}, inplace = True)
        df['state_full'] = df.state_full.str.strip()
        df = df[df[code] != '------']
        df['state_abbr'] = df.state_full.replace(cw)
        df['district'] = df.state_abbr + df[district].apply(lambda x: "_" + str(x))
        df['Employment'] =  pd.to_numeric(df.Employment, errors = 'coerce')
        
        # get relative industry composition
        # for both house and senate
        for gb in ['district', 'state_abbr']:
            df_wide = df.groupby([gb, code]).Employment.mean().unstack()
            df_wide.fillna(0, inplace=True)
            totals = df_wide.sum(axis = 1)
            df_wide = df_wide.apply(lambda x: x/totals * 100)
            
            # cosign similarity
            df_cosign = pd.DataFrame(cosine_similarity(df_wide), columns=df_wide.index, index=df_wide.index)
            df_cosign = df_cosign.unstack()
            df_cosign.index.names = ['d1','d2']
            df_cosign = pd.DataFrame(df_cosign).reset_index().rename(columns = {0: 'cosine'})
            
            if gb == 'district':
                df_ids = members_house
                body = 'house'
            else:
                df_ids = members_senate
                body = 'senate'
            df_ids = df_ids[df_ids.session == congress]    
            
            # merge propub id
            df_cosign = pd.merge(df_ids,df_cosign, how = 'left', left_on=gb, right_on='d1')
            df_cosign = pd.merge(df_ids,df_cosign, how = 'left', left_on=gb, right_on='d2', suffixes=(['_d1','_d2']))
            df_wide = df_wide.reset_index()
            df_wide = pd.merge(df_ids,df_wide, how = 'left', on=gb)
            
            # export files
            export_file = 'cbp_preprocessed/' + body + "_" + str(congress)
            df_cosign.to_csv(export_file + "_cosine.csv", index =False)
            df_wide.to_csv(export_file + "composition.csv", index = False)