In [1]:
import pandas as pd
import os
import numpy as np
from datetime import date
import numpy.matlib 

see https://osf.io/7xnju/ for external data


PID data not publically available

In [2]:

data_path = os.path.expanduser(os.path.join('~','Library','CloudStorage','Box-Box', 'COVID-19 Adolphs Lab', 'core_analysis', 'raw_data'))
out_path = os.path.expanduser(os.path.join('~','Library','CloudStorage','Box-Box', 'COVID-19 Adolphs Lab', 'core_analysis', 'processed_data'))


# Wave dates

In [3]:
from datetime import timedelta
wave_date = pd.read_csv(os.path.join(data_path, 'wave_dates.csv'), dtype = 'str')
wave_date.start_date = wave_date.start_date.astype('datetime64[ns]')
wave_date.next_monday = wave_date.next_monday.astype('datetime64[ns]')
for days in range(1,8):
    wave_date['start_date-' + str(days)] = wave_date.start_date - timedelta(days=days)
wave_date.drop(columns = ['next_monday'], inplace = True)



# PID DATA

In [4]:
# output state and county data frame for states and counties included in participant data
pid_county = pd.read_csv(os.path.join(out_path,'participant_county_data.csv'), dtype = 'str')

# exclude subjects with non-informative county data
pid_county = pid_county.loc[pid_county.flag !='9.0', :]
pid_county = pid_county.merge(wave_date, on=['wave'])
pid_county.drop(columns = ['zip_code','moved','nearest_town', 'county_by_zip', 'county_by_city_and_state', 'county_man_edit'], inplace = True)
pid_county = pd.melt(pid_county, id_vars=['PROLIFIC_PID','wave','state','county','loc_description','flag'], value_vars=['start_date','start_date-1',
                                                                                                  'start_date-2','start_date-3',
                                                                                                  'start_date-4','start_date-5',
                                                                                                  'start_date-6','start_date-7'])
pid_county.rename(columns = {'variable': 'wave_day', 'value': 'date'}, inplace = True)
pid_county.county = pid_county.county.str.replace(' county', '')
pid_county.county = pid_county.county.str.replace('montgomery ', 'montgomery')
pid_county.county = pid_county.county.str.replace('baltimore city', 'baltimore')


# from April 2020 to December 2021
dates_incl = pd.date_range(date.fromisoformat('2020-04-04'), date.fromisoformat('2021-12-31'))

# participant state and counties
county_data = pid_county[['state','county']].drop_duplicates().reset_index(drop=True)



date_df = pd.DataFrame(np.tile(np.array(dates_incl),(county_data.shape[0],1)))

# combine counties and dates
county_data = pd.concat([county_data, date_df], axis=1)
county_data = county_data.melt(id_vars = ['state', 'county'], value_vars = date_df.columns)
county_data = county_data.drop('variable', axis=1)
county_data = county_data.rename(columns = {'value':'date'})
county_data = county_data.loc[~county_data.county.isna(),:].reset_index(drop = True)
state_data = county_data.copy()
state_data = state_data.drop('county', axis=1)

# state and dates
state_data = state_data.drop_duplicates().reset_index(drop=True)


# add year, month day, seperatly
date_tmp_county = np.array(list(county_data.date.astype('str').str.split('-')))
county_data['year'] = date_tmp_county[:,0]
county_data['month'] = date_tmp_county[:,1]
county_data['day'] = date_tmp_county[:,2]

# add year, month day, seperatly
date_tmp_state = np.array(list(state_data.date.astype('str').str.split('-')))
state_data['year'] = date_tmp_state[:,0]
state_data['month'] = date_tmp_state[:,1]
state_data['day'] = date_tmp_state[:,2]


In [5]:

state_dict = {'AL': 'Alabama', 'AK': 'Alaska', 'AZ': 'Arizona', 'AR': 'Arkansas', 'CA': 'California',
 'CO': 'Colorado', 'CT': 'Connecticut', 'DE': 'Delaware', 'DC': 'District of Columbia', 
 'FL': 'Florida', 'GA': 'Georgia', 'HI': 'Hawaii', 'ID': 'Idaho', 'IL': 'Illinois',
 'IN': 'Indiana', 'IA': 'Iowa', 'KS': 'Kansas', 'KY': 'Kentucky', 'LA': 'Louisiana', 
 'ME': 'Maine', 'MD': 'Maryland', 'MA': 'Massachusetts', 'MI': 'Michigan', 'MN': 'Minnesota',
 'MS': 'Mississippi', 'MO': 'Missouri', 'MT': 'Montana', 'NE': 'Nebraska', 'NV': 'Nevada', 
 'NH': 'New Hampshire', 'NJ': 'New Jersey', 'NM': 'New Mexico', 'NY': 'New York',
 'NC': "North Carolina", 'ND': 'North Dakota', 'OH': 'Ohio', 'OK': 'Oklahoma',
 'OR': 'Oregon', 'PA': 'Pennsylvania', 'RI': 'Rhode Island', 'SC': 'South Carolina', 
 'SD': 'South Dakota', 'TN': 'Tennessee', 'TX': 'Texas', 'UT': 'Utah', 'VT': 'Vermont', 
 'VA': 'Virginia', 'WA': 'Washington', 'WV': 'West Virginia', 'WI': 'Wisconsin', 
 'WY': 'Wyoming', 'PR': 'Puerto Rico'}


# COVID cases/ deaths

state level

In [6]:
# state avg
covid_avg_state = pd.read_csv("https://raw.githubusercontent.com/nytimes/covid-19-data/master/rolling-averages/us-states.csv", dtype = 'str')
covid_avg_state.state = covid_avg_state.state.str.lower()
covid_avg_state = covid_avg_state.drop('geoid', axis=1)
covid_avg_state.date = pd.to_datetime(covid_avg_state.date)
covid_avg_state = covid_avg_state.drop_duplicates()


# state cumsum
covid_cumsum_state = pd.read_csv("https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-states.csv", dtype = 'str')
covid_cumsum_state.state = covid_cumsum_state.state.str.lower()
covid_cumsum_state = covid_cumsum_state.drop('fips', axis=1)
covid_cumsum_state.date = pd.to_datetime(covid_cumsum_state.date)
covid_cumsum_state = covid_cumsum_state.rename(columns = {'cases': 'cumsum_cases','deaths': 'cumsum_deaths'})
covid_cumsum_state = covid_cumsum_state.drop_duplicates()


county level

In [7]:
# county avg
covid_avg_county_2020 = pd.read_csv(os.path.join(data_path,"CVDrollingavgs_nytimes_counties2020.csv"), dtype = 'str')
covid_avg_county_2021 = pd.read_csv(os.path.join(data_path,"CVDrollingavgs_nytimes_counties2021.csv"), dtype = 'str')
covid_avg_county = pd.concat([covid_avg_county_2020, covid_avg_county_2021])
covid_avg_county.state = covid_avg_county.state.str.lower()
covid_avg_county.county = covid_avg_county.county.str.lower()
covid_avg_county = covid_avg_county.drop('geoid', axis=1)
covid_avg_county.date = pd.to_datetime(covid_avg_county.date)
# manually edit spelling differences
covid_avg_county.county = covid_avg_county.county.str.replace('doña ana', 'dona ana')
covid_avg_county.county = covid_avg_county.county.str.replace('new york city', 'new york')
covid_avg_county.county = covid_avg_county.county.str.replace('anchorage', 'anchorage municipality')
covid_avg_county.county = covid_avg_county.county.str.replace('bienville', 'bienville parish')
covid_avg_county.county = covid_avg_county.county.str.replace('rapides', 'rapides parish')
covid_avg_county.loc[covid_avg_county.county=='st. tammany', 'county'] = 'st. tammany parish'
covid_avg_county.loc[(covid_avg_county.county=='lafayette') & ((covid_avg_county.state=='kentucky') | 
                                                               (covid_avg_county.state=='louisiana')), 'county' ] = 'lafayette parish'
covid_avg_county.loc[(covid_avg_county.county=='livingston') & (covid_avg_county.state=='louisiana'), 'county' ] = 'livingston parish'
covid_avg_county.loc[(covid_avg_county.county=='orleans') & (covid_avg_county.state=='louisiana'), 'county' ] = 'orleans parish'
covid_avg_county.loc[(covid_avg_county.county=='ouachita') & (covid_avg_county.state=='louisiana'), 'county' ] = 'ouachita parish'
covid_avg_county.loc[(covid_avg_county.county=='calcasieu') & (covid_avg_county.state=='louisiana'), 'county' ] = 'calcasieu parish'


# county cumsum
covid_cumsum_county = pd.read_csv(os.path.join(data_path,"CVDcumsum_nytimes_counties.csv"), dtype = 'str')
covid_cumsum_county.state = covid_cumsum_county.state.str.lower()
covid_cumsum_county.county = covid_cumsum_county.county.str.lower()
covid_cumsum_county = covid_cumsum_county.drop('fips', axis=1)
covid_cumsum_county.date = pd.to_datetime(covid_cumsum_county.date)
covid_cumsum_county = covid_cumsum_county.rename(columns = {'cases': 'cumsum_cases','deaths': 'cumsum_deaths'})
# manually edit spelling differences
covid_cumsum_county.county = covid_cumsum_county.county.str.replace('doña ana', 'dona ana')
covid_cumsum_county.county = covid_cumsum_county.county.str.replace('new york city', 'new york')

covid_cumsum_county.county = covid_cumsum_county.county.str.replace('anchorage', 'anchorage municipality')
covid_cumsum_county.county = covid_cumsum_county.county.str.replace('bienville', 'bienville parish')
covid_cumsum_county.county = covid_cumsum_county.county.str.replace('rapides', 'rapides parish')
covid_cumsum_county.loc[covid_cumsum_county.county=='st. tammany', 'county'] = 'st. tammany parish'
covid_cumsum_county.loc[(covid_cumsum_county.county=='lafayette') & ((covid_cumsum_county.state=='kentucky') | 
                                                               (covid_cumsum_county.state=='louisiana')), 'county' ] = 'lafayette parish'
covid_cumsum_county.loc[(covid_cumsum_county.county=='livingston') & (covid_cumsum_county.state=='louisiana'), 'county' ] = 'livingston parish'
covid_cumsum_county.loc[(covid_cumsum_county.county=='orleans') & (covid_cumsum_county.state=='louisiana'), 'county' ] = 'orleans parish'
covid_cumsum_county.loc[(covid_cumsum_county.county=='ouachita') & (covid_cumsum_county.state=='louisiana'), 'county' ] = 'ouachita parish'
covid_cumsum_county.loc[(covid_cumsum_county.county=='calcasieu') & (covid_cumsum_county.state=='louisiana'), 'county' ] = 'calcasieu parish'




# # the NYT includes Bronx and Queens county COVID data in the NYC data --> copying NYC data
bronx_county_cumsum = covid_cumsum_county.loc[covid_cumsum_county.county == 'new york'].reset_index(drop = True).copy()
bronx_county_cumsum['county'] = 'bronx'
bronx_county_avg = covid_avg_county.loc[covid_avg_county.county == 'new york'].reset_index(drop = True).copy()
bronx_county_avg['county'] = 'bronx'
queens_county_cumsum = covid_cumsum_county.loc[covid_cumsum_county.county == 'new york'].reset_index(drop = True).copy()
queens_county_cumsum['county'] = 'queens'
queens_county_avg = covid_avg_county.loc[covid_avg_county.county == 'new york'].reset_index(drop = True).copy()
queens_county_avg['county'] = 'queens'

covid_cumsum_county = pd.concat([covid_cumsum_county, bronx_county_cumsum, queens_county_cumsum])
covid_cumsum_county = covid_cumsum_county.drop_duplicates()
covid_avg_county = pd.concat([covid_avg_county, bronx_county_avg, queens_county_avg])
covid_avg_county = covid_avg_county.drop_duplicates()


# UNEMPLOYMENT DATA

In [8]:

ue_state = pd.read_csv(os.path.join(data_path, 'unemployment_state.csv'), dtype = 'str')
ue_state.state = ue_state.state.str.lower()

ue_county = pd.read_csv(os.path.join(data_path, 'unemployment_county.csv'), dtype = 'str')
ue_county['County Name/State Abbreviation'].replace({'District of Columbia': 'District of Columbia, DC'} , inplace = True)
county_state_tmp = np.array(list(ue_county['County Name/State Abbreviation'].str.split(', ')))
ue_county['county'] = county_state_tmp[:,0]
ue_county.county = ue_county.county.str.lower()
ue_county['state'] = county_state_tmp[:,1]
ue_county['state'] = ue_county['state'].replace(state_dict)
ue_county.state = ue_county.state.str.lower()
ue_county = ue_county.drop('County Name/State Abbreviation', axis=1)
ue_county.Period = ue_county.Period.str.strip('(p')
ue_county.Period.replace({'Dec-21': '21-Dec'}, inplace = True)
ue_county.Period = ue_county.Period.str.replace('-Se', '-Sep')
period_tmp = np.array(list(ue_county.Period.str.split('-')))
ue_county['year'] = period_tmp[:,0]
ue_county['month'] = period_tmp[:,1]
ue_county.month.replace({'Jan': '01','Feb': '02', 'Mar': '03', 'Apr': '04', 'May': '05',
                         'Jun': '06', 'Jul': '07', 'Aug': '08', 'Sep': '09', 'Oct': '10',
                         'Nov': '11', 'Dec': '12'}, inplace = True)
ue_county.year.replace({'20': '2020','21': '2021', '22': '2022'}, inplace = True)
ue_county = ue_county.drop('Period', axis=1)

ue_county.county = ue_county.county.str.replace(' county', '')
ue_county.county = ue_county.county.str.replace('anchorage borough/municipality', 'anchorage municipality')
ue_county.county = ue_county.county.str.replace('denver/city', 'denver')
ue_county.county = ue_county.county.str.replace('honolulu/city', 'honolulu')
ue_county.county = ue_county.county.str.replace('philadelphia/city', 'philadelphia')
ue_county.county = ue_county.county.str.replace('san francisco/city', 'san francisco' )

ue_county = ue_county.drop_duplicates()
ue_state = ue_state.drop_duplicates()

# Restrictions

In [9]:
# stay at home orders
stay_at_home_county = pd.read_csv(os.path.join(data_path, 
                                               'U.S._State_and_Territorial_Stay-At-Home_Orders__March_15__2020___August_15__2021_by_County_by_Day.csv'),
                                              low_memory=False, dtype = 'str')
drop_cols = ['FIPS_State', 'FIPS_County', 'URL', 'Citation', 'Source_of_Action', 'Express_Preemption']
stay_at_home_county.drop(columns = drop_cols, inplace = True)
stay_at_home_county.rename(columns= {'State_Tribe_Territory': 'state', 'County_Name': 'county',
                                    'Stay_at_Home_Order_Recommendation': 'stayHome_action',
                                   'Order_code': 'stayHome_order_code'}, inplace = True)
stay_at_home_county.state = stay_at_home_county.state.replace(state_dict)
stay_at_home_county = stay_at_home_county.loc[stay_at_home_county.state.isin(list(state_dict.values())),:].reset_index(drop=True)
# order code:
# (1) - 'Mandatory for all individuals'
# (2) - 'Mandatory only for all individuals in certain areas of the jurisdiction'
# (3) - 'Mandatory only for at-risk individuals in the jurisdiction'
# (4) - missing
# (5) - 'Mandatory only for at-risk individuals in certain areas of the jurisdiction'
# (6) - 'Advisory/Recommendation'
# (7) - 'NaN'

############################
# gathering ban
gather_ban_county = pd.read_csv(os.path.join(data_path, 
                                               'U.S._State_and_Territorial_Gathering_Bans__March_11__2020-August_15__2021_by_County_by_Day.csv'),
                                              low_memory=False, dtype = 'str')
drop_cols = ['FIPS_State', 'FIPS_County', 'URL', 'Express_Preemption', 'Citation', 'Source_of_Action', 'URL']
gather_ban_county.drop(columns = drop_cols, inplace = True)
gather_ban_county.rename(columns= {'State_Tribe_Territory': 'state', 'County_Name': 'county',
                                    'General_GB_order_group': 'gatherBan_action',
                                    'General_GB_order_code': 'gatherBan_order_code',
                                    'General_or_Under_6ft_Bans_Gatherings_Over': 'gatherBan_N_limit',
                                    'Indoor_Outdoor': 'gatherBan_indoor_outdoor'}, inplace = True)
gather_ban_county.state = gather_ban_county.state.replace(state_dict)
gather_ban_county = gather_ban_county.loc[gather_ban_county.state.isin(list(state_dict.values())),:].reset_index(drop=True)

# order code:
# (1) - 'No order found'
# (2) - 'Ban of gatherings over 101 or more people'
# (3) - 'Ban of gatherings over 51-100 people'
# (4) - 'Ban of gatherings over 26-50 people'
# (5) - 'Ban of gatherings over 11-25 people'
# (6) - 'Ban of gatherings over 1-10 people'
# (7) - 'Bans gatherings of any size'

############################
# face masks requried ban
mask_mandate_county = pd.read_csv(os.path.join(data_path, 
                                               'U.S._State_and_Territorial_Public_Mask_Mandates_From_April_10__2020_through_August_15__2021_by_County_by_Day.csv'),
                                              low_memory=False, dtype = 'str')
drop_cols = ['FIPS_State', 'FIPS_County', 'URL', 'Citation', 'Source_of_Action', 'URL']
mask_mandate_county.drop(columns = drop_cols, inplace = True)
mask_mandate_county.rename(columns= {'State_Tribe_Territory': 'state', 'County_Name': 'county',
                                    'order_code': 'mask_order_code',
                                    'Face_Masks_Required_in_Public': 'mask_required_in_public'}, inplace = True)

mask_mandate_county.state = mask_mandate_county.state.replace(state_dict)
mask_mandate_county = mask_mandate_county.loc[mask_mandate_county.state.isin(list(state_dict.values())),:].reset_index(drop=True)


# order code - Face_Masks_Required_in_Public:
# (1) - 'Yes'
# (2) - 'NaN'

############################
# restaurant closures
rest_closure_county = pd.read_csv(os.path.join(data_path, 
                                               'U.S._State_and_Territorial_Orders_Closing_and_Reopening_Restaurants_Issued_from_March_11__2020_through_August_15__2021_by_County_by_Day.csv'),
                                              low_memory=False, dtype = 'str')
drop_cols = ['FIPS_State', 'FIPS_County', 'Business_Type', 'URL', 'Citation', 'Source_of_Action']
rest_closure_county.drop(columns = drop_cols, inplace = True)
rest_closure_county.rename(columns= {'State_Tribe_Territory': 'state', 'County_Name': 'county', 
                                     'Action': 'rest_action', 'order_code': 'rest_order_code',
                                    'Percent_Capacity_Outdoor': 'rest_%_capacity_outdoor',
                                    'Percent_Capacity_Indoor': 'rest_%_capacity_indoor',
                                    'Numeric_Capacity_Outdoor': 'rest_N_capacity_outdoor',
                                    'Numeric_Capacity_Indoor': 'rest_N_capacity_indoor',
                                    'Limited_Open_Outdoor_Only': 'rest_lim_capacity_outdoor',
                                    'Limited_Open_General_Indoor': 'rest_lim_general_indoor'}, inplace = True)
rest_closure_county.state = rest_closure_county.state.replace(state_dict)
rest_closure_county = rest_closure_county.loc[rest_closure_county.state.isin(list(state_dict.values())),:].reset_index(drop=True)

# order code:
# (1) - NaN
# (2) - 'Authorized to fully reopen'
# (3) - 'Open with social distancing/reduced seating/enhanced sanitation'
# (4) - 'Open with social distancing/reduced seating/enhanced sanitation'
# (5) - 'Curbside/carryout/delivery only'
# (6) - 'Closed'

############################
# bar closures
bar_closure_county = pd.read_csv(os.path.join(data_path, 
                                               'U.S._State_and_Territorial_Orders_Closing_and_Reopening_Bars_Issued_from_March_11__2020_through_August_15__2021_by_County_by_Day.csv'),
                                              low_memory=False, dtype = 'str')
drop_cols = ['FIPS_State', 'FIPS_County', 'Business_Type', 'URL', 'Citation', 'Source_of_Action']
bar_closure_county.drop(columns = drop_cols, inplace = True)
bar_closure_county.rename(columns= {'State_Tribe_Territory': 'state', 'County_Name': 'county', 
                                    'Action': 'bar_action', 'order_code': 'bar_order_code',
                                    'Percent_Capacity_Outdoor': 'bar_%_capacity_outdoor',
                                    'Percent_Capacity_Indoor': 'bar_%_capacity_indoor',
                                    'Numeric_Capacity_Outdoor': 'bar_N_capacity_outdoor',
                                    'Numeric_Capacity_Indoor': 'bar_N_capacity_indoor',
                                    'Limited_Open_Outdoor_Only': 'bar_lim_capacity_outdoor',
                                    'Limited_Open_General_Indoor': 'bar_lim_general_intdoor'}, inplace = True)
bar_closure_county.state = bar_closure_county.state.replace(state_dict)
bar_closure_county = bar_closure_county.loc[bar_closure_county.state.isin(list(state_dict.values())),:].reset_index(drop=True)


# order code:
# (1) - NaN
# (2) - 'Authorized to fully reopen'
# (3) - 'Open with social distancing/reduced seating/enhanced sanitation'
# (4) - 'Open with social distancing/reduced seating/enhanced sanitation'
# (5) - 'Curbside/carryout/delivery only'
# (6) - 'Closed'

############################
restrictions = stay_at_home_county.merge(gather_ban_county, on = ['state', 'county', 'date'], how = 'outer')
restrictions = restrictions.merge(mask_mandate_county, on = ['state', 'county', 'date'], how = 'outer')
restrictions = restrictions.merge(rest_closure_county, on = ['state', 'county', 'date'], how = 'outer')
restrictions = restrictions.merge(bar_closure_county, on = ['state', 'county', 'date'], how = 'outer')
restrictions.state.replace(state_dict, inplace = True)
date_tmp = np.array(list(restrictions.date.str.split('/')))
restrictions['month'] = date_tmp[:,0]
restrictions['day'] = date_tmp[:,1]
restrictions['year'] = date_tmp[:,2]
restrictions.day.replace({'1':'01','2':'02','3':'03','4':'04','5':'05','6':'06','7':'07','8':'08','9':'09',}, inplace = True)
restrictions.month.replace({'1':'01','2':'02','3':'03','4':'04','5':'05','6':'06','7':'07','8':'08','9':'09',}, inplace = True)
restrictions.drop(columns = ['date'], inplace = True)


###########################
restrictions.county = restrictions.county.str.lower()
restrictions.state = restrictions.state.str.lower()
restrictions.county = restrictions.county.str.replace(' county', '')
restrictions.county = restrictions.county.str.replace('doña ana', 'dona ana')

restrictions = restrictions.drop_duplicates()

###########################
# rename and recode
restrictions = restrictions.rename(columns= {'stayHome_action':'stayAtHomeOrder',
                             'gatherBan_action': 'gatherBan',
                             'rest_action':'restaurant_restriction',
                              'bar_action': 'bar_restriction'})

restrictions['stayAtHomeOrder'].replace({'No order for individuals to stay home':0,
                                        'Advisory/Recommendation':1,
                                        'Mandatory only for at-risk individuals in certain areas of the jurisdiction':2,
                                        'Mandatory only for at-risk individuals in the jurisdiction': 3,
                                        'Mandatory only for all individuals in certain areas of the jurisdiction':4,
                                        'Mandatory for all individuals':5}, inplace = True)

restrictions['gatherBan'].replace({'No order found':0,
                                  'Ban of gatherings over 101 or more people':1,
                                  'Ban of gatherings over 51-100 people':2,
                                  'Ban of gatherings over 26-50 people': 3,
                                  'Ban of gatherings over 11-25 people':4,
                                  'Ban of gatherings over 1-10 people':5,
                                  'Bans gatherings of any size': 6}, inplace = True)

restrictions['restaurant_restriction'].replace({'Authorized to fully reopen':0,
                                               'Open with social distancing/reduced seating/enhanced sanitation':1,
                                               'Curbside/carryout/delivery only':2}, inplace = True)

restrictions['bar_restriction'].replace({'Authorized to fully reopen':0,
                                        'Open with social distancing/reduced seating/enhanced sanitation':1,
                                        'Curbside/carryout/delivery only':2,
                                        'Closed':3}, inplace = True)

# restriction summary measure
restrictions['restriction_sum'] = restrictions[['stayAtHomeOrder', 'gatherBan', 
                                                'restaurant_restriction','bar_restriction']].sum(axis=1)

# mean restrictiosn state level 
col_indlcude = ['state', 
                'stayAtHomeOrder','gatherBan', 'restaurant_restriction', 'bar_restriction', 'restriction_sum',
                'month', 'day', 'year']
restrictions_state = restrictions[col_indlcude].groupby(by= ['state', 'month', 'day', 'year']).mean().reset_index()



# Anti-Racism Crowd-Events

In [10]:
arce = pd.read_csv(os.path.join(data_path, 'anti_racism_crowd_events.txt'),low_memory=False, dtype = str, na_values = 'NaN')     
# to datetime
arce['date'] = pd.to_datetime(arce['date'])
# drop irrelevent time frame
arce = arce.loc[(arce.date>'2020') & (arce.date<'2022')& (arce.online == '0'),:].reset_index(drop = True)



arce = arce.loc[(arce.issues.str.find('raci') > 0) | (arce.claims.str.find('raci') > 0), :]
arce.reset_index(drop = True, inplace = True)
arce['raci_event_count'] = 1

# manually add counties
idx = arce.location_detail.str.contains('Bronx')
idx[idx.isnull()] = False
arce.loc[idx,'resolved_county'] =  'Bronx'
arce.loc[(arce.locality == 'Richmond') & (arce.state == 'VA'), 'resolved_county'] = 'Richmond'
arce.loc[(arce.locality == 'New York') & (arce.state == 'NY'), 'resolved_county'] = 'New York'
arce.loc[(arce.locality == 'Baltimore') & (arce.state == 'MD'), 'resolved_county'] = 'Baltimore'
arce.loc[(arce.locality == 'Norfolk') & (arce.state == 'VA'), 'resolved_county'] = 'Norfolk'
arce.loc[(arce.locality == 'Newport News') & (arce.state == 'VA'), 'resolved_county'] = 'Newport News'
arce.loc[(arce.locality == 'Virginia Beach') & (arce.state == 'VA'), 'resolved_county'] = 'Virginia Beach'
arce.loc[(arce.locality == 'Suffolk') & (arce.state == 'VA'), 'resolved_county'] = 'Suffolk'
arce.loc[(arce.locality == 'St. Louis') & (arce.state == 'MO'), 'resolved_county'] = 'St. Louis'
arce.loc[(arce.locality == 'Saint Louis') & (arce.state == 'MO'), 'resolved_county'] = 'St. Louis'
arce.loc[(arce.locality == 'Hampton') & (arce.state == 'VA'), 'resolved_county'] = 'Hampton'
arce.loc[(arce.locality == 'Harrisonburg') & (arce.state == 'VA'), 'resolved_county'] = 'Harrisonburg'
arce.loc[(arce.locality == 'Fredericksburg') & (arce.state == 'VA'), 'resolved_county'] = 'Fredericksburg'
arce.loc[(arce.locality == 'Lynchburg') & (arce.state == 'VA'), 'resolved_county'] = 'Lynchburg'
arce.loc[(arce.locality == 'Manassas') & (arce.state == 'VA'), 'resolved_county'] = 'Manassas'
arce.loc[(arce.locality == 'Portsmouth') & (arce.state == 'VA'), 'resolved_county'] = 'Portsmouth'
arce.loc[(arce.locality == 'Roanoke') & (arce.state == 'VA'), 'resolved_county'] = 'Roanoke'
arce.loc[(arce.locality == 'Staunton') & (arce.state == 'VA'), 'resolved_county'] = 'Staunton'
arce.loc[(arce.locality == 'Williamsburg') & (arce.state == 'VA'), 'resolved_county'] = 'Williamsburg'
arce.loc[(arce.locality == 'Winchester') & (arce.state == 'VA'), 'resolved_county'] = 'Winchester'
arce.loc[(arce.locality == 'Franklin') & (arce.state == 'VA'), 'resolved_county'] = 'Franklin'
arce.loc[(arce.locality == 'Norton') & (arce.state == 'VA'), 'resolved_county'] = 'Norton'
arce.loc[(arce.locality == 'Alexandria') & (arce.state == 'VA'), 'resolved_county'] = 'Alexandria'
arce.loc[(arce.locality == 'Danville') & (arce.state == 'VA'), 'resolved_county'] = 'Danville'
arce.loc[(arce.locality == 'Fairfax') & (arce.state == 'VA'), 'resolved_county'] = 'Fairfax'
arce.loc[(arce.locality == 'Galax') & (arce.state == 'VA'), 'resolved_county'] = 'Galax'
arce.loc[(arce.locality == 'Lexington') & (arce.state == 'VA'), 'resolved_county'] = 'Lexington'
arce.loc[(arce.locality == 'Waynesboro') & (arce.state == 'VA'), 'resolved_county'] = 'Waynesboro'
arce.loc[(arce.locality == 'Chesapeake') & (arce.state == 'VA'), 'resolved_county'] = 'Chesapeake'
arce.loc[(arce.locality == 'Virgina Beach') & (arce.state == 'VA'), 'resolved_county'] = 'Virgina Beach'
arce.loc[(arce.locality == 'Covington') & (arce.state == 'VA'), 'resolved_county'] = 'Covington'
arce.loc[(arce.locality == 'Fort Monroe') & (arce.state == 'VA'), 'resolved_county'] = 'Albemarle'
arce.loc[(arce.locality == 'Fort Monroe	') & (arce.state == 'VA'), 'resolved_county'] = ''
arce.loc[(arce.locality == 'Salem') & (arce.state == 'VA'), 'resolved_county'] = 'Salem'
arce.loc[(arce.locality == 'Radford') & (arce.state == 'VA'), 'resolved_county'] = 'Radford'
arce.loc[(arce.locality == 'Petersburg') & (arce.state == 'VA'), 'resolved_county'] = 'Petersburg'
arce.loc[(arce.locality == 'Falls Church') & (arce.state == 'VA'), 'resolved_county'] = 'Falls Church'
arce.loc[(arce.locality == 'Canton') & (arce.state == 'IL'), 'resolved_county'] = 'Fulton County'
arce.loc[(arce.locality == 'Carson City') & (arce.state == 'NV'), 'resolved_county'] = 'Carson City'
arce.loc[(arce.locality == 'Wydown') & (arce.state == 'MO'), 'resolved_county'] = 'St. Louis'





# drop irrelevant columns
drop_cols = list(arce.columns[arce.columns.str.startswith('source')])
drop_cols = drop_cols+list(arce.columns[arce.columns.str.startswith('injuries')])
drop_cols = drop_cols+list(arce.columns[arce.columns.str.startswith('police')])
drop_cols = drop_cols+list(arce.columns[arce.columns.str.startswith('participant')])
drop_cols = drop_cols+list(arce.columns[arce.columns.str.endswith('death')])
drop_cols = drop_cols + ['online', 'location_detail', 'locality', 'fips_code', 'lat', 'lon', 
                         'arrests_any', 'property_damage','resolved_locality',
                         'property_damage_any', 'chemical_agents', 'notes', 
                         'valence', 'macroevent', 'actors', 'organizations', 'type', 'title',
                         'size_text','size_low','size_high','size_cat','arrests','claims', 'state', 'issues']
arce.drop(columns = drop_cols, inplace = True)

arce = pd.DataFrame(arce.groupby(by=['date', 'resolved_county', 'resolved_state'])['raci_event_count'].sum()).reset_index()

date_arce_events = np.array(list(arce.date.astype(str).str.split('-')))
arce['year'] = date_arce_events[:,0]
arce['month'] = date_arce_events[:,1]
arce['day'] = date_arce_events[:,2]
arce.drop(columns = ['date'], inplace = True)
arce.rename(columns = {'resolved_county': 'county', 'resolved_state': 'state'}, inplace = True)
arce.state.replace(state_dict, inplace = True)

arce_county = arce.copy()
arce_state = arce.copy()
arce_state = pd.DataFrame(arce_state.groupby(by=['state', 'year','month','day'])['raci_event_count'].sum()).reset_index()
arce_state.state = arce_state.state.str.lower()
arce_state = arce_state.drop_duplicates()

arce_county.county = arce_county.county.str.lower()
arce_county.state = arce_county.state.str.lower()
arce_county.county = arce_county.county.str.replace(' county', '')
arce_county.county = arce_county.county.str.replace('doña ana', 'dona ana')

arce_county.county = arce_county.county.str.replace('anchorage', 'anchorage municipality')
arce_county.county = arce_county.county.str.replace('baltimore', 'baltimore city')
arce_county.county = arce_county.county.str.replace('chesapeake', 'chesapeake city') 
arce_county.county = arce_county.county.str.replace('denali', 'denali borough')
arce_county.county = arce_county.county.str.replace('doòa ana', 'dona ana')
arce_county.county = arce_county.county.str.replace('winchester', 'winchester city')
arce_county.county = arce_county.county.str.replace('virginia beach', 'virginia beach city')
arce_county.county = arce_county.county.str.replace('roanoke', 'roanoke city')
arce_county.loc[(arce_county.county=='suffolk') & (arce_county.state=='virginia'), 'county'] = 'suffolk city'
arce_county.loc[(arce_county.county=='richmond') & (arce_county.state=='virginia'), 'county'] = 'richmond city'
arce_county.loc[(arce_county.county=='norfolk') & (arce_county.state=='virginia'), 'county'] = 'norfolk city'
arce_county.loc[(arce_county.county=='newport news') & (arce_county.state=='virginia'), 'county'] = 'newport news city'
arce_county.county = arce_county.county.str.replace('fairbanks north star', 'fairbanks north star borough')
arce_county = arce_county.drop_duplicates()


# Merge all external data on county and state level

In [11]:
state_data_combined = state_data.merge(covid_cumsum_state, on = ['date', 'state'], how = 'left')
state_data_combined = state_data_combined.merge(covid_avg_state, on = ['date', 'state'], how = 'left')
state_data_combined = state_data_combined.merge(ue_state, on = ['state', 'year', 'month'], how = 'left')
state_data_combined = state_data_combined.merge(arce_state, on = ['state', 'year', 'month', 'day'], how = 'left')
state_data_combined = state_data_combined.merge(restrictions_state, on = ['state', 'year', 'month', 'day'], how = 'left')

county_data_combined = county_data.merge(covid_cumsum_county, on = ['date', 'state', 'county'], how = 'left')
county_data_combined = county_data_combined.merge(covid_avg_county, on = ['date', 'state', 'county'], how = 'left')
county_data_combined = county_data_combined.merge(arce_county, on = ['state', 'county', 'year', 'month', 'day'], how = 'left')
county_data_combined = county_data_combined.merge(restrictions, on = ['state', 'county', 'year', 'month', 'day'], how = 'left')
county_data_combined = county_data_combined.merge(ue_county, on = ['state', 'county', 'year', 'month'], how = 'left')



# merge with pid data

In [12]:
state_data_combined = state_data.merge(state_data_combined, on = ['state','date','year', 'month', 'day'], how = 'left')
county_data_combined = pid_county.merge(county_data_combined, on = ['state', 'county','date'], how = 'left')


# save output

In [13]:
county_data_combined.to_csv(os.path.join(out_path, 'externalMeasures_county.csv'), index = False)
state_data_combined.to_csv(os.path.join(out_path, 'externalMeasures_state.csv'), index = False)