In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt 
import datetime

plt.rcParams["figure.figsize"] = (20,10)

## Policy data by State

#### Data 1: Load and preprocess the policy data 

In [5]:
state_df = pd.read_csv('./../../data/raw/state_policies.csv')

In [6]:
state_df.head()

Unnamed: 0,state_id,county,fips_code,policy_level,date,policy_type,start_stop,comments,source,total_phases
0,HI,,,state,2020-05-18,Manufacturing,start,"Policy_Details: Open with adjusted ""Safe Pract...",sip_submission_form: https://governor.hawaii.g...,
1,TX,Kinney,48271.0,county,2020-07-03,Mask Requirement,start,Policy_Details: County is approved to be exemp...,sip_submission_form: https://tdem.texas.gov/ga...,
2,ID,Custer,16037.0,county,2020-10-27,Phase 3,start,Policy_Details: No greater than 50 people at i...,sip_submission_form: https://coronavirus.idaho...,4.0
3,UT,Wayne,49055.0,county,2020-11-24,Phase 1,start,Policy_Details: Restrictions for highest level...,sip_submission_form: https://coronavirus.utah....,3.0
4,TX,Borden,48033.0,county,2020-07-03,Mask Requirement,start,Policy_Details: County is approved to be exemp...,sip_submission_form: https://tdem.texas.gov/ga...,


In [7]:
state_df = state_df[state_df['policy_level']=='state'] # primarily will concern with state level policies

In [8]:
state_df = state_df.drop(columns=['county', 'fips_code', 'source', 'policy_level'])

In [9]:
state_df['date'] = state_df['date'].apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d').date())

In [10]:
state_df['state'] = state_df['state_id']

In [62]:
state_df.to_csv('cleaned_state_policies.csv')

## Feature data by state

#### Data 2: Load and preprocess state feature data

In [12]:
state_hist = pd.read_csv('./../../data/raw/state_history.csv')

In [13]:
state_hist['date'] = state_hist['date'].apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d').date())

In [14]:
# For the features, we have selected the data from states that have a data grade of A+, A, B; this means that the data is 
# validated and they report daily or alternately.

Information on grades  :  https://docs.google.com/spreadsheets/u/1/d/e/2PACX-1vRL2zG1o-qj9l2sl19d1lj1oHd6WbkJ0ukFwN04a_ms_ANUdgxTMpI7AF-gbQzwOSreJUDx6PEK7Vnq/pubhtml

In [15]:
state_hist['dataQualityGrade'].isnull().sum()

1306

In [16]:
state_hist = state_hist.dropna(subset=['dataQualityGrade'])

In [17]:
state_hist['dataQualityGrade'].unique()

array(['A', 'A+', 'B', 'D', 'C', '#REF!', 'F'], dtype=object)

In [18]:
state_hist['dataQualityGrade'].value_counts()

A+       5675
A        4795
B        4304
C        1096
D         700
F          32
#REF!       6
Name: dataQualityGrade, dtype: int64

In [19]:
state_hist = state_hist[state_hist['dataQualityGrade']!='C']
state_hist = state_hist[state_hist['dataQualityGrade']!='D']
state_hist = state_hist[state_hist['dataQualityGrade']!='F']
state_hist = state_hist[state_hist['dataQualityGrade']!='#REF!']

In [20]:
state_hist = state_hist.drop(columns=['dataQualityGrade', 'death', 'deathConfirmed', 'deathProbable', 'hospitalized', 'hospitalizedCumulative',
                        'hospitalizedCurrently', 'inIcuCumulative', 'negative', 'negativeTestsAntibody', 'negativeTestsPeopleAntibody',
                        'negativeTestsViral', 'onVentilatorCumulative', 'positive', 'positiveCasesViral', 'positiveScore',
                        'positiveTestsAntibody', 'positiveTestsAntigen', 'positiveTestsPeopleAntibody', 'positiveTestsPeopleAntigen',
                        'positiveTestsViral', 'recovered'])

In [21]:
state_hist = state_hist.drop(columns=['totalTestEncountersViral', 'totalTestEncountersViralIncrease', 'totalTestResults',
                                     'totalTestResultsIncrease', 'totalTestsAntibody', 'totalTestsAntigen',
                                     'totalTestsPeopleAntibody', 'totalTestsPeopleAntigen', 'totalTestsPeopleViral', 'totalTestsPeopleViralIncrease',
                                     'totalTestsViral', 'totalTestsViralIncrease'])

In [22]:
state_hist = state_hist.fillna(0)

In [23]:
state_hist.to_csv('cleaned_state_data.csv')

### Get relevant features and policies

In [26]:
# load state data 
state_data = pd.read_csv('./../../data/cleaned_state_data.csv')
# convert dates to datetime object
state_data.date = state_data.date.apply(lambda x: datetime.datetime.strptime(x, '%d-%m-%Y'))

# load policy data
state_policy = pd.read_csv('./../../data/cleaned_state_policies.csv')
# convert the dates to datetime object
state_policy.date = state_policy.date.apply(lambda x: datetime.datetime.strptime(x, '%d-%m-%Y'))
state_policy = state_policy.drop(columns=['Unnamed: 0'])

# define all policy_types to consider
# convert all the policies to lowercase
for i in range(len(state_policy.index)):
    state_policy.at[i, 'policy_type'] = state_policy.at[i, 'policy_type'].lower()
selected_policies = ['manufacturing', 'entertainment', 'non-essential businesses', 'outdoor and recreation', 'shelter in place', 'travel', 'phase 1', 'phase 2', 'phase 3', 'phase 4']
state_policy = state_policy[state_policy['policy_type'].isin(selected_policies)]

state_data.to_csv('./../../data/clean/features.csv', index=False)
state_policy.to_csv('./../../data/clean/policies.csv', index=False)