In [1]:
import pandas as pd
import csv

In [2]:
states = ['Alabama', 'Georgia', 'Louisiana', 'Mississippi', 'South Carolina']
states_2 = ['AL', 'GA', 'LA', 'MS', 'SC']
out_of = ['Out of AL', 'Out of GA', 'Out of LA', 'Out of MS', 'Out of SC']

In [3]:
url_confirmed = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_US.csv'
url_deaths = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_US.csv'
url_tests = 'https://covidtracking.com/api/v1/states/daily.csv'

In [4]:
confirmed = pd.read_csv(url_confirmed)
deaths = pd.read_csv(url_deaths)
tests = pd.read_csv(url_tests)

In [5]:
confirmed.to_csv('time_series_covid19_confirmed_US.csv', index = False)
deaths.to_csv('time_series_covid19_deaths_US.csv', index = False)
tests.to_csv('CovidTracking_Daily.csv', index = False)

In [6]:
confirmed = confirmed.drop(['UID', 'iso2', 'iso3', 'code3', 'FIPS', 'Country_Region', 'Lat', 'Long_', 'Combined_Key'], axis = 1)
deaths = deaths.drop(['UID', 'iso2', 'iso3', 'code3', 'FIPS', 'Country_Region', 'Lat', 'Long_', 'Combined_Key', 'Population'], axis = 1)
tests = tests.drop(['positive', 'negative', 'pending', 'hospitalizedCurrently', 'hospitalizedCumulative', 'inIcuCurrently', 'inIcuCumulative', 'onVentilatorCurrently', 'onVentilatorCumulative', 'recovered', 'hash', 'dateChecked', 'death', 'hospitalized', 'totalTestResults', 'posNeg', 'fips', 'deathIncrease', 'hospitalizedIncrease', 'negativeIncrease', 'positiveIncrease', 'totalTestResultsIncrease'], axis = 1)

In [7]:
confirmed.rename(columns = {'Admin2':'County', 'Province_State':'State'}, inplace = True)
deaths.rename(columns = {'Admin2':'County', 'Province_State':'State'}, inplace = True)
tests.rename(columns = {'date':'Date', 'state':'State', 'total':'Tested'}, inplace = True)

In [8]:
confirmed = confirmed.loc[confirmed['State'].isin(states)].reset_index()
deaths = deaths.loc[deaths['State'].isin(states)].reset_index()
tests = tests.loc[tests['State'].isin(states_2)].reset_index()

In [9]:
confirmed = confirmed.drop(['index'], axis = 1)
deaths = deaths.drop(['index'], axis = 1)
tests = tests.drop(['index'], axis = 1)

In [10]:
confirmed['State'].replace({'Alabama':'01', 'Georgia':'13', 'Louisiana':'22', 'Mississippi':'28', 'South Carolina':'45'}, inplace = True)
deaths['State'].replace({'Alabama':'01', 'Georgia':'13', 'Louisiana':'22', 'Mississippi':'28', 'South Carolina':'45'}, inplace = True)
tests['State'].replace({'AL':'01', 'GA':'13', 'LA':'22', 'MS':'28', 'SC':'45'}, inplace = True)

In [11]:
col_list = []
for item in confirmed.iloc[:, 2:]:
    col_list.append(item)

In [12]:
confirmed_clean = confirmed.melt(id_vars=['County', 'State'], value_vars=[col_list[0]], var_name = 'Date', value_name = 'Cases')
deaths_clean = deaths.melt(id_vars=['County', 'State'], value_vars=[col_list[0]], var_name = 'Date', value_name = 'Deaths')
tests_clean = tests.melt(id_vars=['Date', 'State'], value_vars=['Tested'], var_name = 'County', value_name = 'Cases')

In [13]:
for item in col_list[1:]:
    confirmed_clean_temp = confirmed.melt(id_vars=['County', 'State'], value_vars=[item], var_name = 'Date', value_name = 'Cases')
    confirmed_clean = confirmed_clean.append(confirmed_clean_temp, ignore_index = True)
    deaths_clean_temp = deaths.melt(id_vars=['County', 'State'], value_vars=[item], var_name = 'Date', value_name = 'Deaths')
    deaths_clean = deaths_clean.append(deaths_clean_temp, ignore_index = True)

In [14]:
confirmed_clean = confirmed_clean[~confirmed_clean.County.isin(out_of)]
confirmed_clean['County'].replace({'Unassigned':'Unknown'}, inplace = True)
deaths_clean = deaths_clean[~deaths_clean.County.isin(out_of)]
deaths_clean['County'].replace({'Unassigned':'Unknown'}, inplace = True)

In [15]:
confirmed_clean['Date'] = pd.to_datetime(confirmed_clean['Date']).dt.strftime("%m/%d/%y")
deaths_clean['Date'] = pd.to_datetime(deaths_clean['Date']).dt.strftime("%m/%d/%y")
tests_clean['Date'] = pd.to_datetime(tests_clean['Date'], format='%Y%m%d').dt.strftime("%m/%d/%y")

In [16]:
tests_clean.Cases = tests_clean.Cases.astype('int64')

In [17]:
deaths_rows = deaths_clean.groupby(['State', 'Date']).sum().reset_index()
deaths_rows = deaths_rows.melt(id_vars=['State', 'Date'], value_vars=['Deaths'], var_name = 'County', value_name = 'Cases')

In [18]:
covid_combined = pd.merge(confirmed_clean, deaths_clean, on = ['County', 'State', 'Date'])
dss = pd.concat([covid_combined, tests_clean, deaths_rows], sort = True).reset_index()
dss['County'].replace({'LaSalle':'La Salle'}, inplace = True)

In [19]:
dss = dss.drop(['index'], axis = 1)
dss.Deaths.fillna(0, inplace = True)
dss.Deaths = dss.Deaths.astype('int64')
dss_clean = dss[['State', 'County', 'Date', 'Cases', 'Deaths']]

In [20]:
al_clean = dss_clean.loc[dss_clean['State'] == '01'].reset_index()
ga_clean = dss_clean.loc[dss_clean['State'] == '13'].reset_index()
la_clean = dss_clean.loc[dss_clean['State'] == '22'].reset_index()
ms_clean = dss_clean.loc[dss_clean['State'] == '28'].reset_index()
sc_clean = dss_clean.loc[dss_clean['State'] == '45'].reset_index()

In [21]:
al_clean = al_clean.drop(['index'], axis = 1)
ga_clean = ga_clean.drop(['index'], axis = 1)
la_clean = la_clean.drop(['index'], axis = 1)
ms_clean = ms_clean.drop(['index'], axis = 1)
sc_clean = sc_clean.drop(['index'], axis = 1)

In [22]:
al_clean.to_csv('al-clean.csv', index=False)
ga_clean.to_csv('ga-clean.csv', index=False)
la_clean.to_csv('la-clean.csv', index=False)
ms_clean.to_csv('ms-clean.csv', index=False)
sc_clean.to_csv('sc-clean.csv', index=False)
dss_clean.to_csv('dss-clean.csv', index=False)