## Object

The aim of this notebook is to prepare the COVID dataframes into a time series dataframe for posterior proccessing tasks before modelling.

In [1]:
import numpy as np
import pandas as pd

In [2]:
# Import the complete confirmed cases for all countries
df_confirmed_us = pd.read_csv('data/time_series_covid19_confirmed_US.csv')

In [3]:
df_confirmed_us.head()

Unnamed: 0,UID,iso2,iso3,code3,FIPS,Admin2,Province_State,Country_Region,Lat,Long_,...,2/28/23,3/1/23,3/2/23,3/3/23,3/4/23,3/5/23,3/6/23,3/7/23,3/8/23,3/9/23
0,84001001,US,USA,840,1001.0,Autauga,Alabama,US,32.539527,-86.644082,...,19732,19759,19759,19759,19759,19759,19759,19759,19790,19790
1,84001003,US,USA,840,1003.0,Baldwin,Alabama,US,30.72775,-87.722071,...,69641,69767,69767,69767,69767,69767,69767,69767,69860,69860
2,84001005,US,USA,840,1005.0,Barbour,Alabama,US,31.868263,-85.387129,...,7451,7474,7474,7474,7474,7474,7474,7474,7485,7485
3,84001007,US,USA,840,1007.0,Bibb,Alabama,US,32.996421,-87.125115,...,8067,8087,8087,8087,8087,8087,8087,8087,8091,8091
4,84001009,US,USA,840,1009.0,Blount,Alabama,US,33.982109,-86.567906,...,18616,18673,18673,18673,18673,18673,18673,18673,18704,18704


In [4]:
df_confirmed_us.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3342 entries, 0 to 3341
Columns: 1154 entries, UID to 3/9/23
dtypes: float64(3), int64(1145), object(6)
memory usage: 29.4+ MB


In [5]:
df_confirmed_us.columns

Index(['UID', 'iso2', 'iso3', 'code3', 'FIPS', 'Admin2', 'Province_State',
       'Country_Region', 'Lat', 'Long_',
       ...
       '2/28/23', '3/1/23', '3/2/23', '3/3/23', '3/4/23', '3/5/23', '3/6/23',
       '3/7/23', '3/8/23', '3/9/23'],
      dtype='object', length=1154)

In [6]:
# Delete some columns that won't be usefull
df_confirmed_us.drop(columns=["UID", 
                              "iso2", 
                              "iso3", 
                              "code3", 
                              "FIPS",
                              "Admin2",
                              "Province_State", 
                              "Combined_Key",
                              "Lat", 
                              "Long_",
                              "Country_Region"
                             ], inplace=True)

In [7]:
df_confirmed_us.head()

Unnamed: 0,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,1/28/20,1/29/20,1/30/20,1/31/20,...,2/28/23,3/1/23,3/2/23,3/3/23,3/4/23,3/5/23,3/6/23,3/7/23,3/8/23,3/9/23
0,0,0,0,0,0,0,0,0,0,0,...,19732,19759,19759,19759,19759,19759,19759,19759,19790,19790
1,0,0,0,0,0,0,0,0,0,0,...,69641,69767,69767,69767,69767,69767,69767,69767,69860,69860
2,0,0,0,0,0,0,0,0,0,0,...,7451,7474,7474,7474,7474,7474,7474,7474,7485,7485
3,0,0,0,0,0,0,0,0,0,0,...,8067,8087,8087,8087,8087,8087,8087,8087,8091,8091
4,0,0,0,0,0,0,0,0,0,0,...,18616,18673,18673,18673,18673,18673,18673,18673,18704,18704


In [8]:
df_confirmed_us.columns

Index(['1/22/20', '1/23/20', '1/24/20', '1/25/20', '1/26/20', '1/27/20',
       '1/28/20', '1/29/20', '1/30/20', '1/31/20',
       ...
       '2/28/23', '3/1/23', '3/2/23', '3/3/23', '3/4/23', '3/5/23', '3/6/23',
       '3/7/23', '3/8/23', '3/9/23'],
      dtype='object', length=1143)

In [9]:
df_1 = df_confirmed_us.sum(axis=0)
df_1.columns = ["cases"]
print(df_1)

1/22/20            1
1/23/20            1
1/24/20            2
1/25/20            2
1/26/20            5
             ...    
3/5/23     103646975
3/6/23     103655539
3/7/23     103690910
3/8/23     103755771
3/9/23     103802702
Length: 1143, dtype: int64


In [10]:
df_1.rename("cases", inplace=True)
df_1 = df_1.to_frame().reset_index()
df_1['date'] = pd.to_datetime(df_1['index'], format='%m/%d/%y')
df_1['cases'] = df_1['cases'].astype(float)
df_1.drop(columns="index", inplace = True)
df_confirmed_us = df_1.set_index('date')

In [11]:
df_confirmed_us.head()

Unnamed: 0_level_0,cases
date,Unnamed: 1_level_1
2020-01-22,1.0
2020-01-23,1.0
2020-01-24,2.0
2020-01-25,2.0
2020-01-26,5.0


In [12]:
df_confirmed_us.to_csv('data/processed_confirmed_cases_us.csv')