# ETL_Project

In [88]:
import pandas as pd
from sqlalchemy import create_engine

### Store SARS csv files into dataframe

In [89]:
sars_file = "Resources/sars_2003_complete_dataset_clean.csv"
df_sars = pd.read_csv(sars_file)
df_sars.head()

Unnamed: 0,Date,Country,Cumulative number of case(s),Number of deaths,Number recovered
0,2003-03-17,Germany,1,0,0
1,2003-03-17,Canada,8,2,0
2,2003-03-17,Singapore,20,0,0
3,2003-03-17,"Hong Kong SAR, China",95,1,0
4,2003-03-17,Switzerland,2,0,0


### Transform SARS data

In [90]:
df_sars = df_sars[['Country', 'Cumulative number of case(s)', 'Number of deaths']].copy()
df_sars.columns = ['country', 'total_cases_sars', 'total_deaths_sars']
df_sars = df_sars.groupby('country').max().reset_index()
df_sars.head()

Unnamed: 0,country,total_cases_sars,total_deaths_sars
0,Australia,6,0
1,Belgium,1,0
2,Brazil,3,0
3,Bulgaria,1,0
4,Canada,252,38


In [91]:
df_sars.head()

Unnamed: 0,country,total_cases_sars,total_deaths_sars
0,Australia,6,0
1,Belgium,1,0
2,Brazil,3,0
3,Bulgaria,1,0
4,Canada,252,38


### Store nCoV csv files into dataframe

In [92]:
ncov_file = "Resources/2019_nCoV_data.csv"
df_ncov = pd.read_csv(ncov_file)
df_ncov.head()

Unnamed: 0,Sno,Date,Province/State,Country,Last Update,Confirmed,Deaths,Recovered
0,1,01/22/2020 12:00:00,Anhui,China,01/22/2020 12:00:00,1.0,0.0,0.0
1,2,01/22/2020 12:00:00,Beijing,China,01/22/2020 12:00:00,14.0,0.0,0.0
2,3,01/22/2020 12:00:00,Chongqing,China,01/22/2020 12:00:00,6.0,0.0,0.0
3,4,01/22/2020 12:00:00,Fujian,China,01/22/2020 12:00:00,1.0,0.0,0.0
4,5,01/22/2020 12:00:00,Gansu,China,01/22/2020 12:00:00,0.0,0.0,0.0


### Transform SARS data

In [93]:
# Rename cities in China.
df_ncov = df_ncov.replace(to_replace='China', value='Mainland China', regex=False)
df_ncov = df_ncov.replace(to_replace='Hong Kong', value='Hong Kong SAR, China', regex=False)
df_ncov = df_ncov.replace(to_replace='Macau', value='Macau SAR, China', regex=False)
df_ncov = df_ncov.replace(to_replace='Taiwan', value='Taiwan, China', regex=False)

In [94]:
df_ncov[['Confirmed', 'Deaths']] = df_ncov[['Confirmed', 'Deaths']].astype(int)
df_ncov = df_ncov.groupby('Country')['Confirmed', 'Deaths'].max().reset_index()
df_ncov.columns = ['country', 'total_cases_ncov', 'total_deaths_ncov']

  


In [95]:
df_ncov.head()

Unnamed: 0,country,total_cases_ncov,total_deaths_ncov
0,Australia,5,0
1,Belgium,1,0
2,Brazil,0,0
3,Cambodia,1,0
4,Canada,5,0


### Connect to local database

In [96]:
rds_connection_string = "postgres:postgres@localhost:5432/virus_db"
engine = create_engine(f'postgresql://{rds_connection_string}')

In [97]:
engine.table_names()

['sars', 'ncov']

### Load

In [99]:
df_sars.to_sql(name='sars', con=engine, if_exists='append', index=False)
df_ncov.to_sql(name='ncov', con=engine, if_exists='append', index=False)