# Notebook for generating covid summary data set

Importing required dependencies

In [2]:
import os
import sys
import pandas as pd
import logging
logging.basicConfig(filename='logging.txt', filemode='w', format='%(asctime)s %(levelname)s %(message)s' , datefmt='%m/%d/%Y %H:%M:%S ')


reading the data

In [4]:
counties_df = pd.read_csv('https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-counties.csv')
population_survey_df = pd.read_csv('https://www2.census.gov/programs-surveys/popest/datasets/2010-2019/counties/totals/co-est2019-alldata.csv', encoding='latin-1', usecols=['POPESTIMATE2019', 'STATE', 'COUNTY'])


In [5]:
counties_df.head(3)

Unnamed: 0,date,county,state,fips,cases,deaths
0,2020-01-21,Snohomish,Washington,53061.0,1,0.0
1,2020-01-22,Snohomish,Washington,53061.0,1,0.0
2,2020-01-23,Snohomish,Washington,53061.0,1,0.0


In [6]:
counties_df.dtypes

date       object
county     object
state      object
fips      float64
cases       int64
deaths    float64
dtype: object

In [7]:
population_survey_df.head(3)

Unnamed: 0,STATE,COUNTY,POPESTIMATE2019
0,1,0,4903185
1,1,1,55869
2,1,3,223234


In [8]:
population_survey_df.dtypes

STATE              int64
COUNTY             int64
POPESTIMATE2019    int64
dtype: object

In [9]:
counties_array=counties_df.state.unique()

number of states

In [10]:
len(counties_array)

55

"""Function for data cleaning"""

In [11]:
def data_cleaning(covid19_df, population_df):
    """Function for data cleaning"""
    print('Cleaning the data . . .')
    # Renaming Column names
    covid19_df_rename = {'date': 'Date', 'county': 'County', 'state': 'State',
                         'fips': 'FIPS', 'cases': 'Cases', 'deaths': 'Deaths'}
    covid19_df.columns = [covid19_df_rename.get(x) for x in covid19_df.columns]
    #formating the state and county
    population_df['STATE'] = population_df['STATE'].map('{:0>2}'.format)
    population_df['COUNTY'] = population_df['COUNTY'].map('{:0>3}'.format)
    #creating fips by joining state and county
    population_df['FIPS'] = population_df['STATE'] + population_df['COUNTY']
    #dropping state and county column
    population_df = population_df.drop(['STATE', 'COUNTY'], axis=1)
    #dealing na values in fips column
    covid19_df['FIPS'] = covid19_df['FIPS'].fillna(0).astype(int)
    #formating fips column
    covid19_df['FIPS'] = covid19_df['FIPS'].map('{:0>5}'.format)

    #converting all the date formats to single format
    covid19_df['Date'] = pd.to_datetime(covid19_df['Date'], format='%Y-%m-%d')

    # sorting the df by date
    covid19_df = covid19_df.sort_values(by='Date', ascending=True, ignore_index=True)

    return (covid19_df, population_df)

"""Function for merging dataframes"""

In [12]:
def merge_dfs(covid_df, popltn_df):
    """Function for merging dataframes"""
    print('Merging data frames ... ')
    merged_df = pd.merge(covid_df, popltn_df, on=['FIPS'], how='left')
    #dealing na values
    merged_df['POPESTIMATE2019'] = merged_df['POPESTIMATE2019'].fillna(0).astype(int)
    merged_df.columns = ['Date', 'County', 'State', 'FIPS', 'Daily Cases',
                         'Daily Deaths', 'POPESTIMATE2019']
    #sorting the df
    merged_df = merged_df.sort_values(by=['Date', 'State', 'County'], ascending=[True, True, True])
    return merged_df

#calculating cumulative cases and deaths

In [13]:
def total_counts(merged_df):
    """Function for generating summary dataset"""
    #calculating cumulative cases and deaths
    print('Generating counts ...')
    merged_df['Cumulative Cases'] = merged_df.groupby(by=['FIPS'])['Daily Cases'].transform(lambda x: x.cumsum())
    merged_df['Cumulative Deaths'] = merged_df.groupby(by=['FIPS'])['Daily Deaths'].transform(lambda x: x.cumsum())
    merged_df = merged_df[['FIPS', 'Date', 'County', 'State', 'POPESTIMATE2019',
                           'Daily Cases', 'Daily Deaths', 'Cumulative Cases', 'Cumulative Deaths']]
    merged_df.to_csv('covid_summary.csv', index=False)
    print('Summary Data set generated')


  """Generating summary dataset"""

In [14]:
def main():
    """Generating summary dataset"""
    try:
        (covid_d, pop_d) = data_cleaning(counties_df, population_survey_df)
        merged_df = merge_dfs(covid_d, pop_d)
        #changing the directory according to output path
        os.chdir(sys.argv[1])
        total_counts(merged_df)
    except IndexError:
        print("Summary data set not generated, Please provide correct output")
        logging.warning('Summary data set not generated, Please provide correct output')
    except FileNotFoundError:
        print('Summary data set not generated, No such file or directory')
        logging.warning('Summary data set not generated, No such file or directory')

if __name__ == "__main__":
    main()
    

Cleaning the data . . .
Merging data frames ... 
Summary data set not generated, No such file or directory


In [15]:
print(pd.__version__)

1.0.5


In [18]:
print(logging.__version__)

0.5.1.2


percentage of Cook County Illinois has died of COVID 19

In [46]:
#reading the output generated
outputdf = pd.read_csv('covid_summary.csv')

In [47]:
#filtering cumulative dealths of counties and state as of today
cumulative_deaths = outputdf.loc[outputdf['Date'] == '2021-05-09']

In [48]:
#total deaths as of today for all counties in all states in usa
Total = cumulative_deaths['Cumulative Deaths'].sum()

In [49]:
Total

406677789.0

In [54]:
#total deaths of cook county from illinois state
cook_illinois_deaths=cumulative_deaths.loc[(cumulative_deaths['County']=='Cook') & (cumulative_deaths['State'] == 'Illinois')]

In [51]:
cook_illinois_deaths['Cumulative Deaths']

1300885    2535259.0
Name: Cumulative Deaths, dtype: float64

In [53]:
((cook_illinois_deaths['Cumulative Deaths'])/Total)*100

1300885    0.623407
Name: Cumulative Deaths, dtype: float64