In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_excel("/content/Health_Science_Dataset.xlsx",skiprows = 0,header = 1)
df.head()

Unnamed: 0,Data As Of,Start Week,End Week,MMWRyear,MMWRweek,Week Ending Date,Group,Indicator,Jurisdiction,Age Group,COVID-19 Deaths,Total Deaths,Pneumonia Deaths,Influenza Deaths,Pneumonia or Influenza,"Pneumonia, Influenza, or COVID-19 Deaths"
0,11/02/2023,12/29/2019,01/04/2020,2020,1,01/04/2020,By Week,Week-ending,United States,All Ages,0.0,60028.0,4102.0,432.0,4534.0,4534.0
1,11/02/2023,12/29/2019,01/04/2020,2020,1,01/04/2020,By Week,Week-ending,United States,0-17 years,0.0,667.0,19.0,22.0,41.0,41.0
2,11/02/2023,12/29/2019,01/04/2020,2020,1,01/04/2020,By Week,Week-ending,United States,18-64 years,0.0,14706.0,767.0,183.0,950.0,950.0
3,11/02/2023,12/29/2019,01/04/2020,2020,1,01/04/2020,By Week,Week-ending,United States,65 years and over,0.0,44655.0,3316.0,227.0,3543.0,3543.0
4,11/02/2023,12/29/2019,01/04/2020,2020,1,01/04/2020,By Week,Week-ending,Alabama,All Ages,0.0,1098.0,67.0,,72.0,72.0


In [None]:
# 1. Drop some columns
columns_to_drop = ['Data As Of', 'Indicator', 'Group', 'Pneumonia or Influenza', 'Pneumonia, Influenza, or COVID-19 Deaths']
df = df.drop(columns=columns_to_drop)

In [None]:
# 2. Rename 'Jurisdiction' to 'State' and format for choropleth map
df = df.rename(columns={'Jurisdiction': 'State'})

state_mapping = {
    'Alabama': 'AL',
    'Alaska': 'AK',
    'Arizona': 'AZ',
    'Arkansas': 'AR',
    'California': 'CA',
    'Colorado': 'CO',
    'Connecticut': 'CT',
    'Delaware': 'DE',
    'Florida': 'FL',
    'Georgia': 'GA',
    'Hawaii': 'HI',
    'Idaho': 'ID',
    'Illinois': 'IL',
    'Indiana': 'IN',
    'Iowa': 'IA',
    'Kansas': 'KS',
    'Kentucky': 'KY',
    'Louisiana': 'LA',
    'Maine': 'ME',
    'Maryland': 'MD',
    'Massachusetts': 'MA',
    'Michigan': 'MI',
    'Minnesota': 'MN',
    'Mississippi': 'MS',
    'Missouri': 'MO',
    'Montana': 'MT',
    'Nebraska': 'NE',
    'Nevada': 'NV',
    'New Hampshire': 'NH',
    'New Jersey': 'NJ',
    'New Mexico': 'NM',
    'New York': 'NY',
    'North Carolina': 'NC',
    'North Dakota': 'ND',
    'Ohio': 'OH',
    'Oklahoma': 'OK',
    'Oregon': 'OR',
    'Pennsylvania': 'PA',
    'Rhode Island': 'RI',
    'South Carolina': 'SC',
    'South Dakota': 'SD',
    'Tennessee': 'TN',
    'Texas': 'TX',
    'Utah': 'UT',
    'Vermont': 'VT',
    'Virginia': 'VA',
    'Washington': 'WA',
    'West Virginia': 'WV',
    'Wisconsin': 'WI',
    'Wyoming': 'WY',
    'District of Columbia': 'DC',
    'United States': 'US',
    "HHS Region 1": "HHS1",
    "HHS Region 2": "HHS2",
    "HHS Region 3": "HHS3",
    "HHS Region 4": "HHS4",
    "HHS Region 5": "HHS5",
    "HHS Region 6": "HHS6",
    "HHS Region 7": "HHS7",
    "HHS Region 8": "HHS8",
    "HHS Region 9": "HHS9",
    "HHS Region 10": "HHS10",
}
df['State'] = df['State'].map(state_mapping)

In [None]:
# 3. Convert 'Start Week' to datetime and keep as timestamp
df['Timestamp'] = pd.to_datetime(df['End Week'])
df = df.drop(columns=['Start Week', 'End Week', 'Week Ending Date'])

In [None]:
# Function to fill missing values with mean of the age group for the state
def fill_missing(group):
    return group.fillna(group.mean())

In [None]:
# 4. Fill empty entries
df = df.replace('', np.nan)

numeric_columns = ['COVID-19 Deaths', 'Total Deaths', 'Pneumonia Deaths', 'Influenza Deaths']

# Apply the function to each group (State and Age Group)
df[numeric_columns] = df.groupby(['State', 'Age Group'])[numeric_columns].transform(fill_missing)

# Recalculate 'Total Deaths' if it's still NaN
df['Total Deaths'] = df[['COVID-19 Deaths', 'Pneumonia Deaths', 'Influenza Deaths']].sum(axis=1)
df['Pneumonia or Influenza'] = df[['Pneumonia Deaths', 'Influenza Deaths']].sum(axis=1)
df['Pneumonia or Influenza or Covid'] = df[['Pneumonia Deaths', 'Influenza Deaths','COVID-19 Deaths']].sum(axis=1)
df.dropna(inplace=True)

# Convert relevant columns to integer type
df[[*numeric_columns, 'Pneumonia or Influenza']] = df[[*numeric_columns, 'Pneumonia or Influenza']].astype(int)

In [None]:
new_column_order = ["MMWRyear", "Timestamp", "MMWRweek","State", "Age Group",	"COVID-19 Deaths", "Pneumonia Deaths", "Influenza Deaths","Pneumonia or Influenza","Pneumonia or Influenza or Covid","Total Deaths"]  # Desired column order
df = df[new_column_order]
df.head()

Unnamed: 0,MMWRyear,Timestamp,MMWRweek,State,Age Group,COVID-19 Deaths,Pneumonia Deaths,Influenza Deaths,Pneumonia or Influenza,Pneumonia or Influenza or Covid,Total Deaths
0,2020,2020-01-04,1,US,All Ages,0,4102,432,4534,4534,4534
1,2020,2020-01-04,1,US,0-17 years,0,19,22,41,41,41
2,2020,2020-01-04,1,US,18-64 years,0,767,183,950,950,950
3,2020,2020-01-04,1,US,65 years and over,0,3316,227,3543,3543,3543
4,2020,2020-01-04,1,AL,All Ages,0,67,1,68,68,68


In [None]:
df[["COVID-19 Deaths", "Pneumonia Deaths", "Influenza Deaths","Pneumonia or Influenza","Pneumonia or Influenza or Covid","Total Deaths"]].describe().astype(int)

Unnamed: 0,COVID-19 Deaths,Pneumonia Deaths,Influenza Deaths,Pneumonia or Influenza,Pneumonia or Influenza or Covid,Total Deaths
count,49600,49600,49600,49600,49600,49600
mean,143,142,3,145,288,288
std,737,589,20,597,1317,1317
min,0,0,0,0,0,0
25%,0,0,0,0,0,0
50%,22,24,0,24,51,51
75%,69,89,0,91,161,161
max,25974,16884,1048,16920,42806,42806


In [None]:
# Sort the dataframe
df = df.sort_values(['Timestamp','State','Age Group'])

# Reset index
df = df.reset_index(drop=True)

df.head()

Unnamed: 0,MMWRyear,Timestamp,MMWRweek,State,Age Group,COVID-19 Deaths,Pneumonia Deaths,Influenza Deaths,Pneumonia or Influenza,Pneumonia or Influenza or Covid,Total Deaths
0,2020,2020-01-04,1,AK,0-17 years,0,0,0,0,0,0
1,2020,2020-01-04,1,AK,18-64 years,0,3,0,3,3,3
2,2020,2020-01-04,1,AK,65 years and over,0,13,0,13,13,13
3,2020,2020-01-04,1,AK,All Ages,0,18,0,18,18,18
4,2020,2020-01-04,1,AL,0-17 years,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
95,2020,2020-01-04,1,ID,All Ages,0,19,0,19,19,19
96,2020,2020-01-04,1,IL,0-17 years,0,0,0,0,0,0
97,2020,2020-01-04,1,IL,18-64 years,0,30,0,30,30,30
98,2020,2020-01-04,1,IL,65 years and over,0,115,2,117,117,117
