In [None]:
import matplotlib.pyplot as plt
import pandas as pd
from scipy.stats import linregress
import seaborn as sns
import numpy as np
from scipy.stats import sem

In [None]:
#This is the full data frame with some extra columns removed
health_df = pd.read_csv("Resources/Provisional_COVID-19_Deaths_by_Sex_and_Age.csv")
health_df = health_df.drop(columns = ["Data As Of", "Start Date", "End Date", "Footnote"])
health_df.head()

In [None]:
#I have removed some random age groups
age_adjusted_df = health_df.loc[(health_df["Age Group"] != "40-49 years") &
                                (health_df["Age Group"] != "30-39 years") &
                                (health_df["Age Group"] != "18-29 years") &
                                (health_df["Age Group"] != "0-17 years") &
                                (health_df["Age Group"] != "50-64 years"), :]

In [None]:
#This data frame shows deaths regardless of Year and Month
#This data frame can be sorted by State, Sex, and Age Group
#State: You can choose any of the 50 states, Puerto Rico, or the United States as a whole
#Sex: Male, Female, All Sexes
#Age Group:
    #All ages
    #Under 1 year
    #1-4 years old
    #5-14 years old
    #15-24 years old
    #25-34 years old
    #35-44 years old
    #45-54 years old
    #55-64 years old
    #65-74 years old
    #75-84 years old
    #85 years and over
total_df = age_adjusted_df.loc[health_df["Group"] == "By Total", :]
total_df = total_df.drop(columns = ["Year", "Month", "Group"])
total_df.head()

In [None]:
#This data frame shows deaths by year, regardless of Month.
#You can pick 2020, 2021, or 2022
#This data frame can be sorted by State, Sex, and Age Group
#State: You can choose any of the 50 states, Puerto Rico, or the United States as a whole
#Sex: Male, Female, All Sexes
#Age Group:
    #All ages
    #Under 1 year
    #1-4 years old
    #5-14 years old
    #15-24 years old
    #25-34 years old
    #35-44 years old
    #45-54 years old
    #55-64 years old
    #65-74 years old
    #75-84 years old
    #85 years and over
year_df = age_adjusted_df.loc[health_df["Group"] == "By Year", :]
year_df = year_df.drop(columns = ["Month", "Group"])
year_df

In [None]:
#This data frame shows deaths by Month.
#You can pick any month numerically (ex. January = 1.0)
#This data frame can be sorted by Year, State, Sex, and Age Group
#Year: You can pick 2020, 2021, or 2022
#State: You can choose any of the 50 states, Puerto Rico, or the United States as a whole
#Sex: Male, Female, All Sexes
#Age Group:
    #All ages
    #Under 1 year
    #1-4 years old
    #5-14 years old
    #15-24 years old
    #25-34 years old
    #35-44 years old
    #45-54 years old
    #55-64 years old
    #65-74 years old
    #75-84 years old
    #85 years and over
month_df = age_adjusted_df.loc[health_df["Group"] == "By Month", :]
month_df = month_df.drop(columns = "Group")
month_df.head()

In [None]:
men = year_df["Sex"] == "Men"
print(f"There are {men.count()} men in the dataframe.")
      
women = year_df["Sex"] == "Women"
print(f"There are {women.count()} women in the dataframe.")    

In [None]:
ax = sns.barplot(data = year_df, x="Year", y = "COVID-19 Deaths", hue = "Sex", ci=None, palette="viridis")
plt.title("COVID-19 Deaths from 2019-2022")
ax.set_xticklabels(["2020", "2021", "2022"])
plt.savefig("Resources/Images/COVIDdeathsYear&Gender.png")
plt.show()

#This plot shows that more men have died from COVID than women because
#the above line of code states there are an equal number of men and women in the dataframe.
#The most people died in 2021, but it was the only time we have an entire year's data.

In [None]:
#The most COVID deaths occurred in Dec 2020 and Jan 2021. 
#From Feb - Jun 2021, there was a steady decline as vaccines became available.

months = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]

f, ax = plt.subplots(figsize = (8,6))
ax = sns.barplot(data = month_df, x="Month", y = "COVID-19 Deaths", 
                 hue = "Year", ci=None, palette="rocket")
ax.set_xticklabels(months)
ax.set_title("COVID-19 Deaths from 2020-2022 by Month")
plt.legend(loc="upper center")
plt.savefig("Resources/Images/COVIDdeathsbyMonth.png")
plt.show()

In [None]:
covid_death_by_month = month_df.loc[:, ["Year", "Month", "COVID-19 Deaths"]]

covid_death_by_month = covid_death_by_month.set_index("Month")
covid_death_by_month


In [None]:
covid_death_by_year = month_df.loc[:, ["Year", "COVID-19 Deaths"]]
covid_death_by_year = covid_death_by_year.groupby("Year").sum()
covid_death_by_year

In [None]:
covid_death_by_year.plot(kind = "bar", rot=0, color = "salmon")
plt.ylabel("Number of Deaths (Million)")
plt.title("Number of COVID-19 Deaths from 2020-2022")
plt.savefig("Resources/Images/COVIDdeaths.png")
plt.show()

In [None]:
flu_death_by_year = month_df.loc[:, ["Year", "Influenza Deaths"]]
flu_death_by_year = flu_death_by_year.groupby("Year").sum()
flu_death_by_year

In [None]:
flu_death_by_year.plot(kind = "bar", rot=0, color = "salmon")
plt.ylabel("Number of deaths")
plt.title("Number of Flu Deaths from 2020-2022")
plt.savefig("Resources/Images/Fludeaths.png")
plt.show()

In [None]:
pna_death_by_year = month_df.loc[:, ["Year", "Pneumonia Deaths"]]
pna_death_by_year = pna_death_by_year.groupby("Year").sum()
pna_death_by_year

In [None]:
pna_death_by_year.plot(kind = "bar", rot=0, color = "salmon")
plt.ylabel("Number of deaths (Million)")
plt.title("Number of Pneumonia Deaths from 2020-2022")
plt.savefig("Resources/Images/PNAdeaths.png")
plt.show()

In [None]:
f, ax = plt.subplots(figsize = (10, 10))
states_df = year_df.loc[(health_df["State"] != "United States"), :]
states_df = states_df.groupby(["State"])["COVID-19 Deaths"].sum()
states_df.plot(kind="barh")
plt.title("Total COVID-19 Deaths by State")
plt.xlabel("Number of COVID-19 Deaths")
plt.savefig("Resources/Images/COVIDdeathbyState.png")
plt.show()

In [None]:
state = year_df["State"].unique()
year = year_df["Age Group"].unique()

In [None]:
#Covid vs PNA deaths
plt.scatter(year_df['COVID-19 Deaths'],year_df['Pneumonia Deaths'], c=year_df['Year'])
clb = plt.colorbar()
plt.xlabel("COVID-19 Deaths")
plt.ylabel("Pneumonia Deaths")
plt.title("COVID vs. Pneumonia Deaths from 2020-2022")
clb.set_label("Year")
plt.savefig("Resources/Images/ScatterPNAvsCOVID.png")
plt.show()


In [None]:
#Covid vs flu deaths
plt.scatter(year_df['COVID-19 Deaths'],year_df['Influenza Deaths'], c=year_df['Year'])
clb = plt.colorbar()
plt.xlabel("COVID-19 Deaths")
plt.ylabel("Influenza Deaths")
plt.title("COVID vs. Influenza Deaths from 2020-2022")
clb.set_label("Year")
plt.savefig("Resources/Images/ScatterFluvsCOVID.png")
plt.show()

In [None]:
#Covid vs Total deaths
plt.scatter(year_df['COVID-19 Deaths'],year_df['Total Deaths'], c=year_df['Year'])
clb = plt.colorbar()
plt.xlabel("COVID-19 Deaths")
plt.ylabel("Total Deaths (Millions)")
plt.title("COVID vs. Total Deaths from 2020-2022")
clb.set_label("Year")
plt.savefig("Resources/Images/ScatterTotalvsCOVID.png")
plt.show()

In [None]:
states_df = pd.DataFrame(states_df)
ax = sns.heatmap(states_df, linewidth=0.5,square=True)
plt.show()

In [None]:

# # states = states_df["State"]
# covid_deaths_state = states_df["COVID-19 Deaths"]
# covid_deaths_state = pd.DataFrame(covid_deaths_state)
# covid_deaths_state
# states_df.plot(kind="scatter")

In [None]:
total_df.describe()

In [None]:
new_month_df = month_df.dropna()
subset = new_month_df.sample(3000, random_state = 42)

print(f"The SEM value for the sample COVID-19 Deaths is {sem(sample['COVID-19 Deaths'])}")

# Create a sample set of 35, each with 3000 data points
covid_sample_set = [new_month_df.sample(3000) for x in range(0,35)]
means = [sample['COVID-19 Deaths'].mean() for sample in covid_sample_set]
standard_errors = [sem(sample['COVID-19 Deaths']) for sample in covid_sample_set]
x_axis = np.arange(0, len(covid_sample_set), 1) + 1

fig, ax = plt.subplots()
ax.errorbar(x_axis, means, standard_errors, fmt="o")
ax.set_xlim(0, len(covid_sample_set) + 1)

ax.set_xlabel("Sample Number")
ax.set_ylabel("Mean COVID-19 Deaths")
plt.show()
plt.savefig("Resources/Images/SEMCOVID.png")

In [None]:
new_month_df = month_df.dropna()
subset = new_month_df.sample(3000, random_state = 42)

print(f"The SEM value for the sample Pneumonia Deaths is {sem(sample['Pneumonia Deaths'])}")

# Create a sample set of 35, each with 3000 data points
covid_sample_set = [new_month_df.sample(3000) for x in range(0,35)]
means = [sample['Pneumonia Deaths'].mean() for sample in covid_sample_set]
standard_errors = [sem(sample['Pneumonia Deaths']) for sample in covid_sample_set]
x_axis = np.arange(0, len(covid_sample_set), 1) + 1

fig, ax = plt.subplots()
ax.errorbar(x_axis, means, standard_errors, fmt="o")
ax.set_xlim(0, len(covid_sample_set) + 1)

ax.set_xlabel("Sample Number")
ax.set_ylabel("Mean Pneumonia Deaths")
plt.savefig("Resources/Images/SEMPNA.png")
plt.show()

In [None]:
new_month_df = month_df.dropna()
subset = new_month_df.sample(3000, random_state = 42)

print(f"The SEM value for the sample Influenza Deaths is {sem(sample['Influenza Deaths'])}")

# Create a sample set of 35, each with 3000 data points
covid_sample_set = [new_month_df.sample(3000) for x in range(0,35)]
means = [sample['Influenza Deaths'].mean() for sample in covid_sample_set]
standard_errors = [sem(sample['Influenza Deaths']) for sample in covid_sample_set]
x_axis = np.arange(0, len(covid_sample_set), 1) + 1

fig, ax = plt.subplots()
ax.errorbar(x_axis, means, standard_errors, fmt="o")
ax.set_xlim(0, len(covid_sample_set) + 1)

ax.set_xlabel("Sample Number")
ax.set_ylabel("Mean Pneumonia Deaths")
plt.savefig("Resources/Images/SEMFlu.png")
plt.show()

In [None]:
# data = [states, covid_deaths_state]
fig, ax = plt.subplots(figsize = (20,20))
plt.imshow(states_df, cmap='tab20_r', interpolation='nearest')
plt.show()