In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import ticker
import warnings
warnings.filterwarnings('ignore')

In [None]:
data_tn = pd.read_csv('../data/cleaned_Heart_Disease_and_Stroke_Mortality_Among_US_Adults_35.csv')

In [None]:
data_tn.head(3)

In [None]:
data_tn.info()

In [None]:
# Convert the 'Year' column to numeric, handling errors
data_tn['Year'] = pd.to_numeric(data_tn['Year'], errors='coerce')
data_tn['Year'] = data_tn['Year'].fillna(0)

# Drop any rows where 'Year' could not be converted to a valid integer
data_tn = data_tn.dropna(subset=['Year'])

In [None]:
data_tn['Year'].isna().sum()

# Filtering the dataset to include only rows from the most recent years (2010-2019)

In [None]:
data_tn_recent_years = data_tn[data_tn['Year'] >= 2010]

In [None]:
data_tn_recent_years.head()

In [None]:
print(f"The filtered DataFrame (2010-2019) has {len(data_tn_recent_years)} rows.")

In [None]:
data_tn_recent_years.info()

# Checking the unique values.

In [None]:
# Check all columns in the DataFrame for unique values
for column in data_tn_recent_years.columns:
    unique_values = data_tn_recent_years[column].unique()
    if len(unique_values) == 1:
        print(f"The '{column}' column has the same value across all rows.")
        print(f"The single value is: {unique_values[0]}")

# Checking for duplicates

In [None]:
# Check for duplicates
duplicate_rows = data_tn_recent_years.duplicated().sum()
print(f"Number of duplicate rows: {duplicate_rows}")

In [None]:
data_tn_recent_years

# Saving the cleaned dataset of TN to a csv file

In [None]:
# Save the cleaned DataFrame to a new CSV file
data_tn_recent_years.to_csv('../data/tn_cleaned_Heart_Disease_and_Stroke_Mortality_Among_US_Adults_35.csv', index=False)

In [None]:
# import data
path = '../data/tn_cleaned_Heart_Disease_and_Stroke_Mortality_Among_US_Adults_35.csv'
heart = pd.read_csv(path)
display(heart) 

- Average Mortality Rate by Age Group

In [None]:
plt.figure(figsize=(10,6))
heart.groupby('AgeGroup')['MortalityRate'].mean().sort_values(ascending=False).plot.bar(fontsize=12).set_title("Average Mortality Rate By Age Group (2010-2019)")
plt.ylabel('Number of Deaths per 100,000')
plt.xticks(rotation=45, ha='right')
plt.show()

- Mortality Rate by Age Group and Sex

In [None]:
# Plot the data with the custom color palette
plt.figure(figsize=(14, 7))
sns.boxplot(data=heart, x='AgeGroup', y='MortalityRate', hue='Gender')
plt.title('Mortality Rates by Age Group and Sex')
plt.xlabel('Age Group')
plt.ylabel('Number of Deaths per 100,000')
plt.legend(title='Sex')
plt.savefig('Mortality Rates by Age Group and Sex.jpg', format='jpg')
plt.show()

Mortality Rate by Gender

In [None]:
# Plot Average MortalityRate by Gender
plt.figure(figsize=(14,7))
heart.groupby('Gender')['MortalityRate'].mean().sort_values(ascending=False).plot.bar(fontsize=12).set_title("Mortality Rate By Gender (2010-2019)")
plt.ylabel('Number of Deaths per 100,000')
plt.xticks(rotation=45, ha='right')
plt.show()