In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import pycountry_convert as pc

# <center> Netflix Applied Data Science - Project 1 </center>
## <center> Part 2: Data visualization and analysis</center>
### <center> Group 6 </center>
### <center>Vu, Alex, Kwabena </center>
<br>
<center> 10/14/2021 </center>

### Pre-work
+ Recover **clean_df** and **all_clean_df** variables for visualization and analysis in this Jupyter Notebook
+ Declare **functions** for further visualization work

In [None]:
%store -r clean_df all_clean_df

In [None]:
# convert iso_alpha3 code to country name
def alpha3_to_country(alpha3):
    alpha2 = pc.country_alpha3_to_country_alpha2(alpha3)
    return pc.country_alpha2_to_country_name(alpha2)

### Alex's work on visualization

In [None]:
al_clean_df = clean_df.drop(columns = ["Area","AgeInterval","Type","Access"])
# al_clean_df["Age"] = al_clean_df["Age"].astype('int64')

In [None]:
age_death_fig = px.histogram(al_clean_df,"Age","Deaths", title='Deaths by age')

In [None]:
age_death_fig.show()

+ At Age 15 has a higher amount of deaths at 1.35 M than all ages up until Age 30.
+ Possible Explainations E-Cigs and Vaping.
+ Data has Left Skew and Peaks at 85.

In [None]:
list_of_codes = al_clean_df["PopCode"].unique()
for cd in list_of_codes:
    s_df = al_clean_df.loc[al_clean_df["PopCode"] == cd ]
    px.histogram(s_df,"Sex","Deaths",title = cd).show()

+ Males have a higher number of Deaths than females since the data was collected.
+ Males in Asia have More Deaths than females.
+ While males in Oceania have a similar amount of deaths as females.
+ Overall Trend Points to males having a higher death rate than females.
+ Some outliers: GBR_NIR, HUN, FIN, AUT.

### Kwabena's work on visualization

In [None]:
group_1 = clean_df.groupby("Year")["Deaths"].sum()
plot_up = px.line(group_1)#, width=800, height=400)
plot_up.update_xaxes(
        tickangle = 70,
        title_text = "YEARS",
        title_font = {"size": 20},
        title_standoff = 25)

plot_up.update_yaxes(
        title_text = "TOTAL NUMBER OF DEATHS",
        title_font = {"size": 20},
        title_standoff = 25)
plot_up.update_layout(title_text='LINE CHART OF TOTAL NUMBER OF DEATHS VERSUS YEARS', title_x=0.5)

In [None]:
plot_up.show()

+ The Line chart displays a steady increase of deaths from the year 2000 to 2019.
+ A sharp rise occurred from 2019 to 2020.
+ The graph plummeted from 2020 to 2021, indicating the curtailment in mortality.

In [None]:
death_ave = clean_df.groupby("PopCode")["Deaths"].mean()
death_ave_df = pd.DataFrame(death_ave)
bar_plot = px.bar(death_ave_df)#, width=800, height=400)
bar_plot.update_xaxes(tickangle = 70,
        title_text = "POPULATION CODE",
        title_font = {"size": 20},
        title_standoff = 20)
bar_plot.update_yaxes(
        title_text = "AVERAGE OF DEATHS",
        title_font = {"size": 20},
        title_standoff = 40)
bar_plot.update_layout(title_text='BAR CHART OF AVERAGE DEATHS VERSUS POPULATION CODE', title_x=0.5)

In [None]:
bar_plot.show()

+ USA leads with an Average of Deaths of approximately 2,918.
+ Russia comes second with equivalently 1,010 Average of Deaths, followed by Canada.
+ From observation, North America appears to top the chart with a high mortality rate.

In [None]:
ave_group = clean_df.groupby("continent")["Deaths"].mean()
ave_df = pd.DataFrame(ave_group)
new_df = ave_df.reset_index()
fig = px.pie(new_df, values='Deaths', names="continent", title='AVERAGE OF DEATHS PER CONTINENT')#, width=800, height=400)

In [None]:
fig.show()

+ North America presents a percentage of 73.7
+ South America manifests as the continent with the lowest percentage of Death Average.

In [None]:
USA_CAN = clean_df[(clean_df['PopCode']=='USA') | (clean_df['PopCode']== 'CAN')].reset_index()
n_df= USA_CAN.groupby(['Year','Sex'])['Deaths'].mean()
reset = n_df.reset_index()
box_plot = px.box(reset, x='Sex', y='Deaths')#, width=800, height=400)
box_plot.update_layout(xaxis_title = 'SEX', yaxis_title='AVERAGE OF DEATHS', title = 'BOX PLOT OF MALE & FEMALE DEATHS IN NORTH AMERICA', title_x = 0.5)

In [None]:
box_plot.show()

+ With the upper quartile (Q4) of about 695 Death Average for Males and a corresponding Average of 667 for their female counterparts, there was the observation of higher death rate in males than in females, considering the range of years in North America.

### Vu's work on visualization

In [None]:
years_df_l = []
agg_func = {'Deaths': 'sum', 'iso_alpha3': 'first', 'continent': 'first'}

# generate data list from 2010 to 2020
for year in range(2010,2021):
    year_df = all_clean_df[all_clean_df['Year'] == year]
    d_by_country = year_df.groupby(by=['PopCode']).aggregate(agg_func)
    d_by_country['Country'] = d_by_country['iso_alpha3'].apply(alpha3_to_country)
    years_df_l.append(d_by_country)

# Create figure
years_fig = go.Figure(data=go.Choropleth(locations = years_df_l[0]['iso_alpha3'],
                                   z = years_df_l[0]['Deaths'],
                                   text = years_df_l[0]['Country'],
                                   colorbar_title = "Deaths",
                                   colorscale="Jet"))

# Add dropdowns
years_fig.update_layout(
    updatemenus=[
        dict(
            buttons=list([
                dict(
                    args=[{"z":[years_df_l[0]['Deaths']],
                           "locations":[years_df_l[0]['iso_alpha3']],
                           "text":[years_df_l[0]['Country']]}],
                    label="2010",
                    method="restyle"
                ),
                dict(
                    args=[{"z":[years_df_l[1]['Deaths']],
                           "locations":[years_df_l[1]['iso_alpha3']],
                           "text":[years_df_l[1]['Country']]}],
                    label="2011",
                    method="restyle"
                ),
                dict(
                    args=[{"z":[years_df_l[2]['Deaths']],
                           "locations":[years_df_l[2]['iso_alpha3']],
                           "text":[years_df_l[2]['Country']]}],
                    label="2012",
                    method="restyle"
                ),
                dict(
                    args=[{"z":[years_df_l[3]['Deaths']],
                           "locations":[years_df_l[3]['iso_alpha3']],
                           "text":[years_df_l[3]['Country']]}],
                    label="2013",
                    method="restyle"
                ),
                dict(
                    args=[{"z":[years_df_l[4]['Deaths']],
                           "locations":[years_df_l[4]['iso_alpha3']],
                           "text":[years_df_l[4]['Country']]}],
                    label="2014",
                    method="restyle"
                ),
                dict(
                    args=[{"z":[years_df_l[5]['Deaths']],
                           "locations":[years_df_l[5]['iso_alpha3']],
                           "text":[years_df_l[5]['Country']]}],
                    label="2015",
                    method="restyle"
                ),
                dict(
                    args=[{"z":[years_df_l[6]['Deaths']],
                           "locations":[years_df_l[6]['iso_alpha3']],
                           "text":[years_df_l[6]['Country']]}],
                    label="2016",
                    method="restyle"
                ),
                dict(
                    args=[{"z":[years_df_l[7]['Deaths']],
                           "locations":[years_df_l[7]['iso_alpha3']],
                           "text":[years_df_l[7]['Country']]}],
                    label="2017",
                    method="restyle"
                ),
                dict(
                    args=[{"z":[years_df_l[8]['Deaths']],
                           "locations":[years_df_l[8]['iso_alpha3']],
                           "text":[years_df_l[8]['Country']]}],
                    label="2018",
                    method="restyle"
                ),
                dict(
                    args=[{"z":[years_df_l[9]['Deaths']],
                           "locations":[years_df_l[9]['iso_alpha3']],
                           "text":[years_df_l[9]['Country']]}],
                    label="2019",
                    method="restyle"
                ),
                dict(
                    args=[{"z":[years_df_l[10]['Deaths']],
                           "locations":[years_df_l[10]['iso_alpha3']],
                           "text":[years_df_l[10]['Country']]}],
                    label="2020",
                    method="restyle"
                ),
            ]),
            direction="down",
            pad={"r": 1, "t": 1},
            showactive=True,
            x=0.7,
            xanchor="center",
            y=1.18,
            yanchor="top"
        ),
    ]
)

years_fig.update_layout(
    title={
        'text': "Deaths by Year",
        'y':0.9,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'}
)

In [None]:
years_fig.show()

In [None]:
usa_df = all_clean_df[all_clean_df['PopCode']=='USA']
d_by_week = usa_df.groupby(by=['Year','Week'])['Deaths'].sum().reset_index()
usa_fig = px.line(d_by_week, x='Week', y='Deaths', color='Year', title='Death counts in USA')

In [None]:
usa_fig.show()

+ In the past years (2015-2019), US death counts are reported slightly changes depending on the seasons.
+ Weekly death counts in USA have been reflecting the impact of COVID-19 pandemic since 2020.
+ Started from January 2021, US death counts drop sharply because of vaccination rollout.

In [None]:
agg_func = {'Deaths': 'sum', 'iso_alpha3': 'first', 'continent': 'first'}
year2021_df = all_clean_df[all_clean_df['Year'] == 2021]
d2021_by_country = year2021_df.groupby(by=['PopCode']).aggregate(agg_func)
d2021_by_country['Country'] = d2021_by_country['iso_alpha3'].apply(alpha3_to_country)
d2021_fig = px.scatter_geo(d2021_by_country, locations="iso_alpha3", color="continent",
                     hover_name="Country", size="Deaths",
                     projection="natural earth")#, width=800, height=400)

### CONCLUSION

# CONCLUSIONS
+ Understanding death causation posibilities by age distribution and gender differences.
    + Children at the age of 15 have more deaths than people at ages 10,20,25, which could be due to the vaping epidemic.
    + Males have a higher rate of Deaths all across the board. However, there are a couple of countries where females have more deaths.
+ Weekly death counts show the short-term elevation which might helps authorities in responding to temporary hazards such as COVID-19 pandemic, or temperature extremes.


In [None]:
d2021_fig.show()

> ***Understanding death helps saving lives***