In [143]:
import pandas as pd
import plotly.express as px
import numpy as np

In [144]:
df = pd.read_csv('datasets/water_life.csv')
df_cleaned = df.dropna(subset=['Water', 'Life']).copy()

# Ensure 'Water' column is numeric and round to nearest full percentage
df_cleaned['Water'] = pd.to_numeric(df_cleaned['Water'], errors='coerce').round()

# Filter data to include only every 4th year from 2002 to 2022
df_cleaned = df_cleaned[(df_cleaned['Year'] >= 2002) & (df_cleaned['Year'] <= 2022) & (df_cleaned['Year'] % 4 == 2)]

# Sort DataFrame by 'Year' and 'Water'
df_cleaned.sort_values(by=['Year', 'Water'], ascending=[True, False], inplace=True)

In [145]:
avg_life_exp_by_year = df_cleaned.groupby('Year')['Life'].mean()

print(avg_life_exp_by_year)

Year
2002    68.900640
2006    70.431927
2010    71.859859
2014    73.113792
2018    73.950967
2022    73.766179
Name: Life, dtype: float64


In [146]:
df_cleaned['Water'] = pd.to_numeric(df_cleaned['Water'], errors='coerce')

# Bin 'Water' access into percentiles (every 10th percentile)
bins = np.arange(0, 101, 10)  # bins from 0 to 100 by 10
labels = [f'{i}-{i+9}%' for i in range(0, 100, 10)]  # labels for bins

df_cleaned['Water_Percentile'] = pd.cut(df_cleaned['Water'], bins=bins, labels=labels, include_lowest=True)

# Group by 'Year' and 'Water_Percentile', and calculate mean 'Life'
agg_data_by_year_percentile = df_cleaned.groupby(['Year', 'Water_Percentile'], observed=True)['Life'].mean().reset_index()

# Define custom colors for each year
custom_colors = {
    2002: '#31fcee',
    2006: '#5dbab1',
    2010: '#49817b',
    2014: '#b4a7b7',
    2018: '#9946b2',
    2022: '#4e4351'
}

# Plotting with Plotly Express
fig = px.line(agg_data_by_year_percentile, x='Water_Percentile', y='Life', color='Year',
              color_discrete_map=custom_colors,
              custom_data=['Year'],
              labels={'Life': 'Mean Life Expectancy', 'Water_Percentile': 'Water Access Percentile'})

fig.update_layout(
    title='Mean Life expectancy by Water Access percentile<br><sup>Access to water positively correlates to life expectancy</sup>', 
    title_x=0.5,
    plot_bgcolor='#cff8d6',
    paper_bgcolor='#cff8d6',
    margin={'l': 90, 'b': 125, 'r': 50, 't': 90},
    xaxis=dict(title='Water access percentile (%)',
               gridcolor='darkgrey',
               gridwidth=1,
               zeroline=False),   
    yaxis=dict(title='Mean life expectancy (years)',
               gridcolor='darkgrey',
               gridwidth=1,
               zeroline=False),
    width=780
)

for trace in fig.data:
    trace.hovertemplate = "<b>%{customdata[0]}</b><br>" + \
                          "Water access percentile: %{x}<br>" + \
                          "Mean life expectancy: %{y} years<br>" + \
                          "<extra></extra>"

fig.add_annotation(x=-0.08, y=-0.35,
                   showarrow=False,
                   xref='paper', yref='paper',
                   xanchor='left', yanchor='bottom',
                   align='left',
                   text='Annotation')

fig.show()