In [None]:
import pandas as pd
import plotly.express as px
import numpy as np
import warnings
from plotly.graph_objects import Figure
import plotly.graph_objects as go
from plotly.subplots import make_subplots
pd.options.plotting.backend = 'plotly'
warnings.filterwarnings('ignore')

In [None]:
# Path to the CSV file
file_path = './data/data.csv' #from UN Population

# Read the CSV file into a pandas DataFrame
data = pd.read_csv(file_path, usecols=['Age', 'Time', 'Population', 'Deaths'])
data = data.set_index(['Age', 'Time'])
data['Rates'] = data['Deaths'] / data['Population']

In [None]:
# Inspect raw death rates
age_groups = set(data.index.get_level_values(0))
rates_fig = Figure()
for i, age in enumerate(set(data.index.get_level_values(0))):
    rates_fig.add_trace(go.Scatter(x=data.loc[age].index, y=data.loc[age]['Rates'], name=age))
rates_fig.update_layout(height=600, title='Death rates by age group and year')
rates_fig.update_yaxes(type='log')

In [None]:
age_groups = {
    '0-4': (0, 4),
    '5-14': (5, 14),
    '15-34': (15, 34),
    '35-49': (35, 49),
    '50+': (50, float('inf'))  # '50+' will be handled as 50 to infinity
}

# Function to check if a given age range falls within a specific age group
def is_age_in_group(age_range, group_start, group_end):
    if '+' in age_range:
        age = int(age_range[:-1])
        return age >= group_start
    else:
        start, end = map(int, age_range.split('-'))
        return start >= group_start and end <= group_end

# Function to aggregate data by age group and calculate the death rate
def aggregate_data(data, age_groups):
    # Create an empty DataFrame for the aggregated data
    aggregated_data = []

    # Iterate over each year and age group
    for year in sorted(data['Time'].unique()):
        for age_group, (start, end) in age_groups.items():
            # Filter data for the current year and age group
            filtered_data = data[(data['Time'] == year) & 
                                 data['Age'].apply(lambda x: is_age_in_group(x, start, end))]
            
            # Sum the population and deaths
            total_population = filtered_data['Population'].sum()
            total_deaths = filtered_data['Deaths'].sum()

            # Calculate the death rate per 1000 population
            death_rate = (total_deaths / total_population) if total_population else 0

            # Append the aggregated data
            aggregated_data.append({
                'Year': year,
                'AgeGroup': age_group,
                'Population': total_population,
                'Deaths': total_deaths,
                'DeathRate': death_rate
            })

    # Convert the aggregated data to a DataFrame
    return pd.DataFrame(aggregated_data)

# Perform the aggregation and calculate the death rate
df = aggregate_data(data, age_groups)
df.head(10)

In [None]:
df.loc[df["AgeGroup"] == "0-4", "AgeGroup"] = 0
df.loc[df["AgeGroup"] == "5-14", "AgeGroup"] = 5
df.loc[df["AgeGroup"] == "15-34", "AgeGroup"] = 15
df.loc[df["AgeGroup"] == "35-49", "AgeGroup"] = 35
df.loc[df["AgeGroup"] == "50+", "AgeGroup"] = 50

In [None]:
new_df = df[['Year','AgeGroup','DeathRate']].copy()
raw_death_rates = new_df.pivot(index="AgeGroup", columns="Year", values="DeathRate")
raw_death_rates.rename_axis(None, axis=1, inplace=True)
raw_death_rates.index.name = None

In [None]:
raw_death_rates

In [None]:
def adapt_death_rates_for_lifetable(
    upper_age: int, 
    rates: pd.Series,
) -> pd.Series:
    """Get the death rates applicable to each year of age.
    
    Args:
        upper_age: The top year of age to consider
        rates: The raw data for the death rates
    
    Returns:
        The death rates by year of age
        
    """
    ages = range(upper_age)
    revised_rates = pd.Series(index=ages)
    revised_rates.index.name = 'age'
    for a in ages:
        idx = next((i for i, age in enumerate(rates.index) if age > a), 0)  # The zero is a trick to get the last element when one is subtracted
        revised_rates.loc[a] = rates.iloc[idx - 1]
    return revised_rates


def get_lifetable_from_rates(
    rates: pd.Series,
) -> pd.Series:
    """Calculate cohort sizes - note this only works for increments of one year
    (intended to be used with adapt_death_rates_for_lifetable above).
    
    Args:
        rates: Annual death rates from adapt_death_rates_for_lifetable
    
    Returns:
        The life table
        
    """
    lifetable = pd.Series(index=rates.index)
    cohort_size = 100000
    for a in rates.index:
        lifetable[a] = cohort_size
        cohort_size -= cohort_size * rates.loc[a]
    return lifetable

In [None]:
years = df['Year'].unique()

In [None]:
all_rates = pd.DataFrame(columns=years)
lifetables = pd.DataFrame(columns=all_rates.columns)
for year in years:
    all_rates[year] = adapt_death_rates_for_lifetable(100, raw_death_rates[year])
    lifetables[year] = get_lifetable_from_rates(all_rates[year])

In [None]:
lifetables.plot(labels={'value': 'surviving'}, height=600, title='life tables by year')