In [1]:
import duckdb
import polars as pl
import altair as alt

alt.theme.enable('ggplot2')

jco_colors = [
    "#0073C2",  # blue
    "#EFC000",  # yellow
    "#868686",  # gray
    "#CD534C",  # red
    "#7AA6DC",  # light blue
    "#003C67",  # dark blue
    "#8F7700",  # dark yellow
    "#3B3B3B",  # dark gray
    "#A73030",  # dark red
    "#4A6990",  # slate blue
]

## load data

In [2]:
uri = 'https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2025/2025-11-18/who_tb_data.csv'

tb_data = duckdb.sql(f"from read_csv('{uri}', nullstr=['', 'NA']);").pl()

## exporation

In [26]:
df = duckdb.sql("""

    select g_whoregion as region,
        year,
        round(avg(e_inc_100k), 2) as avg_e_inc_100k
    from tb_data
    group by 1, 2

""").pl()

min_year = df.select(pl.col('year').min()).item()
max_year = df.select(pl.col('year').max()).item()

alt.Chart(
    df,
    width=650,
    height=550,
    title=alt.Title(
        f'Estimated rate of tuberculosis incidence ({min_year} to {max_year})',
        anchor='start',
        fontSize=18.5, fontWeight='bold',
        subtitle=[
            'Incidence has decreased generally but there are recent increases in some regions',
            '',
            'Source: World Health Organization (WHO)'
        ],
        subtitleFontSize=15,
        offset=10,
    ),
).mark_line(
    size=3.8,
    opacity=0.7,
    interpolate='catmull-rom',
).encode(
    x=alt.X(
        'year',
        title='',
        axis=alt.Axis(
            format='d',
            values=list(range(min_year, max_year+1, 2)),
            labelFontSize=12,
        ),
    ),
    y=alt.Y(
        'avg_e_inc_100k',
        title='cases per 100,000 people (estimated)',
        axis=alt.Axis(values=range(0, 400+1, 20), labelFontSize=12)
    ),
    color=alt.Color(
        'region',
        title='',
        scale=alt.Scale(range=jco_colors),
        legend=alt.Legend(
            symbolSize=100,
            symbolStrokeWidth=3.8,
            labelFontSize=12,
        ),
    ),
    tooltip='region',
)

In [27]:
df = duckdb.sql("""

    select g_whoregion as region,
        year,
        round(avg(e_mort_100k), 2) as avg_e_mort_100k
    from tb_data
    group by 1, 2

""").pl()

min_year = df.select(pl.col('year').min()).item()
max_year = df.select(pl.col('year').max()).item()

alt.Chart(
    df,
    width=650,
    height=550,
    title=alt.Title(
        f'Estimated mortality from tuberculosis ({min_year} to {max_year})',
        anchor='start',
        fontSize=18.5, fontWeight='bold',
        subtitle=[
            '',
            'Source: World Health Organization (WHO)'
        ],
        subtitleFontSize=15,
        offset=10,
    ),
).mark_line(
    size=3.8,
    opacity=0.7,
    interpolate='catmull-rom',
).encode(
    x=alt.X(
        'year',
        title='',
        axis=alt.Axis(
            format='d',
            values=list(range(min_year, max_year+1, 2)),
            labelFontSize=12,
        ),
    ),
    y=alt.Y(
        'avg_e_mort_100k',
        title='deaths per 100,000 people (estimated)',
        axis=alt.Axis(values=range(0, 120+1, 10), labelFontSize=12)
    ),
    color=alt.Color(
        'region',
        title='',
        scale=alt.Scale(range=jco_colors),
        legend=alt.Legend(
            symbolSize=100,
            symbolStrokeWidth=3.8,
            labelFontSize=12,
        ),
    ),
    tooltip='region',
)