## Baby boomers ageing
Visualization of the demographical "movement" through ages of the baby-boomers phenomenon in Europe.

In [17]:
import pandas as pd
DATA_URL='https://ec.europa.eu/eurostat/estat-navtree-portlet-prod/BulkDownloadListing?file=data/demo_pjan.tsv.gz'
df_raw = pd.read_csv(DATA_URL, delimiter='[\t,]+', compression='gzip')
df_raw.rename(columns={ "geo\\time": "region" }, inplace=True)

print(f'{len(df_raw)} total records read.')
print(f'Countries/regions:{df_raw.region.unique()}')

  This is separate from the ipykernel package so we can avoid doing imports until


17529 total records read.
Countries/regions:['AD' 'AL' 'AM' 'AT' 'AZ' 'BA' 'BE' 'BG' 'BY' 'CH' 'CY' 'CZ' 'DE' 'DE_TOT'
 'DK' 'EA18' 'EA19' 'EE' 'EEA30_2007' 'EEA31' 'EFTA' 'EL' 'ES' 'EU27_2007'
 'EU27_2020' 'EU28' 'FI' 'FR' 'FX' 'GE' 'HR' 'HU' 'IE' 'IS' 'IT' 'LI' 'LT'
 'LU' 'LV' 'MC' 'MD' 'ME' 'MK' 'MT' 'NL' 'NO' 'PL' 'PT' 'RO' 'RS' 'RU'
 'SE' 'SI' 'SK' 'SM' 'TR' 'UA' 'UK' 'XK']


### Data filtering & cleanup

Cleanup the data and leave only the desired parts - region, gender, etc.
Change the following constants in order to get a different selection.

In [18]:
# Choose a region or a set of such to be used for the graphics.
REGIONS = ['EU27_2007']

# Choose gender here - 'F', 'M' or 'T' -> total. Sorry, stats are two-gendered...
SEX = 'T'

df_clean = df_raw[(df_raw.region.isin(REGIONS))
                  & (df_raw.sex == SEX)
                  & (df_raw.age.str.match(r'Y\d+'))].copy()

# Trim the column names
df_clean.rename(columns=lambda x: x.strip(), inplace=True)
df_clean.shape

(99, 64)

In [19]:
import re

all_years = list(filter(re.compile(r'\d+').search, df_clean.columns))

for y in all_years:
    df_clean[y] = pd.to_numeric(df_clean[y].str.replace('[^\d]+', ''), errors='coerce')

df_clean.dropna(axis=1, how='all', inplace=True)
df_clean = df_clean.groupby(by='age', as_index=False).sum()
df_clean.shape

(99, 28)

In [20]:
# Clean the age column
df_clean.age = pd.to_numeric(df_clean.age.str.replace(r'Y_?', ''))
df_clean.sort_values(by='age', inplace=True)
df_clean = df_clean.reset_index(drop=True)

df_clean

Unnamed: 0,age,2019,2018,2017,2016,2015,2014,2013,2012,2011,...,2002,2001,2000,1999,1998,1997,1996,1995,1994,1993
0,1,5037194,5099349,5079469,5098572,5049528,5153428,5188922.0,5299636.0,5316177.0,...,5070006.0,5032724.0,5051247.0,5097512.0,5119053.0,5089074.0,5218244.0,5351718.0,0.0,0.0
1,2,5137604,5109130,5137505,5080994,5179908,5219696,5317380.0,5331664.0,5385111.0,...,5043958.0,5051909.0,5101699.0,5122382.0,5122763.0,5224500.0,5358099.0,5528843.0,0.0,0.0
2,3,5141980,5168164,5114535,5219009,5244147,5351795,5351059.0,5394254.0,5286789.0,...,5065174.0,5100810.0,5137249.0,5124382.0,5254020.0,5361099.0,5535325.0,5610174.0,0.0,0.0
3,4,5198152,5151040,5252134,5279610,5375443,5384196,5413283.0,5298156.0,5237756.0,...,5103016.0,5141840.0,5147770.0,5256369.0,5391962.0,5540378.0,5624773.0,5748441.0,0.0,0.0
4,5,5176355,5277602,5306099,5413361,5407084,5448645,5315752.0,5253675.0,5165227.0,...,5155236.0,5156599.0,5255547.0,5394679.0,5573992.0,5639446.0,5765231.0,5807856.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94,95,343234,335240,333794,311263,236598,184790,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
95,96,252320,250542,237066,177609,140599,135405,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
96,97,185011,173547,133073,102217,100464,102342,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
97,98,126427,95453,74538,71164,73579,82126,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Now, finally, do some plotting

In [47]:
%matplotlib notebook
import matplotlib.pyplot as plt
from matplotlib.animation import FuncAnimation

import numpy as np

YEARS = sorted(filter(re.compile(r'\d+').search, df_clean.columns))
print(f'Years:{YEARS}')

FRACTION = 500000
MAX_POP = int(np.ceil(df_clean[YEARS].max().max() / FRACTION)) * FRACTION + 2
NUM_RANGE = range(0, MAX_POP, FRACTION)
AGE_RANGE = range(0, 100, 10)

plt.style.use('fivethirtyeight')

fig, ax = plt.subplots(figsize=(10,8))
fig.subplots_adjust(bottom=.1, left=.1)
fig.text(.5, .01, 'Source: %s' % DATA_URL,
         wrap=True,
         horizontalalignment='center',
         fontsize=8)

line, = ax.plot([], [])
all_frames = []

def init():
    ax.set_xlim(0, 100)
    ax.set_ylim(0, MAX_POP)

    ax.set_xlabel('Age')
    ax.set_ylabel('Population')
    ax.set_xticks(AGE_RANGE)
    ax.set_xticklabels([ f'%dy' % n for n in AGE_RANGE])
    ax.set_yticks(NUM_RANGE)
    ax.set_yticklabels([ '%.1fm' % (n / 1000000) for n in NUM_RANGE])

    line.set_data([], [])
    return line,

def animate(year):
    ax.set_title('%s demographics, year %s' % (','.join(REGIONS), year))
    line.set_data(df_clean.index, df_clean[year])
    return line,

anim = FuncAnimation(fig, animate,
                     init_func=init,
                     frames=YEARS,
                     interval=500,
                     blit=True)
plt.show()

Years:['1993', '1994', '1995', '1996', '1997', '1998', '1999', '2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019']


<IPython.core.display.Javascript object>

Uncomment these if you want to save the result as animated GIF.

In [48]:
gif_name  = 'data/aging_%s.gif' % ('-'.join(REGIONS))
anim.save(gif_name, writer='imagemagick', dpi=72)
print('Done.')

Done.
