In [2]:
import pandas as pd
import numpy as np
from countries import country_labels
print("Libraries imported!")

Libraries imported!


# Import of data

In [41]:
# Loading data
raw_df = pd.read_csv('demo_pjangroup__custom_11197792_linear.csv')
raw_df.head(3)

Unnamed: 0,DATAFLOW,LAST UPDATE,freq,unit,sex,age,geo,TIME_PERIOD,OBS_VALUE,OBS_FLAG
0,ESTAT:DEMO_PJANGROUP(1.0),30/04/24 23:00:00,A,NR,F,TOTAL,AD,1986,20898.0,
1,ESTAT:DEMO_PJANGROUP(1.0),30/04/24 23:00:00,A,NR,F,TOTAL,AD,1987,22148.0,
2,ESTAT:DEMO_PJANGROUP(1.0),30/04/24 23:00:00,A,NR,F,TOTAL,AD,1988,22814.0,


In [4]:
# Basic information columns, entries and data types
raw_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 113840 entries, 0 to 113839
Data columns (total 10 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   DATAFLOW     113840 non-null  object 
 1   LAST UPDATE  113840 non-null  object 
 2   freq         113840 non-null  object 
 3   unit         113840 non-null  object 
 4   sex          113840 non-null  object 
 5   age          113840 non-null  object 
 6   geo          113840 non-null  object 
 7   TIME_PERIOD  113840 non-null  int64  
 8   OBS_VALUE    113835 non-null  float64
 9   OBS_FLAG     1872 non-null    object 
dtypes: float64(1), int64(1), object(8)
memory usage: 8.7+ MB


# Cleaning the database and pivoting...

In [77]:
# Removing first few columns which describe frequency of data acquisition (annual) and last data update
df = raw_df.drop(['DATAFLOW', 'LAST UPDATE', 'freq', 'unit', 'OBS_FLAG'], axis=1)

# ```df.set_index(['geo', 'sex', 'TIME_PERIOD'])``` could be used,
# but .pivot() provided with easier defining of what is an index and what are columns and values
df = df.pivot(index=['geo', 'sex', 'TIME_PERIOD'], columns='age', values='OBS_VALUE')

# Re-ordering population age ranges
cols_revised_order = ['TOTAL', 'Y_LT5', 'Y5-9', 'Y10-14', 'Y15-19', 'Y20-24', 'Y25-29', 'Y30-34', 'Y35-39',
                      'Y40-44', 'Y45-49', 'Y50-54', 'Y55-59', 'Y60-64', 'Y65-69', 'Y70-74', 'Y_GE75']
df = df[cols_revised_order]

# Removing Y from age range columns
new_columns = dict()
for old_column in list(df.columns):
    new_column = old_column.replace("Y", "")
    new_columns[old_column] = new_column
    
new_columns['Y_LT5'] = "<5"
new_columns['Y_GE75'] = "75<"

#print(new_columns)

df.rename(new_columns, axis=1, inplace=True)

# Replacing country and gender labels with their full names
df = df.rename(country_labels()).rename({"F":"Female", "M":"Male", "T":"Both"}).reset_index()

In [89]:
df

age,geo,sex,TIME_PERIOD,TOTAL,<5,5-9,10-14,15-19,20-24,25-29,30-34,35-39,40-44,45-49,50-54,55-59,60-64,65-69,70-74,75<
0,Andorra,Female,1986,20898.0,1099.0,1496.0,1704.0,1530.0,1993.0,2298.0,2016.0,1764.0,1319.0,1081.0,1024.0,1003.0,862.0,592.0,483.0,
1,Andorra,Female,1987,22148.0,1153.0,1588.0,1759.0,1619.0,2153.0,2387.0,2190.0,1828.0,1472.0,1117.0,1079.0,1025.0,928.0,657.0,507.0,
2,Andorra,Female,1988,22814.0,1129.0,1574.0,1772.0,1685.0,2057.0,2527.0,2257.0,1883.0,1565.0,1163.0,1117.0,1058.0,954.0,784.0,521.0,
3,Andorra,Female,1989,23813.0,1099.0,1606.0,1795.0,1747.0,2070.0,2586.0,2406.0,1984.0,1690.0,1329.0,1120.0,1149.0,993.0,849.0,537.0,
4,Andorra,Female,1990,24171.0,1182.0,1562.0,1712.0,1798.0,1997.0,2558.0,2417.0,2019.0,1759.0,1450.0,1135.0,1134.0,1031.0,924.0,544.0,949.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7410,Kosovo*,Both,2018,1798506.0,153526.0,147177.0,149641.0,172350.0,155673.0,144850.0,132532.0,124428.0,120832.0,108030.0,95156.0,81541.0,66810.0,49751.0,38929.0,57280.0
7411,Kosovo*,Both,2019,1795666.0,146042.0,150812.0,141325.0,166789.0,152483.0,143229.0,133788.0,123967.0,120343.0,110194.0,98362.0,85308.0,69773.0,53199.0,38945.0,61107.0
7412,Kosovo*,Both,2020,1782115.0,139433.0,150864.0,138268.0,163371.0,147970.0,139857.0,133504.0,122738.0,118526.0,111374.0,100248.0,87709.0,70225.0,55578.0,39612.0,62838.0
7413,Kosovo*,Both,2021,1798186.0,139507.0,151145.0,138118.0,151523.0,159614.0,141623.0,135642.0,124647.0,118049.0,115335.0,101022.0,88886.0,73646.0,56294.0,41132.0,62003.0


In [90]:
# Un-comment line below to export to Excel file
df.to_excel('arranged_table.xlsx', engine='openpyxl', sheet_name='Sheet1')

# First drawings...


In [91]:
# Libraries needed for interactive dashboards
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
print("Libraries imported!")

Libraries imported!


In [92]:
country, year = "Croatia", 2001

In [94]:
df_country = df[ df["geo"] == country]
df_country

age,geo,sex,TIME_PERIOD,TOTAL,<5,5-9,10-14,15-19,20-24,25-29,30-34,35-39,40-44,45-49,50-54,55-59,60-64,65-69,70-74,75<
3287,Croatia,Female,1981,2374579.0,,,,,,,,,,,,,,,,
3288,Croatia,Female,1982,2384760.0,,,,,,,,,,,,,,,,
3289,Croatia,Female,1983,2397661.0,,,,,,,,,,,,,,,,
3290,Croatia,Female,1984,2409164.0,,,,,,,,,,,,,,,,
3291,Croatia,Female,1985,2419923.0,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3432,Croatia,Both,2019,4076246.0,185127.0,202194.0,200465.0,204449.0,240575.0,239413.0,262027.0,282031.0,275383.0,266771.0,285324.0,296230.0,297658.0,262779.0,190347.0,385473.0
3433,Croatia,Both,2020,4058165.0,182408.0,196674.0,202389.0,198576.0,238386.0,238457.0,255340.0,280875.0,277900.0,266906.0,280613.0,290286.0,295571.0,267320.0,203186.0,383278.0
3434,Croatia,Both,2021,4036355.0,181550.0,190992.0,202345.0,195840.0,233001.0,238148.0,250514.0,277405.0,280584.0,269356.0,271652.0,287473.0,292648.0,271071.0,218929.0,374847.0
3435,Croatia,Both,2022,3862305.0,175765.0,180825.0,195621.0,188897.0,206977.0,215051.0,226765.0,254062.0,267208.0,259426.0,258212.0,278654.0,286296.0,280555.0,227474.0,360517.0


In [140]:
fem_bar = px.bar(data_frame = df_fem_anno.index[1:],
       y=df_fem_anno.index[1:], x=df_fem_anno.values[1:],
       labels = {"y": "Age ranges (years)", "x":"Population"},
       title=f"Female Population of {country} in Year {year}")

male_bar = px.bar(data_frame = df_male_anno.index[1:],
       y=df_male_anno.index[1:], x=df_male_anno.values[1:]/df_male_anno.values[0]*100,
       labels = {"y": "Age ranges (years)", "x":"Population"},
       title=f"Male Population of {country} in Year {year}")

In [137]:
trace_male = go.Bar(x=df_male_anno.values[1:], y=df_male_anno.index[1:],
                    orientation="h", name="Male")

trace_female = go.Bar(x=df_fem_anno.values[1:], y=df_fem_anno.index[1:],
                      orientation="h", name="Female")

layout = go.Layout(title=f"Population Pyramid of {country} in Year {year}", 
                   xaxis=dict(title="Population"),
                   yaxis=dict(title="Age ranges (years)"),
                   barmode="group")

fig = go.Figure(data=[trace_male, trace_female], layout=layout)
fig.show()

In [257]:
df_country = df.loc[(country, slice(None), slice(None))]
df_country.index[1][1]

1982

In [None]:
countries = set(df.index.get_level_values(0))
print(f"Number of countries: { len(countries) }")

In [None]:
geo_pops_annual = df.loc[('Andorra', 'Male', 1999)]
print(geo_pops_annual)

total = []
for index, value in geo_pops_annual[1:].items():
    if index != 'TOTAL':
        #print(index, value)
        total.append(value)
print(sum(total))
        

In [None]:
# Libraries needed for interactive dashboards
import plotly.express as px
import dash
from dash import dcc, html
from dash.dependencies import Input, Output
print("Libraries imported!")