In [1]:
import pandas as pd
import numpy as np
from countries import country_labels
print("Libraries imported!")

ModuleNotFoundError: No module named 'countries'

# Data Import

In [None]:
# Loading data
raw_df = pd.read_csv('demo_pjangroup__custom_11197792_linear.csv')
raw_df.head(3)

In [None]:
# Basic information columns, entries and data types
#raw_df.info()

# Data Cleaning and Rearrangement

In [None]:
# Removing first few columns which describe frequency of data acquisition (annual) and last data update
df = raw_df.drop(['DATAFLOW', 'LAST UPDATE', 'freq', 'unit', 'OBS_FLAG'], axis=1).rename({"age":"id"}, axis=1)

# ```df.set_index(['geo', 'sex', 'TIME_PERIOD'])``` could be used,
# but .pivot() provided with easier defining of what is an index and what are columns and values
df = df.pivot(index=['geo', 'sex', 'TIME_PERIOD'], columns='id', values='OBS_VALUE')

# Re-ordering population age ranges
cols_revised_order = ['Y_LT5', 'Y5-9', 'Y10-14', 'Y15-19', 'Y20-24', 'Y25-29', 'Y30-34', 'Y35-39',
                      'Y40-44', 'Y45-49', 'Y50-54', 'Y55-59', 'Y60-64', 'Y65-69', 'Y70-74', 'Y_GE75', 'TOTAL']
df = df[cols_revised_order]

# Removing Y from age range columns
new_columns = dict()
for old_column in list(df.columns):
    new_column = old_column.replace("Y", "")
    new_columns[old_column] = new_column
    
new_columns['Y_LT5'] = "<5"
new_columns['Y_GE75'] = "75<"

#print(new_columns)

df.rename(new_columns, axis=1, inplace=True)

# Replacing country and gender labels with their full names
df = df.rename(country_labels()).rename({"F":"Female", "M":"Male", "T":"Both"}).reset_index()

In [None]:
df

In [None]:
# Un-comment line below to export to Excel file
#df.to_excel('arranged_table.xlsx', engine='openpyxl', sheet_name='Sheet1')

# Exploratory Data Analysis


## Annual Population Bar Charts

In [None]:
# Libraries needed for interactive dashboards
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
print("Libraries imported!")

In [None]:
country, year = "Andorra", 2019

In [None]:
df_country = df[ (df["geo"] == country) &  (df['TIME_PERIOD'] == year) ].transpose()
df_country

In [None]:
px.bar(data_frame=df_country, y='Female')

In [None]:
fem_bar = px.bar(data_frame = df_fem_anno.index[1:],
       y=df_fem_anno.index[1:], x=df_fem_anno.values[1:],
       labels = {"y": "Age ranges (years)", "x":"Population"},
       title=f"Female Population of {country} in Year {year}")

male_bar = px.bar(data_frame = df_male_anno.index[1:],
       y=df_male_anno.index[1:], x=df_male_anno.values[1:]/df_male_anno.values[0]*100,
       labels = {"y": "Age ranges (years)", "x":"Population"},
       title=f"Male Population of {country} in Year {year}")

In [None]:
trace_male = go.Bar(x=df_male_anno.values[1:], y=df_male_anno.index[1:],
                    orientation="h", name="Male")

trace_female = go.Bar(x=df_fem_anno.values[1:], y=df_fem_anno.index[1:],
                      orientation="h", name="Female")

layout = go.Layout(title=f"Population Pyramid of {country} in Year {year}", 
                   xaxis=dict(title="Population"),
                   yaxis=dict(title="Age ranges (years)"),
                   barmode="group")

fig = go.Figure(data=[trace_male, trace_female], layout=layout)
fig.show()

In [None]:
df_country = df.loc[(country, slice(None), slice(None))]
df_country.index[1][1]

In [None]:
countries = set(df.index.get_level_values(0))
print(f"Number of countries: { len(countries) }")

In [None]:
geo_pops_annual = df.loc[('Andorra', 'Male', 1999)]
print(geo_pops_annual)

total = []
for index, value in geo_pops_annual[1:].items():
    if index != 'TOTAL':
        #print(index, value)
        total.append(value)
print(sum(total))
        

In [None]:
# Libraries needed for interactive dashboards
import plotly.express as px
import dash
from dash import dcc, html
from dash.dependencies import Input, Output
print("Libraries imported!")