<span style="font-family:Lucida Bright;">

<hr style="border:2px solid black"> </hr>

<p style="margin-bottom:1cm"></p>

<center>
<font size="7"><b>Social Data Analysis and Visualization</b></font>
<p style="margin-bottom:1cm"></p>
<font size="6.8"><b>Final Project</b></font>   
<p style="margin-bottom:0.8cm"></p>
<font size="5.8"><b>Data Import and Cleaning</b></font>   
<p style="margin-bottom:0.8cm"></p>
<font size="3"><b>Wojciech Mazurkiewicz, DTU, 14 May 2021</b></font>
<br>
<font size="3"><b></b></font>

</center>

<p style="margin-bottom:0.7cm"></p>

<hr style="border:2px solid black"> </hr>

<hr style="border:2px solid black"> </hr>

# Initialization

## How to read this notebook

<span style="font-family:Arial;">

Please note that the pre-rendered outputs will first display properly when the notebook is __trusted__.

## Imports

In [1]:
%matplotlib inline

import bokeh.plotting as bplt
import calendar
import datetime
import folium
import json
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle
import plotly.express as px
import scipy.stats
import seaborn as sns
import urllib.request
import warnings

from bokeh.io import output_file
from bokeh.io import output_notebook
from bokeh.io import show
from bokeh.models import Legend
from bokeh.models.ranges import FactorRange
from bokeh.models.sources import ColumnDataSource
from folium.map import FeatureGroup
from folium.plugins import HeatMap, HeatMapWithTime
from functools import reduce
from IPython.core.interactiveshell import InteractiveShell
from IPython.display import display
from IPython.display import Markdown
from IPython.display import YouTubeVideo
from matplotlib import cm
from matplotlib.colors import Normalize
from matplotlib.image import NonUniformImage
from mpl_toolkits.axes_grid1 import make_axes_locatable
from operator import itemgetter
from pathlib import Path
from scipy import stats
from sklearn.datasets import fetch_20newsgroups
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from tqdm import tqdm

##  Configuration

### Notebook options

In [None]:
# Decide which output is shown below the cells.
InteractiveShell.ast_node_interactivity = "none"

### Bokeh options

In [None]:
# Show bokeh figures in the notebook.
output_notebook()

### Matplotlib options

In [50]:
# Show matplotlib plots inline.
%matplotlib inline

### Pandas options

In [2]:
# Define the format in which the numbers will be shown in
# the pandas dataframes.
pd.options.display.float_format = '{:,.1f}'.format

# Decide how to handle the "SettingWithCopyWarning" warning
pd.options.mode.chained_assignment = None  # default='warn'

# Set the maximum number of rows and columns to show when 
# displaying a Pandas dataframe.
pd.options.display.max_rows = 75
pd.options.display.max_columns = 75

### Warnings

In [None]:
# Decide how to handle warnings.
warnings.filterwarnings('ignore', category=UserWarning)
warnings.filterwarnings('ignore', category=PerformanceWarning)

## Paths

In [3]:
# Project root.
path_root = Path(
    r'C:\GDrive\DTU\Kurser\Social_Data_Analysis_and_Visualization_02806\final_project')

# Resources root.
path_resources_root = path_root / 'resources'

# Data folders.
path_data_root = path_resources_root / 'data'
path_data_citizenship_root = path_data_root / 'citizenship'
path_data_dwellings_root = path_data_root / 'dwellings'
path_data_geo_root = path_data_root / 'geo'
path_data_marital_status_root = path_data_root / 'marital_status'
path_data_without_district_info_root = path_data_root / 'without_district_information'
path_data_clean_root = path_data_root / 'clean'

## Function definitions

### General functions

In [4]:
# A function that returns a dict of object attributes.
def get_obj_attributes(obj):
    return {attribute_name: getattr(obj, attribute_name)
            for attribute_name in dir(obj)
            if (not attribute_name.startswith('__')
                and not callable(getattr(obj, attribute_name)))}

# A function that will print a markdown text.
def printmd(string):
    display(Markdown(string))
    
# A function that returns unique values from a text.
def unique(list_):
    return list(set(list_))

### Load data

In [51]:
# Define a class th
class CphData:

    def __init__(self):
        # Country of origin (no distric info)
        df_country_of_origin = pd.read_pickle(
            path_data_clean_root / 'cph_population_by_country_of_origin_without_district.pkl')
        df_country_of_origin.attrs = {'name': 'Country of origin'}
        self.country_of_origin = df_country_of_origin

        # Citizenship
        df_citizenship = pd.read_pickle(
            path_data_clean_root / 'cph_population_by_citizenship.pkl')
        df_citizenship.attrs = {'name': 'Citizenship'}
        self.citizenship = df_citizenship

        # Marital status
        df_marital_status = pd.read_pickle(
            path_data_clean_root / 'cph_population_by_marital_status.pkl')
        df_marital_status.attrs = {'name': 'Marital status'}
        self.marital_status = df_marital_status

        # Family type and children
        df_family_type_and_children = pd.read_pickle(
            path_data_clean_root / 'cph_population_by_family_type_and_number_of_chidren.pkl')
        df_family_type_and_children.attrs = {
            'name': 'Family type and children'}
        self.family_type_and_children = df_family_type_and_children

        # Income
        df_income = pd.read_pickle(path_data_clean_root / 'cph_income.pkl')
        df_income.attrs = {'name': 'Income'}
        self.income = df_income

        # Life span
        df_life_span = pd.read_pickle(
            path_data_clean_root / 'cph_life_span.pkl')
        df_life_span.attrs = {'name': 'Life span'}
        self.life_span = df_life_span

        # Population movement
        df_population_movement = pd.read_pickle(
            path_data_clean_root / 'cph_population_movement.pkl')
        df_population_movement.attrs = {'name': 'Population movement'}
        self.population_movement = df_population_movement

        # Dwellings
        df_dwellings = pd.read_pickle(
            path_data_clean_root / 'cph_dwellings.pkl')
        df_dwellings.attrs = {'name': 'Dwellings'}
        self.dwellings = df_dwellings

    # Gets dataframes with district information:
    def get_dataframes_with_district(self):
        return [df
                for df in self.get_all_dataframes()
                if 'District' in df.columns]

    # Gets all the dataframes and returns them in a list.
    def get_all_dataframes(self):
        return list(get_obj_attributes(self).values())

    # Gets the names of all columns in all datasets.
    def get_all_column_names(self):
        # Initialize the list of all columns from all the dataframes:
        all_columns = list()

        # Get all columns.
        for df in self.get_all_dataframes():
            all_columns += df.columns.to_list()

        return list(sorted(set(all_columns)))

    # Displays the dataframes with specified names.
    def display_dataframes(self, names=None):
        for attribute_name, df in get_obj_attributes(self).items():
            if (names is None) or (df.attrs["name"] in names):
                printmd(f'\n**{df.attrs["name"]}**:')
                display(df)

    # Gets the names of the dataframes in the object.
    def get_dataframe_names(self):
        return [df.attrs['name'] for df in self.get_dataframes()]

    # Displays the names of the dataframes in the object.
    def display_dataframe_names(self):
        # Display all the dataframes.
        df_names = self.get_dataframe_names()
        printmd(f'The dataframes in the object are:')
        printmd(f'***{"***, ***".join(df_names)}***')

### Plotting functions

In [6]:
# A function that applies default formatting to an axes.
def format_axes(axes: plt.Axes,
                keep_box=False):
    if not keep_box:
        axes.spines['top'].set_color('white')
        axes.spines['right'].set_color('white')

    axes.set_facecolor("white")


# A function that applies default formatting to annotation
# of an axes.
def format_axes_annotation(axes: plt.Axes):
    axes.xaxis.label.set_fontsize(14)
    axes.yaxis.label.set_fontsize(14)
    axes.title.set_fontsize(16)


# A function for creating common x-label for the figure.
def figure_x_label(figure: plt.Figure,
                   label: str,
                   y_position=0.04,
                   font_size=16):
    figure.text(0.5, y_position, label,
                ha='center',
                fontdict={'size': font_size})


# A function for creating common y-label for the figure.
def figure_y_label(figure: plt.Figure,
                   label: str,
                   x_position=0.04,
                   font_size=16):
    figure.text(x_position, 0.5, label,
                va='center',
                rotation='vertical',
                fontdict={'size': font_size})


# A function that draws a horizontal line across the entire axes.
def draw_threshold(value: float,
                   axes: plt.Axes,
                   linewidth=1,
                   linestyle='-',
                   color=None,
                   title=None):
    
    # Get axes limits and ranges.
    x_min, x_max = axes.get_xlim()
    x_range = x_max - x_min
    y_min, y_max = axes.get_ylim()
    y_range = y_max - y_min
    
    # Plot the threshold line.
    axes.plot([x_min, x_max], [value, value],
              linewidth=1,
              linestyle='-',
              color=color)
    
    # Write a title above the threshold line
    if title is not None:
        axes.text(x_min + 0.01 * x_range,
                  value + 0.02 * y_range,
                  title)

###  Dataframe functions

In [7]:
# A function that gets column names of a dataframe.
def get_df_columns(df, exclude=None):
    # If columns to be excluded have not been defined,
    # represent it as an empty list.
    if exclude is None:
        exclude = list()

    # If the columns to be excluded are not specified using a list
    # or a tuple, represent them as a list.
    elif not isinstance(exclude, (list, tuple)):
        exclude = [exclude]

    # Return all column names except the ones to exclude.
    return [column for column in df.columns.to_list()
            if column not in exclude]

# A function that resets the names of indices
def reset_df_index_names(df):
    return (
        df
        .rename_axis(['' for level in range(df.columns.nlevels)],
                     axis="columns")
        .rename_axis(['' for level in range(df.index.nlevels)],
                     axis="rows")
    )


# A function for balancing a dataframe so that the number of rows
# containing each value present in the designated column will be the same.
def balance_dataframe(df: pd.DataFrame, column_name):
    # Get the number of crimes for the least frequent crime.
    lowest_frequency = df['Category'].value_counts().min()

    # Create an empty dataframe for storing the balanced data
    df_balanced = pd.DataFrame()

    # For each value in column, randomly choose the number of samples
    # that corresponds to the least frequent value in the column.
    for value in df[column_name].unique():
        df_balanced = df_balanced.append(
            df
            .loc[df[column_name] == value]
            .sample(lowest_frequency)
        )

    return df_balanced


# A function that evaluates a dictionary of models on data from
# a pandas dataframe.
def evaluate_models(models: dict,
                    df: pd.DataFrame,
                    predictor_labels: list,
                    target_label: str,
                    test_size=0.33):

    # Get the dataset.
    X = df.loc[:, predictor_labels].values
    y = df.loc[:, target_label].values

    # Split the dataset into a test and training set.
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=32)

    # Fit the models to the data.
    for model_name, model in models.items():

        # Print the name of the model.
        printmd(f'*__{model_name}:__*')

        # Train the model on the training set.
        model.fit(X_train, y_train)

        # Get the predictions on the test set.
        predictions = model.predict(X_test)

        # Print the classification report.
        print(classification_report(y_test, predictions,
                                    zero_division=0,
                                    digits=4))


# A function that sorts the columns in alphabethical order
# and puts the user-chosen columns first
def df_sort_columns(df: pd.DataFrame,
                    first_columns=['Year',
                                   'Quarter',
                                   'District',
                                   'District type',
                                   'Sex',
                                   'Age']):

    # Define a function that moves the chosen element to the
    # front of the list.
    def move_to_front(element, list_):
        if element in list_:
            list_.insert(0, list_.pop(list_.index(element)))

    # Make sure that the columns that are to be put in front
    # are represented as a list.
    if first_columns is None:
        first_columns = list()

    # Sort the columns in alphabetical order.
    sorted_columns = list(df.columns)
    sorted_columns.sort()

    # Move the user-chosen columns to the front.
    for column in first_columns[::-1]:
        move_to_front(column, sorted_columns)

    # Assign the ordered columns to the dataframe.
    df = df[sorted_columns]

    return df


# A function that creates a new column representing data in 'value_columns'
# for every unique value in 'category_columns'.
def df_create_column_for_each_unique_value(df,
                                           category_columns,
                                           value_columns,
                                           aggfunc='first'):

    # Always represent category and value columns as a list or tuple.
    if not isinstance(category_columns, (list, tuple)):
        category_columns = [category_columns]

    if not isinstance(value_columns, (list, tuple)):
        value_columns = [value_columns]

    # Create a colum order for grouping so that all the value columns
    # come last and category columns second last. We leave out 1
    # value column for the result
    cat_and_value_columns = category_columns + value_columns
    column_order = (
        get_df_columns(df, exclude=cat_and_value_columns)
        + cat_and_value_columns[:-1]
    )

    # Create columns from unique values by grouping and unstacking.
    df = (
        df
        .groupby(column_order)
        .first()
        .unstack(list(np.arange(-len(cat_and_value_columns) + 1,
                                0)))
        .reset_index()
    )

    # Delete the names of the index levels
    df = df.rename_axis(['' for level in range(df.columns.nlevels)],
                        axis="columns")
    return df

# OLD IMPLEMENTATION
#     return (
#         df
#         .pivot_table(values=value_columns,
#                      index=get_df_columns(df,
#                                           exclude=category_columns + value_columns),
#                      columns=category_columns,
#                      aggfunc='first')
#         .reset_index()
#         .rename_axis(('', ''), axis="columns")
#     )

###  Data cleaning functions

In [8]:
# Define a function that loads a dataframe from
# kk.statistikbank.dk
def load_cph_df(path_csv):
    return pd.read_csv(path_csv,
                       sep='\t',
                       skiprows=0,
                       encoding='windows-1252')


# A function that loads a dataframe from multiple files.
def load_split_dataframe(paths_csv):

    # Load the first two dataframes.
    df_1 = load_cph_df(paths_csv[0])
    df_2 = load_cph_df(paths_csv[1])

    # Extract common columns.
    common_columns = list(set(df_1.columns)
                          & set(df_2.columns))

    # Load the whole dataframe.
    for idx, path_csv in enumerate(paths_csv):
        # Merge the dataframes.
        if idx == 0:
            df = load_cph_df(path_csv)
        else:
            df = df.merge(load_cph_df(path_csv),
                          left_on=common_columns,
                          right_on=common_columns)

    # Return the merged dataframe.
    return df


# A function that cleans the data about copenhagen district
def clean_districts(df: pd.DataFrame):

    # Continue only if the dataframe contains the column: "district"
    if 'district' not in df.columns:
        return df

    df = df.copy()

    # Define the dame of the column containing info about districts.
    district_column_name = 'district'

    # Extract the district names.
    districts = df[district_column_name]

    # Initialize the list for storing district types.
    district_types = list()

    # Define valid districts.
    valid_districts = ['Indre By', 'Østerbro', 'Nørrebro', 'Vesterbro/Kongens Enghave', 'Valby',
                       'Vanløse', 'Brønshøj-Husum', 'Bispebjerg', 'Amager Øst', 'Amager Vest']

    # Define valid districts.
    valid_polling_areas = ['1. Østerbro', '1. Øst', '1. Nord', '1. Syd', '1. Vest', '1. Nordvest',
                           '2. Sundbyvester', '2. Nord', '2. Syd', '2. Øst', '2. Vest', '3. Indre By',
                           '3. Nord', '3. Syd', '3. Øst',
                           '4. Sundbyøster', '4. Nord', '4. Syd', '4. Øst',
                           '5. Nørrebro', '5. Nord', '5. Nørrebrohallen',
                           '5. Syd', '5. Øst', '5. Vest', '5. Nordvest',
                           '6. Bispebjerg', '6. Vest', '6. Nord', '6. Øst', '6. Syd',
                           '7. Brønshøj', '7. Nord', '7. Syd', '7. Øst', '7. Vest',
                           '7. Nordvest', '7. Katrinedal', '7. Kirkebjerg', '7. Vanløse',
                           '8. Valby', '8. Nord', '8. Syd', '8. Vest', '8. Sydøst', '8. Midt',
                           '9. Vesterbro', '9. Nord', '9. Syd',
                           '9. Øst', '9. Vest', '9. Midt', '9. Sydhavn']

    # Assign type to each district.
    for district in districts:
        if (('District' in district) | (district in valid_districts)):
            district_types.append('District')

        elif (('Polling area' in district) | (district in valid_polling_areas)):
            district_types.append('Polling area')

        elif 'Copenhagen total' in district:
            district_types.append('Entire Copenhagen')

        else:
            district_types.append('Unknown')

    # Insert the column "district type" next to the district column.
    idx_district_column = df.columns.to_list().index(district_column_name)
    df.insert(idx_district_column + 1, 'district type', district_types)

    # Clean the names in the district - column.
    for token in ['District -', 'Polling area -']:
        df[district_column_name] = (
            df[district_column_name]
            .str.replace(token, '')
            .str.strip()
        )

    return df


# A function that cleans all the information about the time in the
# dataframes for Copenhagen.
def clean_years(df: pd.DataFrame,
                non_year_columns=None,
                value_name='Value'):

    if non_year_columns is not None:
        # If "non_year_columns" is an integer, regard the first n
        # columns represented by the integer as non-year columns
        if isinstance(non_year_columns, int):
            non_year_columns = df.columns[:non_year_columns]
    else:
        non_year_columns = [column for column in df.columns.to_list()
                            if column[0] not in ['1', '2']]

    # Create a row for each Year and Quarter.
    df = df.melt(
        id_vars=non_year_columns,
        var_name="Time",
        value_name=value_name
    )

    # If the time is represented by a year and a quarter,
    # create columns "Year" and "Quarter" from the column "Time".
    if 'Q' in df.loc[0, 'Time']:
        df[['Year', 'Quarter']] = (
            df
            .pop('Time').str.split('Q', 2, expand=True)
            .astype(int)
        )

    # If year is presented as "XXXX:YYYY", take the second value.
    elif ':' in df.loc[0, 'Time']:
        df['Year'] = (
            df
            .pop('Time')
            .str.split(':').str[-1]
            .astype(int)
        )

    # Otherwise, transform the year into an integer
    else:
        df['Year'] = df.pop('Time').astype(int)

    return df


# A fuction that removes all other quarters of the year than last.
def choose_latest_quarter(df: pd.DataFrame, value_name):
    # Run only if the dataframe has a column "Age"

    if 'Quarter' not in df.columns:
        return df

    return (
        df
        .sort_values(by=['Year', 'Quarter'])
        .groupby([column for column in df.columns
                  if column not in ['Quarter', value_name]],
                 as_index=False)
        .last()
    )


# A function that perfoms standard cleaning of a dataframe
# from kk.statistikbank.dk
def clean_cph_dataframe(df, value_name='Value', df_name=''):
    # Clean district information.
    df = clean_districts(df)

    # Clean time information.
    df = clean_years(df, value_name=value_name)

    # Capitalize column names.
    df.columns = [column.capitalize() for column in df.columns]

    # Standardize age intervals
    df_standardize_age(df, value_name)

    # Choose only the last available quarter of the year.
    df = choose_latest_quarter(df, value_name)  
    
    # Order the columns.
    df = df_sort_columns(df)
    
    # Name the dataframe.
    df.name = df_name

    return df


# A function that displays all basic stats about the dataframe.
def show_stats(df):
    # Show data types.
    display(df.dtypes.to_frame('Data types'))

    # Show missing values.
    display(
        df.isna()
        .sum()
        .to_frame('Number of missing values')
    )

    # Describe values in columns.
    display(df.describe(include='all'))


# A function that unifies age intervals in the databases from
# from Københavns Kommune:
def df_standardize_age(df: pd.DataFrame, sum_column):

    # Run only if the dataframe has a column "Age"
    if 'Age' not in df.columns:
        return df

    # Use 10-year intervals for age: Create the mapping.
    mapping_5_to_10_years = dict()
    for interval_min in range(0, 90, 10):
        map_from_low = f'{interval_min}-{interval_min + 4} years'
        map_from_high = f'{interval_min + 5}-{interval_min + 9} years'
        map_to = f'{interval_min}-{interval_min + 9}'

        mapping_5_to_10_years[map_from_low] = map_to
        mapping_5_to_10_years[map_from_high] = map_to

    # Define the mapping for all the possible versions of the 90+ age.
    mapping_5_to_10_years['90-99 + years'] = '90+'
    mapping_5_to_10_years['95-99 years'] = '90+'
    mapping_5_to_10_years['90-94 years'] = '90+'
    mapping_5_to_10_years['100 years and over'] = '90+'
    mapping_5_to_10_years['95+years'] = '90+'

    # Apply the mapping to the column "Age".
    df['Age'] = df['Age'].map(mapping_5_to_10_years)

    # Sum the values in the new bins. First, get the names
    # of all the columns that are NOT the column which will summed up.
    non_sum_columns = [column for column in df.columns
                       if column != sum_column]

    # Sum the values together in the age bins.
    df[sum_column] = (
        df
        .groupby(non_sum_columns)
        .transform('sum')
    )

    # Drop duplicate rows
    df.drop_duplicates(ignore_index=True, inplace=True)

    return df

<hr style="border:2px solid black"> </hr>

# Load data

In [9]:
# Load the clean data.
cph_data = CphData()   

# Display all dataframes:
cph_data.display_dataframes()



**Citizenship**:

Unnamed: 0,Year,Quarter,District,District type,Sex,Age,Citizenship,Number of people
0,1980,1,1. Nord,Polling area,Men,0-9,Denmark,265.0
1,1981,1,1. Nord,Polling area,Men,0-9,Denmark,278.0
2,1982,1,1. Nord,Polling area,Men,0-9,Denmark,265.0
3,1983,1,1. Nord,Polling area,Men,0-9,Denmark,249.0
4,1984,1,1. Nord,Polling area,Men,0-9,Denmark,249.0
...,...,...,...,...,...,...,...,...
157435,2016,4,Østerbro,District,Women,90+,Western countries,3.0
157436,2017,4,Østerbro,District,Women,90+,Western countries,4.0
157437,2018,4,Østerbro,District,Women,90+,Western countries,5.0
157438,2019,4,Østerbro,District,Women,90+,Western countries,6.0



**Country of origin**:

Unnamed: 0,Year,Quarter,Sex,Age,Country of origin,Number of people
0,2008,4,Men,0-9,Abu Dhabi,0.0
1,2009,4,Men,0-9,Abu Dhabi,0.0
2,2010,4,Men,0-9,Abu Dhabi,0.0
3,2011,4,Men,0-9,Abu Dhabi,0.0
4,2012,4,Men,0-9,Abu Dhabi,0.0
...,...,...,...,...,...,...
61875,2016,4,Women,90+,Zimbabwe,0.0
61876,2017,4,Women,90+,Zimbabwe,0.0
61877,2018,4,Women,90+,Zimbabwe,0.0
61878,2019,4,Women,90+,Zimbabwe,0.0



**Dwellings**:

Unnamed: 0,Year,District,District type,Dwelling ownership,Total square meters occupied dwellings
0,1991,Copenhagen total,Entire Copenhagen,Owner-occupied,4667001.0
1,1991,Indre By,District,Owner-occupied,566699.0
2,1991,Østerbro,District,Owner-occupied,635362.0
3,1991,Nørrebro,District,Owner-occupied,298278.0
4,1991,Vesterbro/Kongens Enghave,District,Owner-occupied,273584.0
...,...,...,...,...,...
11899,2021,9. Syd,Polling area,Unknown,0.0
11900,2021,9. Øst,Polling area,Unknown,0.0
11901,2021,9. Vest,Polling area,Unknown,0.0
11902,2021,9. Midt,Polling area,Unknown,0.0



**Family type and children**:

Unnamed: 0,Year,Quarter,District,District type,Family type,Number of children,Number of families
0,1998,1,1. Nord,Polling area,Children below 18 years not living with parents,0 children,5.0
1,1999,1,1. Nord,Polling area,Children below 18 years not living with parents,0 children,10.0
2,2000,1,1. Nord,Polling area,Children below 18 years not living with parents,0 children,17.0
3,2001,1,1. Nord,Polling area,Children below 18 years not living with parents,0 children,19.0
4,2002,4,1. Nord,Polling area,Children below 18 years not living with parents,0 children,26.0
...,...,...,...,...,...,...,...
36795,2016,4,Østerbro,District,Single women,More than 3 children,21.0
36796,2017,4,Østerbro,District,Single women,More than 3 children,20.0
36797,2018,4,Østerbro,District,Single women,More than 3 children,21.0
36798,2019,4,Østerbro,District,Single women,More than 3 children,19.0



**Income**:

Unnamed: 0,Year,District,District type,Sex,Total income in district (kr.),Average income (kr.),Number of people
0,1987,1. Nord,Polling area,Men,480298000.0,190217.0,2525.0
1,1987,1. Nord,Polling area,Women,457876000.0,123617.0,3704.0
2,1987,1. Nordvest,Polling area,Men,637196000.0,145246.0,4387.0
3,1987,1. Nordvest,Polling area,Women,608784000.0,108402.0,5616.0
4,1987,1. Syd,Polling area,Men,844061000.0,152330.0,5541.0
...,...,...,...,...,...,...,...
4285,2019,Vanløse,District,Women,5566365000.0,319410.0,17427.0
4286,2019,Vesterbro/Kongens Enghave,District,Men,11307215000.0,381678.0,29625.0
4287,2019,Vesterbro/Kongens Enghave,District,Women,9399658000.0,319325.0,29436.0
4288,2019,Østerbro,District,Men,13865000000.0,448024.0,30947.0



**Life span**:

Unnamed: 0,Year,District,District type,Average life span
0,2009,Indre By,District,79.4
1,2009,Østerbro,District,78.4
2,2009,Nørrebro,District,72.3
3,2009,Vesterbro/Kongens Enghave,District,73.5
4,2009,Valby,District,75.9
...,...,...,...,...
127,2020,Brønshøj-Husum,District,78.5
128,2020,Bispebjerg,District,77.8
129,2020,Amager Øst,District,81.0
130,2020,Amager Vest,District,78.7



**Marital status**:

Unnamed: 0,Year,Quarter,District,District type,Sex,Age,Marital status,Number of people
0,1974,1,1. Nord,Polling area,Men,0-9,Divorced,0.0
1,1975,1,1. Nord,Polling area,Men,0-9,Divorced,0.0
2,1976,1,1. Nord,Polling area,Men,0-9,Divorced,0.0
3,1977,1,1. Nord,Polling area,Men,0-9,Divorced,0.0
4,1978,1,1. Nord,Polling area,Men,0-9,Divorced,0.0
...,...,...,...,...,...,...,...,...
240635,2016,4,Østerbro,District,Women,90+,Widowed,235.0
240636,2017,4,Østerbro,District,Women,90+,Widowed,220.0
240637,2018,4,Østerbro,District,Women,90+,Widowed,211.0
240638,2019,4,Østerbro,District,Women,90+,Widowed,208.0



**Population movement**:

Unnamed: 0,Year,District,District type,Deaths,Emigrated,Immigrated,Internal migration,Internal migration from other districts,Internal migration to to other districts,Internal migratrion from danish municipalities,Internal migratrion to danish municipalities,Live births,Natural increase,Netmigration,Netmigration Copenhagen,Netmigration Denmark
0,1975,1. Nord,Polling area,134.0,122.0,84.0,158.0,663.0,744.0,562.0,668.0,93.0,-41.0,-38.0,-81.0,-106.0
1,1976,1. Nord,Polling area,129.0,98.0,88.0,110.0,622.0,681.0,496.0,637.0,73.0,-56.0,-10.0,-59.0,-141.0
2,1977,1. Nord,Polling area,111.0,90.0,92.0,127.0,637.0,633.0,446.0,564.0,72.0,-39.0,2.0,4.0,-118.0
3,1978,1. Nord,Polling area,127.0,72.0,77.0,111.0,652.0,638.0,465.0,499.0,73.0,-54.0,5.0,14.0,-34.0
4,1980,1. Nord,Polling area,128.0,115.0,107.0,99.0,686.0,558.0,587.0,501.0,75.0,-53.0,-8.0,128.0,86.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2578,2016,Østerbro,District,461.0,1877.0,2905.0,3788.0,5533.0,5835.0,5576.0,5467.0,1267.0,806.0,1028.0,-302.0,109.0
2579,2017,Østerbro,District,475.0,2006.0,3000.0,3855.0,5642.0,5918.0,5818.0,5904.0,1244.0,769.0,994.0,-276.0,-86.0
2580,2018,Østerbro,District,463.0,2235.0,3017.0,3709.0,5310.0,5963.0,4930.0,5726.0,1251.0,788.0,782.0,-653.0,-796.0
2581,2019,Østerbro,District,443.0,2277.0,3018.0,3813.0,5493.0,6120.0,5255.0,5699.0,1303.0,860.0,741.0,-627.0,-444.0


<hr style="border:2px solid black"> </hr>

# Creation of a superset containing mixed informations from all datasets

We know that we have 1 dataset without the district information: the one that describes the total population of Copenhagen by the country of origin. We will treat this dataset separately.

Now, let's see how much information the dataset containing district information have in common:

## Identify common data categories in dataframes containing district information

In [10]:
# Get the names of all columns in all tables.
all_columns = cph_data.get_all_column_names()

# Get all dataframes containing district information.
all_dataframes_with_district = cph_data.get_dataframes_with_district()

# Get a dict containing the names of dataframes that
# contain a given column.
datasets_with_column = {column: [df.attrs["name"] for df in all_dataframes_with_district
                                 if column in df.columns]
                        for column in all_columns}

# Get the column names sorted by the number of datasets in which they appear.
columns_by_popularity = sorted(datasets_with_column,
                               key=lambda column: len(
                                   datasets_with_column[column]),
                               reverse=True)

# Create dictionary sorted by the the popularity of the column
# (from most to least popular).
datasets_with_column_sorted = {column: datasets_with_column[column]
                               for column in columns_by_popularity}

# Print out the most popular columns.
for column_name, df_names in datasets_with_column_sorted.items():
    print()
    printmd(f'Column **_{column_name}_** occurs in {len(df_names)} '
            f'out of {len(all_dataframes_with_district)} dataframes:')
    print(df_names)




Column **_District_** occurs in 7 out of 7 dataframes:

['Citizenship', 'Dwellings', 'Family type and children', 'Income', 'Life span', 'Marital status', 'Population movement']



Column **_District type_** occurs in 7 out of 7 dataframes:

['Citizenship', 'Dwellings', 'Family type and children', 'Income', 'Life span', 'Marital status', 'Population movement']



Column **_Year_** occurs in 7 out of 7 dataframes:

['Citizenship', 'Dwellings', 'Family type and children', 'Income', 'Life span', 'Marital status', 'Population movement']



Column **_Number of people_** occurs in 3 out of 7 dataframes:

['Citizenship', 'Income', 'Marital status']



Column **_Quarter_** occurs in 3 out of 7 dataframes:

['Citizenship', 'Family type and children', 'Marital status']



Column **_Sex_** occurs in 3 out of 7 dataframes:

['Citizenship', 'Income', 'Marital status']



Column **_Age_** occurs in 2 out of 7 dataframes:

['Citizenship', 'Marital status']



Column **_Average income (kr.)_** occurs in 1 out of 7 dataframes:

['Income']



Column **_Average life span_** occurs in 1 out of 7 dataframes:

['Life span']



Column **_Citizenship_** occurs in 1 out of 7 dataframes:

['Citizenship']



Column **_Deaths_** occurs in 1 out of 7 dataframes:

['Population movement']



Column **_Dwelling ownership_** occurs in 1 out of 7 dataframes:

['Dwellings']



Column **_Emigrated_** occurs in 1 out of 7 dataframes:

['Population movement']



Column **_Family type_** occurs in 1 out of 7 dataframes:

['Family type and children']



Column **_Immigrated_** occurs in 1 out of 7 dataframes:

['Population movement']



Column **_Internal migration_** occurs in 1 out of 7 dataframes:

['Population movement']



Column **_Internal migration from other districts_** occurs in 1 out of 7 dataframes:

['Population movement']



Column **_Internal migration to to other districts_** occurs in 1 out of 7 dataframes:

['Population movement']



Column **_Internal migratrion from danish municipalities_** occurs in 1 out of 7 dataframes:

['Population movement']



Column **_Internal migratrion to danish municipalities_** occurs in 1 out of 7 dataframes:

['Population movement']



Column **_Live births_** occurs in 1 out of 7 dataframes:

['Population movement']



Column **_Marital status_** occurs in 1 out of 7 dataframes:

['Marital status']



Column **_Natural increase_** occurs in 1 out of 7 dataframes:

['Population movement']



Column **_Netmigration_** occurs in 1 out of 7 dataframes:

['Population movement']



Column **_Netmigration Copenhagen_** occurs in 1 out of 7 dataframes:

['Population movement']



Column **_Netmigration Denmark_** occurs in 1 out of 7 dataframes:

['Population movement']



Column **_Number of children_** occurs in 1 out of 7 dataframes:

['Family type and children']



Column **_Number of families_** occurs in 1 out of 7 dataframes:

['Family type and children']



Column **_Total income in district (kr.)_** occurs in 1 out of 7 dataframes:

['Income']



Column **_Total square meters occupied dwellings_** occurs in 1 out of 7 dataframes:

['Dwellings']



Column **_Country of origin_** occurs in 0 out of 7 dataframes:

[]


We can use **Year**, **District**, and **District Type** as basis of a superdataset, where we cross informations from various datasets.

## Steamline districts

For simplicity, it might be beneficial to use only one district type. Let's see which ones we have available:

In [11]:
# Get the names of all columns in all tables.
all_columns = cph_data.get_all_column_names()

# Get all dataframes containing district information.
all_dataframes_with_district = cph_data.get_dataframes_with_district()

# Get all district types.
all_district_types = list()
for df in cph_data.get_dataframes_with_district():
    all_district_types += list(df['District type'].unique())

all_district_types = unique(all_district_types)

# Display the district types.
printmd('All district types:')
for district in all_district_types:
    district_string = '***, ***'.join(all_district_types)

printmd('***' + '***, ***'.join(all_district_types) +  '***')

All district types:

***District***, ***Entire Copenhagen***, ***Polling area***

***Entire Copenhagen*** is not a district, but a summation of all districts in Copenhagen. ***District*** and ***Polling area*** are real districts - ***District*** is of lower resolution and ***Polling area*** is of higher resolution.

In [12]:
# Get all dataframes containing district information.
all_dataframes_with_district = cph_data.get_dataframes_with_district()

# Get all Polling Areas.
all_polling_areas = list()
for df in cph_data.get_dataframes_with_district():
    all_polling_areas += list(df.loc[df['District type'] == 'Polling area', 'District'].unique())
all_polling_areas = unique(all_polling_areas)

# Get all Districts.
all_districts = list()
for df in cph_data.get_dataframes_with_district():
    all_districts += list(df.loc[df['District type'] == 'District', 'District'].unique())
all_districts = unique(all_districts)

# Show the Polling Areas and Districts:
display(pd.DataFrame(sorted(all_polling_areas), columns=['Polling areas']))
display(pd.DataFrame(sorted(all_districts), columns=['Districts']))

Unnamed: 0,Polling areas
0,1. Nord
1,1. Nordvest
2,1. Syd
3,1. Vest
4,1. Øst
5,1. Østerbro
6,2. Nord
7,2. Sundbyvester
8,2. Syd
9,2. Vest


Unnamed: 0,Districts
0,Amager Vest
1,Amager Øst
2,Bispebjerg
3,Brønshøj-Husum
4,Indre By
5,Nørrebro
6,Unlocated
7,Valby
8,Vanløse
9,Vesterbro/Kongens Enghave


Note that one of the districts is ***Unlocated***.

Let' now take a look at which datasets have which dataset type:

In [13]:
datasets_with_district_type = {district_type: [df.attrs["name"]
                                               for df in all_dataframes_with_district
                                               if district_type in df['District type'].values]
                               for district_type in all_district_types}

# Get the district names sorted by the number of datasets in which they appear.
district_types_by_popularity = sorted(datasets_with_district_type,
                                      key=lambda column: len(
                                          datasets_with_district_type[column]),
                                      reverse=True)

# Create dictionary sorted by the the popularity of the district type.
# (from most to least popular).
datasets_with_district_type_sorted = {column: datasets_with_district_type[column]
                                      for column in district_types_by_popularity}

# Print out the most popular columns.
for column_name, df_names in datasets_with_district_type_sorted.items():
    print()
    printmd(f'Dataset type **_{column_name}_** occurs in {len(df_names)} '
            f'out of {len(all_dataframes_with_district)} dataframes:')
    print(df_names)




Dataset type **_District_** occurs in 7 out of 7 dataframes:

['Citizenship', 'Dwellings', 'Family type and children', 'Income', 'Life span', 'Marital status', 'Population movement']



Dataset type **_Polling area_** occurs in 6 out of 7 dataframes:

['Citizenship', 'Dwellings', 'Family type and children', 'Income', 'Marital status', 'Population movement']



Dataset type **_Entire Copenhagen_** occurs in 5 out of 7 dataframes:

['Citizenship', 'Dwellings', 'Family type and children', 'Income', 'Marital status']


So, if we use ***District*** as an area, we will be able to divide Copenhagen in 10 districts and use all of our datasets containing district informattion.

If we use ***Polling area***, we will be able to divide Copenhagen in 52 districts and use all of our datasets except the one containing information about *Life span*.

It is a bit of a bummer, because I would like to use the high resolution, but I am also very interested in taking a look on how life span correlates with all the other factors. Therefore, at least as a starting point, I will use ***District***.

## Initialize the superset

Let's create an object for our superset, use only districts of type: ***District*** and drop the column: ***District type***, which becomes superficial.

In [14]:
# Initialize the dataset that will contain the cross of all the
# dataset available in this project.
cph_data_superset = CphData()

# Filter the districts to include only District type: District
# and drop the column "District type"
for attribute_name, df in get_obj_attributes(cph_data_superset).items():
    # Delete all the datasets from the object that do not contain
    # the district data
    if 'District' not in df.columns:
        delattr(cph_data_superset, attribute_name)
    
    # Limit the data to District Type: District and
    # remove the column: District type
    else:
        setattr(
            cph_data_superset,
            attribute_name,
            (df
             .loc[df['District type'] == 'District']
             .drop(['District type'], axis=1))
        )

# Display all dataframes:
cph_data_superset.display_dataframes()


**Citizenship**:

Unnamed: 0,Year,Quarter,District,Sex,Age,Citizenship,Number of people
2173,1980,1,Amager Vest,Men,0-9,Denmark,1972.0
2174,1981,1,Amager Vest,Men,0-9,Denmark,1917.0
2175,1982,1,Amager Vest,Men,0-9,Denmark,1888.0
2176,1983,1,Amager Vest,Men,0-9,Denmark,1889.0
2177,1984,1,Amager Vest,Men,0-9,Denmark,1856.0
...,...,...,...,...,...,...,...
157435,2016,4,Østerbro,Women,90+,Western countries,3.0
157436,2017,4,Østerbro,Women,90+,Western countries,4.0
157437,2018,4,Østerbro,Women,90+,Western countries,5.0
157438,2019,4,Østerbro,Women,90+,Western countries,6.0



**Dwellings**:

Unnamed: 0,Year,District,Dwelling ownership,Total square meters occupied dwellings
1,1991,Indre By,Owner-occupied,566699.0
2,1991,Østerbro,Owner-occupied,635362.0
3,1991,Nørrebro,Owner-occupied,298278.0
4,1991,Vesterbro/Kongens Enghave,Owner-occupied,273584.0
5,1991,Valby,Owner-occupied,551526.0
...,...,...,...,...
11846,2021,Vanløse,Unknown,0.0
11847,2021,Brønshøj-Husum,Unknown,0.0
11848,2021,Bispebjerg,Unknown,0.0
11849,2021,Amager Øst,Unknown,0.0



**Family type and children**:

Unnamed: 0,Year,Quarter,District,Family type,Number of children,Number of families
1219,1998,1,Amager Vest,Children below 18 years not living with parents,0 children,82.0
1220,1999,1,Amager Vest,Children below 18 years not living with parents,0 children,78.0
1221,2000,1,Amager Vest,Children below 18 years not living with parents,0 children,77.0
1222,2001,1,Amager Vest,Children below 18 years not living with parents,0 children,72.0
1223,2002,4,Amager Vest,Children below 18 years not living with parents,0 children,81.0
...,...,...,...,...,...,...
36795,2016,4,Østerbro,Single women,More than 3 children,21.0
36796,2017,4,Østerbro,Single women,More than 3 children,20.0
36797,2018,4,Østerbro,Single women,More than 3 children,21.0
36798,2019,4,Østerbro,Single women,More than 3 children,19.0



**Income**:

Unnamed: 0,Year,District,Sex,Total income in district (kr.),Average income (kr.),Number of people
106,1987,Amager Vest,Men,2368858000.0,138659.0,17084.0
107,1987,Amager Vest,Women,2056151000.0,103898.0,19790.0
108,1987,Amager Øst,Men,2614378000.0,140520.0,18605.0
109,1987,Amager Øst,Women,2191694000.0,103348.0,21207.0
110,1987,Bispebjerg,Men,2127197000.0,128679.0,16531.0
...,...,...,...,...,...,...
4285,2019,Vanløse,Women,5566365000.0,319410.0,17427.0
4286,2019,Vesterbro/Kongens Enghave,Men,11307215000.0,381678.0,29625.0
4287,2019,Vesterbro/Kongens Enghave,Women,9399658000.0,319325.0,29436.0
4288,2019,Østerbro,Men,13865000000.0,448024.0,30947.0



**Life span**:

Unnamed: 0,Year,District,Average life span
0,2009,Indre By,79.4
1,2009,Østerbro,78.4
2,2009,Nørrebro,72.3
3,2009,Vesterbro/Kongens Enghave,73.5
4,2009,Valby,75.9
...,...,...,...
127,2020,Brønshøj-Husum,78.5
128,2020,Bispebjerg,77.8
129,2020,Amager Øst,81.0
130,2020,Amager Vest,78.7



**Marital status**:

Unnamed: 0,Year,Quarter,District,Sex,Age,Marital status,Number of people
2491,1974,1,Amager Vest,Men,0-9,Divorced,0.0
2492,1975,1,Amager Vest,Men,0-9,Divorced,0.0
2493,1976,1,Amager Vest,Men,0-9,Divorced,0.0
2494,1977,1,Amager Vest,Men,0-9,Divorced,0.0
2495,1978,1,Amager Vest,Men,0-9,Divorced,0.0
...,...,...,...,...,...,...,...
240635,2016,4,Østerbro,Women,90+,Widowed,235.0
240636,2017,4,Østerbro,Women,90+,Widowed,220.0
240637,2018,4,Østerbro,Women,90+,Widowed,211.0
240638,2019,4,Østerbro,Women,90+,Widowed,208.0



**Population movement**:

Unnamed: 0,Year,District,Deaths,Emigrated,Immigrated,Internal migration,Internal migration from other districts,Internal migration to to other districts,Internal migratrion from danish municipalities,Internal migratrion to danish municipalities,Live births,Natural increase,Netmigration,Netmigration Copenhagen,Netmigration Denmark
2173,1975,Amager Vest,1041.0,1451.0,564.0,3454.0,6344.0,4817.0,2736.0,3974.0,559.0,-482.0,-887.0,1527.0,-1238.0
2174,1976,Amager Vest,1063.0,1295.0,720.0,2935.0,6162.0,4942.0,2979.0,4211.0,433.0,-630.0,-575.0,1220.0,-1232.0
2175,1977,Amager Vest,965.0,1689.0,815.0,3775.0,6626.0,4828.0,2676.0,3926.0,444.0,-521.0,-874.0,1798.0,-1250.0
2176,1978,Amager Vest,984.0,1357.0,870.0,2865.0,5656.0,4561.0,2695.0,3495.0,460.0,-524.0,-487.0,1095.0,-800.0
2177,1980,Amager Vest,1001.0,1297.0,655.0,2888.0,5757.0,4355.0,2869.0,3007.0,414.0,-587.0,-642.0,1402.0,-138.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2578,2016,Østerbro,461.0,1877.0,2905.0,3788.0,5533.0,5835.0,5576.0,5467.0,1267.0,806.0,1028.0,-302.0,109.0
2579,2017,Østerbro,475.0,2006.0,3000.0,3855.0,5642.0,5918.0,5818.0,5904.0,1244.0,769.0,994.0,-276.0,-86.0
2580,2018,Østerbro,463.0,2235.0,3017.0,3709.0,5310.0,5963.0,4930.0,5726.0,1251.0,788.0,782.0,-653.0,-796.0
2581,2019,Østerbro,443.0,2277.0,3018.0,3813.0,5493.0,6120.0,5255.0,5699.0,1303.0,860.0,741.0,-627.0,-444.0


## Remove the column `Quarter`

As we have already changed the temporal resolution of all the datasets to ***one year***, the column `Quarter` has become superficial.

In [15]:
for attribute_name, df in get_obj_attributes(cph_data_superset).items():
    # Delete all the datasets from the object that do not contain
    # the district data.
    if 'Quarter' in df.columns:
        df.drop(['Quarter'], axis=1, inplace=True)
    
# Display one dataframe to veryfy that the column Quarter is gone:
display(cph_data_superset.family_type_and_children.head(3))

Unnamed: 0,Year,District,Family type,Number of children,Number of families
1219,1998,Amager Vest,Children below 18 years not living with parents,0 children,82.0
1220,1999,Amager Vest,Children below 18 years not living with parents,0 children,78.0
1221,2000,Amager Vest,Children below 18 years not living with parents,0 children,77.0


## Prepare each dataset for inclusion in superset

### Citizenship

#### Display

In [16]:
display(cph_data_superset.citizenship)

Unnamed: 0,Year,District,Sex,Age,Citizenship,Number of people
2173,1980,Amager Vest,Men,0-9,Denmark,1972.0
2174,1981,Amager Vest,Men,0-9,Denmark,1917.0
2175,1982,Amager Vest,Men,0-9,Denmark,1888.0
2176,1983,Amager Vest,Men,0-9,Denmark,1889.0
2177,1984,Amager Vest,Men,0-9,Denmark,1856.0
...,...,...,...,...,...,...
157435,2016,Østerbro,Women,90+,Western countries,3.0
157436,2017,Østerbro,Women,90+,Western countries,4.0
157437,2018,Østerbro,Women,90+,Western countries,5.0
157438,2019,Østerbro,Women,90+,Western countries,6.0


#### Aggregate over columns `Age` and `Sex`

In [17]:
# Aggregate number of people over columns Sex and Age
columns_to_aggregate = ['Number of people']
columns_to_remove = ['Sex', 'Age'] 

# Create a dataframe where Age and Sex information will be removed.
df_temp_citizenship_1 = (
    cph_data_superset.citizenship
    .groupby(get_df_columns(cph_data_superset.citizenship,
                            exclude=columns_to_remove + columns_to_aggregate),
             as_index=False)
    .sum()
)

# Show the result.
display(df_temp_citizenship_1)

Unnamed: 0,Year,District,Citizenship,Number of people
0,1980,Amager Vest,Denmark,45262.0
1,1980,Amager Vest,Non-western countries,1064.0
2,1980,Amager Vest,Western countries,1502.0
3,1980,Amager Øst,Denmark,46257.0
4,1980,Amager Øst,Non-western countries,1197.0
...,...,...,...,...
1225,2020,Vesterbro/Kongens Enghave,Non-western countries,4482.0
1226,2020,Vesterbro/Kongens Enghave,Western countries,8949.0
1227,2020,Østerbro,Denmark,68643.0
1228,2020,Østerbro,Non-western countries,3866.0


#### Create new columns containing the number of people for each citizenship group:

In [18]:
# Create new columns containing the number of people for each citizenship group:
df_temp_citizenship_2 = (
    df_create_column_for_each_unique_value(
        df_temp_citizenship_1,
        category_columns=['Citizenship'],
        value_columns='Number of people'
    )
)

# Show the results.
display(df_temp_citizenship_2)

Unnamed: 0_level_0,Year,District,Number of people,Number of people,Number of people
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Denmark,Non-western countries,Western countries
0,1980,Amager Vest,45262.0,1064.0,1502.0
1,1980,Amager Øst,46257.0,1197.0,1019.0
2,1980,Bispebjerg,43259.0,1039.0,589.0
3,1980,Brønshøj-Husum,38858.0,299.0,526.0
4,1980,Indre By,42261.0,915.0,2582.0
...,...,...,...,...,...
405,2020,Nørrebro,67274.0,4953.0,7374.0
406,2020,Valby,51397.0,5105.0,5393.0
407,2020,Vanløse,36293.0,2248.0,2617.0
408,2020,Vesterbro/Kongens Enghave,60744.0,4482.0,8949.0


#### Let's express the number of people as fractions of the total.

In [19]:
# Initialize the processing stage.
df_temp_citizenship_3 = df_temp_citizenship_2.copy()

# Calculate the total number of people
df_temp_citizenship_3['Total number of people'] = (
    df_temp_citizenship_3
    .loc[:, ['Number of people']]
    .sum(axis=1)
)

# Calculate percentages
df_temp_citizenship_3['Number of people'] = (
    df_temp_citizenship_3['Number of people']
    .div(df_temp_citizenship_3['Total number of people'], axis='index')
    .mul(100, axis='index')
)

# Change the name from "number of people" to ""% of people"
df_temp_citizenship_3.columns = (
    df_temp_citizenship_3.columns.set_levels(
        (
            df_temp_citizenship_3
            .columns
            .levels[0]
            .str
            .replace('Number of people', r'% of people')
            .str
            .replace('Total number of people', r'Number of people')
        ),
        level=0
    )
)

# Put assign the dataframe to the object containing
# the data for the superset.
cph_data_superset.citizenship = df_temp_citizenship_3

# Show the resulting dataframe.
display(cph_data_superset.citizenship)

Unnamed: 0_level_0,Year,District,% of people,% of people,% of people,Number of people
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Denmark,Non-western countries,Western countries,Unnamed: 6_level_1
0,1980,Amager Vest,94.6,2.2,3.1,47828.0
1,1980,Amager Øst,95.4,2.5,2.1,48473.0
2,1980,Bispebjerg,96.4,2.3,1.3,44887.0
3,1980,Brønshøj-Husum,97.9,0.8,1.3,39683.0
4,1980,Indre By,92.4,2.0,5.6,45758.0
...,...,...,...,...,...,...
405,2020,Nørrebro,84.5,6.2,9.3,79601.0
406,2020,Valby,83.0,8.2,8.7,61895.0
407,2020,Vanløse,88.2,5.5,6.4,41158.0
408,2020,Vesterbro/Kongens Enghave,81.9,6.0,12.1,74175.0


#### Check that the sum of citizens looks right

In [20]:
# Sum all the numbers of people in 2019.
sum_2019 =(
    cph_data_superset.citizenship
    .loc[cph_data_superset.citizenship['Year'] == 2019,
         'Number of people']
    .to_numpy()
    .sum()
)

# Display the result.
printmd(f'The sum of people in Copenhagen in 2019 '
        f'has been calculated to ***{sum_2019:,.0f}.***')

The sum of people in Copenhagen in 2019 has been calculated to ***629,513.***

### Dwellings

#### Display

In [21]:
display(cph_data_superset.dwellings)

Unnamed: 0,Year,District,Dwelling ownership,Total square meters occupied dwellings
1,1991,Indre By,Owner-occupied,566699.0
2,1991,Østerbro,Owner-occupied,635362.0
3,1991,Nørrebro,Owner-occupied,298278.0
4,1991,Vesterbro/Kongens Enghave,Owner-occupied,273584.0
5,1991,Valby,Owner-occupied,551526.0
...,...,...,...,...
11846,2021,Vanløse,Unknown,0.0
11847,2021,Brønshøj-Husum,Unknown,0.0
11848,2021,Bispebjerg,Unknown,0.0
11849,2021,Amager Øst,Unknown,0.0


#### Create new columns containing the number of total of square meters for each dwelling ownership group.

In [22]:
# Create new columns containing the number of people for dwelling ownership' group.
df_temp_dwellings_1 = (
    df_create_column_for_each_unique_value(
        cph_data_superset.dwellings,
        category_columns='Dwelling ownership',
        value_columns='Total square meters occupied dwellings'
    )
)

# Show the results.
display(df_temp_dwellings_1)

Unnamed: 0_level_0,Year,District,Total square meters occupied dwellings,Total square meters occupied dwellings,Total square meters occupied dwellings,Total square meters occupied dwellings,Total square meters occupied dwellings,Total square meters occupied dwellings
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Housing cooperative,Non-profit housing,Owner-occupied,Private rental,Public authorities,Unknown
0,1991,Amager Vest,304751.0,363436.0,422697.0,656597.0,47081.0,0.0
1,1991,Amager Øst,457312.0,279473.0,525356.0,588056.0,97804.0,0.0
2,1991,Bispebjerg,225184.0,600972.0,243548.0,397895.0,315623.0,0.0
3,1991,Brønshøj-Husum,66738.0,540095.0,572879.0,234872.0,92475.0,0.0
4,1991,Indre By,334860.0,123646.0,566699.0,1211717.0,141718.0,0.0
...,...,...,...,...,...,...,...,...
305,2021,Nørrebro,1356364.0,687400.0,305394.0,707955.0,55142.0,0.0
306,2021,Valby,589246.0,493886.0,790576.0,642010.0,13061.0,0.0
307,2021,Vanløse,412189.0,202948.0,669109.0,391645.0,2875.0,0.0
308,2021,Vesterbro/Kongens Enghave,1096714.0,459806.0,667276.0,912130.0,21060.0,0.0


#### Check if there are any square meters in the column: `Unknown`.

In [23]:
# Get the sum of m2 in the column "Unknown".
m2_in_uknown = (
    df_temp_dwellings_1
    .loc[:, ('Total square meters occupied dwellings', 'Unknown')].sum()
)

# Show the result.
printmd(f'The sum of square meters in the column `Unknown` is: ***{m2_in_uknown}***.')

The sum of square meters in the column `Unknown` is: ***0.0***.

Therefore, we will remove the column `Unknown`:

In [24]:
# Remove the column "Unknown":
df_temp_dwellings_2 = (
    df_temp_dwellings_1.drop(
        ('Total square meters occupied dwellings', 'Unknown'),
        axis=1
    )
)

# Show the results.
display(df_temp_dwellings_2)

Unnamed: 0_level_0,Year,District,Total square meters occupied dwellings,Total square meters occupied dwellings,Total square meters occupied dwellings,Total square meters occupied dwellings,Total square meters occupied dwellings
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Housing cooperative,Non-profit housing,Owner-occupied,Private rental,Public authorities
0,1991,Amager Vest,304751.0,363436.0,422697.0,656597.0,47081.0
1,1991,Amager Øst,457312.0,279473.0,525356.0,588056.0,97804.0
2,1991,Bispebjerg,225184.0,600972.0,243548.0,397895.0,315623.0
3,1991,Brønshøj-Husum,66738.0,540095.0,572879.0,234872.0,92475.0
4,1991,Indre By,334860.0,123646.0,566699.0,1211717.0,141718.0
...,...,...,...,...,...,...,...
305,2021,Nørrebro,1356364.0,687400.0,305394.0,707955.0,55142.0
306,2021,Valby,589246.0,493886.0,790576.0,642010.0,13061.0
307,2021,Vanløse,412189.0,202948.0,669109.0,391645.0,2875.0
308,2021,Vesterbro/Kongens Enghave,1096714.0,459806.0,667276.0,912130.0,21060.0


#### Let's express the number of people as fractions of the total.

In [25]:
# Initialize the processing stage.
df_temp_dwellings_3 = df_temp_dwellings_2.copy()

# Calculate the total number of m2
df_temp_dwellings_3['Total m2'] = (
    df_temp_dwellings_3
    .loc[:, ['Total square meters occupied dwellings']]
    .sum(axis=1)
)

# Calculate percentages.
df_temp_dwellings_3['Total square meters occupied dwellings'] = (
    df_temp_dwellings_3['Total square meters occupied dwellings']
    .div(df_temp_dwellings_3['Total m2'], axis='index')
    .mul(100, axis='index')
)

# Change the name from "Total square meters occupied dwellings"
# to "% of total dwellings area."
df_temp_dwellings_3.columns = (
    df_temp_dwellings_3.columns.set_levels(
        (
            df_temp_dwellings_3
            .columns
            .levels[0]
            .str
            .replace('Total square meters occupied dwellings', r'% of total dwellings area')
            .str
            .replace('Total m2', r'Total dwellings area')
        ),
        level=0
    )
)

# Put assign the dataframe to the object containing
# the data for the superset.
cph_data_superset.dwellings = df_temp_dwellings_3

# Show the resulting dataframe.
display(cph_data_superset.dwellings)

Unnamed: 0_level_0,Year,District,% of total dwellings area,% of total dwellings area,% of total dwellings area,% of total dwellings area,% of total dwellings area,Total dwellings area
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Housing cooperative,Non-profit housing,Owner-occupied,Private rental,Public authorities,Unnamed: 8_level_1
0,1991,Amager Vest,17.0,20.3,23.6,36.6,2.6,1794562.0
1,1991,Amager Øst,23.5,14.3,27.0,30.2,5.0,1948001.0
2,1991,Bispebjerg,12.6,33.7,13.7,22.3,17.7,1783222.0
3,1991,Brønshøj-Husum,4.4,35.8,38.0,15.6,6.1,1507059.0
4,1991,Indre By,14.1,5.2,23.8,50.9,6.0,2378640.0
...,...,...,...,...,...,...,...,...
305,2021,Nørrebro,43.6,22.1,9.8,22.7,1.8,3112255.0
306,2021,Valby,23.3,19.5,31.3,25.4,0.5,2528779.0
307,2021,Vanløse,24.6,12.1,39.9,23.3,0.2,1678766.0
308,2021,Vesterbro/Kongens Enghave,34.7,14.6,21.1,28.9,0.7,3156986.0


### Family type and children

#### Display

In [26]:
display(cph_data_superset.family_type_and_children)

Unnamed: 0,Year,District,Family type,Number of children,Number of families
1219,1998,Amager Vest,Children below 18 years not living with parents,0 children,82.0
1220,1999,Amager Vest,Children below 18 years not living with parents,0 children,78.0
1221,2000,Amager Vest,Children below 18 years not living with parents,0 children,77.0
1222,2001,Amager Vest,Children below 18 years not living with parents,0 children,72.0
1223,2002,Amager Vest,Children below 18 years not living with parents,0 children,81.0
...,...,...,...,...,...
36795,2016,Østerbro,Single women,More than 3 children,21.0
36796,2017,Østerbro,Single women,More than 3 children,20.0
36797,2018,Østerbro,Single women,More than 3 children,21.0
36798,2019,Østerbro,Single women,More than 3 children,19.0


#### Create new columns containing the number of families for each family type and number of children

In [27]:
# Create new columns containing the number of families 
# for each family type and number of children.
df_temp_family_type_and_children_1 = (
    df_create_column_for_each_unique_value(
        cph_data_superset.family_type_and_children,
        category_columns=['Family type', 'Number of children'],
        value_columns='Number of families'
    )
)

# Show the result
display(df_temp_family_type_and_children_1)

Unnamed: 0_level_0,Year,District,Number of families,Number of families,Number of families,Number of families,Number of families,Number of families,Number of families,Number of families,Number of families,Number of families,Number of families,Number of families,Number of families,Number of families,Number of families,Number of families,Number of families,Number of families,Number of families,Number of families,Number of families,Number of families,Number of families,Number of families,Number of families
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Children below 18 years not living with parents,Children below 18 years not living with parents,Children below 18 years not living with parents,Children below 18 years not living with parents,Children below 18 years not living with parents,Couples living in consensual union and cohabiting couples,Couples living in consensual union and cohabiting couples,Couples living in consensual union and cohabiting couples,Couples living in consensual union and cohabiting couples,Couples living in consensual union and cohabiting couples,Married couples and registered partnership,Married couples and registered partnership,Married couples and registered partnership,Married couples and registered partnership,Married couples and registered partnership,Single men,Single men,Single men,Single men,Single men,Single women,Single women,Single women,Single women,Single women
Unnamed: 0_level_2,Unnamed: 1_level_2,Unnamed: 2_level_2,0 children,1 child,2 children,3 children,More than 3 children,0 children,1 child,2 children,3 children,More than 3 children,0 children,1 child,2 children,3 children,More than 3 children,0 children,1 child,2 children,3 children,More than 3 children,0 children,1 child,2 children,3 children,More than 3 children
0,1998,Amager Vest,82.0,0.0,0.0,0.0,0.0,1996.0,561.0,294.0,61.0,16.0,3100.0,1060.0,1022.0,246.0,115.0,8562.0,150.0,42.0,2.0,0.0,8975.0,996.0,489.0,122.0,41.0
1,1998,Amager Øst,62.0,0.0,0.0,0.0,0.0,2617.0,640.0,275.0,42.0,7.0,3259.0,1043.0,909.0,255.0,68.0,9857.0,175.0,20.0,5.0,3.0,9893.0,937.0,369.0,59.0,17.0
2,1998,Bispebjerg,82.0,0.0,0.0,0.0,0.0,2116.0,397.0,191.0,29.0,3.0,2933.0,835.0,702.0,232.0,102.0,9656.0,155.0,20.0,2.0,1.0,10836.0,852.0,317.0,72.0,17.0
3,1998,Brønshøj-Husum,88.0,0.0,0.0,0.0,0.0,1129.0,490.0,320.0,62.0,11.0,3362.0,1179.0,1349.0,390.0,112.0,5015.0,121.0,31.0,3.0,3.0,6657.0,763.0,377.0,107.0,32.0
4,1998,Indre By,52.0,0.0,0.0,0.0,0.0,2374.0,605.0,289.0,48.0,10.0,2970.0,980.0,661.0,102.0,30.0,9666.0,146.0,34.0,1.0,0.0,9728.0,888.0,250.0,38.0,7.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
225,2020,Nørrebro,55.0,0.0,0.0,0.0,0.0,5358.0,1539.0,791.0,120.0,48.0,2530.0,1512.0,1551.0,545.0,162.0,16220.0,289.0,70.0,11.0,3.0,17342.0,1497.0,674.0,162.0,94.0
226,2020,Valby,57.0,0.0,0.0,0.0,0.0,3292.0,1039.0,586.0,86.0,25.0,2972.0,1476.0,1800.0,551.0,104.0,11416.0,301.0,64.0,10.0,2.0,10869.0,1307.0,592.0,120.0,43.0
227,2020,Vanløse,36.0,0.0,0.0,0.0,0.0,1875.0,587.0,381.0,81.0,15.0,2464.0,959.0,1454.0,419.0,53.0,6885.0,199.0,37.0,4.0,0.0,7202.0,769.0,327.0,77.0,18.0
228,2020,Vesterbro/Kongens Enghave,52.0,0.0,0.0,0.0,0.0,4622.0,1535.0,781.0,136.0,11.0,2950.0,1646.0,1805.0,409.0,51.0,15557.0,384.0,60.0,7.0,1.0,13758.0,1544.0,672.0,115.0,23.0


#### Let's express the number if families as fractions of the total.

In [28]:
# Initialize the processing stage.
df_temp_family_type_and_children_2 = df_temp_family_type_and_children_1.copy()

# Calculate the total number of people.
df_temp_family_type_and_children_2['Total number of families'] = (
    df_temp_family_type_and_children_2
    .loc[:, ['Number of families']]
    .sum(axis=1)
)

# Calculate percentages.
df_temp_family_type_and_children_2['Number of families'] = (
    df_temp_family_type_and_children_2['Number of families']
    .div(df_temp_family_type_and_children_2['Total number of families'], axis='index')
    .mul(100, axis='index')
)

# Change the name from "Number of families'"
# to ""% of families"
df_temp_family_type_and_children_2.columns = (
    df_temp_family_type_and_children_2.columns.set_levels(
        (
            df_temp_family_type_and_children_2
            .columns
            .levels[0]
            .str
            .replace('Number of families', r'% of families')
            .str
            .replace('Total number of families', r'Number of families')
        ),
        level=0
    )
)

# Put assign the dataframe to the object containing
# the data for the superset.
cph_data_superset.family_type_and_children = df_temp_family_type_and_children_2

# Show the resulting dataframe.
display(cph_data_superset.family_type_and_children)

Unnamed: 0_level_0,Year,District,% of families,% of families,% of families,% of families,% of families,% of families,% of families,% of families,% of families,% of families,% of families,% of families,% of families,% of families,% of families,% of families,% of families,% of families,% of families,% of families,% of families,% of families,% of families,% of families,% of families,Number of families
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Children below 18 years not living with parents,Children below 18 years not living with parents,Children below 18 years not living with parents,Children below 18 years not living with parents,Children below 18 years not living with parents,Couples living in consensual union and cohabiting couples,Couples living in consensual union and cohabiting couples,Couples living in consensual union and cohabiting couples,Couples living in consensual union and cohabiting couples,Couples living in consensual union and cohabiting couples,Married couples and registered partnership,Married couples and registered partnership,Married couples and registered partnership,Married couples and registered partnership,Married couples and registered partnership,Single men,Single men,Single men,Single men,Single men,Single women,Single women,Single women,Single women,Single women,Unnamed: 28_level_1
Unnamed: 0_level_2,Unnamed: 1_level_2,Unnamed: 2_level_2,0 children,1 child,2 children,3 children,More than 3 children,0 children,1 child,2 children,3 children,More than 3 children,0 children,1 child,2 children,3 children,More than 3 children,0 children,1 child,2 children,3 children,More than 3 children,0 children,1 child,2 children,3 children,More than 3 children,Unnamed: 28_level_2
0,1998,Amager Vest,0.3,0.0,0.0,0.0,0.0,7.1,2.0,1.1,0.2,0.1,11.1,3.8,3.7,0.9,0.4,30.7,0.5,0.2,0.0,0.0,32.1,3.6,1.8,0.4,0.1,27932.0
1,1998,Amager Øst,0.2,0.0,0.0,0.0,0.0,8.6,2.1,0.9,0.1,0.0,10.7,3.4,3.0,0.8,0.2,32.3,0.6,0.1,0.0,0.0,32.4,3.1,1.2,0.2,0.1,30512.0
2,1998,Bispebjerg,0.3,0.0,0.0,0.0,0.0,7.2,1.3,0.6,0.1,0.0,9.9,2.8,2.4,0.8,0.3,32.7,0.5,0.1,0.0,0.0,36.7,2.9,1.1,0.2,0.1,29550.0
3,1998,Brønshøj-Husum,0.4,0.0,0.0,0.0,0.0,5.2,2.3,1.5,0.3,0.1,15.6,5.5,6.2,1.8,0.5,23.2,0.6,0.1,0.0,0.0,30.8,3.5,1.7,0.5,0.1,21601.0
4,1998,Indre By,0.2,0.0,0.0,0.0,0.0,8.2,2.1,1.0,0.2,0.0,10.3,3.4,2.3,0.4,0.1,33.5,0.5,0.1,0.0,0.0,33.7,3.1,0.9,0.1,0.0,28879.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
225,2020,Nørrebro,0.1,0.0,0.0,0.0,0.0,10.6,3.0,1.6,0.2,0.1,5.0,3.0,3.1,1.1,0.3,32.1,0.6,0.1,0.0,0.0,34.3,3.0,1.3,0.3,0.2,50573.0
226,2020,Valby,0.2,0.0,0.0,0.0,0.0,9.0,2.8,1.6,0.2,0.1,8.1,4.0,4.9,1.5,0.3,31.1,0.8,0.2,0.0,0.0,29.6,3.6,1.6,0.3,0.1,36712.0
227,2020,Vanløse,0.2,0.0,0.0,0.0,0.0,7.9,2.5,1.6,0.3,0.1,10.3,4.0,6.1,1.8,0.2,28.9,0.8,0.2,0.0,0.0,30.2,3.2,1.4,0.3,0.1,23842.0
228,2020,Vesterbro/Kongens Enghave,0.1,0.0,0.0,0.0,0.0,10.0,3.3,1.7,0.3,0.0,6.4,3.6,3.9,0.9,0.1,33.7,0.8,0.1,0.0,0.0,29.8,3.3,1.5,0.2,0.0,46119.0


#### Check that the number of people looks right

In [29]:
# Sum all the number of people in 2019.
sum_2019 =(
    cph_data_superset.family_type_and_children
    .loc[cph_data_superset.family_type_and_children['Year'] == 2019,
         'Number of families']
    .to_numpy()
    .sum()
)

# Display the result.
printmd(f'The sum of families of Copenhagen in 2019 '
        f'has been calculated to ***{sum_2019:,.0f}.***')

The sum of families of Copenhagen in 2019 has been calculated to ***384,090.***

### Income

#### Display

In [30]:
display(cph_data_superset.income)

Unnamed: 0,Year,District,Sex,Total income in district (kr.),Average income (kr.),Number of people
106,1987,Amager Vest,Men,2368858000.0,138659.0,17084.0
107,1987,Amager Vest,Women,2056151000.0,103898.0,19790.0
108,1987,Amager Øst,Men,2614378000.0,140520.0,18605.0
109,1987,Amager Øst,Women,2191694000.0,103348.0,21207.0
110,1987,Bispebjerg,Men,2127197000.0,128679.0,16531.0
...,...,...,...,...,...,...
4285,2019,Vanløse,Women,5566365000.0,319410.0,17427.0
4286,2019,Vesterbro/Kongens Enghave,Men,11307215000.0,381678.0,29625.0
4287,2019,Vesterbro/Kongens Enghave,Women,9399658000.0,319325.0,29436.0
4288,2019,Østerbro,Men,13865000000.0,448024.0,30947.0


#### Aggregate over column `Sex`

In [31]:
# Drop the column: "Average income" as it is not suited for aggregating.
df_temp_income_1 = (
    cph_data_superset.income
    .drop(['Average income (kr.)'], axis=1)
)

# Aggregate number of people over columns Sex and Age
columns_to_aggregate = ['Total income in district (kr.)', 'Number of people']
columns_to_remove = ['Sex'] 

# Create a dataframe where Age and Sex information will be removed.
df_temp_income_1 = (
    df_temp_income_1
    .groupby(get_df_columns(df_temp_income_1,
                            exclude=columns_to_remove + columns_to_aggregate),
             as_index=False)
    .sum()
)

# Re-add the column: "Average income".
df_temp_income_1['Average income (kr.)'] = (
    df_temp_income_1['Total income in district (kr.)']
    .div(df_temp_income_1['Number of people'])
)

# Sort the columns.
df_temp_income_1 = df_sort_columns(df_temp_income_1)

# Show the result.
display(df_temp_income_1)

Unnamed: 0,Year,District,Average income (kr.),Number of people,Total income in district (kr.)
0,1987,Amager Vest,120003.5,36874.0,4425009000.0
1,1987,Amager Øst,120719.2,39812.0,4806072000.0
2,1987,Bispebjerg,109902.1,37841.0,4158804000.0
3,1987,Brønshøj-Husum,124875.5,31076.0,3880632000.0
4,1987,Indre By,146519.9,35839.0,5251126000.0
...,...,...,...,...,...
358,2019,Unlocated,144593.6,2955.0,427274000.0
359,2019,Valby,331929.4,48587.0,16127455000.0
360,2019,Vanløse,350026.9,33675.0,11787157000.0
361,2019,Vesterbro/Kongens Enghave,350601.5,59061.0,20706873000.0


#### Check that the number of people looks right

In [32]:
# Sum all the number of people in 2019.
sum_2019 =(
    df_temp_income_1
    .loc[df_temp_income_1['Year'] == 2019,
         'Number of people']
    .to_numpy()
    .sum()
)

# Display the result.
printmd(f'The sum of people in Copenhagen in 2019 '
        f'has been calculated to ***{sum_2019:,.0f}.***')

The sum of people in Copenhagen in 2019 has been calculated to ***515,890.***

This number is significantly lower than the ***629,513*** we get from the ***Citizenship*** dataset. Let's take a look why:

In [33]:
# Define the relevant columns.
columns = ['Year', 'District', 'Number of people']

# Pre process the dataframes.
df_n_people_citizenship = cph_data_superset.citizenship[columns]
df_n_people_citizenship.columns = df_n_people_citizenship.columns.droplevel(1)

# Create a dataframe showing the number of people by year and district
# for the Citizenship and Income dataset.
df_n_people = (
    df_n_people_citizenship
    .merge(cph_data_superset.income[columns],
           on=['Year', 'District'],
           suffixes=[', ' + cph_data_superset.citizenship.attrs['name'],
                     ', ' + cph_data_superset.income.attrs['name']],
           how='inner')
)

# Add a column showing the difference in the number of people.
df_n_people['Number of people, Citizenship - Income'] = (
    df_n_people['Number of people, Citizenship']
    .sub(df_n_people['Number of people, Income'])
)

# Display the results.
display(df_n_people)
printmd('Statistics about the difference:')
display(df_n_people['Number of people, Citizenship - Income']
        .describe()
        .to_frame())

Unnamed: 0,Year,District,"Number of people, Citizenship","Number of people, Income","Number of people, Citizenship - Income"
0,1987,Amager Vest,45978.0,17084.0,28894.0
1,1987,Amager Vest,45978.0,19790.0,26188.0
2,1987,Amager Øst,45879.0,18605.0,27274.0
3,1987,Amager Øst,45879.0,21207.0,24672.0
4,1987,Bispebjerg,43004.0,16531.0,26473.0
...,...,...,...,...,...
655,2019,Vanløse,41137.0,17427.0,23710.0
656,2019,Vesterbro/Kongens Enghave,72128.0,29625.0,42503.0
657,2019,Vesterbro/Kongens Enghave,72128.0,29436.0,42692.0
658,2019,Østerbro,79908.0,30947.0,48961.0


Statistics about the difference:

Unnamed: 0,"Number of people, Citizenship - Income"
count,660.0
mean,30049.3
std,7338.8
min,17930.0
25%,24801.2
50%,27824.0
75%,34145.0
max,48961.0


We can see that there are around 8,500, or around 20% less citizens in each district in the ***Income*** dataset than in the ***Citizenship*** dataset. I believe the difference caused by exclusion of children from the ***Income*** data.

#### Remove columns containing total income and number of people per district.

Because we already know the total number of people in Copenhagen from the ***Citizenship*** dataset, I choose to only keep the information about *average income*, for every year and district.

In [34]:
# Remove columns containing total income and number of people per district.
cph_data_superset.income = (
    df_temp_income_1
    .drop(['Number of people', 'Total income in district (kr.)'],
          axis=1)
)

# Show the result.
display(cph_data_superset.income)

Unnamed: 0,Year,District,Average income (kr.)
0,1987,Amager Vest,120003.5
1,1987,Amager Øst,120719.2
2,1987,Bispebjerg,109902.1
3,1987,Brønshøj-Husum,124875.5
4,1987,Indre By,146519.9
...,...,...,...
358,2019,Unlocated,144593.6
359,2019,Valby,331929.4
360,2019,Vanløse,350026.9
361,2019,Vesterbro/Kongens Enghave,350601.5


### Lifespan

#### Display

In [35]:
display(cph_data_superset.life_span)

Unnamed: 0,Year,District,Average life span
0,2009,Indre By,79.4
1,2009,Østerbro,78.4
2,2009,Nørrebro,72.3
3,2009,Vesterbro/Kongens Enghave,73.5
4,2009,Valby,75.9
...,...,...,...
127,2020,Brønshøj-Husum,78.5
128,2020,Bispebjerg,77.8
129,2020,Amager Øst,81.0
130,2020,Amager Vest,78.7


### Marital status

#### Display

In [36]:
display(cph_data_superset.marital_status)

Unnamed: 0,Year,District,Sex,Age,Marital status,Number of people
2491,1974,Amager Vest,Men,0-9,Divorced,0.0
2492,1975,Amager Vest,Men,0-9,Divorced,0.0
2493,1976,Amager Vest,Men,0-9,Divorced,0.0
2494,1977,Amager Vest,Men,0-9,Divorced,0.0
2495,1978,Amager Vest,Men,0-9,Divorced,0.0
...,...,...,...,...,...,...
240635,2016,Østerbro,Women,90+,Widowed,235.0
240636,2017,Østerbro,Women,90+,Widowed,220.0
240637,2018,Østerbro,Women,90+,Widowed,211.0
240638,2019,Østerbro,Women,90+,Widowed,208.0


#### Aggregate over columns `Age` and `Sex`

In [37]:
# Aggregate number of people over columns Sex and Age
columns_to_aggregate = ['Number of people']
columns_to_remove = ['Sex', 'Age'] 

# Create a dataframe where Age and Sex information will be removed.
df_temp_marital_status_1 = (
    cph_data_superset.marital_status
    .groupby(get_df_columns(cph_data_superset.marital_status,
                            exclude=columns_to_remove + columns_to_aggregate),
             as_index=False)
    .sum()
)

# Show the result.
display(df_temp_marital_status_1)

Unnamed: 0,Year,District,Marital status,Number of people
0,1974,Amager Vest,Divorced,5066.0
1,1974,Amager Vest,Married / seperated,22022.0
2,1974,Amager Vest,Never married,22094.0
3,1974,Amager Vest,Widowed,4838.0
4,1974,Amager Øst,Divorced,5064.0
...,...,...,...,...
1875,2020,Vesterbro/Kongens Enghave,Widowed,1027.0
1876,2020,Østerbro,Divorced,7375.0
1877,2020,Østerbro,Married / seperated,19476.0
1878,2020,Østerbro,Never married,51514.0


#### Create new columns containing the number of people for each marital status:

In [38]:
# Create new columns containing the number of people for each citizenship group:
df_temp_marital_status_2 = (
    df_create_column_for_each_unique_value(
        df_temp_marital_status_1,
        category_columns=['Marital status'],
        value_columns='Number of people'
    )
)

# Show the results.
display(df_temp_marital_status_2)

Unnamed: 0_level_0,Year,District,Number of people,Number of people,Number of people,Number of people
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Divorced,Married / seperated,Never married,Widowed
0,1974,Amager Vest,5066.0,22022.0,22094.0,4838.0
1,1974,Amager Øst,5064.0,25077.0,18434.0,5452.0
2,1974,Bispebjerg,5357.0,24056.0,15658.0,6239.0
3,1974,Brønshøj-Husum,3460.0,21320.0,16313.0,3661.0
4,1974,Indre By,6399.0,19686.0,24515.0,3946.0
...,...,...,...,...,...,...
465,2020,Nørrebro,5859.0,14183.0,58223.0,1336.0
466,2020,Valby,5445.0,15080.0,39733.0,1637.0
467,2020,Vanløse,3741.0,11432.0,24833.0,1152.0
468,2020,Vesterbro/Kongens Enghave,5999.0,15325.0,51824.0,1027.0


#### Let's express the number of people as fractions of the total.

In [39]:
# Initialize the processing stage.
df_temp_marital_status_3 = df_temp_marital_status_2.copy()

# Calculate the total number of people
df_temp_marital_status_3['Total number of people'] = (
    df_temp_marital_status_3
    .loc[:, ['Number of people']]
    .sum(axis=1)
)

# Calculate percentages.
df_temp_marital_status_3['Number of people'] = (
    df_temp_marital_status_3['Number of people']
    .div(df_temp_marital_status_3['Total number of people'], axis='index')
    .mul(100, axis='index')
)

# Change the name from "number of people" to ""% of people"
df_temp_marital_status_3.columns = (
    df_temp_marital_status_3.columns.set_levels(
        (
            df_temp_marital_status_3
            .columns
            .levels[0]
            .str
            .replace('Number of people', r'% of people')
            .str
            .replace('Total number of people', r'Number of people')
        ),
        level=0
    )
)

# Show the resulting dataframe.
display(df_temp_marital_status_3)

Unnamed: 0_level_0,Year,District,% of people,% of people,% of people,% of people,Number of people
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Divorced,Married / seperated,Never married,Widowed,Unnamed: 7_level_1
0,1974,Amager Vest,9.4,40.8,40.9,9.0,54020.0
1,1974,Amager Øst,9.4,46.4,34.1,10.1,54027.0
2,1974,Bispebjerg,10.4,46.9,30.5,12.2,51310.0
3,1974,Brønshøj-Husum,7.7,47.6,36.5,8.2,44754.0
4,1974,Indre By,11.7,36.1,44.9,7.2,54546.0
...,...,...,...,...,...,...,...
465,2020,Nørrebro,7.4,17.8,73.1,1.7,79601.0
466,2020,Valby,8.8,24.4,64.2,2.6,61895.0
467,2020,Vanløse,9.1,27.8,60.3,2.8,41158.0
468,2020,Vesterbro/Kongens Enghave,8.1,20.7,69.9,1.4,74175.0


#### Check that the number of people looks right

In [40]:
# Sum all the number of people in 2019.
sum_2019 =(
    df_temp_marital_status_3
    .loc[df_temp_marital_status_3['Year'] == 2019,
         'Number of people']
    .to_numpy()
    .sum()
)

# Display the result.
printmd(f'The sum of people in Copenhagen in 2019 '
        f'has been calculated to ***{sum_2019:,.0f}.***')

The sum of people in Copenhagen in 2019 has been calculated to ***629,513.***

The number is the same as the one obtained from the ***Citizenship*** dataset. We can therefore drop the column `Number of people` and avoid redundancy.

#### Remove the column `Number of people`

In [41]:
# Remove the column "Number of people".
cph_data_superset.marital_status = (
    df_temp_marital_status_3
    .drop(['Number of people'],
          axis=1)
)

# Show the results.
display(cph_data_superset.marital_status)

  obj = obj._drop_axis(labels, axis, level=level, errors=errors)


Unnamed: 0_level_0,Year,District,% of people,% of people,% of people,% of people
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Divorced,Married / seperated,Never married,Widowed
0,1974,Amager Vest,9.4,40.8,40.9,9.0
1,1974,Amager Øst,9.4,46.4,34.1,10.1
2,1974,Bispebjerg,10.4,46.9,30.5,12.2
3,1974,Brønshøj-Husum,7.7,47.6,36.5,8.2
4,1974,Indre By,11.7,36.1,44.9,7.2
...,...,...,...,...,...,...
465,2020,Nørrebro,7.4,17.8,73.1,1.7
466,2020,Valby,8.8,24.4,64.2,2.6
467,2020,Vanløse,9.1,27.8,60.3,2.8
468,2020,Vesterbro/Kongens Enghave,8.1,20.7,69.9,1.4


### Population movement

#### Display

In [42]:
display(cph_data_superset.population_movement)

Unnamed: 0,Year,District,Deaths,Emigrated,Immigrated,Internal migration,Internal migration from other districts,Internal migration to to other districts,Internal migratrion from danish municipalities,Internal migratrion to danish municipalities,Live births,Natural increase,Netmigration,Netmigration Copenhagen,Netmigration Denmark
2173,1975,Amager Vest,1041.0,1451.0,564.0,3454.0,6344.0,4817.0,2736.0,3974.0,559.0,-482.0,-887.0,1527.0,-1238.0
2174,1976,Amager Vest,1063.0,1295.0,720.0,2935.0,6162.0,4942.0,2979.0,4211.0,433.0,-630.0,-575.0,1220.0,-1232.0
2175,1977,Amager Vest,965.0,1689.0,815.0,3775.0,6626.0,4828.0,2676.0,3926.0,444.0,-521.0,-874.0,1798.0,-1250.0
2176,1978,Amager Vest,984.0,1357.0,870.0,2865.0,5656.0,4561.0,2695.0,3495.0,460.0,-524.0,-487.0,1095.0,-800.0
2177,1980,Amager Vest,1001.0,1297.0,655.0,2888.0,5757.0,4355.0,2869.0,3007.0,414.0,-587.0,-642.0,1402.0,-138.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2578,2016,Østerbro,461.0,1877.0,2905.0,3788.0,5533.0,5835.0,5576.0,5467.0,1267.0,806.0,1028.0,-302.0,109.0
2579,2017,Østerbro,475.0,2006.0,3000.0,3855.0,5642.0,5918.0,5818.0,5904.0,1244.0,769.0,994.0,-276.0,-86.0
2580,2018,Østerbro,463.0,2235.0,3017.0,3709.0,5310.0,5963.0,4930.0,5726.0,1251.0,788.0,782.0,-653.0,-796.0
2581,2019,Østerbro,443.0,2277.0,3018.0,3813.0,5493.0,6120.0,5255.0,5699.0,1303.0,860.0,741.0,-627.0,-444.0


#### Let's express the number of people as fractions of the total.

In [43]:
# Initialize the processing stage.
df_temp_population_movement_1 = cph_data_superset.population_movement.copy()

# Get the Number of people from the Citizenship dataset
columns = ['Year', 'District', 'Number of people']
df_n_people_citizenship = cph_data_superset.citizenship[columns]
df_n_people_citizenship.columns = df_n_people_citizenship.columns.droplevel(1)

df_temp_population_movement_1 = (
    df_temp_population_movement_1
    .merge(df_n_people_citizenship,
           on=['Year', 'District'],
           how='inner')
)

# Set the name again, as merge deletes it.
df_temp_population_movement_1.attrs['name'] = 'Population movement'

# Get the columns to express as fractions
columns_to_express_as_fractions = get_df_columns(
    df_temp_population_movement_1,
    exclude=['Year', 'District', 'Number of people']
)

# Calculate percentages.
df_temp_population_movement_1[columns_to_express_as_fractions] = (
    df_temp_population_movement_1[columns_to_express_as_fractions]
    .div(df_temp_population_movement_1['Number of people'], axis='index')
    .mul(100, axis='index')
)

# Drop the column "Number of people".
df_temp_population_movement_1 = (
    df_temp_population_movement_1
    .drop(['Number of people'],
          axis=1)
)

# Adjust the column names.
df_temp_population_movement_1.columns = pd.MultiIndex.from_tuples(
            [(column, ) + ('', )
             if column not in columns_to_express_as_fractions
             else
             (r'% of people', ) + (column, )
             for column in list(df_temp_population_movement_1.columns)]
)

# Assign the dataframe to the superset object.
cph_data_superset.population_movement = df_temp_population_movement_1
    
# Show the resulting dataframe.
display(cph_data_superset.population_movement)

Unnamed: 0_level_0,Year,District,% of people,% of people,% of people,% of people,% of people,% of people,% of people,% of people,% of people,% of people,% of people,% of people,% of people
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Deaths,Emigrated,Immigrated,Internal migration,Internal migration from other districts,Internal migration to to other districts,Internal migratrion from danish municipalities,Internal migratrion to danish municipalities,Live births,Natural increase,Netmigration,Netmigration Copenhagen,Netmigration Denmark
0,1980,Amager Vest,2.1,2.7,1.4,6.0,12.0,9.1,6.0,6.3,0.9,-1.2,-1.3,2.9,-0.3
1,1981,Amager Vest,2.2,2.2,1.1,6.0,11.5,8.8,5.8,5.9,0.7,-1.5,-1.2,2.7,-0.0
2,1982,Amager Vest,1.9,2.1,1.0,5.8,12.0,8.9,5.9,6.0,0.9,-1.1,-1.2,3.2,-0.1
3,1987,Amager Vest,1.8,2.0,1.0,6.5,13.2,10.6,6.4,8.1,1.0,-0.9,-1.0,2.6,-1.7
4,1988,Amager Vest,1.9,2.2,1.0,6.6,14.3,11.4,6.3,7.0,1.0,-0.9,-1.1,2.9,-0.7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
365,2016,Østerbro,0.6,2.4,3.7,4.9,7.1,7.5,7.2,7.0,1.6,1.0,1.3,-0.4,0.1
366,2017,Østerbro,0.6,2.5,3.8,4.9,7.1,7.5,7.3,7.4,1.6,1.0,1.3,-0.3,-0.1
367,2018,Østerbro,0.6,2.8,3.8,4.7,6.7,7.5,6.2,7.2,1.6,1.0,1.0,-0.8,-1.0
368,2019,Østerbro,0.6,2.8,3.8,4.8,6.9,7.7,6.6,7.1,1.6,1.1,0.9,-0.8,-0.6


## Make sure all the tables have the same number of levels in the column index

In [44]:
# Get the maximal number of levels present in the column index of
# any of the dataframes in the superset.
max_n_levels = max([df.columns.nlevels
                    for df in cph_data_superset.get_all_dataframes()])

# Add empty index layers to columns with less than max levels.
for attribute_name, df in get_obj_attributes(cph_data_superset).items():
    if df.columns.nlevels < max_n_levels:
        df.columns = pd.MultiIndex.from_tuples(
            [idx + ('', ) * (max_n_levels - df.columns.nlevels)
             if isinstance(idx, tuple)
             else (idx, ) + ('', ) * (max_n_levels - df.columns.nlevels)
             for idx in list(df.columns)]
        )
        
    # Save the dataframe in the object: cph_data_superset.
    setattr(cph_data_superset, attribute_name, df)

## Create the superset

### Show all datasets to be merged

In [45]:
cph_data_superset.display_dataframes()


**Citizenship**:

Unnamed: 0_level_0,Year,District,% of people,% of people,% of people,Number of people
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Denmark,Non-western countries,Western countries,Unnamed: 6_level_1
,,,,,,
0,1980,Amager Vest,94.6,2.2,3.1,47828.0
1,1980,Amager Øst,95.4,2.5,2.1,48473.0
2,1980,Bispebjerg,96.4,2.3,1.3,44887.0
3,1980,Brønshøj-Husum,97.9,0.8,1.3,39683.0
4,1980,Indre By,92.4,2.0,5.6,45758.0
...,...,...,...,...,...,...
405,2020,Nørrebro,84.5,6.2,9.3,79601.0
406,2020,Valby,83.0,8.2,8.7,61895.0
407,2020,Vanløse,88.2,5.5,6.4,41158.0



**Dwellings**:

Unnamed: 0_level_0,Year,District,% of total dwellings area,% of total dwellings area,% of total dwellings area,% of total dwellings area,% of total dwellings area,Total dwellings area
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Housing cooperative,Non-profit housing,Owner-occupied,Private rental,Public authorities,Unnamed: 8_level_1
,,,,,,,,
0,1991,Amager Vest,17.0,20.3,23.6,36.6,2.6,1794562.0
1,1991,Amager Øst,23.5,14.3,27.0,30.2,5.0,1948001.0
2,1991,Bispebjerg,12.6,33.7,13.7,22.3,17.7,1783222.0
3,1991,Brønshøj-Husum,4.4,35.8,38.0,15.6,6.1,1507059.0
4,1991,Indre By,14.1,5.2,23.8,50.9,6.0,2378640.0
...,...,...,...,...,...,...,...,...
305,2021,Nørrebro,43.6,22.1,9.8,22.7,1.8,3112255.0
306,2021,Valby,23.3,19.5,31.3,25.4,0.5,2528779.0
307,2021,Vanløse,24.6,12.1,39.9,23.3,0.2,1678766.0



**Family type and children**:

Unnamed: 0_level_0,Year,District,% of families,% of families,% of families,% of families,% of families,% of families,% of families,% of families,% of families,% of families,% of families,% of families,% of families,% of families,% of families,% of families,% of families,% of families,% of families,% of families,% of families,% of families,% of families,% of families,% of families,Number of families
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Children below 18 years not living with parents,Children below 18 years not living with parents,Children below 18 years not living with parents,Children below 18 years not living with parents,Children below 18 years not living with parents,Couples living in consensual union and cohabiting couples,Couples living in consensual union and cohabiting couples,Couples living in consensual union and cohabiting couples,Couples living in consensual union and cohabiting couples,Couples living in consensual union and cohabiting couples,Married couples and registered partnership,Married couples and registered partnership,Married couples and registered partnership,Married couples and registered partnership,Married couples and registered partnership,Single men,Single men,Single men,Single men,Single men,Single women,Single women,Single women,Single women,Single women,Unnamed: 28_level_1
Unnamed: 0_level_2,Unnamed: 1_level_2,Unnamed: 2_level_2,0 children,1 child,2 children,3 children,More than 3 children,0 children,1 child,2 children,3 children,More than 3 children,0 children,1 child,2 children,3 children,More than 3 children,0 children,1 child,2 children,3 children,More than 3 children,0 children,1 child,2 children,3 children,More than 3 children,Unnamed: 28_level_2
0,1998,Amager Vest,0.3,0.0,0.0,0.0,0.0,7.1,2.0,1.1,0.2,0.1,11.1,3.8,3.7,0.9,0.4,30.7,0.5,0.2,0.0,0.0,32.1,3.6,1.8,0.4,0.1,27932.0
1,1998,Amager Øst,0.2,0.0,0.0,0.0,0.0,8.6,2.1,0.9,0.1,0.0,10.7,3.4,3.0,0.8,0.2,32.3,0.6,0.1,0.0,0.0,32.4,3.1,1.2,0.2,0.1,30512.0
2,1998,Bispebjerg,0.3,0.0,0.0,0.0,0.0,7.2,1.3,0.6,0.1,0.0,9.9,2.8,2.4,0.8,0.3,32.7,0.5,0.1,0.0,0.0,36.7,2.9,1.1,0.2,0.1,29550.0
3,1998,Brønshøj-Husum,0.4,0.0,0.0,0.0,0.0,5.2,2.3,1.5,0.3,0.1,15.6,5.5,6.2,1.8,0.5,23.2,0.6,0.1,0.0,0.0,30.8,3.5,1.7,0.5,0.1,21601.0
4,1998,Indre By,0.2,0.0,0.0,0.0,0.0,8.2,2.1,1.0,0.2,0.0,10.3,3.4,2.3,0.4,0.1,33.5,0.5,0.1,0.0,0.0,33.7,3.1,0.9,0.1,0.0,28879.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
225,2020,Nørrebro,0.1,0.0,0.0,0.0,0.0,10.6,3.0,1.6,0.2,0.1,5.0,3.0,3.1,1.1,0.3,32.1,0.6,0.1,0.0,0.0,34.3,3.0,1.3,0.3,0.2,50573.0
226,2020,Valby,0.2,0.0,0.0,0.0,0.0,9.0,2.8,1.6,0.2,0.1,8.1,4.0,4.9,1.5,0.3,31.1,0.8,0.2,0.0,0.0,29.6,3.6,1.6,0.3,0.1,36712.0
227,2020,Vanløse,0.2,0.0,0.0,0.0,0.0,7.9,2.5,1.6,0.3,0.1,10.3,4.0,6.1,1.8,0.2,28.9,0.8,0.2,0.0,0.0,30.2,3.2,1.4,0.3,0.1,23842.0
228,2020,Vesterbro/Kongens Enghave,0.1,0.0,0.0,0.0,0.0,10.0,3.3,1.7,0.3,0.0,6.4,3.6,3.9,0.9,0.1,33.7,0.8,0.1,0.0,0.0,29.8,3.3,1.5,0.2,0.0,46119.0



**Income**:

Unnamed: 0,Year,District,Average income (kr.)
,,,
,,,
0,1987,Amager Vest,120003.5
1,1987,Amager Øst,120719.2
2,1987,Bispebjerg,109902.1
3,1987,Brønshøj-Husum,124875.5
4,1987,Indre By,146519.9
...,...,...,...
358,2019,Unlocated,144593.6
359,2019,Valby,331929.4



**Life span**:

Unnamed: 0,Year,District,Average life span
,,,
,,,
0,2009,Indre By,79.4
1,2009,Østerbro,78.4
2,2009,Nørrebro,72.3
3,2009,Vesterbro/Kongens Enghave,73.5
4,2009,Valby,75.9
...,...,...,...
127,2020,Brønshøj-Husum,78.5
128,2020,Bispebjerg,77.8



**Marital status**:

Unnamed: 0_level_0,Year,District,% of people,% of people,% of people,% of people
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Divorced,Married / seperated,Never married,Widowed
,,,,,,
0,1974,Amager Vest,9.4,40.8,40.9,9.0
1,1974,Amager Øst,9.4,46.4,34.1,10.1
2,1974,Bispebjerg,10.4,46.9,30.5,12.2
3,1974,Brønshøj-Husum,7.7,47.6,36.5,8.2
4,1974,Indre By,11.7,36.1,44.9,7.2
...,...,...,...,...,...,...
465,2020,Nørrebro,7.4,17.8,73.1,1.7
466,2020,Valby,8.8,24.4,64.2,2.6
467,2020,Vanløse,9.1,27.8,60.3,2.8



**Population movement**:

Unnamed: 0_level_0,Year,District,% of people,% of people,% of people,% of people,% of people,% of people,% of people,% of people,% of people,% of people,% of people,% of people,% of people
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Deaths,Emigrated,Immigrated,Internal migration,Internal migration from other districts,Internal migration to to other districts,Internal migratrion from danish municipalities,Internal migratrion to danish municipalities,Live births,Natural increase,Netmigration,Netmigration Copenhagen,Netmigration Denmark
,,,,,,,,,,,,,,,
0,1980,Amager Vest,2.1,2.7,1.4,6.0,12.0,9.1,6.0,6.3,0.9,-1.2,-1.3,2.9,-0.3
1,1981,Amager Vest,2.2,2.2,1.1,6.0,11.5,8.8,5.8,5.9,0.7,-1.5,-1.2,2.7,-0.0
2,1982,Amager Vest,1.9,2.1,1.0,5.8,12.0,8.9,5.9,6.0,0.9,-1.1,-1.2,3.2,-0.1
3,1987,Amager Vest,1.8,2.0,1.0,6.5,13.2,10.6,6.4,8.1,1.0,-0.9,-1.0,2.6,-1.7
4,1988,Amager Vest,1.9,2.2,1.0,6.6,14.3,11.4,6.3,7.0,1.0,-0.9,-1.1,2.9,-0.7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
365,2016,Østerbro,0.6,2.4,3.7,4.9,7.1,7.5,7.2,7.0,1.6,1.0,1.3,-0.4,0.1
366,2017,Østerbro,0.6,2.5,3.8,4.9,7.1,7.5,7.3,7.4,1.6,1.0,1.3,-0.3,-0.1
367,2018,Østerbro,0.6,2.8,3.8,4.7,6.7,7.5,6.2,7.2,1.6,1.0,1.0,-0.8,-1.0


###  Merge

In [46]:
# Merge 
for idx_df, df in enumerate(cph_data_superset.get_all_dataframes()):
    # The first dataframe is left unaltered.
    if idx_df == 0:
        df_superset = df
        
    # Each consecutive dataframe is merged with the merge of the previous
    # dataframes on columns "Year" and "District".
    else:
        df_superset = df_superset.merge(
            df,
            on=['Year', 'District'],
            how='inner',
        )

# Show the results.
printmd('The **superset**:')
display(df_superset)
display(pd.DataFrame(df_superset['Year'].unique(), columns=['Included years']))
display(pd.DataFrame(df_superset['District'].unique(), columns=['Included districts']))

  obj = obj._drop_axis(labels, axis, level=level, errors=errors)


The **superset**:

Unnamed: 0_level_0,Year,District,% of people,% of people,% of people,Number of people,% of total dwellings area,% of total dwellings area,% of total dwellings area,% of total dwellings area,% of total dwellings area,Total dwellings area,% of families,% of families,% of families,% of families,% of families,% of families,% of families,% of families,% of families,% of families,% of families,% of families,% of families,% of families,% of families,% of families,% of families,% of families,% of families,% of families,% of families,% of families,% of families,% of families,% of families,Number of families,Average income (kr.),Average life span,% of people,% of people,% of people,% of people,% of people,% of people,% of people,% of people,% of people,% of people,% of people,% of people,% of people,% of people,% of people,% of people,% of people
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Denmark,Non-western countries,Western countries,Unnamed: 6_level_1,Housing cooperative,Non-profit housing,Owner-occupied,Private rental,Public authorities,Unnamed: 12_level_1,Children below 18 years not living with parents,Children below 18 years not living with parents,Children below 18 years not living with parents,Children below 18 years not living with parents,Children below 18 years not living with parents,Couples living in consensual union and cohabiting couples,Couples living in consensual union and cohabiting couples,Couples living in consensual union and cohabiting couples,Couples living in consensual union and cohabiting couples,Couples living in consensual union and cohabiting couples,Married couples and registered partnership,Married couples and registered partnership,Married couples and registered partnership,Married couples and registered partnership,Married couples and registered partnership,Single men,Single men,Single men,Single men,Single men,Single women,Single women,Single women,Single women,Single women,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Divorced,Married / seperated,Never married,Widowed,Deaths,Emigrated,Immigrated,Internal migration,Internal migration from other districts,Internal migration to to other districts,Internal migratrion from danish municipalities,Internal migratrion to danish municipalities,Live births,Natural increase,Netmigration,Netmigration Copenhagen,Netmigration Denmark
Unnamed: 0_level_2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,0 children,1 child,2 children,3 children,More than 3 children,0 children,1 child,2 children,3 children,More than 3 children,0 children,1 child,2 children,3 children,More than 3 children,0 children,1 child,2 children,3 children,More than 3 children,0 children,1 child,2 children,3 children,More than 3 children,Unnamed: 38_level_2,Unnamed: 39_level_2,Unnamed: 40_level_2,Unnamed: 41_level_2,Unnamed: 42_level_2,Unnamed: 43_level_2,Unnamed: 44_level_2,Unnamed: 45_level_2,Unnamed: 46_level_2,Unnamed: 47_level_2,Unnamed: 48_level_2,Unnamed: 49_level_2,Unnamed: 50_level_2,Unnamed: 51_level_2,Unnamed: 52_level_2,Unnamed: 53_level_2,Unnamed: 54_level_2,Unnamed: 55_level_2,Unnamed: 56_level_2,Unnamed: 57_level_2
0,2009,Amager Vest,86.9,6.4,6.7,56257.0,27.2,20.1,29.5,22.3,0.8,2469807.0,0.2,0.0,0.0,0.0,0.0,8.0,2.5,1.2,0.2,0.0,10.2,4.4,4.3,1.2,0.4,31.7,0.6,0.1,0.0,0.0,29.6,3.2,1.6,0.4,0.2,33983.0,271994.0,74.8,8.9,27.5,60.0,3.6,0.9,2.3,2.8,4.6,8.5,8.4,7.5,6.6,1.9,1.0,0.5,0.1,0.9
1,2009,Amager Øst,87.9,6.0,6.1,50390.0,35.2,14.7,29.3,20.3,0.5,2107253.0,0.2,0.0,0.0,0.0,0.0,8.9,2.3,1.0,0.2,0.0,8.8,3.7,4.0,0.9,0.2,32.5,0.7,0.1,0.0,0.0,31.7,3.1,1.3,0.3,0.1,31821.0,262985.5,76.2,9.1,25.0,62.0,3.9,0.8,2.0,3.0,4.6,7.1,8.4,7.1,6.4,1.6,0.8,0.9,-1.2,0.7
2,2009,Bispebjerg,84.0,10.4,5.6,47798.0,32.2,34.1,15.5,17.9,0.3,1951637.0,0.2,0.0,0.0,0.0,0.0,7.7,1.7,0.7,0.1,0.0,7.0,2.9,2.8,0.9,0.4,36.8,0.6,0.1,0.0,0.0,33.0,3.1,1.3,0.4,0.1,31999.0,239752.5,74.3,11.2,22.4,62.0,4.4,1.0,2.0,3.1,4.5,9.7,10.5,8.5,7.6,1.5,0.6,1.1,-0.8,0.9
3,2009,Brønshøj-Husum,84.7,11.2,4.1,39771.0,10.1,39.3,39.5,9.7,1.4,1628317.0,0.3,0.0,0.0,0.0,0.0,4.9,2.0,1.3,0.3,0.1,13.1,4.7,6.9,2.5,0.9,27.5,0.9,0.3,0.1,0.0,27.7,3.7,2.1,0.6,0.3,21476.0,264988.8,75.8,10.3,33.8,50.1,5.7,1.4,1.2,2.2,4.3,6.3,6.2,5.3,5.7,1.5,0.1,0.9,0.1,-0.4
4,2009,Indre By,86.9,2.9,10.2,47339.0,22.7,5.4,28.1,39.1,4.6,2710400.0,0.2,0.0,0.0,0.0,0.0,7.5,2.2,1.2,0.2,0.0,10.6,3.1,3.4,0.8,0.1,32.4,0.6,0.2,0.0,0.0,33.1,2.8,1.2,0.2,0.0,30605.0,326660.1,79.4,9.6,27.4,59.7,3.2,0.6,3.1,4.3,5.0,8.5,9.0,7.5,6.7,1.5,0.9,1.2,-0.5,0.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105,2019,Nørrebro,84.2,6.5,9.2,80646.0,44.5,22.4,9.6,21.8,1.7,3064700.0,0.1,0.0,0.0,0.0,0.0,10.3,3.0,1.5,0.2,0.1,5.1,3.0,3.0,1.0,0.4,32.5,0.6,0.1,0.0,0.0,34.1,2.9,1.4,0.3,0.2,51332.0,292138.7,76.4,7.3,17.9,73.1,1.7,0.5,2.3,2.7,5.8,8.5,9.9,6.8,7.5,1.9,1.3,0.4,-1.4,-0.7
106,2019,Valby,83.1,8.3,8.6,59840.0,25.5,19.9,30.3,23.8,0.5,2341961.0,0.1,0.0,0.0,0.0,0.0,9.1,2.8,1.6,0.2,0.1,8.2,4.1,4.9,1.6,0.3,31.1,0.8,0.2,0.0,0.0,29.2,3.7,1.6,0.4,0.1,35397.0,331929.4,78.4,9.0,24.6,63.7,2.7,0.6,2.1,2.8,4.5,7.9,6.7,9.1,8.6,1.7,1.0,0.7,1.2,0.5
107,2019,Vanløse,88.0,5.6,6.4,41137.0,24.9,12.0,39.6,23.4,0.2,1668619.0,0.2,0.0,0.0,0.0,0.0,8.0,2.4,1.6,0.3,0.1,10.4,4.0,6.2,1.8,0.2,28.9,0.7,0.2,0.0,0.0,29.8,3.4,1.3,0.3,0.1,23763.0,350026.9,80.8,9.2,27.9,60.0,2.9,0.6,1.8,1.9,2.7,6.1,6.3,6.6,7.2,1.4,0.8,0.1,-0.2,-0.6
108,2019,Vesterbro/Kongens Enghave,81.9,5.9,12.2,72128.0,37.3,15.4,20.3,26.3,0.7,2935055.0,0.1,0.0,0.0,0.0,0.0,9.7,3.3,1.7,0.3,0.0,6.5,3.6,3.8,0.9,0.1,34.1,0.8,0.1,0.0,0.0,29.6,3.4,1.5,0.3,0.1,44900.0,350601.5,77.7,8.1,20.8,69.6,1.4,0.4,2.9,3.7,5.3,9.3,8.6,8.8,7.8,2.0,1.6,0.8,0.7,1.0


Unnamed: 0,Included years
0,2009
1,2010
2,2011
3,2012
4,2013
5,2014
6,2015
7,2016
8,2017
9,2018


Unnamed: 0,Included districts
0,Amager Vest
1,Amager Øst
2,Bispebjerg
3,Brønshøj-Husum
4,Indre By
5,Nørrebro
6,Valby
7,Vanløse
8,Vesterbro/Kongens Enghave
9,Østerbro


### Sort columns

In [47]:
# Sort the columns so that Year and District show up first
df_superset = (
    df_sort_columns(df_superset,
                    first_columns=[('Year', '', ''),
                                   ('District', '', '')])
)

### Save superset dataframe

In [49]:
# Save the superset to disk.
df_superset.to_pickle(path_data_clean_root / 'cph_clean_superset.pkl')

### Show data for a chosen district

In [None]:
# Choose district
district = 'Nørrebro'

# Sort the columns so that Year and District show up first
display(
    df_superset
    .loc[df_superset[('District', '', '')] == district]
)

<hr style="border:2px solid black"> </hr>

# Sandbox