# Synthetic Population

In [None]:
from pathlib import Path
import pandas as pd
import numpy as np

In [None]:
LONDON_BOUNDARY_FILE_URL = 'https://files.datapress.com/london/dataset/statistical-gis-boundary-files-london/2016-10-03T13:52:28/statistical-gis-boundaries-london.zip'
LONDON_CENSUS_LABOUR_URL = 'https://files.datapress.com/london/dataset/2011-census-labour-and-qualifications/visualisation-data-labour.zip'
LONDON_CENSUS_QUALIFICATION_URL = 'https://files.datapress.com/london/dataset/2011-census-labour-and-qualifications/visualisation-data-qualifications.zip'
LONDON_CENSUS_WARD_POPULATION_URL = 'https://files.datapress.com/london/dataset/2011-census-demography/ward-pop-ONS-GLA-Census.xls'
LONDON_CENSUS_BOROUGH_POPULATION_URL = 'https://files.datapress.com/london/dataset/2011-census-demography/london-unrounded-data.xls'

LABOUR_FILE_PATH = Path('./LABOUR.xlsx')
QUALIFICATION_FILE_PATH = Path('./QUALIFICATIONS.xlsx')

## Helper Functions

In [None]:
import io
import zipfile
import tempfile
import requests
import requests_cache

requests_cache.install_cache('../build/cache')

def read_census_file(url, filename):
    """Reads census 2011 data from the London data store.
    
    The dataset is reduced to Haringey and ward resolution. All other data is discarded.
    """
    r = requests.get(url)
    z = zipfile.ZipFile(io.BytesIO(r.content))
    with tempfile.TemporaryDirectory(prefix='london-census-files') as tmpdir:
        z.extractall(path=tmpdir)
        path_to_temp_file = Path(tmpdir) / filename
        df = pd.read_excel(
            path_to_temp_file, 
            sheetname='2011 Data',
            skiprows=[0],
            header=[0]
        )
    df.rename(columns={'Unnamed: 1': 'area_type'}, inplace=True)
    df['area_type'] = df['area_type'].ffill()
    df = df[(df.DISTLABEL == 'Haringey') & (df.area_type == 'ward')]
    del df['DISTLABEL']
    del df['area_type']
    del df['ZONEID']
    del df['Unnamed: 2']
    df.set_index('ZONELABEL', inplace=True)
    df.index.rename('ward', inplace=True)
    return df

In [None]:
def reduce_census_data_to_tottenham(census_data):
    return census_data[census_data.index.map(lambda label: 'Tottenham' in label)].sum()

## Example not considering households

Two attributes with two categories each:

* age: 0-50yrs, 50-100yrs
* sex: m, f

In [None]:
from enum import Enum

class OrderedEnum(Enum):
    def __ge__(self, other):
        if self.__class__ is other.__class__:
            return self.value >= other.value
        return NotImplemented
    def __gt__(self, other):
        if self.__class__ is other.__class__:
            return self.value > other.value
        return NotImplemented
    def __le__(self, other):
        if self.__class__ is other.__class__:
            return self.value <= other.value
        return NotImplemented
    def __lt__(self, other):
        if self.__class__ is other.__class__:
            return self.value < other.value
        return NotImplemented

class Age(OrderedEnum):
    AGE0_50 = 1
    AGE50_100 = 2
    
class Sex(OrderedEnum):
    MALE = 1
    FEMALE = 2
        

Let's create some microdata, the seed for the algorithm.

In [None]:
p1 = (Age.AGE0_50, Sex.MALE)
p2 = (Age.AGE0_50, Sex.MALE)
p3 = (Age.AGE50_100, Sex.MALE)
p4 = (Age.AGE0_50, Sex.FEMALE)
p5 = (Age.AGE50_100, Sex.FEMALE)

Let's make up some statistics about the entire population.

In [None]:
averages = {
    Age.AGE0_50: 75,
    Age.AGE50_100: 25,
    Sex.MALE: 65,
    Sex.FEMALE: 35
}

## Iterative Proportional Fitting

In [None]:
df_in = pd.DataFrame(
        {
            'sex': [Sex.MALE, Sex.MALE, Sex.FEMALE, Sex.FEMALE],
            'age': [Age.AGE0_50, Age.AGE50_100, Age.AGE0_50, Age.AGE50_100],
            'total': [2, 1, 1, 1]
        }
    )
df_in

In [None]:
xip = df_in.groupby('sex')['total'].sum()
xpj = df_in.groupby('age')['total'].sum()

xip.ix[Sex.MALE] = averages[Sex.MALE]
xip.ix[Sex.FEMALE] = averages[Sex.FEMALE]

xpj.ix[Age.AGE0_50] = averages[Age.AGE0_50]
xpj.ix[Age.AGE50_100] = averages[Age.AGE50_100]

aggregates = [xip, xpj]
dimensions = [['sex'], ['age']]

In [None]:
from ipfn import *

IPF = ipfn.ipfn(
    df_in,
    aggregates, 
    dimensions
)
df_out = IPF.iteration()
df_out

In [None]:
df_out.groupby('sex').sum()

In [None]:
df_out.groupby('age').sum()

These numbers correctly mimic the population statistics.

Next, based on these numbers let's create a synthetic population. Assuming the result can be understood as a joint probability mass function, we can run 100 monte carlo draws to draw 100 individuals from this function.

In [None]:
import random

def create_individual(df):
    random_number = random.uniform(0, 100)
    summed_probability = 0
    for i in df.index:
        if random_number < df.ix[i, 'total'] + summed_probability:
            return df.ix[i, ['sex', 'age']].values
        else:
            summed_probability += df.ix[i, 'total']
    raise ValueError('doh!')

In [None]:
create_individual(df_out)

In [None]:
random.seed('syntheticpopulation')

synthetic_population = pd.DataFrame(
    data=[create_individual(df_out) for i in range(100)], 
    columns=['age', 'sex']
)

In [None]:
synthetic_population

In [None]:
synthetic_population.describe()

That's the synthetic population!

These numbers diverge slightly from the given population statistics but that's due to the nondeterministic drawing.

In [None]:
# TODO: consider zones
# TODO: consider households

## Demographic Data

### Read Usual Resident Data

In [None]:
def read_ward_population_data(url):
    """Reads census 2011 demographic data on ward level from the London data store.
    
    The dataset is reduced to Haringey and ward resolution. All other data is discarded.
    """
    r = requests.get(url)
    df = pd.read_excel(
        io.BytesIO(r.content), 
        sheetname='2011 Census',
        skiprows=[0],
        header=[0]
    )
    df = df.ix[:, :23] # only totals, cut sex specifics
    df = df[df.Borough == 'Haringey']
    del df['Borough']
    del df['Persons: All Ages'] # cut totals
    del df['Ward Code']
    df.set_index('Ward Name', inplace=True)
    return df

In [None]:
usual_residents = read_ward_population_data(LONDON_CENSUS_WARD_POPULATION_URL)
usual_residents.columns

In [None]:
assert usual_residents.sum().sum() == 254926

### Read total number of fifteen year old

The number of fifteen year old is important as it is used to divide adults from youth in the census (see lateron). The ward population data set cuts based on 5 years and hence does not cut between youth and adults. For reference, the total number of fifteen year old is read in here.

In [None]:
def read_borough_population_data(url):
    """Reads census 2011 demographic data on borough level from the London data store."""
    r = requests.get(url)
    df = pd.read_excel(
        io.BytesIO(r.content), 
        sheetname='Persons',
        skiprows=[0],
        header=[0]
    )
    df.drop(df.columns[[0, 2]], axis=1, inplace=True)
    df.drop([0, 34, 35, 36, 37, 38], axis=0, inplace=True)
    df.rename(columns={'Unnamed: 1': 'ward'}, inplace=True)
    df.set_index('ward', inplace=True)
    df = df.astype(np.int16)
    return df

In [None]:
total_fifteen_haringey = read_borough_population_data(LONDON_CENSUS_BOROUGH_POPULATION_URL).ix['Haringey', 15]
total_fifteen_haringey

### Read Economic Data

In [None]:
qualification_data = read_census_file(LONDON_CENSUS_QUALIFICATION_URL, QUALIFICATION_FILE_PATH)

In [None]:
qualification_data.columns

In [None]:
assert qualification_data.sum().sum() == usual_residents.ix[:, '15 to 19':].sum().sum() - total_fifteen_haringey

Qualification data is available for every usual resident starting from age 16. Unfortunately there is no data available on how many residents are older or younger than 16 (only 15) in the ward population data set. But obviously, we can infer the number of residents below 16 from the qualification data.

In [None]:
younger_than_sixteen = usual_residents.sum(axis=1) - qualification_data.sum(axis=1)
younger_than_sixteen.name = 'usual residents below 16'
younger_than_sixteen.sum()

In [None]:
usual_residents_age_fifteen = younger_than_sixteen - usual_residents.ix[:, :'10 to 14'].sum(axis=1)
usual_residents_age_fifteen.name = 'usual residents age 16'
assert usual_residents_age_fifteen.sum() == total_fifteen_haringey

In [None]:
labour_data = read_census_file(LONDON_CENSUS_LABOUR_URL, LABOUR_FILE_PATH)
labour_data.drop(labour_data.columns[10:], axis=1, inplace=True)
labour_data.columns

In [None]:
assert labour_data.sum().sum() == usual_residents.ix[:, '15 to 19':'70 to 74'].sum().sum() - total_fifteen_haringey

Labour data is available for every usual resident between age 16 and 74.