# Synthetic Population

In [None]:
import pandas as pd
import numpy as np

## Example not considering households

Two attributes with two categories each:

* age: 0-50yrs, 50-100yrs
* sex: m, f

In [None]:
from enum import Enum

class OrderedEnum(Enum):
    def __ge__(self, other):
        if self.__class__ is other.__class__:
            return self.value >= other.value
        return NotImplemented
    def __gt__(self, other):
        if self.__class__ is other.__class__:
            return self.value > other.value
        return NotImplemented
    def __le__(self, other):
        if self.__class__ is other.__class__:
            return self.value <= other.value
        return NotImplemented
    def __lt__(self, other):
        if self.__class__ is other.__class__:
            return self.value < other.value
        return NotImplemented

class Age(OrderedEnum):
    AGE0_50 = 1
    AGE50_100 = 2
    
class Sex(OrderedEnum):
    MALE = 1
    FEMALE = 2
        

Let's create some microdata, the seed for the algorithm.

In [None]:
p1 = (Age.AGE0_50, Sex.MALE)
p2 = (Age.AGE0_50, Sex.MALE)
p3 = (Age.AGE50_100, Sex.MALE)
p4 = (Age.AGE0_50, Sex.FEMALE)
p5 = (Age.AGE50_100, Sex.FEMALE)

Let's make up some statistics about the entire population.

In [None]:
averages = {
    Age.AGE0_50: 75,
    Age.AGE50_100: 25,
    Sex.MALE: 65,
    Sex.FEMALE: 35
}

## Iterative Proportional Fitting

In [None]:
df_in = pd.DataFrame(
        {
            'sex': [Sex.MALE, Sex.MALE, Sex.FEMALE, Sex.FEMALE],
            'age': [Age.AGE0_50, Age.AGE50_100, Age.AGE0_50, Age.AGE50_100],
            'total': [2, 1, 1, 1]
        }
    )
df_in

In [None]:
xip = df_in.groupby('sex')['total'].sum()
xpj = df_in.groupby('age')['total'].sum()

xip.ix[Sex.MALE] = averages[Sex.MALE]
xip.ix[Sex.FEMALE] = averages[Sex.FEMALE]

xpj.ix[Age.AGE0_50] = averages[Age.AGE0_50]
xpj.ix[Age.AGE50_100] = averages[Age.AGE50_100]

aggregates = [xip, xpj]
dimensions = [['sex'], ['age']]

In [None]:
from ipfn import *

IPF = ipfn.ipfn(
    df_in,
    aggregates, 
    dimensions
)
df_out = IPF.iteration()
df_out

In [None]:
df_out.groupby('sex').sum()

In [None]:
df_out.groupby('age').sum()

These numbers correctly mimic the population statistics.

Next, based on these numbers let's create a synthetic population. Assuming the result can be understood as a joint probability mass function, we can run 100 monte carlo draws to draw 100 individuals from this function.

In [None]:
import random

def create_individual(df):
    random_number = random.uniform(0, 100)
    summed_probability = 0
    for i in df.index:
        if random_number < df.ix[i, 'total'] + summed_probability:
            return df.ix[i, ['sex', 'age']].values
        else:
            summed_probability += df.ix[i, 'total']
    raise ValueError('doh!')

In [None]:
create_individual(df_out)

In [None]:
random.seed('syntheticpopulation')

synthetic_population = pd.DataFrame(
    data=[create_individual(df_out) for i in range(100)], 
    columns=['age', 'sex']
)

In [None]:
synthetic_population

In [None]:
synthetic_population.describe()

That's the synthetic population!

These numbers diverge slightly from the given population statistics but that's due to the nondeterministic drawing.