In [None]:
#pip install -r requirements.txt

# Doppelganger! (Simple)
Welcome to the simplified Doppelganger example. If you have not already done so, please see the [README document](https://github.com/sidewalklabs/doppelganger/blob/master/README.md) for installation instructions and information on what Doppelganger is doing under the hood. For a more thorough walkthrough, take a look at [doppelganger_example_full](./doppelganger_example_full.ipynb).

# Getting Started
Doppelganger lets you configure which census fields you use, the relationships among these fields (network structure), and the data preprocessing. We'll begin by loading the necessary packages, and then load a simple configuration file.

In [19]:
import pandas as pd
import pickle

from doppelganger import (
    allocation,
    inputs,
    Configuration,
    HouseholdAllocator,
    PumsData,
    SegmentedData,
    BayesianNetworkModel,
    Population,
    Preprocessor,
    Marginals
)

configuration = Configuration.from_file('sample_data/config.json')

# Loading and Cleaning Data
The following loads our data and cleans it according to the configuration. Reusing the same preprocessor ensures all data is cleaned consistently. We'll use California's PUMA `00106` for our demonstration.

In [23]:
PUMA = '00106'

preprocessor = Preprocessor.from_config(configuration.preprocessing_config)

# Take pums fields from the config and from the default fields needed for
# the household allocation process.
household_fields = tuple(set(
    field.name for field in allocation.DEFAULT_HOUSEHOLD_FIELDS).union(
        set(configuration.household_fields)
))

households_data = PumsData.from_csv('sample_data/households_00106_dirty.csv').clean(
    household_fields, preprocessor, puma=PUMA
)

persons_fields = tuple(set(
    field.name for field in allocation.DEFAULT_PERSON_FIELDS).union(
        set(configuration.person_fields)
))
persons_data = PumsData.from_csv('sample_data/persons_00106_dirty.csv').clean(
    persons_fields, preprocessor, puma=PUMA
)

In [24]:
households_data.data

Unnamed: 0,puma,household_weight,serial_number,num_vehicles,state,household_income,num_people
0,00106,100,1000481,0,06,<=40000,1
1,00106,112,1002740,3+,06,40000+,4+
2,00106,78,1004323,2,06,40000+,3
3,00106,53,1006982,3+,06,40000+,4+
4,00106,63,1010099,2,06,40000+,2
5,00106,42,1011294,2,06,40000+,2
6,00106,255,1011919,0,06,<=40000,0
7,00106,72,1012613,2,06,<=40000,4+
8,00106,128,1015561,3+,06,40000+,3
9,00106,76,1016255,2,06,40000+,2


In [25]:
persons_data.data

Unnamed: 0,puma,person_weight,serial_number,sex,state,individual_income,age
0,00106,100,1000481,M,06,<=0,65+
1,00106,122,1002740,M,06,20000-40000,18-34
2,00106,122,1002740,M,06,0-20000,18-34
3,00106,107,1002740,F,06,20000-40000,65+
4,00106,113,1002740,M,06,40000-80000,65+
5,00106,76,1004323,M,06,<=0,18-34
6,00106,78,1004323,F,06,40000-80000,35-64
7,00106,68,1004323,M,06,20000-40000,35-64
8,00106,53,1006982,M,06,<=0,65+
9,00106,76,1006982,M,06,<=0,0-17


# Household Allocation
Now we will allocate persons and households to tracts to align with census controls.  First, load our controls based on ACS marginals.

In [26]:
controls = Marginals.from_csv('sample_data/marginals_00106.csv')

In [27]:
controls.data

Unnamed: 0.1,Unnamed: 0,STATEFP,COUNTYFP,PUMA5CE,TRACTCE,num_people_count,num_people_1,num_people_3,num_people_2,num_people_4+,num_vehicles_1,num_vehicles_0,num_vehicles_2,num_vehicles_3+,age_0-17,age_18-34,age_65+,age_35-64
0,0,6,1,106,430101,2217,305,356,648,908,270,45,1057,2076,1756,1161,671,3383
1,1,6,1,106,430102,863,158,174,407,124,127,8,450,398,354,210,553,1022
2,2,6,1,106,430200,2417,397,580,936,504,211,16,1380,1466,1310,911,1114,3324
3,3,6,1,106,430300,1239,222,194,466,357,118,59,548,906,845,492,672,1597
4,4,6,1,106,430400,752,136,150,294,172,181,6,308,661,311,331,416,997
5,5,6,1,106,430500,2027,473,389,614,551,443,81,1283,873,1458,1264,582,2889
6,6,6,1,106,430600,2145,496,477,699,473,425,0,1010,1421,988,1120,876,2755
7,7,6,1,106,430700,1291,165,265,470,391,128,0,760,934,911,830,523,1649
8,8,6,1,106,430800,2116,367,328,793,628,522,128,1335,909,1406,1377,896,2412
9,9,6,1,106,430900,1822,499,356,589,378,724,40,893,467,1449,1186,587,1799


In [28]:
# # Serialization
# with open('controls.pkl', 'wb') as file:
#     pickle.dump(controls, file)
# with open('households_data.pkl', 'wb') as file:
#     pickle.dump(households_data, file)
# with open('persons_data.pkl', 'wb') as file:
#     pickle.dump(persons_data, file)

Now use `HouseholdAllocator` to generate household allocations.

In [30]:
allocator = HouseholdAllocator.from_cleaned_data(controls, households_data, persons_data)

  w = households[inputs.HOUSEHOLD_WEIGHT.name].as_matrix().T
  hh_table = households[hh_columns].as_matrix()
  A = tract_controls.data[hh_columns].as_matrix()
This use of ``*`` has resulted in matrix multiplication.
Using ``*`` for matrix multiplication has been deprecated since CVXPY 1.1.
    Use ``*`` for matrix-scalar and vector-scalar multiplication.
    Use ``@`` for matrix-matrix and matrix-vector multiplication.
    Use ``multiply`` for elementwise multiplication.
This code path has been hit 4 times so far.



# Bayesian Network Generation
Let's create models to generate characteristics for the people and households we just allocated. We'll start by loading up our pums data.  Our model learns different probability distributions for each category of person.  The category can be whatever you want and is specified by passing a segmentation function when you load training data.


In [31]:
segmentation_function = lambda x: x[inputs.AGE.name]
person_training_data = SegmentedData.from_data(
    persons_data,
    list(configuration.person_fields),
    inputs.PERSON_WEIGHT.name,
    segmenter=segmentation_function
)
person_model = BayesianNetworkModel.train(
    person_training_data,
    configuration.person_structure,
    configuration.person_fields
)

household_segmenter = lambda x: x[inputs.NUM_PEOPLE.name]

household_training_data = SegmentedData.from_data(
    households_data,
    list(configuration.household_fields),
    inputs.HOUSEHOLD_WEIGHT.name,
    household_segmenter,
)
household_model = BayesianNetworkModel.train(
    household_training_data,
    configuration.household_structure,
    configuration.household_fields
)

  bayesian_network = BayesianNetwork.from_structure(data, structure)


# Population Synthesis
Now for the main event! We can synthesize a population by taking the household allocations
we produced above and filling out missing categories with our Bayesian Networks.


In [32]:
population = Population.generate(allocator, person_model, household_model)

We can access the people and households as Pandas DataFrames and work with them directly. Households and people are unique by household_id. We can also join them to create a fat table of individual and household attributes.

In [33]:
people = population.generated_people
households = population.generated_households

merge_cols = [inputs.HOUSEHOLD_ID.name]
combined = pd.merge(people, households, on=merge_cols)

combined

Unnamed: 0,household_id,tract_x,serial_number_x,repeat_index_x,age,sex,individual_income,tract_y,serial_number_y,repeat_index_y,num_people,household_income,num_vehicles
0,430101-1000481-0,430101,1000481,0,65+,M,<=0,430101,1000481,0,1,<=40000,0
1,430101-1000481-1,430101,1000481,1,65+,M,20000-40000,430101,1000481,1,1,<=40000,1
2,430101-1000481-2,430101,1000481,2,65+,M,<=0,430101,1000481,2,1,<=40000,2
3,430101-1000481-3,430101,1000481,3,65+,M,<=0,430101,1000481,3,1,<=40000,1
4,430102-1000481-0,430102,1000481,0,65+,M,<=0,430102,1000481,0,1,<=40000,1
5,430200-1000481-0,430200,1000481,0,65+,M,0-20000,430200,1000481,0,1,<=40000,1
6,430200-1000481-1,430200,1000481,1,65+,M,<=0,430200,1000481,1,1,<=40000,1
7,430200-1000481-2,430200,1000481,2,65+,M,<=0,430200,1000481,2,1,<=40000,1
8,430200-1000481-3,430200,1000481,3,65+,M,<=0,430200,1000481,3,1,<=40000,1
9,430300-1000481-0,430300,1000481,0,65+,M,<=0,430300,1000481,0,1,<=40000,1


We can easily save this population to a csv.


In [34]:
population.write('generated_people.csv', 'generated_households.csv')

We can additionally save any of our intermediary stages and load them up again
whenever we want.  For example, we could save our Bayesian Network and reuse them
again later with the same or different household allocations.


In [13]:
person_model.write('person_model.json')
person_model_reloaded = BayesianNetworkModel.from_file('person_model.json', segmenter=segmentation_function)

  type_to_network[type_] = BayesianNetwork.from_json(json.dumps(network_json))


# Customize by PUMA
To try this out on the PUMA of your choice and learn to make other customizations, take a look at [doppelganger_example_full](./doppelganger_example_full.ipynb).