# Haringey Study

This notebook performs an energy agents study for Haringey.

In [None]:
import os
import sys
from pathlib import Path
from collections import namedtuple
import datetime
from itertools import chain, count
import random
import math
import subprocess

import numpy as np
import pandas as pd
import sqlalchemy
import matplotlib.pyplot as plt
import pytz
import requests_cache
import ipywidgets
from IPython.display import display
%matplotlib inline

from pytus2000 import read_diary_file, diary, read_individual_file, individual, read_diary_file_as_timeseries
import pytus2000
import people as ppl
module_path = os.path.abspath(os.path.join('../'))
if module_path not in sys.path:
    sys.path.append(module_path)
import ktp.census
import ktp.synthpop
import ktp.tus

In [None]:
BUILD_FOLDER_PATH = Path('./build/')
TUS_DATA_FOLDER_PATH = Path('./data/UKDA-4504-tab/')
MIDAS_DATABASE_PATH = Path('./data/Londhour.csv')

PATH_TO_JAR = Path('../../energy-agents/target/energy-agents-1.0-SNAPSHOT-jar-with-dependencies.jar') # FIXME
PATH_TO_INPUT_DB = (BUILD_FOLDER_PATH / ('haringey-scenario-lsoa.db')).absolute()
PATH_TO_OUTPUT_DB = (BUILD_FOLDER_PATH / ('haringey-scenario-lsoa-results.db')).absolute()
PATH_TO_INPUT_DB.parent.mkdir(parents=True, exist_ok=True)
MARKOV_CHAIN_INDEX_TABLE_NAME = 'markovChains'
DWELLINGS_TABLE_NAME = 'dwellings'
PEOPLE_TABLE_NAME = 'people'
ENVIRONMENT_TABLE_NAME = 'environment'
PARAMETERS_TABLE_NAME = 'parameters'

In [None]:
random.seed('haringey-scenario-lsoa')
pytus2000.set_cache_location(BUILD_FOLDER_PATH)
requests_cache.install_cache((BUILD_FOLDER_PATH / 'web-cache').as_posix())

In [None]:
def df_to_input_db(df, table_name):
    disk_engine = sqlalchemy.create_engine('sqlite:///{}'.format(PATH_TO_INPUT_DB))
    df.to_sql(name=table_name, con=disk_engine)

In [None]:
def update_progress_bar(generator, progress_bar):
    for elem in generator:
        progress_bar.value += 1
        yield elem

In [None]:
def run_simulation(path_to_jar, path_to_input, path_to_output):
    cmd = ['java', '-jar', path_to_jar, '-i', path_to_input, '-o', path_to_output]
    popen = subprocess.Popen(
        cmd, 
        stdout=subprocess.PIPE, 
        stderr=subprocess.PIPE,
        universal_newlines=True,
        
    )
    for stdout_line in iter(popen.stdout.readline, ""):
        print(stdout_line, end="")
    popen.stdout.close()
    return_code = popen.wait()
    if return_code:
        raise subprocess.CalledProcessError(return_code, cmd)

## Read, clean, and map all data

In [None]:
NUMBER_HOUSEHOLDS_HARINGEY = 101955
NUMBER_USUAL_RESIDENTS_HARINGEY = 254926
TIME_STEP_SIZE = datetime.timedelta(minutes=10)
START_TIME = datetime.datetime(2005, 1, 1, 0, 0)

### Participants

First off, let's define the group of people we are using from the UK Time Use Survey 2000 as seed for the synthetic population. 

In [None]:
individual_data = read_individual_file(TUS_DATA_FOLDER_PATH / 'tab' / 'individual_data_5.tab')
## TODO filter city population
seed = pd.DataFrame(index=individual_data.index, columns=['labour', 'qualification', 'age', 'hhtype'])
seed.labour = individual_data.ECONACT2.map(ktp.tus.LABOUR_MAP)
seed.labour[individual_data.IAGE > 74] = ktp.census.Labour.ABOVE_74
seed.qualification = individual_data.HIQUAL4.map(ktp.tus.QUALIFICATION_MAP)
seed['age'] = individual_data.IAGE.copy()
seed['hhtype'] = individual_data.HHTYPE4.map(ktp.tus.HOUSEHOLDTYPE_MAP)
seed.head()

In [None]:
seed.dropna(axis='index', how='any', inplace=True)
assert not seed.isnull().any().any()

A household is invalid if the amount of individuals we have do not match the household type. For example, a couple without children household must have exactly two individuals

In [None]:
household_types = seed.groupby((seed.index.get_level_values(0), seed.index.get_level_values(1))).hhtype.first()
household_sizes = seed.groupby((seed.index.get_level_values(0), seed.index.get_level_values(1))).hhtype.count()
mask_couples_children = household_sizes[(household_types == ktp.census.HouseholdType.COUPLE_WITH_DEPENDENT_CHILDREN) & (household_sizes <= 2)]
mask_couples_no_children = household_sizes[(household_types == ktp.census.HouseholdType.COUPLE_WITHOUT_DEPENDENT_CHILDREN) & (household_sizes != 2)]
invalids = (
    household_sizes[(household_types == ktp.census.HouseholdType.COUPLE_WITH_DEPENDENT_CHILDREN) & (household_sizes <= 2)] |
    household_sizes[(household_types == ktp.census.HouseholdType.COUPLE_WITHOUT_DEPENDENT_CHILDREN) & (household_sizes != 2)] |
    household_sizes[(household_types == ktp.census.HouseholdType.LONE_PARENT_WITH_DEPENDENT_CHILDREN) & (household_sizes < 2)] |
    household_sizes[(household_types == ktp.census.HouseholdType.MULTI_PERSON_HOUSEHOLD) & (household_sizes <= 2)]
)

seed.drop(labels=invalids.index, level=None, inplace=True)


print("{} households are invalid and were removed.".format(invalids.count()))

#### Test all input

In [None]:
assert not ((seed.labour == ktp.census.Labour.ABOVE_74) & (seed.age <= 74)).any()
assert not ((seed.labour == ktp.census.Labour.BELOW_16) & (seed.age >= 16)).any()
assert not ((seed.labour == ktp.census.Labour.BELOW_16) & (seed.qualification != ktp.census.Qualification.BELOW_16)).any()
assert not ((seed.labour != ktp.census.Labour.BELOW_16) & (seed.qualification == ktp.census.Qualification.BELOW_16)).any()

household_types = seed.groupby((seed.index.get_level_values(0), seed.index.get_level_values(1))).hhtype.first()
household_sizes = seed.groupby((seed.index.get_level_values(0), seed.index.get_level_values(1))).hhtype.count()
assert (household_sizes[household_types == ktp.census.HouseholdType.ONE_PERSON_HOUSEHOLD] == 1).all()
assert (household_sizes[household_types == ktp.census.HouseholdType.COUPLE_WITH_DEPENDENT_CHILDREN] > 2).all()
assert (household_sizes[household_types == ktp.census.HouseholdType.LONE_PARENT_WITH_DEPENDENT_CHILDREN] > 1).all()
assert (household_sizes[household_types == ktp.census.HouseholdType.COUPLE_WITHOUT_DEPENDENT_CHILDREN] == 2).all()
assert (household_sizes[household_types == ktp.census.HouseholdType.MULTI_PERSON_HOUSEHOLD] > 2).all()

### Create Markov Chains

Now that we have the participants, we can create markov chain for each type of citizen. A type is defined by two attributes:

* the current work status; 'labour' in the following
* the highest qualification received, 'qualification' in the following.

In [None]:
diary_data = read_diary_file(TUS_DATA_FOLDER_PATH / 'tab' / 'diary_data_8.tab')
diary_data_ts = read_diary_file_as_timeseries(TUS_DATA_FOLDER_PATH / 'tab' / 'diary_data_8.tab')[['activity', 'location']]

In [None]:
simple_ts = pd.DataFrame({
    'location': diary_data_ts.location.map(ktp.tus.LOCATION_MAP),
    'activity': diary_data_ts.activity.map(ktp.tus.ACTIVITY_MAP)
})

####  Handle Unknowns

In [None]:
simple_ts.isnull().any()

There are no nans.

In [None]:
len(simple_ts[(simple_ts.activity == ktp.tus.Activity.UNKNOWN) | (simple_ts.location == ktp.tus.Location.UNKNOWN)]) / len(simple_ts)

5.5% of all entries are missing.

In [None]:
filled_simple_ts = simple_ts.copy()

In [None]:
filled_simple_ts.replace(to_replace=[ktp.tus.Location.UNKNOWN, ktp.tus.Activity.UNKNOWN], value=np.nan, inplace=True)

In [None]:
filled_simple_ts.isnull().describe()

Unknowns will be filled by forward fill. That is, whenever  an acticity/location is unknown it is expected that the last known activity/location is still valid. 

When doing that, it is important to not forward fill between diaries (all diaries are below each other). Hence, they must be grouped into diaries first and then forward filled. This will lead to the fact that not all Unknowns can be filled (the ones at the beginning of the day), but that is wanted.  

In [None]:
filled_simple_ts = filled_simple_ts.groupby([filled_simple_ts.index.get_level_values(0), 
                                             filled_simple_ts.index.get_level_values(1), 
                                             filled_simple_ts.index.get_level_values(2), 
                                             filled_simple_ts.index.get_level_values(3)]).fillna(method='ffill')

In [None]:
filled_simple_ts.isnull().describe()

In [None]:
# TODO don't forward fill over too long durations, e.g. not more than 1-2h.

The remaining nans are filtered in the Filter section below.

#### Map to markov states.

In [None]:
markov_ts = ktp.tus.from_simplified_location_and_activity_to_people_model(filled_simple_ts)

#### Filter

In [None]:
# TODO

In [None]:
def filter_nan(markov_ts, diary_data):
    """Remove all diaries with at least one NaN."""
    nan_mask = markov_ts.groupby(by=lambda index: (index[0], index[1], index[2], index[3])).apply(lambda values: values.isnull().any())
    return pd.DataFrame(markov_ts)[markov_ts.index.droplevel(4).isin(nan_mask[~nan_mask].index)]

markov_ts = filter_nan(markov_ts, diary_data)
assert not markov_ts.isnull().any().any()

#### Cluster by synth pop categories

In [None]:
seed_groups = seed.groupby(['labour', 'qualification'])

In [None]:
def markov_chain_for_group_of_people(markov_ts, group_of_people, weekdays, weekenddays):
    # filter by people
    people_mask = markov_ts.index.droplevel([3, 4]).isin(group_of_people.index)
    filtered_markov = pd.DataFrame(markov_ts)[people_mask]
    # filter by weekday
    weekday_mask = filtered_markov.index.droplevel([4]).isin(weekdays.index)
    filtered_markov_weekday = filtered_markov[weekday_mask]
    # filter by weekend
    weekend_mask = filtered_markov.index.droplevel([4]).isin(weekenddays.index)
    filtered_markov_weekend = filtered_markov[weekend_mask]
    return ppl.WeekMarkovChain(
        weekday_time_series=filtered_markov_weekday.unstack(level=[0, 1, 2, 3]),
        weekend_time_series=filtered_markov_weekend.unstack(level=[0, 1, 2, 3]),
        time_step_size=TIME_STEP_SIZE
    )


def people_group(labour, qualification):
    group = seed_groups.get_group((labour, qualification))
    return group

In [None]:
weekdays = diary_data[diary_data.DDAYW2 == diary.DDAYW2.WEEKDAY_MON___FRI]
weekenddays = diary_data[diary_data.DDAYW2 == diary.DDAYW2.WEEKEND_DAY]

progress_bar = ipywidgets.IntProgress(
    value=0,
    min=0,
    max=len([qual for qual in ktp.census.Qualification]) * len([lab for lab in ktp.census.Labour]),
    step=1,
    description='Progress:',
    bar_style='',
    orientation='horizontal'
)
display(progress_bar)

markov_chains = {
    (labour, qualification): markov_chain_for_group_of_people(
        markov_ts=markov_ts, 
        group_of_people=people_group(labour, qualification),
        weekdays=weekdays,
        weekenddays=weekenddays
    ) if (labour, qualification) in seed_groups.groups.keys() else None
    for labour in ktp.census.Labour
    for qualification in update_progress_bar(ktp.census.Qualification, progress_bar)
}

#### Amend seed by markov chain attribute

Now that we have calculated all markov chains, the id associated with the markov chain will be added as an attribute to the seed.

In [None]:
def markov_id(labour, qualification):
    # cantor pairing function, http://stackoverflow.com/a/919661/1856079
    x = labour.value
    y = qualification.value
    return int(1/2 * (x + y) * (x + y + 1) + y)

In [None]:
seed['markov_id'] = seed.apply(
    lambda row: markov_id(row.labour, row.qualification), 
    axis=1
)
seed['initial_activity'] = seed.apply(
    lambda row: markov_chains[(row.labour, row.qualification)].valid_states(START_TIME)[0], 
    axis=1
)

In [None]:
seed.head()

### Synthetic Population

#### Read all census data

In [None]:
# TODO data should be retrieved using API
PATH_TO_USUAL_RESIDENTS = Path('./data/census/usual_residents.csv')
PATH_TO_ECONOMIC_ACTIVITY = Path('./data/census/economic_activity.csv')
PATH_TO_HOUSEHOLD_TYPE = Path('./data/census/household_type.csv')
PATH_TO_QUALIFICATION = Path('./data/census/qualification.csv')

In [None]:
usual_residents = pd.read_csv(PATH_TO_USUAL_RESIDENTS, skiprows=8, skipfooter=5, engine='python', index_col='mnemonic')
usual_residents.drop('2011 super output area - lower layer', axis=1, inplace=True)
assert usual_residents.sum().sum() == NUMBER_USUAL_RESIDENTS_HARINGEY

In [None]:
household_data = pd.read_csv(PATH_TO_HOUSEHOLD_TYPE, skiprows=8, skipfooter=5, engine='python', index_col='mnemonic')
household_data.drop('2011 super output area - lower layer', axis=1, inplace=True)
household_data = household_data.rename(columns=ktp.census.HOUSEHOLDTYPE_MAP).groupby(lambda x:x, axis=1).sum()
assert household_data.sum().sum() == NUMBER_HOUSEHOLDS_HARINGEY

In [None]:
qualification_map = {
    'No qualifications': ktp.census.Qualification.NO_QUALIFICATIONS,
    'Highest level of qualification: Level 1 qualifications': ktp.census.Qualification.LEVEL_1,
    'Highest level of qualification: Level 2 qualifications': ktp.census.Qualification.LEVEL_2,
    'Highest level of qualification: Apprenticeship': ktp.census.Qualification.APPRENTICESHIP,
    'Highest level of qualification: Level 3 qualifications': ktp.census.Qualification.LEVEL_3,
    'Highest level of qualification: Level 4 qualifications and above': ktp.census.Qualification.LEVEL_45,
    'Highest level of qualification: Other qualifications': ktp.census.Qualification.OTHER_QUALIFICATION
}

In [None]:
qualification_data = pd.read_csv(PATH_TO_QUALIFICATION, skiprows=8, skipfooter=5, engine='python', index_col='mnemonic')
qualification_data.drop('2011 super output area - lower layer', axis=1, inplace=True)
qualification_data = qualification_data.rename(columns=qualification_map).groupby(lambda x:x, axis=1).sum()
assert qualification_data.sum().sum() == usual_residents.ix[:, 'Age 16 to 17':].sum().sum()

Qualification data is available for every usual resident starting from age 16.

In [None]:
younger_than_sixteen = usual_residents.ix[:, :'Age 15'].sum(axis=1)
qualification_data[ktp.census.Qualification.BELOW_16] = younger_than_sixteen
assert qualification_data.sum().sum() == usual_residents.sum().sum()

In [None]:
labour_data = pd.read_csv(PATH_TO_ECONOMIC_ACTIVITY, skiprows=8, skipfooter=5, engine='python', index_col='mnemonic')
labour_data.drop('2011 super output area - lower layer', axis=1, inplace=True)
labour_data = labour_data.rename(columns=ktp.census.LABOUR_MAP).groupby(lambda x:x, axis=1).sum()
assert labour_data.sum().sum() == usual_residents.ix[:, 'Age 16 to 17':'Age 65 to 74'].sum().sum()

Labour data is available for every usual resident between age 16 and 74.

In [None]:
labour_data[ktp.census.Labour.BELOW_16] = younger_than_sixteen
labour_data[ktp.census.Labour.ABOVE_74] = usual_residents.ix[:, 'Age 75 to 84':].sum(axis=1)
assert labour_data.sum().sum() == usual_residents.sum().sum()

#### Prepare seed

In [None]:
# prepare index
sn1_plus_sn2 = seed.index.droplevel(2)
seed = seed.copy()
seed['household_id'] = list(sn1_plus_sn2)
seed.reset_index(inplace=True)
seed.rename(columns={'SN3': 'person_id'}, inplace=True)
seed.set_index(['household_id', 'person_id'], inplace=True)
seed.drop(['SN1', 'SN2'], axis=1, inplace=True)
seed.head()

#### Run the iterative proportional fitting and create synthetic population

In [None]:
Dwelling = namedtuple('Dwelling', ['id', 'seedId','householdType', 'region'])
DWELLING_COUNTER = count(0)
Citizen = namedtuple('Citizen', ['dwellingId', 'markovId', 'initialActivity'])

In [None]:
def synthetic_dwelling_generator(household_weights, seed, number_households, region):
    norm_household_weights = household_weights.copy()
    norm_household_weights = household_weights / household_weights.sum()
    cum_norm_household_weights = norm_household_weights.cumsum()
    assert math.isclose(norm_household_weights.sum(), 1, abs_tol=0.001) 
    for i in range(1, number_households + 1):
        household_id = sample_household(cum_norm_household_weights)
        yield Dwelling(DWELLING_COUNTER.__next__(), household_id, seed.ix[(household_id), :].iloc[0].hhtype, region)

        
def sample_household(cumulated_household_weights):
    random_number = random.uniform(0, 1)
    return cumulated_household_weights[cumulated_household_weights >= random_number].index[0]


def synthetic_population_generator(dwellings, seed):
    for dwelling in dwellings:
        inhabitants = seed.ix[dwelling.seedId, :]
        for index, row in inhabitants.iterrows():
            yield Citizen(
                dwellingId=dwelling.id, 
                markovId=row.markov_id,
                initialActivity=row.initial_activity
            )

In [None]:
def hipf(region):
    number_households = household_data.ix[region, :].sum()
    household_weights = ktp.synthpop.fit_hipf(
        reference_sample=seed,
        controls_households={'hhtype': household_data.ix[region, :].to_dict()},
        controls_individuals={
            'labour': labour_data.ix[region, :].to_dict(),
            'qualification': qualification_data.ix[region, :].to_dict()
        },
        residuals_tol=0.0001,
        weights_tol=0.0001,
        maxiter=100
    )
    assert number_households - household_weights.sum() < 0.1
    assert not any(household_weights.isnull())
    return synthetic_dwelling_generator(
        household_weights=household_weights, 
        seed=seed, 
        number_households=number_households,
        region=region
    )

In [None]:
seed.head()

In [None]:
progress_bar = ipywidgets.IntProgress(
    value=0,
    min=0,
    max=len(household_data.index),
    step=1,
    description='Progress:',
    bar_style='',
    orientation='horizontal'
)
display(progress_bar)

dwellings = list(chain(*(hipf(region) for region in update_progress_bar(household_data.index, progress_bar))))
citizens = list(synthetic_population_generator(dwellings, seed))

In [None]:
assert len(dwellings) == NUMBER_HOUSEHOLDS_HARINGEY
assert abs(len(citizens) - NUMBER_USUAL_RESIDENTS_HARINGEY) < 1000

#### Extend Synthetic Population with Parameters from UKBuildings

In [None]:
# TODO ; for the moment use random parameters

In [None]:
class UniformDistributedParameter():
    
    def __init__(self, expected_value, variation_in_percent):
        self.__expected_value = expected_value
        self.__random_max = expected_value * variation_in_percent / 100
        
    def sample(self):
        return self.__expected_value + random.uniform(-self.__random_max, self.__random_max)
   

CONDITIONED_FLOOR_AREA = 100 # m^2
HEAT_MASS_CAPACITY = UniformDistributedParameter(165000 * CONDITIONED_FLOOR_AREA, 0.0)
HEAT_TRANSMISSION = UniformDistributedParameter(200, 0.0)
MAX_HEATING_POWER = 10000
MAX_COOLING_POWER = -10000
INITIAL_TEMPERATURE = UniformDistributedParameter(22, 0.0)
HEATING_CONTROL_STRATEGY = 'PRESENCE_TRIGGERED'

In [None]:
dwellings_df = pd.DataFrame(
    index=[dwelling.id for dwelling in dwellings],
    data={
        'heatMassCapacity': [HEAT_MASS_CAPACITY.sample() for unused in dwellings],
        'heatTransmission': [HEAT_TRANSMISSION.sample() for unused in dwellings],
        'maxHeatingPower': MAX_HEATING_POWER,
        'maxCoolingPower': MAX_COOLING_POWER,
        'initialTemperature': [INITIAL_TEMPERATURE.sample() for unused in dwellings],
        'conditionedFloorArea': CONDITIONED_FLOOR_AREA,
        'heatingControlStrategy': HEATING_CONTROL_STRATEGY,
        'region': [dwelling.region for dwelling in dwellings]
    }
)

In [None]:
citizen_df = pd.DataFrame(
    index=list(range(len(citizens))),
    data={
        'markovChainId': [citizen.markovId for citizen in citizens],
        'dwellingId': [citizen.dwellingId for citizen in citizens],
        'initialActivity': [str(citizen.initialActivity) for citizen in citizens]
    }
)

### Write out to Database

In [None]:
markov_index = pd.Series(
    {
        markov_id(labour, qualification): "markov{}".format(markov_id(labour, qualification))
        for labour, qualification in markov_chains.keys() if markov_chains[(labour, qualification)] is not None
    }, 
    name='tablename'
)
df_to_input_db(markov_index, MARKOV_CHAIN_INDEX_TABLE_NAME)

In [None]:
for key, markov_chain in markov_chains.items():
    if markov_chain is None:
        continue
    labour, qualification = key
    df = markov_chain.to_dataframe()
    df.fromActivity = [str(x) for x in df.fromActivity]
    df.toActivity = [str(x) for x in df.toActivity]
    df_to_input_db(df, markov_index[markov_id(labour, qualification)])

In [None]:
df_to_input_db(dwellings_df, DWELLINGS_TABLE_NAME)

In [None]:
df_to_input_db(citizen_df, PEOPLE_TABLE_NAME)

### Enviroment Data

In [None]:
def date_parser(date, time):
    month, day, year = [int(x) for x in date.split('/')]
    hour, minute = [int(x) for x in time.split(':')]
    return datetime.datetime(year, month, day, hour - 1, minute)

temperature = pd.read_csv(
    MIDAS_DATABASE_PATH, 
    skiprows=[0], 
    header=0, 
    parse_dates=[['Date (MM/DD/YYYY)', 'Time (HH:MM)']], 
    date_parser=date_parser,
    index_col=[0]
)
temperature.rename(columns={'Dry-bulb (C)': 'temperature'}, inplace=True)
temperature.index.name = 'index'
df_to_input_db(temperature['temperature'].resample(TIME_STEP_SIZE).ffill(), ENVIRONMENT_TABLE_NAME)

### Simulation Parameter

In [None]:
df_to_input_db(
    table_name=PARAMETERS_TABLE_NAME,
    df=pd.DataFrame(
        index=[1],
        data={
            'initialDatetime': START_TIME,
            'timeStepSize_in_min': TIME_STEP_SIZE.total_seconds() / 60,
            'numberTimeSteps': 6 * 24,
            'randomSeed': 123456789,
            'setPointWhileHome': 22.0,
            'setPointWhileAsleep': 18.0,
            'wakeUpTime': datetime.time(7, 0),
            'leaveHomeTime': datetime.time(8, 30),
            'comeHomeTime': datetime.time(18, 0),
            'bedTime': datetime.time(22, 0)
        }
    )
)

## Run Simulation

In [None]:
run_simulation(PATH_TO_JAR, PATH_TO_INPUT_DB, PATH_TO_OUTPUT_DB)