# Haringey Study

This notebook performs an energy agents study for Haringey.

In [None]:
import os
import sys
from pathlib import Path
from collections import namedtuple
import datetime
from itertools import chain, count, product
import random
import math
import subprocess
from multiprocessing import Pool, cpu_count

import numpy as np
import pandas as pd
import sqlalchemy
import pytz
import requests_cache
import ipywidgets
from IPython.display import display
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_context("notebook", font_scale=1.25, rc={"lines.linewidth": 2.5})
%matplotlib inline

from pytus2000 import read_diary_file, diary, read_individual_file, individual, read_diary_file_as_timeseries
import pytus2000
import people as ppl
module_path = os.path.abspath(os.path.join('../'))
if module_path not in sys.path:
    sys.path.append(module_path)
import ktp.census
import ktp.synthpop
import ktp.tus
from ktp.synthpop import PeopleFeature, HouseholdFeature

In [None]:
# settings
NAME ='haringey-scenario-lsoa-age'
JAVA_HEAP_SIZE = 12 # GB
NUMBER_PROCESSES = cpu_count() # max number of concurrent processes
TIME_STEP_SIZE = datetime.timedelta(minutes=10)
START_TIME = datetime.datetime(2005, 1, 3, 0, 0) # Monday
NUMBER_TIME_STEPS = 6 * 24
SET_POINT_WHILE_HOME = 22
SET_POINT_WHILE_ASLEEP = 18
METABOLIC_RATE_ACTIVE = 140
METABOLIC_RATE_PASSIVE = 70
METABOLIC_ADULT_RATE = 1.0
METABOLIC_CHILD_RATE = 0.75
SPATIAL_RESOLUTION = ktp.census.GeographicalLayer.WARD
PEOPLE_FEATURES = [PeopleFeature.AGE]
HOUSEHOLD_FEATURES = [HouseholdFeature.PSEUDO]

In [None]:
CACHE_FOLDER_PATH = Path('./build/')
BUILD_FOLDER_PATH = Path('./build/') / NAME
BUILD_FOLDER_PATH.mkdir(parents=True, exist_ok=True)
TUS_DATA_FOLDER_PATH = Path('./data/UKDA-4504-tab/')
MIDAS_DATABASE_PATH = Path('./data/Londhour.csv')

PATH_TO_JAR = Path('../../energy-agents/target/energy-agents-1.0-SNAPSHOT-jar-with-dependencies.jar') # FIXME
PATH_TO_INPUT_DB = (BUILD_FOLDER_PATH / '{}.db'.format(NAME)).absolute()
PATH_TO_OUTPUT_DB = (BUILD_FOLDER_PATH / '{}-results.db'.format(NAME)).absolute()
MARKOV_CHAIN_INDEX_TABLE_NAME = 'markovChains'
DWELLINGS_TABLE_NAME = 'dwellings'
PEOPLE_TABLE_NAME = 'people'
ENVIRONMENT_TABLE_NAME = 'environment'
PARAMETERS_TABLE_NAME = 'parameters'
NUMBER_HOUSEHOLDS_HARINGEY = 101955
NUMBER_USUAL_RESIDENTS_HARINGEY = 254926

In [None]:
if PATH_TO_INPUT_DB.exists():
    raise IOError('Input database already exists. Please delete or choose different name.')

In [None]:
random.seed('haringey-scenario')
pytus2000.set_cache_location(CACHE_FOLDER_PATH)
requests_cache.install_cache((CACHE_FOLDER_PATH / 'web-cache').as_posix())

In [None]:
def df_to_input_db(df, table_name):
    disk_engine = sqlalchemy.create_engine('sqlite:///{}'.format(PATH_TO_INPUT_DB))
    df.to_sql(name=table_name, con=disk_engine)

In [None]:
def update_progress_bar(generator, progress_bar):
    for elem in generator:
        progress_bar.value += 1
        yield elem

In [None]:
def run_simulation(path_to_jar, path_to_input, path_to_output):
    cmd = ['java', '-jar', '-Xmx{}g'.format(JAVA_HEAP_SIZE), str(path_to_jar), 
           '-i', str(path_to_input), '-o', str(path_to_output),
           '-w', str(NUMBER_PROCESSES)]
    popen = subprocess.Popen(
        cmd, 
        stdout=subprocess.PIPE, 
        stderr=subprocess.PIPE,
        universal_newlines=True,
    )
    for stdout_line in iter(popen.stdout.readline, ""):
        print(stdout_line, end="")
    popen.stdout.close()
    return_code = popen.wait()
    if return_code:
        raise subprocess.CalledProcessError(return_code, cmd)

## Read, clean, and map all data

### Participants

First off, let's define the group of people we are using from the UK Time Use Survey 2000 as seed for the synthetic population. 

In [None]:
individual_data = read_individual_file(TUS_DATA_FOLDER_PATH / 'tab' / 'individual_data_5.tab')
## TODO filter city population
## TODO remove invalid households
age = individual_data.IAGE
seed = pd.DataFrame(index=individual_data.index)
for feature in PEOPLE_FEATURES:
    seed[str(feature)] = feature.tus_value_to_ktp_value(individual_data[feature.tus_variable_name], age)
for feature in HOUSEHOLD_FEATURES:
    seed[str(feature)] = feature.tus_value_to_ktp_value(individual_data[feature.tus_variable_name])
seed.head()

In [None]:
print('{} individuals in the seed have at least one missing feature '
      'and will be removed.'.format(len(seed[seed.isnull().any(axis=1)].index)))
seed.dropna(axis='index', how='any', inplace=True)
assert not seed.isnull().any().any()

### Amend Seed by Metabolic Rate

In [None]:
seed.loc[individual_data.IAGE < 18, 'metabolic_rate_active'] = METABOLIC_RATE_ACTIVE * METABOLIC_CHILD_RATE
seed.loc[individual_data.IAGE < 18, 'metabolic_rate_passive'] = METABOLIC_RATE_ACTIVE * METABOLIC_CHILD_RATE
seed.loc[individual_data.IAGE > 18, 'metabolic_rate_active'] = METABOLIC_RATE_ACTIVE * METABOLIC_ADULT_RATE
seed.loc[individual_data.IAGE > 18, 'metabolic_rate_passive'] = METABOLIC_RATE_PASSIVE * METABOLIC_ADULT_RATE
# TODO add female metabolic rate

In [None]:
seed.head()

### Create Markov Chains

Now that we have the participants, we can create markov chain for each type of citizen. A type is defined by all people features.

In [None]:
diary_data = read_diary_file(TUS_DATA_FOLDER_PATH / 'tab' / 'diary_data_8.tab')
diary_data_ts = read_diary_file_as_timeseries(TUS_DATA_FOLDER_PATH / 'tab' / 'diary_data_8.tab')[['activity', 'location']]

In [None]:
simple_ts = pd.DataFrame({
    'location': diary_data_ts.location.map(ktp.tus.LOCATION_MAP),
    'activity': diary_data_ts.activity.map(ktp.tus.ACTIVITY_MAP)
})

####  Handle Unknowns

In [None]:
simple_ts.isnull().any()

There are no nans.

In [None]:
len(simple_ts[(simple_ts.activity == ktp.tus.Activity.UNKNOWN) | (simple_ts.location == ktp.tus.Location.UNKNOWN)]) / len(simple_ts)

5.5% of all entries are missing.

In [None]:
filled_simple_ts = simple_ts.copy()

In [None]:
filled_simple_ts.replace(to_replace=[ktp.tus.Location.UNKNOWN, ktp.tus.Activity.UNKNOWN], value=np.nan, inplace=True)

In [None]:
filled_simple_ts.isnull().describe()

Unknowns will be filled by forward fill. That is, whenever  an acticity/location is unknown it is expected that the last known activity/location is still valid. 

When doing that, it is important to not forward fill between diaries (all diaries are below each other). Hence, they must be grouped into diaries first and then forward filled. This will lead to the fact that not all Unknowns can be filled (the ones at the beginning of the day), but that is wanted.  

In [None]:
filled_simple_ts = filled_simple_ts.groupby([filled_simple_ts.index.get_level_values(0), 
                                             filled_simple_ts.index.get_level_values(1), 
                                             filled_simple_ts.index.get_level_values(2), 
                                             filled_simple_ts.index.get_level_values(3)]).fillna(method='ffill')

In [None]:
filled_simple_ts.isnull().describe()

In [None]:
# TODO don't forward fill over too long durations, e.g. not more than 1-2h.

The remaining nans are filtered in the Filter section below.

#### Map to markov states.

In [None]:
markov_ts = ktp.tus.from_simplified_location_and_activity_to_people_model(filled_simple_ts)

#### Filter

In [None]:
# TODO

In [None]:
def filter_nan(markov_ts, diary_data):
    """Remove all diaries with at least one NaN."""
    nan_mask = markov_ts.groupby(by=lambda index: (index[0], index[1], index[2], index[3])).apply(lambda values: values.isnull().any())
    return pd.DataFrame(markov_ts)[markov_ts.index.droplevel(4).isin(nan_mask[~nan_mask].index)]

markov_ts = filter_nan(markov_ts, diary_data)
assert not markov_ts.isnull().any().any()

#### Cluster by synth pop categories

In [None]:
seed_groups = seed.groupby([str(feature) for feature in PEOPLE_FEATURES])
print("Dividing the seed into {} cluster.".format(len(seed_groups.groups.keys())))

In [None]:
weekdays = diary_data[diary_data.DDAYW2 == diary.DDAYW2.WEEKDAY_MON___FRI]
weekenddays = diary_data[diary_data.DDAYW2 == diary.DDAYW2.WEEKEND_DAY]

progress_bar = ipywidgets.IntProgress(
    value=0,
    min=0,
    max=len(seed_groups.groups.keys()),
    step=1,
    description='Progress:',
    bar_style='',
    orientation='horizontal'
)
display(progress_bar)

with Pool(cpu_count()) as pool:
    feature_combinations = seed_groups.groups.keys()
    all_parameters = ( # imap_unordered allows only one parameter, hence the tuple
        (markov_ts, 
         seed_groups.get_group(features),
         features,
         weekdays,
         weekenddays,
         TIME_STEP_SIZE)
        for features in feature_combinations
    )
    markov_chains = dict(update_progress_bar(
        pool.imap_unordered(ktp.tus.markov_chain_for_cluster, all_parameters),
        progress_bar)
    )

#### Amend seed by markov chain attribute

Now that we have calculated all markov chains, the id associated with the markov chain will be added as an attribute to the seed.

In [None]:
def markov_id(feature_values):
    if not isinstance(feature_values, tuple): # single feature value
        return feature_values.value
    elif len(feature_values) == 2:
        return pairing_function(feature_values[0].value, feature_values[1].value)
    else:
        raise NotImplementedError()


def pairing_function(x, y):
    # cantor pairing function, http://stackoverflow.com/a/919661/1856079
    return int(1/2 * (x + y) * (x + y + 1) + y)

In [None]:
seed['markov_id'] = pd.Series(index=seed.index, data=0, dtype=np.int32)
for feature_combination, index in seed_groups.groups.items():
    seed.loc[index, 'markov_id'] = markov_id(feature_combination)
    seed.loc[index, 'initial_activity'] = markov_chains[feature_combination].valid_states(START_TIME)[0]

In [None]:
seed.head()

### Synthetic Population

#### Read all census data

In [None]:
census_data_ppl = {feature: feature.read_census_data(SPATIAL_RESOLUTION) for feature in PEOPLE_FEATURES}
for data in census_data_ppl.values():
    assert data.sum().sum() == NUMBER_USUAL_RESIDENTS_HARINGEY

In [None]:
census_data_hh = {feature: feature.read_census_data(SPATIAL_RESOLUTION) for feature in HOUSEHOLD_FEATURES}
for data in census_data_hh.values():
    assert data.sum().sum() == NUMBER_HOUSEHOLDS_HARINGEY

#### Prepare seed

In [None]:
# prepare index
sn1_plus_sn2 = seed.index.droplevel(2)
seed = seed.copy()
seed['household_id'] = list(sn1_plus_sn2)
seed.reset_index(inplace=True)
seed.rename(columns={'SN3': 'person_id'}, inplace=True)
seed.set_index(['household_id', 'person_id'], inplace=True)
seed.drop(['SN1', 'SN2'], axis=1, inplace=True)
seed.head()

#### Run the iterative proportional fitting and create synthetic population

In [None]:
random_hh_feature = list(census_data_hh.values())[0]
regions = list(random_hh_feature.index)
controls_hh = {region: {str(feature): census_data_hh[feature].ix[region, :] for feature in HOUSEHOLD_FEATURES}
               for region in regions}
controls_ppl = {region: {str(feature): census_data_ppl[feature].ix[region, :] for feature in PEOPLE_FEATURES}
                for region in regions}
number_households = {region: random_hh_feature.ix[region, :].sum() for region in regions}
household_counter = count(start=1, step=1)
household_ids = {region: [household_counter.__next__() for _ in range(number_households[region])]
                for region in regions}
random_numbers = {region: [random.uniform(0, 1) for _ in range(number_households[region])]
                  for region in regions}
hh_chunk_size = int(NUMBER_HOUSEHOLDS_HARINGEY / NUMBER_PROCESSES / 4)

In [None]:
fitting_progress_bar = ipywidgets.IntProgress(
    value=0,
    min=0,
    max=len(regions),
    step=1,
    description='Fitting:',
    bar_style='',
    orientation='horizontal'
)
households_progress_bar = ipywidgets.IntProgress(
    value=0,
    min=0,
    max=len(regions),
    step=1,
    description='Househo:',
    bar_style='',
    orientation='horizontal'
)
citizen_progress_bar = ipywidgets.IntProgress(
    value=0,
    min=0,
    max=math.ceil(NUMBER_HOUSEHOLDS_HARINGEY / hh_chunk_size),
    step=1,
    description='Citizen:',
    bar_style='',
    orientation='horizontal'
)

In [None]:
fitting_progress_bar.value = 0
households_progress_bar.value = 0
citizen_progress_bar.value = 0
display(fitting_progress_bar)
display(households_progress_bar)
display(citizen_progress_bar)

with Pool(NUMBER_PROCESSES) as pool:
    hipf_params = ((seed, controls_hh[region], controls_ppl[region], region) for region in regions)
    household_weights = dict(update_progress_bar(
        pool.imap_unordered(ktp.synthpop.run_hipf, hipf_params), 
        fitting_progress_bar
    ))
    household_params = ((region, seed, household_weights[region], random_numbers[region], household_ids[region])
                        for region in regions)
    households = list(chain(*update_progress_bar(
        pool.imap_unordered(ktp.synthpop.sample_households, household_params),
        households_progress_bar
    )))
    household_chunks = [households[i:i + hh_chunk_size] for i in range(0, len(households), hh_chunk_size)]
    citizens = list(chain(*update_progress_bar(
        pool.imap_unordered(ktp.synthpop.sample_citizen, ((households, seed) for households in household_chunks)),
        citizen_progress_bar
    )))


In [None]:
assert len(households) == NUMBER_HOUSEHOLDS_HARINGEY
assert abs(len(citizens) - NUMBER_USUAL_RESIDENTS_HARINGEY) < 1000

#### Extend Synthetic Population with Parameters from UKBuildings

In [None]:
# TODO ; for the moment use random parameters

In [None]:
class UniformDistributedParameter():
    
    def __init__(self, expected_value, variation_in_percent):
        self.__expected_value = expected_value
        self.__random_max = expected_value * variation_in_percent / 100
        
    def sample(self):
        return self.__expected_value + random.uniform(-self.__random_max, self.__random_max)
   

CONDITIONED_FLOOR_AREA = 100 # m^2
HEAT_MASS_CAPACITY = UniformDistributedParameter(165000 * CONDITIONED_FLOOR_AREA, 0.0)
HEAT_TRANSMISSION = UniformDistributedParameter(200, 0.0)
MAX_HEATING_POWER = 10000
INITIAL_TEMPERATURE = UniformDistributedParameter(22, 0.0)
HEATING_CONTROL_STRATEGY = 'PRESENCE_TRIGGERED'

In [None]:
dwellings_df = pd.DataFrame(
    index=[household.id for household in households],
    data={
        'heatMassCapacity': [HEAT_MASS_CAPACITY.sample() for unused in households],
        'heatTransmission': [HEAT_TRANSMISSION.sample() for unused in households],
        'maxHeatingPower': MAX_HEATING_POWER,
        'initialTemperature': [INITIAL_TEMPERATURE.sample() for unused in households],
        'conditionedFloorArea': CONDITIONED_FLOOR_AREA,
        'heatingControlStrategy': HEATING_CONTROL_STRATEGY,
        'region': [household.region for household in households]
    }
)

In [None]:
citizen_df = pd.DataFrame(
    index=list(range(len(citizens))),
    data={
        'markovChainId': [citizen.markovId for citizen in citizens],
        'dwellingId': [citizen.householdId for citizen in citizens],
        'initialActivity': [str(citizen.initialActivity) for citizen in citizens],
        'activeMetabolicRate': [citizen.activeMetabolicRate for citizen in citizens],
        'passiveMetabolicRate': [citizen.passiveMetabolicRate for citizen in citizens],
        'randomSeed': [citizen.randomSeed for citizen in citizens]
    }
)

### Write out to Database

In [None]:
markov_index = pd.Series(
    {
        markov_id(feature_combination): "markov{}".format(markov_id(feature_combination))
        for feature_combination in markov_chains.keys()
    }, 
    name='tablename'
)
df_to_input_db(markov_index, MARKOV_CHAIN_INDEX_TABLE_NAME)

In [None]:
for feature_combination, markov_chain in markov_chains.items():
    df = markov_chain.to_dataframe()
    df.fromActivity = [str(x) for x in df.fromActivity]
    df.toActivity = [str(x) for x in df.toActivity]
    df_to_input_db(df, markov_index[markov_id(feature_combination)])

In [None]:
df_to_input_db(dwellings_df, DWELLINGS_TABLE_NAME)

In [None]:
df_to_input_db(citizen_df, PEOPLE_TABLE_NAME)

### Enviroment Data

In [None]:
def date_parser(date, time):
    month, day, year = [int(x) for x in date.split('/')]
    hour, minute = [int(x) for x in time.split(':')]
    return datetime.datetime(year, month, day, hour - 1, minute)

temperature = pd.read_csv(
    MIDAS_DATABASE_PATH, 
    skiprows=[0], 
    header=0, 
    parse_dates=[['Date (MM/DD/YYYY)', 'Time (HH:MM)']], 
    date_parser=date_parser,
    index_col=[0]
)
temperature.rename(columns={'Dry-bulb (C)': 'temperature'}, inplace=True)
temperature.index.name = 'index'
df_to_input_db(temperature['temperature'].resample(TIME_STEP_SIZE).ffill(), ENVIRONMENT_TABLE_NAME)

### Simulation Parameter

In [None]:
df_to_input_db(
    table_name=PARAMETERS_TABLE_NAME,
    df=pd.DataFrame(
        index=[1],
        data={
            'initialDatetime': START_TIME,
            'timeStepSize_in_min': TIME_STEP_SIZE.total_seconds() / 60,
            'numberTimeSteps': NUMBER_TIME_STEPS,
            'setPointWhileHome': SET_POINT_WHILE_HOME,
            'setPointWhileAsleep': SET_POINT_WHILE_ASLEEP,
            'wakeUpTime': datetime.time(7, 0),
            'leaveHomeTime': datetime.time(8, 30),
            'comeHomeTime': datetime.time(18, 0),
            'bedTime': datetime.time(22, 0)
        }
    )
)

## Run Simulation

In [None]:
run_simulation(PATH_TO_JAR, PATH_TO_INPUT_DB, PATH_TO_OUTPUT_DB)

## Result Analysis

In [None]:
disk_engine = sqlalchemy.create_engine('sqlite:///{}'.format(PATH_TO_OUTPUT_DB))

In [None]:
pd.read_sql_query('SELECT * FROM metadata', disk_engine, index_col='key', parse_dates=True)

### Read Data

In [None]:
thermal_power = pd.read_sql_query('SELECT * FROM thermalPower', disk_engine, index_col='timestamp', parse_dates=True)
thermal_power.index = pd.to_datetime(thermal_power.index * 1000 * 1000)
thermal_power.index.name = 'datetime'
thermal_power = thermal_power.pivot(columns='id')
thermal_power.columns = thermal_power.columns.droplevel(0)
thermal_power.name = 'thermal power'

In [None]:
dwellings = pd.read_sql_query('SELECT * FROM dwellings', disk_engine, index_col='index')
people = pd.read_sql_query('SELECT * FROM people', disk_engine, index_col='index')
dwellings['householdSize'] = people.groupby('dwellingId').size()
dwellings['average_power'] = thermal_power.mean()

In [None]:
geo_data = ktp.census.read_haringey_shape_file(SPATIAL_RESOLUTION)
householdTypes = ktp.census.read_household_type_data(SPATIAL_RESOLUTION)
age_structure = ktp.census.read_age_structure_data(SPATIAL_RESOLUTION)
qualification_data = ktp.census.read_qualification_level_data(SPATIAL_RESOLUTION)
economic_activity_data = ktp.census.read_economic_activity_data(SPATIAL_RESOLUTION)

In [None]:
AGE_MAP = {
    ktp.types.AgeStructure.AGE_0_TO_4: 2.5,
    ktp.types.AgeStructure.AGE_5_TO_7: 6.5,
    ktp.types.AgeStructure.AGE_8_TO_9: 9,
    ktp.types.AgeStructure.AGE_10_TO_14: 12.5,
    ktp.types.AgeStructure.AGE_15: 15.5,
    ktp.types.AgeStructure.AGE_16_TO_17: 17,
    ktp.types.AgeStructure.AGE_18_TO_19: 19,
    ktp.types.AgeStructure.AGE_20_TO_24: 22.5,
    ktp.types.AgeStructure.AGE_25_TO_29: 27.5,
    ktp.types.AgeStructure.AGE_30_TO_44: 37.5,
    ktp.types.AgeStructure.AGE_45_TO_59: 52.5,
    ktp.types.AgeStructure.AGE_60_TO_64: 62.5,
    ktp.types.AgeStructure.AGE_65_TO_74: 70,
    ktp.types.AgeStructure.AGE_75_TO_84: 80,
    ktp.types.AgeStructure.AGE_85_TO_89: 87.5,
    ktp.types.AgeStructure.AGE_90_AND_OVER: 95 # FIXME
}

def meanAge(age_structure):
    age_structure_num = age_structure.copy()
    for col in age_structure:
        age_structure_num[col] = age_structure[col] * AGE_MAP[col]
    return age_structure_num.sum(axis=1) / age_structure.sum(axis=1)
    
    
def percent_highest_qualification(qualification_data):
    return qualification_data[ktp.types.Qualification.LEVEL_45] / qualification_data.sum(axis=1)


def percent_economic_active(economic_activity_data):
    total_active = economic_activity_data[[ktp.types.EconomicActivity.EMPLOYEE_PART_TIME, 
                                           ktp.types.EconomicActivity.EMPLOYEE_FULL_TIME,
                                           ktp.types.EconomicActivity.SELF_EMPLOYED, 
                                           ktp.types.EconomicActivity.ACTIVE_FULL_TIME_STUDENT]].sum(axis=1)
    return total_active / economic_activity_data.sum(axis=1) 

In [None]:
geo_data['average_power'] = dwellings.groupby('region').average_power.mean()
geo_data['number_households'] = householdTypes.sum(axis=1)
geo_data['number citizens'] = age_structure.sum(axis=1)
geo_data['avg household size'] = age_structure.sum(axis=1)/householdTypes.sum(axis=1)
geo_data['avg age'] = meanAge(age_structure)
geo_data['percent highest qual'] = percent_highest_qualification(qualification_data)
geo_data['percent economic act'] = percent_economic_active(economic_activity_data)

### Plot Data

In [None]:
ax = thermal_power\
    .groupby(axis=1, by=lambda id: dwellings.loc[id, 'region'])\
    .mean()\
    .plot(figsize=(14, 7), legend=None)
_ = plt.ylabel('average thermal power per household [W]')
_ = plt.title('Average of thermal power per household in different {}'.format(SPATIAL_RESOLUTION))
fig = ax.get_figure()
fig.savefig((BUILD_FOLDER_PATH / 'thermal_power.png').as_posix())

In [None]:
fig = plt.figure(figsize=(7, 7))
sns.violinplot(data=dwellings.groupby('region').average_power.mean())
_ = plt.ylabel('average thermal power per household [W]')
_ = plt.xticks([])
_ = plt.title("Distribution of average thermal power per household among {}".format(SPATIAL_RESOLUTION))
fig.savefig((BUILD_FOLDER_PATH / "distributation-average-power.png").as_posix())

In [None]:
import geopandasplotting as gpdplt

ax = gpdplt.plot_dataframe(
    geo_data,
    column='average_power',
    categorical=False, 
    linewidth=0.2, 
    legend=True,
    figsize=(14, 7),
    cmap='viridis'
)
_ = plt.title("Average Thermal Power per Household in different {} [W]".format(SPATIAL_RESOLUTION))
_ = plt.xticks([])
_ = plt.yticks([])
fig = ax.get_figure()
fig.savefig((BUILD_FOLDER_PATH / 'thermal_power_choropleth.png').as_posix())

In [None]:
max_power_region = dwellings.groupby('region').average_power.mean().max()
min_power_region = dwellings.groupby('region').average_power.mean().min()
print(max_power_region/min_power_region)

In [None]:
sns.barplot(data=dwellings, x='region', y='householdSize')

In [None]:
fig = sns.pairplot(
    data=geo_data, 
    vars=['average_power', 'avg household size', 'avg age', 
            'percent highest qual', 'percent economic act'])
fig.savefig((BUILD_FOLDER_PATH / 'pairwise-distributions.png').as_posix())