In [None]:
import os
import sys
from pathlib import Path
from itertools import combinations

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pytz
import requests_cache
import seaborn as sns
sns.set_context("notebook", font_scale=1.25, rc={"lines.linewidth": 2.5})
%matplotlib inline

from pytus2000 import read_diary_file, diary, read_individual_file, individual, read_diary_file_as_timeseries
import pytus2000
import people as ppl
module_path = os.path.abspath(os.path.join('../'))
if module_path not in sys.path:
    sys.path.append(module_path)
import urbanoccupants as uo

In [None]:
BUILD_FOLDER_PATH = Path('./build/')
TUS_DATA_FOLDER_PATH = Path('./data/UKDA-4504-tab/')

In [None]:
pytus2000.set_cache_location(BUILD_FOLDER_PATH)
requests_cache.install_cache((BUILD_FOLDER_PATH / 'web-cache').as_posix())

In [None]:
def filter_features_and_drop_nan(df, features):
    if isinstance(features, tuple): # 2D
        features = list(features)
    return df[features].dropna(axis='index', how='any')

## Participants

In [None]:
ALL_FEATURES = [
    uo.synthpop.PeopleFeature.ECONOMIC_ACTIVITY,
    uo.synthpop.PeopleFeature.QUALIFICATION,
    uo.synthpop.PeopleFeature.AGE,
    uo.synthpop.HouseholdFeature.HOUSEHOLD_TYPE,
    uo.synthpop.HouseholdFeature.POPULATION_DENSITY,
    uo.synthpop.HouseholdFeature.REGION,
    uo.synthpop.PeopleFeature.CARER,
    uo.synthpop.PeopleFeature.PERSONAL_INCOME
]

In [None]:
individual_data = read_individual_file(TUS_DATA_FOLDER_PATH / 'tab' / 'individual_data_5.tab')
age = individual_data.IAGE
seed = pd.DataFrame(index=individual_data.index)
for feature in ALL_FEATURES:
    seed[str(feature)] = feature.tus_value_to_uo_value(individual_data[feature.tus_variable_name], age)
seed.head()

In [None]:
seed.info()

## Time Series

In [None]:
diary_data = read_diary_file(TUS_DATA_FOLDER_PATH / 'tab' / 'diary_data_8.tab')
diary_data_ts = read_diary_file_as_timeseries(TUS_DATA_FOLDER_PATH / 'tab' / 'diary_data_8.tab')[['activity', 'location']]

In [None]:
simple_ts = pd.DataFrame({
    'location': diary_data_ts.location.map(uo.tus.LOCATION_MAP),
    'activity': diary_data_ts.activity.map(uo.tus.ACTIVITY_MAP)
})

### Handle Unknowns

In [None]:
simple_ts.isnull().any()

There are no nans.

In [None]:
len(simple_ts[(simple_ts.activity == uo.tus.Activity.UNKNOWN) | (simple_ts.location == uo.tus.Location.UNKNOWN)]) / len(simple_ts)

5.5% of all entries are missing.

In [None]:
filled_simple_ts = simple_ts.copy()

In [None]:
filled_simple_ts.replace(to_replace=[uo.tus.Location.UNKNOWN, uo.tus.Activity.UNKNOWN], value=np.nan, inplace=True)

In [None]:
filled_simple_ts.isnull().describe()

Unknowns will be filled by forward fill. That is, whenever  an acticity/location is unknown it is expected that the last known activity/location is still valid. 

When doing that, it is important to not forward fill between diaries (all diaries are below each other). Hence, they must be grouped into diaries first and then forward filled. This will lead to the fact that not all Unknowns can be filled (the ones at the beginning of the day), but that is wanted.  

In [None]:
filled_simple_ts = filled_simple_ts.groupby([filled_simple_ts.index.get_level_values(0), 
                                             filled_simple_ts.index.get_level_values(1), 
                                             filled_simple_ts.index.get_level_values(2), 
                                             filled_simple_ts.index.get_level_values(3)]).fillna(method='ffill')

In [None]:
filled_simple_ts.isnull().describe()

In [None]:
# TODO don't forward fill over too long durations, e.g. not more than 1-2h.

The remaining nans are filtered in the Filter section below.

### Map to markov states.

In [None]:
markov_ts = uo.tus.from_simplified_location_and_activity_to_people_model(filled_simple_ts)

### Filter

In [None]:
# TODO

In [None]:
def filter_nan(markov_ts):
    """Remove all diaries with at least one NaN."""
    nan_mask = markov_ts.groupby(by=lambda index: (index[0], index[1], index[2], index[3])).apply(lambda values: values.isnull().any())
    return pd.DataFrame(markov_ts)[markov_ts.index.droplevel(4).isin(nan_mask[~nan_mask].index)]

def remove_people_not_in_seed(markov_ts, seed):
    mask = markov_ts.index.droplevel([3, 4]).isin(seed.index)
    return markov_ts[mask]


def filter_all_people_for_which_less_than_two_diaries_exist(markov_ts):
    valid_mask = markov_ts.groupby([markov_ts.index.get_level_values(0), 
                                    markov_ts.index.get_level_values(1), 
                                    markov_ts.index.get_level_values(2)]).apply(lambda values: len(values) == 24 * 6 * 2)
    return markov_ts[markov_ts.index.droplevel([3, 4]).isin(valid_mask[valid_mask].index)]

In [None]:
markov_ts = filter_nan(markov_ts)
assert not markov_ts.isnull().any().any()

In [None]:
markov_ts = remove_people_not_in_seed(markov_ts, seed)
markov_ts = filter_all_people_for_which_less_than_two_diaries_exist(markov_ts)
seed = seed[seed.index.isin(markov_ts.index.droplevel([3, 4]))]
assert len(seed.index) * 2 * 24 * 6 == len(markov_ts.index)

### Add DayType

In [None]:
weekdays = diary_data[diary_data.DDAYW2 == diary.DDAYW2.WEEKDAY_MON___FRI]
markov_ts['daytype'] = 'weekend'
markov_ts.loc[markov_ts.index.droplevel(4).isin(weekdays.index), 'daytype'] = 'weekday'
markov_ts = markov_ts.reset_index(level=[3, 4]).set_index(['daytype', 'time_of_day'], append=True)
markov_ts.drop('SN4', axis=1, inplace=True)
markov_ts.head()

In [None]:
markov_ts = markov_ts.unstack([0, 1, 2])
markov_ts.columns = markov_ts.columns.droplevel(0)

## Analyse Features

In [None]:
# taken from http://stackoverflow.com/a/39266194/1856079
import scipy.stats as ss

def cramers_corrected_stat(confusion_matrix):
    """ calculate Cramers V statistic for categorial-categorial association.
        uses correction from Bergsma and Wicher, 
        Journal of the Korean Statistical Society 42 (2013): 323-328
    """
    chi2 = ss.chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum().sum()
    phi2 = chi2/n
    r,k = confusion_matrix.shape
    phi2corr = max(0, phi2 - ((k-1)*(r-1))/(n-1))    
    rcorr = r - ((r-1)**2)/(n-1)
    kcorr = k - ((k-1)**2)/(n-1)
    return np.sqrt(phi2corr / min( (kcorr-1), (rcorr-1)))

In [None]:
feature_correlation = pd.Series(
    index=combinations(ALL_FEATURES, 2),
    data=[cramers_corrected_stat(pd.crosstab(filter_features_and_drop_nan(seed, features)[features[0]],
                                             filter_features_and_drop_nan(seed, features)[features[1]]))
          for features in combinations([str(feature) for feature in ALL_FEATURES], 2)]
)

In [None]:
ax = feature_correlation.sort_values().plot.barh(figsize=(14, 7))
_ = plt.title('Cramer phi estimation of people feature association')
fig = ax.get_figure()
fig.savefig((BUILD_FOLDER_PATH / 'people-feature-association.png').as_posix())

In [None]:
feature_correlation.sort_values(ascending=False)

## Select Variables

### One dimenional features

First, let's look at one dimensional features only.

In [None]:
markov_ts.ix['weekend', 10].map(lambda x: x.value).plot()
_ = plt.yticks([1, 2, 3, 4, 5], [x for x in ppl.Activity])

In [None]:
def pairing_function(x, y):
    # cantor pairing function, http://stackoverflow.com/a/919661/1856079
    return int(1/2 * (x + y) * (x + y + 1) + y)


def feature_id(feature_values):
    # transform enums to ints
    if not isinstance(feature_values, tuple) and not isinstance(feature_values, pd.Series): # single feature value
        if isinstance(feature_values, Enum):
            feature_values = int(feature_values.value)
    else:
        if isinstance(feature_values[0], Enum):
            feature_values = tuple(int(feature_value.value) for feature_value in feature_values)
    
    # calculate id
    if not isinstance(feature_values, tuple) and not isinstance(feature_values, pd.Series): # single feature value
        return feature_values
    elif len(feature_values) == 2:
        return pairing_function(feature_values[0], feature_values[1])
    else:
        return pairing_function(feature_id(feature_values[:-1]), feature_values[-1])

        
def cramers_phi_for_feature(feature):
    if isinstance(feature, pd.Series): # 1D
        feature_ids = feature.apply(feature_id)
    elif isinstance(feature, pd.DataFrame): # 2D or more
        feature_ids = feature.apply(feature_id, axis=1)
    else:
        raise ValueError('Feature must be pandas series or dataframe.')
    def cramers_phi(series):
        return cramers_corrected_stat(pd.crosstab(series.values, feature_ids))
    return cramers_phi


from enum import Enum

class TestFeature(Enum):
    A = 1
    B = 2
    C = 3

assert feature_id(TestFeature.A) == 1
assert feature_id((TestFeature.A, TestFeature.B)) == pairing_function(1, 2)
assert feature_id(pd.Series([TestFeature.A, TestFeature.B])) == pairing_function(1, 2)
assert feature_id((TestFeature.A, TestFeature.B, TestFeature.C)) == pairing_function(pairing_function(1, 2), 3)
assert feature_id(pd.Series([TestFeature.A, TestFeature.B, TestFeature.C])) == pairing_function(pairing_function(1, 2), 3)

In [None]:
def cramers_phi_for_features(markov_ts, seed, features):
    filtered_seed = filter_features_and_drop_nan(seed, features)
    return markov_ts.loc[:, filtered_seed.index].apply(cramers_phi_for_feature(filtered_seed), axis=1)

In [None]:
ts_corr_1d = pd.DataFrame({
    feature: cramers_phi_for_features(markov_ts, seed, feature)
    for feature in [str(feature) for feature in ALL_FEATURES]
})   

In [None]:
ax = ts_corr_1d.plot(figsize=(14, 7))
_ = plt.title("Cramer's phi for each time step of people model")
_ = plt.ylabel("Cramer's phi")
_ = plt.xlabel("time step")
fig = ax.get_figure()
fig.savefig((BUILD_FOLDER_PATH / 'markov_ts_cramer_1d.png').as_posix())

In [None]:
ts_corr_1d.describe()

### 2 dimensional features

In [None]:
ts_corr_2d = pd.DataFrame({
    features: cramers_phi_for_features(markov_ts, seed, features)
    for features in combinations([str(feature) for feature in ALL_FEATURES], 2)
})   

In [None]:
best_1d = ts_corr_1d.loc[:, ts_corr_1d.mean() == ts_corr_1d.mean().max()]
ax = ts_corr_2d.loc[:, ts_corr_2d.mean() > float(best_1d.mean())].plot(figsize=(14, 7))
_ = plt.title("Cramer's phi for each time step of people model")
_ = plt.ylabel("Cramer's phi")
_ = plt.xlabel("time step")
fig = ax.get_figure()
fig.savefig((BUILD_FOLDER_PATH / 'markov_ts_cramer_2d.png').as_posix())

In [None]:
ts_corr_2d.loc[:, ts_corr_2d.mean() > ts_corr_1d.mean().max()].describe()

In [None]:
ts_corr_2d.describe().sort_values(by='mean', axis=1, ascending=False)

### 3 dimensional features

In [None]:
ts_corr_3d = pd.DataFrame({
    features: cramers_phi_for_features(markov_ts, seed, features)
    for features in combinations([str(feature) for feature in ALL_FEATURES[:4]], 3)
})   

In [None]:
ax = ts_corr_3d.loc[:, ts_corr_3d.mean() > ts_corr_2d.mean().max()].plot(figsize=(14, 7))
_ = plt.title("Cramer's phi for each time step of people model")
_ = plt.ylabel("Cramer's phi")
_ = plt.xlabel("time step")
fig = ax.get_figure()
fig.savefig((BUILD_FOLDER_PATH / 'markov_ts_cramer_3d.png').as_posix())

In [None]:
ts_corr_3d.loc[:, ts_corr_3d.mean() > ts_corr_2d.mean().max()].describe()

## Visual Inspection of Clustering

In [None]:
from matplotlib.colors import ListedColormap
from scipy.ndimage.filters import gaussian_filter

GREY_COLORMAP = ListedColormap(sns.light_palette("black", 30)[2:20])
GAUSSIAN_SIGMA = 0.7

In [None]:
color_markov_ts = markov_ts.copy()
color_markov_ts.replace(to_replace=ppl.Activity.NOT_AT_HOME, value=0, inplace=True)
color_markov_ts.replace(to_replace=ppl.Activity.SLEEP_AT_OTHER_HOME, value=0, inplace=True)
color_markov_ts.replace(to_replace=ppl.Activity.OTHER_HOME, value=0, inplace=True)
color_markov_ts.replace(to_replace=ppl.Activity.SLEEP_AT_HOME, value=0.5, inplace=True)
color_markov_ts.replace(to_replace=ppl.Activity.HOME, value=1.0, inplace=True)

### Original data set

In [None]:
fig = plt.figure(figsize=(14, 7))
sns.heatmap(gaussian_filter(color_markov_ts, sigma=GAUSSIAN_SIGMA), cmap=GREY_COLORMAP, cbar=False)
_ = plt.xticks([])
_ = plt.yticks([])
_ = plt.xlabel('people')
_ = plt.ylabel('time of day')
fig.savefig((BUILD_FOLDER_PATH / 'all-diaries-original.png').as_posix())

### Clustered by economic activity

In [None]:
sorted_seed = seed.sort_values(by='economicActivity')
tranposed_markov_ts = color_markov_ts.transpose()
tranposed_markov_ts.index = tranposed_markov_ts.index.droplevel(0)
last_entries_in_group = sorted_seed.reset_index().groupby('economicActivity').last()[['SN1', 'SN2', 'SN3']]
cluster_boundaries = [sorted_seed.reset_index()[(sorted_seed.reset_index().SN1 == last_entries_in_group.iloc[i, 0]) &
                                                (sorted_seed.reset_index().SN2 == last_entries_in_group.iloc[i, 1]) &
                                                (sorted_seed.reset_index().SN3 == last_entries_in_group.iloc[i, 2])].index.values[0]
                      for i in range(len(last_entries_in_group))]
cluster_boundaries = pd.Series(cluster_boundaries)
label_locations = cluster_boundaries.shift().fillna(0) + cluster_boundaries.diff().fillna(cluster_boundaries[0]) / 2
label_locations = label_locations.astype(np.int16)
sorted_markov_ts = tranposed_markov_ts.reindex(sorted_seed.index).dropna().transpose()

In [None]:
fig = plt.figure(figsize=(14, 7))
sns.heatmap(gaussian_filter(sorted_markov_ts, sigma=GAUSSIAN_SIGMA), cmap=GREY_COLORMAP, cbar=False)
plt.vlines(cluster_boundaries, ymin=0, ymax=288, color='black', linewidth=2)
_ = plt.xticks(label_locations.values, sorted_seed.economicActivity.iloc[cluster_boundaries].values)
_ = plt.yticks([])
_ = plt.xlabel('people')
_ = plt.ylabel('time of day')
fig.autofmt_xdate()
fig.savefig((BUILD_FOLDER_PATH / 'all-diaries-economic-activity-cluster.png').as_posix())

### Clustered by age group

In [None]:
sorted_seed = seed.sort_values(by='ageGroup')
transposed_markov_ts = color_markov_ts.transpose()
transposed_markov_ts.index = transposed_markov_ts.index.droplevel(0)
last_entries_in_group = sorted_seed.reset_index().groupby('ageGroup').last()[['SN1', 'SN2', 'SN3']]
cluster_boundaries = [sorted_seed.reset_index()[(sorted_seed.reset_index().SN1 == last_entries_in_group.iloc[i, 0]) &
                                                (sorted_seed.reset_index().SN2 == last_entries_in_group.iloc[i, 1]) &
                                                (sorted_seed.reset_index().SN3 == last_entries_in_group.iloc[i, 2])].index.values[0]
                      for i in range(len(last_entries_in_group))]
cluster_boundaries = pd.Series(cluster_boundaries)
label_locations = cluster_boundaries.shift().fillna(0) + cluster_boundaries.diff().fillna(cluster_boundaries[0]) / 2
label_locations = label_locations.astype(np.int16)
sorted_markov_ts = transposed_markov_ts.reindex(sorted_seed.index).dropna().transpose()

In [None]:
fig = plt.figure(figsize=(14, 7))
sns.heatmap(gaussian_filter(sorted_markov_ts, sigma=GAUSSIAN_SIGMA), cmap=GREY_COLORMAP, cbar=False)
plt.vlines(cluster_boundaries, ymin=0, ymax=288, color='black', linewidth=2)
_ = plt.xticks(label_locations.values, sorted_seed.ageGroup.iloc[cluster_boundaries])
_ = plt.yticks([])
_ = plt.xlabel('people')
_ = plt.ylabel('time of day')
fig.autofmt_xdate()
fig.savefig((BUILD_FOLDER_PATH / 'all-diaries-age-group-cluster.png').as_posix())