In [None]:
import os
import sys
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pytz
import requests_cache
import seaborn as sns
sns.set_context("notebook", font_scale=1.25, rc={"lines.linewidth": 2.5})
%matplotlib inline

from pytus2000 import read_diary_file, diary, read_individual_file, individual, read_diary_file_as_timeseries
import pytus2000
import people as ppl
module_path = os.path.abspath(os.path.join('../'))
if module_path not in sys.path:
    sys.path.append(module_path)
import ktp.tus
import ktp.census

In [None]:
BUILD_FOLDER_PATH = Path('./build/')
TUS_DATA_FOLDER_PATH = Path('./data/UKDA-4504-tab/')

In [None]:
pytus2000.set_cache_location(BUILD_FOLDER_PATH)
requests_cache.install_cache((BUILD_FOLDER_PATH / 'web-cache').as_posix())

## Participants

In [None]:
individual_data = read_individual_file(TUS_DATA_FOLDER_PATH / 'tab' / 'individual_data_5.tab')
## TODO filter city population
seed = pd.DataFrame(index=individual_data.index, columns=['labour', 'qualification', 'age', 'hhtype'])
seed.labour = individual_data.ECONACT2.map(ktp.tus.ECONOMIC_ACTIVITY_MAP)
seed.labour[individual_data.IAGE > 74] = ktp.census.EconomicActivity.ABOVE_74
seed.qualification = individual_data.HIQUAL4.map(ktp.tus.QUALIFICATION_MAP)
seed['age'] = individual_data.IAGE.copy()
seed['hhtype'] = individual_data.HHTYPE4.map(ktp.tus.HOUSEHOLDTYPE_MAP)
## TODO add more feature, all that are in census data as well
seed.head()

In [None]:
seed.dropna(axis='index', how='any', inplace=True)
assert not seed.isnull().any().any()

In [None]:
household_types = seed.groupby((seed.index.get_level_values(0), seed.index.get_level_values(1))).hhtype.first()
household_sizes = seed.groupby((seed.index.get_level_values(0), seed.index.get_level_values(1))).hhtype.count()
mask_couples_children = household_sizes[(household_types == ktp.census.HouseholdType.COUPLE_WITH_DEPENDENT_CHILDREN) & (household_sizes <= 2)]
mask_couples_no_children = household_sizes[(household_types == ktp.census.HouseholdType.COUPLE_WITHOUT_DEPENDENT_CHILDREN) & (household_sizes != 2)]
invalids = (
    household_sizes[(household_types == ktp.census.HouseholdType.COUPLE_WITH_DEPENDENT_CHILDREN) & (household_sizes <= 2)] |
    household_sizes[(household_types == ktp.census.HouseholdType.COUPLE_WITHOUT_DEPENDENT_CHILDREN) & (household_sizes != 2)] |
    household_sizes[(household_types == ktp.census.HouseholdType.LONE_PARENT_WITH_DEPENDENT_CHILDREN) & (household_sizes < 2)] |
    household_sizes[(household_types == ktp.census.HouseholdType.MULTI_PERSON_HOUSEHOLD) & (household_sizes <= 2)]
)

seed.drop(labels=invalids.index, level=None, inplace=True)


print("{} households are invalid and were removed.".format(invalids.count()))

### Test

In [None]:
assert not ((seed.labour == ktp.census.EconomicActivity.ABOVE_74) & (seed.age <= 74)).any()
assert not ((seed.labour == ktp.census.EconomicActivity.BELOW_16) & (seed.age >= 16)).any()
assert not ((seed.labour == ktp.census.EconomicActivity.BELOW_16) & (seed.qualification != ktp.census.Qualification.BELOW_16)).any()
assert not ((seed.labour != ktp.census.EconomicActivity.BELOW_16) & (seed.qualification == ktp.census.Qualification.BELOW_16)).any()

household_types = seed.groupby((seed.index.get_level_values(0), seed.index.get_level_values(1))).hhtype.first()
household_sizes = seed.groupby((seed.index.get_level_values(0), seed.index.get_level_values(1))).hhtype.count()
assert (household_sizes[household_types == ktp.census.HouseholdType.ONE_PERSON_HOUSEHOLD] == 1).all()
assert (household_sizes[household_types == ktp.census.HouseholdType.COUPLE_WITH_DEPENDENT_CHILDREN] > 2).all()
assert (household_sizes[household_types == ktp.census.HouseholdType.LONE_PARENT_WITH_DEPENDENT_CHILDREN] > 1).all()
assert (household_sizes[household_types == ktp.census.HouseholdType.COUPLE_WITHOUT_DEPENDENT_CHILDREN] == 2).all()
assert (household_sizes[household_types == ktp.census.HouseholdType.MULTI_PERSON_HOUSEHOLD] > 2).all()

## Time Series

In [None]:
diary_data = read_diary_file(TUS_DATA_FOLDER_PATH / 'tab' / 'diary_data_8.tab')
diary_data_ts = read_diary_file_as_timeseries(TUS_DATA_FOLDER_PATH / 'tab' / 'diary_data_8.tab')[['activity', 'location']]

In [None]:
simple_ts = pd.DataFrame({
    'location': diary_data_ts.location.map(ktp.tus.LOCATION_MAP),
    'activity': diary_data_ts.activity.map(ktp.tus.ACTIVITY_MAP)
})

### Handle Unknowns

In [None]:
simple_ts.isnull().any()

There are no nans.

In [None]:
len(simple_ts[(simple_ts.activity == ktp.tus.Activity.UNKNOWN) | (simple_ts.location == ktp.tus.Location.UNKNOWN)]) / len(simple_ts)

5.5% of all entries are missing.

In [None]:
filled_simple_ts = simple_ts.copy()

In [None]:
filled_simple_ts.replace(to_replace=[ktp.tus.Location.UNKNOWN, ktp.tus.Activity.UNKNOWN], value=np.nan, inplace=True)

In [None]:
filled_simple_ts.isnull().describe()

Unknowns will be filled by forward fill. That is, whenever  an acticity/location is unknown it is expected that the last known activity/location is still valid. 

When doing that, it is important to not forward fill between diaries (all diaries are below each other). Hence, they must be grouped into diaries first and then forward filled. This will lead to the fact that not all Unknowns can be filled (the ones at the beginning of the day), but that is wanted.  

In [None]:
filled_simple_ts = filled_simple_ts.groupby([filled_simple_ts.index.get_level_values(0), 
                                             filled_simple_ts.index.get_level_values(1), 
                                             filled_simple_ts.index.get_level_values(2), 
                                             filled_simple_ts.index.get_level_values(3)]).fillna(method='ffill')

In [None]:
filled_simple_ts.isnull().describe()

In [None]:
# TODO don't forward fill over too long durations, e.g. not more than 1-2h.

The remaining nans are filtered in the Filter section below.

### Map to markov states.

In [None]:
markov_ts = ktp.tus.from_simplified_location_and_activity_to_people_model(filled_simple_ts)

### Filter

In [None]:
# TODO

In [None]:
def filter_nan(markov_ts, diary_data):
    """Remove all diaries with at least one NaN."""
    nan_mask = markov_ts.groupby(by=lambda index: (index[0], index[1], index[2], index[3])).apply(lambda values: values.isnull().any())
    return pd.DataFrame(markov_ts)[markov_ts.index.droplevel(4).isin(nan_mask[~nan_mask].index)]

def remove_people_not_in_seed(markov_ts, seed):
    mask = markov_ts.index.droplevel([3, 4]).isin(seed.index)
    return markov_ts[mask]


def filter_all_people_for_which_less_than_two_diaries_exist(markov_ts):
    valid_mask = markov_ts.groupby([markov_ts.index.get_level_values(0), 
                                    markov_ts.index.get_level_values(1), 
                                    markov_ts.index.get_level_values(2)]).apply(lambda values: len(values) == 24 * 6 * 2)
    return markov_ts[markov_ts.index.droplevel([3, 4]).isin(valid_mask[valid_mask].index)]

In [None]:
markov_ts = filter_nan(markov_ts, diary_data)
assert not markov_ts.isnull().any().any()

In [None]:
markov_ts = remove_people_not_in_seed(markov_ts, seed)
markov_ts = filter_all_people_for_which_less_than_two_diaries_exist(markov_ts)
seed = seed[seed.index.isin(markov_ts.index.droplevel([3, 4]))]
assert len(seed.index) * 2 * 24 * 6 == len(markov_ts.index)

In [None]:
## TODO exchange SN4 with day type

### Add DayType

In [None]:
weekdays = diary_data[diary_data.DDAYW2 == diary.DDAYW2.WEEKDAY_MON___FRI]
markov_ts['daytype'] = 'weekend'
markov_ts.loc[markov_ts.index.droplevel(4).isin(weekdays.index), 'daytype'] = 'weekday'
markov_ts = markov_ts.reset_index(level=[3, 4]).set_index(['daytype', 'time_of_day'], append=True)
markov_ts.drop('SN4', axis=1, inplace=True)
markov_ts.head()

In [None]:
markov_ts = markov_ts.unstack([0, 1, 2])

## Analyse Features

In [None]:
# taken from http://stackoverflow.com/a/39266194/1856079
import scipy.stats as ss

def cramers_corrected_stat(confusion_matrix):
    """ calculate Cramers V statistic for categorial-categorial association.
        uses correction from Bergsma and Wicher, 
        Journal of the Korean Statistical Society 42 (2013): 323-328
    """
    chi2 = ss.chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum().sum()
    phi2 = chi2/n
    r,k = confusion_matrix.shape
    phi2corr = max(0, phi2 - ((k-1)*(r-1))/(n-1))    
    rcorr = r - ((r-1)**2)/(n-1)
    kcorr = k - ((k-1)**2)/(n-1)
    return np.sqrt(phi2corr / min( (kcorr-1), (rcorr-1)))

In [None]:
cramers_corrected_stat(pd.crosstab(seed['labour'], seed['qualification']))

In [None]:
sns.heatmap(pd.get_dummies(seed['hhtype'], columns=['hhtype']).corr())

In [None]:
fig = plt.figure(figsize=(14, 7))
sns.heatmap(pd.get_dummies(seed, columns=['labour', 'hhtype', 'qualification']).corr())

## Select Variables

In [None]:
markov_ts.ix['weekend', 10].map(lambda x: x.value).plot()
_ = plt.yticks([1, 2, 3, 4, 5], [x for x in ppl.Activity])

In [None]:
def timestep_corr(index, markov_ts, feature):
    time_slice = markov_ts.iloc[index, :]
    time_slice.index = feature.index
    return cramers_corrected_stat(pd.crosstab(feature, time_slice))

In [None]:
def pairing_function(x, y):
    # cantor pairing function, http://stackoverflow.com/a/919661/1856079
    return int(1/2 * (x + y) * (x + y + 1) + y)

In [None]:
ts_corr = pd.DataFrame({
    'qualification': [timestep_corr(i, markov_ts, seed.qualification) for i in range(144*2)],
    'labour': [timestep_corr(i, markov_ts, seed.labour) for i in range(144*2)],
    'hhtype': [timestep_corr(i, markov_ts, seed.hhtype) for i in range(144*2)],
    'q+l': [timestep_corr(i, markov_ts, seed.apply(lambda row: pairing_function(row.qualification.value, row.labour.value), axis=1)) 
            for i in range(144*2)],
    'q+h': [timestep_corr(i, markov_ts, seed.apply(lambda row: pairing_function(row.qualification.value, row.hhtype.value), axis=1)) 
            for i in range(144*2)],
    'l+h': [timestep_corr(i, markov_ts, seed.apply(lambda row: pairing_function(row.labour.value, row.hhtype.value), axis=1)) 
            for i in range(144*2)]
})

In [None]:
ax = ts_corr.plot(figsize=(14, 7))
_ = plt.title("Cramer's phi for each time step of people model")
_ = plt.ylabel("Cramer's phi")
_ = plt.xlabel("time step")
fig = ax.get_figure()
fig.savefig((BUILD_FOLDER_PATH / 'markov_ts_cramer.png').as_posix())