In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

pd.options.display.max_seq_items = 2000

# import python modules
%load_ext autoreload
%autoreload 2
import sys
sys.path.append('/Users/tabea/Documents/UrbanMobility/src')
from data_prep import journey_data_aggregation as agg

# Data Aggregation and Analysis

### Hourly Data Aggregation

In [19]:
# import files
journey_16 = pd.read_csv('../data/processed/journey_data_clean_featureeng/journey_data_2016.csv', parse_dates=['end_date', 'start_date'], index_col=0)
journey_17 = pd.read_csv('../data/processed/journey_data_clean_featureeng/journey_data_2017.csv', parse_dates=['end_date', 'start_date'], index_col=0)
journey_18 = pd.read_csv('../data/processed/journey_data_clean_featureeng/journey_data_2018.csv', parse_dates=['end_date', 'start_date'], index_col=0)
journey_19 = pd.read_csv('../data/processed/journey_data_clean_featureeng/journey_data_2019.csv', parse_dates=['end_date', 'start_date'], index_col=0)

  mask |= (ar1 == a)


In [22]:
# hourly aggregation    
agg_journey_16 = agg.aggregate_demand(journey_16, 'H')
agg_journey_17 = agg.aggregate_demand(journey_17, 'H')
agg_journey_18 = agg.aggregate_demand(journey_18, 'H')
agg_journey_19 = agg.aggregate_demand(journey_19, 'H')

In [26]:
# adding rows for hours where no journeys were started, which are missing from the data. This makes sure that every hour is represented in the data, even those with a demand of 0.
borough_df = pd.read_csv('../data/interim/borough_data_featureeng.csv', index_col=0)
temporal_weather_features = ['hour', 'part_of_day', 'day_of_week', 'day_of_month', 'day_of_year', 'is_weekend', 'month', 'season', 'bank_holiday', 'temp', 'feelslike', 'dew', 'humidity', 'precip', 'windgust', 'windspeed', 'cloudcover', 'visibility', 'uvindex']

agg_journey_16_added_0_demand = agg.clean_aggregated_df_hourly(agg_journey_16, borough_df, temporal_weather_features)
agg_journey_17_added_0_demand = agg.clean_aggregated_df_hourly(agg_journey_17, borough_df, temporal_weather_features)
agg_journey_18_added_0_demand = agg.clean_aggregated_df_hourly(agg_journey_18, borough_df, temporal_weather_features)
agg_journey_19_added_0_demand = agg.clean_aggregated_df_hourly(agg_journey_19, borough_df, temporal_weather_features)

In [27]:
# print nr of nan values
print('2016: ', agg_journey_16_added_0_demand.isna().sum().sum())
print('2017: ', agg_journey_17_added_0_demand.isna().sum().sum())
print('2017: ', agg_journey_18_added_0_demand.isna().sum().sum())
print('2019: ', agg_journey_19_added_0_demand.isna().sum().sum())

2016:  0
2017:  0
2017:  0
2019:  0


In [30]:
# save aggregated data
agg_journey_16_added_0_demand.to_csv('../data/processed/aggregated_journey_data/agg_journey_data_hourly_2016.csv')
agg_journey_17_added_0_demand.to_csv('../data/processed/aggregated_journey_data/agg_journey_data_hourly_2017.csv')
agg_journey_18_added_0_demand.to_csv('../data/processed/aggregated_journey_data/agg_journey_data_hourly_2018.csv')
agg_journey_19_added_0_demand.to_csv('../data/processed/aggregated_journey_data/agg_journey_data_hourly_2019.csv')

### Daily Data Aggregation

In [17]:
journey_16 = pd.read_csv('../data/processed/journey_data_clean_featureeng/journey_data_2016_dailyweather.csv', parse_dates=['end_date', 'start_date'], index_col=0)
journey_17 = pd.read_csv('../data/processed/journey_data_clean_featureeng/journey_data_2017_dailyweather.csv', parse_dates=['end_date', 'start_date'], index_col=0)
journey_18 = pd.read_csv('../data/processed/journey_data_clean_featureeng/journey_data_2018_dailyweather.csv', parse_dates=['end_date', 'start_date'], index_col=0)
journey_19 = pd.read_csv('../data/processed/journey_data_clean_featureeng/journey_data_2019_dailyweather.csv', parse_dates=['end_date', 'start_date'], index_col=0)

  mask |= (ar1 == a)


In [18]:
# daily aggregation    
agg_journey_daily_16 = agg.aggregate_demand(journey_16, 'D')
agg_journey_daily_17 = agg.aggregate_demand(journey_17, 'D')
agg_journey_daily_18 = agg.aggregate_demand(journey_18, 'D')
agg_journey_daily_19 = agg.aggregate_demand(journey_19, 'D')

In [19]:
# clean
agg_journey_daily_16_cleaned = agg.clean_aggregated_df_daily(agg_journey_daily_16)
agg_journey_daily_17_cleaned = agg.clean_aggregated_df_daily(agg_journey_daily_17)
agg_journey_daily_18_cleaned = agg.clean_aggregated_df_daily(agg_journey_daily_18)
agg_journey_daily_19_cleaned = agg.clean_aggregated_df_daily(agg_journey_daily_19)

In [20]:
agg_journey_daily_16_cleaned.to_csv('../data/processed/aggregated_journey_data/agg_journey_data_daily_2016.csv')
agg_journey_daily_17_cleaned.to_csv('../data/processed/aggregated_journey_data/agg_journey_data_daily_2017.csv')
agg_journey_daily_18_cleaned.to_csv('../data/processed/aggregated_journey_data/agg_journey_data_daily_2018.csv')
agg_journey_daily_19_cleaned.to_csv('../data/processed/aggregated_journey_data/agg_journey_data_daily_2019.csv')

### Hourly Aggregated Demand Analysis: Detailed Weekly Demand 2019 and Yearly Comparisons 

In [None]:

boroughs = ['Westminster', 'Tower Hamlets', 'Kensington and Chelsea', 'Camden', 'Hammersmith and Fulham', 'Lambeth', 'Wandsworth', 'Southwark', 
            'Hackney', 'City of London', 'Islington', 'Newham']

borough_data = {}

for borough in boroughs:
    borough_data[f'weekly_demand_{borough}_2019'], borough_data[f'weekly_demand_{borough}_2018'], borough_data[f'weekly_demand_{borough}_2017'], borough_data[f'weekly_demand_{borough}_2016'] = agg.plot_demand_by_week_borough(agg_journey_19_added_0_demand, agg_journey_18_added_0_demand, agg_journey_17_added_0_demand, agg_journey_16_added_0_demand, borough)

0 - 23 hours corresponds to Monday. The data from 01.01.2017, which is a Sunday, is shown towards the end of the weekly cycle in the visualization, at hours 144-167. This might seem counter-intuitive if you are used to calendars where Sunday is the first day of the week.

In [110]:
# inspect specific days
df_inspect = borough_data['weekly_demand_City of London_2018']

df_inspect = df_inspect[(df_inspect['week_of_year'] == 23) & (df_inspect['hour_of_week'].isin(range(144, 160)))]
df_inspect[['demand', 'start_date_hour', 'hour', 'hour_of_week', 'week_of_year', 'day_of_week']]


Unnamed: 0,demand,start_date_hour,hour,hour_of_week,week_of_year,day_of_week
44525,29,2018-06-10 00:00:00,0,144,23,6
44537,18,2018-06-10 01:00:00,1,145,23,6
44549,2,2018-06-10 02:00:00,2,146,23,6
44560,11,2018-06-10 03:00:00,3,147,23,6
44571,4,2018-06-10 04:00:00,4,148,23,6
44583,2,2018-06-10 05:00:00,5,149,23,6
44594,2,2018-06-10 06:00:00,6,150,23,6
44605,13,2018-06-10 07:00:00,7,151,23,6
44617,13,2018-06-10 08:00:00,8,152,23,6
44629,25,2018-06-10 09:00:00,9,153,23,6
