In [1]:
import pandas as pd
import os
from pathlib import Path
import logging
import warnings
import itertools

from utils import *

warnings.filterwarnings("ignore")

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s",
)

In [2]:
MODE_MAPPING = {
    "car": "pv",
    "car_passenger": "pv",
    "bus": "pt",
    "rail": "pt",
    "ferry": "pt",
    "walk": "active",
    "bike": "active",
    "freight": "freight",
    "lightrail": "pt",
    "rail": "pt",
    "train": "pt",
}

def read_trips(scenario_path) -> pd.DataFrame:
    df = pd.read_csv(os.path.join(scenario_path, "trips.csv"), low_memory=False)
    persons = pd.read_csv(os.path.join(scenario_path, "synthetic_persons.csv"), low_memory=False)
    persons = persons[['person_id', 'household_id', 'home_region', 'sa2', 'age', 'gender', 
                       'labour_force_status', 'student_status', 'hh_size', 'car_availability', 
                       'is_working_from_home', 'income_group']] #select important columns
    persons['age_group'] = persons["age"].apply(get_age_group)
    persons = persons.rename(columns={'sa2': 'home_sa2'})
    
    df['person'] = df['person'].astype(str)
    persons['person_id'] = persons['person_id'].astype(str)

    df = df.merge(persons, left_on='person', right_on = 'person_id', how='left')
    distance_bins = range(0, 40, 1)

    df["dep_time"] = (df["dep_time"].apply(get_sec)) / 3600
    df["trav_time"] = (df["trav_time"].apply(get_sec)) / 3600
    df["period"] = df["dep_time"].apply(get_time_period)
    df["traveled_distance"] = (df["traveled_distance"].map(int)) / 1000  # Down to km

    df["traveled_distance_bin"] = pd.cut(df["traveled_distance"] / 1000, distance_bins)
    df.loc[df["person"].str.contains("hgv"), "longest_distance_mode"] = "freight"
    df["longest_distance_mode"] = df["longest_distance_mode"].map(MODE_MAPPING)

    df["od_activities"] = df["start_activity_type"].str.split("_").str[0] + "-" + df["end_activity_type"].str.split("_").str[0]
    df['trip_purpose'] = df['od_activities'].apply(get_trip_purpose)

    return df

In [4]:
scenario_path = '/Users/tszchun.chow/Documents/GitHub/monty-data-visualization/pt_variations/baseline'
trip_df = read_trips(scenario_path)

In [5]:
trip_df.columns

Index(['person', 'trip_number', 'trip_id', 'dep_time', 'trav_time',
       'wait_time', 'traveled_distance', 'euclidean_distance', 'main_mode',
       'longest_distance_mode', 'modes', 'start_activity_type',
       'end_activity_type', 'start_facility_id', 'start_link', 'start_x',
       'start_y', 'end_facility_id', 'end_link', 'end_x', 'end_y',
       'first_pt_boarding_stop', 'last_pt_egress_stop', 'start_sa2', 'end_sa2',
       'start_regional_council', 'end_regional_council', 'person_id',
       'household_id', 'home_region', 'home_sa2', 'age', 'gender',
       'labour_force_status', 'student_status', 'hh_size', 'car_availability',
       'is_working_from_home', 'income_group', 'age_group', 'period',
       'traveled_distance_bin', 'od_activities', 'trip_purpose'],
      dtype='object')

In [9]:
DEMOGRAPHIC_TYPE = ["gender", "age_group", "income_group", 
                   "labour_force_status", "student_status", "car_availability"] 

In [26]:
groups = ['period', 'longest_distance_mode', 'trip_purpose', 'od', 'demo_type']
trips_metrics_dfs = []

def metrics_groupby(trip_df, subset_cols):
    logging.info(f"Grouping metrics for {str(subset_cols)}")
    trip_metrics_df = trip_df.groupby(subset_cols).agg(
        {'trip_id':'count', 'trav_time': 'mean', 'traveled_distance': 'mean'}).reset_index()
    
    demo_type = [i for i in subset_cols if i in DEMOGRAPHIC_TYPE]
    if len(demo_type) > 0:
        trip_metrics_df['demographic_type'] = demo_type[0]
        trip_metrics_df = trip_metrics_df.rename(columns={demo_type[0]: 'demographic_group'})

    return trip_metrics_df

for L in range(len(groups) + 1):
    for subset in itertools.combinations(groups, L):
        subset = [col for col in subset]
        if not subset:
             break 
        if ('demo_type' in subset) and ('od' in subset):
            for demo_type in DEMOGRAPHIC_TYPE:
                temp_subset_cols = subset.copy()
                temp_subset_cols[temp_subset_cols.index('demo_type')] = demo_type
                for od in ['start', 'end']:
                    subset_cols = temp_subset_cols.copy()
                    subset_cols[subset_cols.index('od')] = od+'_regional_council'

                    trips_metrics_dfs.append(metrics_groupby(trip_df, subset_cols))

        elif ('demo_type' in subset):
            for demo_type in DEMOGRAPHIC_TYPE:
                subset_cols = subset.copy()
                subset_cols[subset_cols.index('demo_type')] = demo_type

                trips_metrics_dfs.append(metrics_groupby(trip_df, subset_cols))

        elif ('od' in subset):
            for od in ['start', 'end']:
                subset_cols = subset.copy()
                subset_cols[subset_cols.index('od')] = od+'_regional_council'

                trips_metrics_dfs.append(metrics_groupby(trip_df, subset_cols))
                
        else:
            subset_cols = subset.copy()
            trips_metrics_dfs.append(metrics_groupby(trip_df, subset_cols))
        
        

2023-12-12 12:14:43,002 - INFO - [3456935546.py:5] - Grouping metrics for ['period']
2023-12-12 12:14:43,178 - INFO - [3456935546.py:5] - Grouping metrics for ['longest_distance_mode']
2023-12-12 12:14:43,419 - INFO - [3456935546.py:5] - Grouping metrics for ['trip_purpose']
2023-12-12 12:14:43,610 - INFO - [3456935546.py:5] - Grouping metrics for ['start_regional_council']
2023-12-12 12:14:43,811 - INFO - [3456935546.py:5] - Grouping metrics for ['end_regional_council']
2023-12-12 12:14:44,009 - INFO - [3456935546.py:5] - Grouping metrics for ['gender']
2023-12-12 12:14:44,175 - INFO - [3456935546.py:5] - Grouping metrics for ['age_group']
2023-12-12 12:14:44,342 - INFO - [3456935546.py:5] - Grouping metrics for ['income_group']
2023-12-12 12:14:44,483 - INFO - [3456935546.py:5] - Grouping metrics for ['labour_force_status']
2023-12-12 12:14:44,656 - INFO - [3456935546.py:5] - Grouping metrics for ['student_status']
2023-12-12 12:14:44,834 - INFO - [3456935546.py:5] - Grouping metrics

In [31]:
trips_metrics_summary= pd.concat(trips_metrics_dfs, axis = 0, join='outer'
                            ).rename(columns={'trip_id': 'trip_count',
                                            'longest_distance_mode': 'mode',
                                            'trav_time': 'travel_time'
                                            }
                            )

category_cols = ['period', 'mode', 'trip_purpose', 'start_regional_council', 'end_regional_council', 'demographic_type', 'demographic_group']
#category_cols = list(set(trips_metrics_summary.columns) - set(['trip_count', 'travel_time', 'traveled_distance']))

trips_metrics_summary = trips_metrics_summary.fillna("All")
trips_metrics_summary['period'] = trips_metrics_summary['period'].fillna("Daily")
trips_metrics_summary.melt(id_vars=category_cols, 
                            var_name="Metric", 
                            value_name="Value")
#trips_metrics_summary.to_csv('tripmetrics.csv', index=None)

Unnamed: 0,period,mode,trip_purpose,start_regional_council,end_regional_council,demographic_type,demographic_group,Metric,Value
0,AM,All,All,All,All,All,All,trip_count,262987.000000
1,IP,All,All,All,All,All,All,trip_count,763372.000000
2,OP,All,All,All,All,All,All,trip_count,295565.000000
3,PM,All,All,All,All,All,All,trip_count,426503.000000
4,All,active,All,All,All,All,All,trip_count,203951.000000
...,...,...,...,...,...,...,...,...,...
215248,PM,pv,non home-based,All,Waikato Region,car_availability,Never,traveled_distance,7.225890
215249,PM,pv,non home-based,All,Wellington Region,car_availability,Always,traveled_distance,4.567347
215250,PM,pv,non home-based,All,Wellington Region,car_availability,Never,traveled_distance,4.975395
215251,PM,pv,non home-based,All,West Coast Region,car_availability,Always,traveled_distance,6.429346
