### Imports

In [1]:
#Spark Imports
import pyspark
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.window import Window
from pyspark.sql import functions as F
from pyspark.sql import types as T

#Python Standard Libs Imports
import json
import urllib2
import sys
from datetime import datetime
from os.path import isfile, join, splitext
from glob import glob

#Imports to enable visualizations
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

### Functions

#### Basic Functions

In [2]:
def rename_columns(df, list_of_tuples):
    for (old_col, new_col) in list_of_tuples:
        df = df.withColumnRenamed(old_col, new_col)
    return df

def read_folders(path, sqlContext, sc, initial_date, final_date, folder_suffix):
    extension = splitext(path)[1]

    if extension == "":
        path_pattern = path + "/*/part-*"
        if "hdfs" in path:
            URI = sc._gateway.jvm.java.net.URI
            Path = sc._gateway.jvm.org.apache.hadoop.fs.Path
            FileSystem = sc._gateway.jvm.org.apache.hadoop.fs.FileSystem
            Configuration = sc._gateway.jvm.org.apache.hadoop.conf.Configuration

            hdfs = "/".join(path_pattern.split("/")[:3])
            dir = "/" + "/".join(path_pattern.split("/")[3:])

            fs = FileSystem.get(URI(hdfs), Configuration())

            status = fs.globStatus(Path(dir))

            files = map(lambda file_status: str(file_status.getPath()), status)

        else:
            files = glob(path_pattern)

        #print initial_date, final_date
        #print datetime.strptime(files[0].split('/')[-2],('%Y_%m_%d' + folder_suffix))

        files = filter(lambda f: initial_date <= datetime.strptime(f.split("/")[-2], ('%Y_%m_%d' + folder_suffix)) <=
                                 final_date, files)
        
        #print len(files)
        #print files
        if folder_suffix == '_od':
            return reduce(lambda df1, df2: df1.unionAll(df2),
                      map(lambda f: read_hdfs_folder(sqlContext,f), files))
        else:
            return reduce(lambda df1, df2: df1.unionAll(df2),
                      map(lambda f: read_buste_data_v3(sqlContext,f), files))
    else:
        return read_file(path, sqlContext)

def read_hdfs_folder(sqlContext, folderpath):
    data_frame = sqlContext.read.csv(folderpath, header=True,
                                     inferSchema=True,nullValue="-")
    return data_frame

def read_buste_data_v3(sqlContext, folderpath):
    data_frame = read_hdfs_folder(sqlContext,folderpath)
    data_frame = data_frame.withColumn("date", F.unix_timestamp(F.col("date"),'yyyy_MM_dd'))
    
    return data_frame

def printdf(df,l=10):
    return df.limit(l).toPandas()

def get_timestamp_in_tz(unixtime_timestamp,ts_format,tz):
    return F.from_utc_timestamp(F.from_unixtime(unixtime_timestamp, ts_format),tz)


#### Analysis Functions

In [3]:
def get_otp_itineraries(otp_url,o_lat,o_lon,d_lat,d_lon,date,time,verbose=False):
    otp_http_request = 'routers/ctba/plan?fromPlace={},{}&toPlace={},{}&mode=TRANSIT,WALK&date={}&time={}'
    otp_request_url = otp_url + otp_http_request.format(o_lat,o_lon,d_lat,d_lon,date,time)
    if verbose:
        print otp_request_url
    return json.loads(urllib2.urlopen(otp_request_url).read())

def get_otp_suggested_trips(od_matrix,otp_url):
    trips_otp_response = {}
    counter = 0
    for row in od_matrix.collect():
        id=long(row['user_trip_id'])
        start_time = row['o_base_datetime'].split(' ')[1]
        trip_plan = get_otp_itineraries(otp_url,row['o_shape_lat'], row['o_shape_lon'], row['shapeLat'], row['shapeLon'],row['date'],start_time)
        trips_otp_response[id] = trip_plan
        counter+=1

    return trips_otp_response

def advance_od_matrix_start_time(od_matrix,extra_seconds):
    return od_matrix.withColumn('o_datetime', F.concat(F.col('date'), F.lit(' '), F.col('o_timestamp'))) \
                    .withColumn('d_datetime', F.concat(F.col('date'), F.lit(' '), F.col('timestamp'))) \
                    .withColumn('executed_duration', (F.unix_timestamp('d_datetime') - F.unix_timestamp('o_datetime'))/60) \
                    .withColumn('o_base_datetime', F.from_unixtime(F.unix_timestamp(F.col('o_datetime'),'yyyy-MM-dd HH:mm:ss') - extra_seconds, 'yyyy-MM-dd HH:mm:ss')) \

def extract_otp_trips_legs(otp_trips):
    trips_legs = []

    for trip in otp_trips.keys():
        if 'plan' in otp_trips[trip]:
            itinerary_id = 1
            for itinerary in otp_trips[trip]['plan']['itineraries']:
                date = otp_trips[trip]['plan']['date']/1000
                leg_id = 1
                for leg in itinerary['legs']:
                    route = leg['route'] if leg['route'] != '' else None
                    fromStopId = leg['from']['stopId'].split(':')[1] if leg['mode'] == 'BUS' else None
                    toStopId = leg['to']['stopId'].split(':')[1] if leg['mode'] == 'BUS' else None
                    start_time = long(leg['startTime'])/1000
                    end_time = long(leg['endTime'])/1000
                    duration = (end_time - start_time)/60
                    trips_legs.append((date,trip,itinerary_id,leg_id,start_time,end_time,leg['mode'],route,fromStopId,toStopId, duration))
                    leg_id += 1
                itinerary_id += 1
    return trips_legs

def prepare_otp_legs_df(otp_legs_list):
    labels=['date','user_trip_id','itinerary_id','leg_id','otp_start_time','otp_end_time','mode','route','from_stop_id','to_stop_id','otp_duration_mins']
    otp_legs_df = sqlContext.createDataFrame(otp_legs_list, labels) \
                        .withColumn('date',F.from_unixtime(F.col('date'),'yyyy-MM-dd')) \
                        .withColumn('otp_duration_mins',((F.col('otp_end_time') - F.col('otp_start_time'))/60)) \
                        .withColumn('otp_start_time',F.from_unixtime(F.col('otp_start_time'),'yyyy-MM-dd HH:mm:ss').astype('timestamp')) \
                        .withColumn('otp_end_time',F.from_unixtime(F.col('otp_end_time'),'yyyy-MM-dd HH:mm:ss').astype('timestamp')) \
                        .withColumn('route', F.col('route').astype('integer')) \
                        .withColumn('from_stop_id', F.col('from_stop_id').astype('integer')) \
                        .withColumn('to_stop_id', F.col('to_stop_id').astype('integer')) \
                        .orderBy(['date','user_trip_id','itinerary_id','otp_start_time'])

    return otp_legs_df

def get_df_stats(df,filtered_df,df_label,filtered_df_label):
    df_size = df.count()
    filtered_df_size = filtered_df.count()
    print "Total", df_label,":", df_size
    print "Total", filtered_df_label, ":", filtered_df_size, "(", 100*(filtered_df_size/float(df_size)), "%)"

def get_filtered_df_stats(filtered_df,full_df_size,filtered_df_label,full_df_label):
    filtered_df_size = filtered_df.count()
    print filtered_df_label, "in Total", full_df_label, ":", filtered_df_size, "(", 100*(filtered_df_size/float(full_df_size)), "%)"

def clean_buste_data(buste_data):
    return buste_data.select(["date","route","busCode","tripNum","stopPointId","timestamp"]) \
        .na.drop(subset=["date","route","busCode","tripNum","stopPointId","timestamp"]) \
        .dropDuplicates(['date','route','busCode','tripNum','stopPointId']) \
        .withColumn('route',F.col('route').astype('float')) \
        .withColumn('date',F.from_unixtime(F.col('date'),'yyyy-MM-dd')) \
        .withColumn('timestamp',F.from_unixtime(F.unix_timestamp(F.concat(F.col('date'),F.lit(' '),F.col('timestamp')), 'yyyy-MM-dd HH:mm:ss')))

def find_otp_bus_legs_actual_start_time(otp_legs_df,clean_bus_trips_df):
    w = Window.partitionBy(['date','user_trip_id','itinerary_id','route','from_stop_id']).orderBy(['timediff'])
    return otp_legs_df \
        .withColumn('stopPointId', F.col('from_stop_id')) \
        .join(clean_bus_trips_df, ['date','route','stopPointId'], how='inner') \
        .na.drop(subset=['timestamp']) \
        .withColumn('timediff',F.abs(F.unix_timestamp(F.col('timestamp')) - F.unix_timestamp(F.col('otp_start_time')))) \
        .drop('otp_duration') \
        .withColumn('rn', F.row_number().over(w)) \
        .where(F.col('rn') == 1) \
        .select(['date','user_trip_id','itinerary_id','leg_id','route','busCode','tripNum','from_stop_id','otp_start_time','timestamp','to_stop_id','otp_end_time']) \
        .withColumnRenamed('timestamp','from_timestamp')

def find_otp_bus_legs_actual_end_time(otp_legs_st,clean_bus_trips):
    return otp_legs_st \
                .withColumnRenamed('to_stop_id','stopPointId') \
                .join(clean_bus_trips, ['date','route','busCode','tripNum','stopPointId'], how='inner') \
                .na.drop(subset=['timestamp']) \
                .withColumn('timediff',F.abs(F.unix_timestamp(F.col('timestamp')) - F.unix_timestamp(F.col('otp_end_time')))) \
                .withColumnRenamed('timestamp', 'to_timestamp') \
                .withColumnRenamed('stopPointId','to_stop_id') \
                .orderBy(['date','route','stopPointId','timediff'])

def clean_otp_legs_actual_time_df(otp_legs_st_end_df):
    return otp_legs_start_end \
                .select(['date','user_trip_id','itinerary_id','leg_id','route','busCode','tripNum','from_stop_id','from_timestamp','to_stop_id','to_timestamp']) \
                .withColumn('actual_duration_mins', (F.unix_timestamp(F.col('to_timestamp')) - F.unix_timestamp(F.col('from_timestamp')))/60) \
                .orderBy(['date','user_trip_id','itinerary_id','leg_id']) \
                .filter('actual_duration_mins > 0')

def combine_otp_suggestions_with_bus_legs_actual_time(otp_suggestions,bus_legs_actual_time):
    return otp_legs_df \
                .join(clean_otp_legs_actual_time, on=['date','user_trip_id','itinerary_id','leg_id', 'route', 'from_stop_id','to_stop_id'], how='left_outer') \
                .withColumn('considered_duration_mins', F.when(F.col('mode') == F.lit('BUS'), F.col('actual_duration_mins')).otherwise(F.col('otp_duration_mins'))) \
                .withColumn('considered_start_time', F.when(F.col('mode') == F.lit('BUS'), F.col('from_timestamp')).otherwise(F.col('otp_start_time')))

def select_itineraries_fully_identified(otp_itineraries_legs):
    itineraries_not_fully_identified = otp_itineraries_legs \
                                        .filter((otp_itineraries_legs.mode == 'BUS') & (otp_itineraries_legs.busCode.isNull())) \
                                        .select(['date','user_trip_id','itinerary_id']).distinct()
    itineraries_fully_identified = otp_itineraries_legs.select(['date','user_trip_id','itinerary_id']).subtract(itineraries_not_fully_identified)
    return otp_itineraries_legs.join(itineraries_fully_identified, on=['date','user_trip_id','itinerary_id'], how='inner')

def rank_otp_itineraries_by_actual_duration(trips_itineraries):
    itineraries_window = Window.partitionBy(['date','user_trip_id']).orderBy(['actual_duration_mins'])
    return trips_itineraries.withColumn('rank', F.row_number().over(itineraries_window))

def get_trips_itineraries_pool(trips_otp_alternatives,od_mat):
    return trips_otp_alternatives \
                .union(od_mat \
                .withColumn('itinerary_id', F.lit(0)) \
                .withColumnRenamed('executed_duration','duration') \
                .withColumnRenamed('o_datetime', 'alt_start_time') \
                .select(['date','user_trip_id','itinerary_id','duration','alt_start_time'])) \
                .orderBy(['date','user_trip_id','itinerary_id'])

def determining_trips_alternatives_feasibility(otp_itineraries_legs,od_mat):
    trips_itineraries_possibilities = otp_itineraries_legs \
                        .groupBy(['date', 'user_trip_id', 'itinerary_id']) \
                        .agg(F.sum('considered_duration_mins').alias('duration'), \
                             F.first('considered_start_time').alias('alt_start_time')) \
                        .orderBy(['date','user_trip_id','itinerary_id']) \
            .join(od_mat \
                        .withColumnRenamed('o_datetime','exec_start_time') \
                        .select(['date','user_trip_id','exec_start_time']),
                on=['date','user_trip_id']) \
            .withColumn('start_diff', (F.abs(F.unix_timestamp(F.col('exec_start_time')) - F.unix_timestamp(F.col('alt_start_time')))/60))

    filtered_trips_possibilities = trips_itineraries_possibilities \
                                        .filter(F.col('start_diff') <= 20) \
                                        .drop('exec_start_time', 'start_diff')

    return (trips_itineraries_possibilities,filtered_trips_possibilities)

def select_best_trip_itineraries(itineraries_pool):
    return rank_otp_itineraries_by_actual_duration(itineraries_pool).filter('rank == 1') \
                                    .drop('rank')

def compute_improvement_capacity(best_itineraries,od_mat):
    return  od_mat \
                .withColumnRenamed('o_datetime','exec_start_time') \
                .select(['date','user_trip_id','cardNum','birthdate','gender','exec_start_time','executed_duration']) \
            .join(best_itineraries, on=['date','user_trip_id']) \
            .withColumn('imp_capacity', F.col('executed_duration') - F.col('duration'))


### Main Code

#### Reading Input Variables

In [4]:
initial_date = datetime.strptime('2017-05-09', '%Y-%m-%d')
final_date = datetime.strptime('2017-05-09', '%Y-%m-%d')
od_matrix_folderpath = '/local/tarciso/masters/data/bus_trips/test/single-day-test/2017_05_09/od/trips_od/'
buste_data_folderpath = '/local/tarciso/masters/data/bus_trips/test/single-day-test/2017_05_09/buste/'
otp_server_url = 'http://150.165.85.4:10402/otp/'
results_folderpath = '/local/tarciso/masters/data/bus_trips/single-day-test/'

#### Settting Up Environment

In [5]:
#Get Spark Session
spark  = SparkSession.builder.getOrCreate()
spark.conf.set('spark.sql.crossJoin.enabled', 'true')

sc = spark.sparkContext
sc.setLogLevel("ERROR")
sqlContext = pyspark.SQLContext(sc)

In [6]:
sc._conf.getAll()

[(u'hive.metastore.warehouse.dir',
  u'file:/local/tarciso/workspace/trips-optimality-exp/spark-warehouse/'),
 (u'spark.driver.host', u'10.30.0.113'),
 (u'spark.app.id', u'local-1521639691833'),
 (u'spark.driver.port', u'38409'),
 (u'spark.driver.memory', u'4g'),
 (u'spark.rdd.compress', u'True'),
 (u'spark.serializer.objectStreamReset', u'100'),
 (u'spark.master', u'local[*]'),
 (u'spark.executor.id', u'driver'),
 (u'spark.submit.deployMode', u'client'),
 (u'spark.app.name', u'PySparkShell')]

#### Reading OD Matrix

In [7]:
print "Got Spark Context"

print "Reading OD-Matrix Data..."
od_matrix_day_folderpath = od_matrix_folderpath + '/' + initial_date.strftime('%Y_%m_%d') + '_od'
od_matrix = read_hdfs_folder(sqlContext, od_matrix_day_folderpath) \
                .withColumnRenamed('date','date_in_secs') \
                .withColumn('date', F.from_unixtime(F.col('date_in_secs'), 'yyyy-MM-dd')) \
                .withColumnRenamed('o_boarding_id','user_trip_id')
        
print "Preprocessing Data..."
od_matrix = advance_od_matrix_start_time(od_matrix,120)

Got Spark Context
Reading OD-Matrix Data...
Preprocessing Data...


In [8]:
printdf(od_matrix)

Unnamed: 0,route,tripNum,shapeId,shapeSequence,shapeLat,shapeLon,distanceTraveledShape,busCode,gpsPointId,gpsLat,...,o_unixtimestamp,next_o_unixtimestamp,leg_duration,dist,rn,date,o_datetime,d_datetime,executed_duration,o_base_datetime
0,370,4,2789,5510497,-25.436972,-49.277444,3768.768,BC032,,-25.437973,...,46949,73805,447.6,0.583099,1,2017-05-09,2017-05-09 10:02:29,2017-05-09 10:45:35,43.1,2017-05-09 10:00:29
1,380,6,4127,6448424,-25.427902,-49.263242,6213.697,LC302,,-25.428101,...,65029,75415,173.1,0.020988,1,2017-05-09,2017-05-09 15:03:49,2017-05-09 15:56:29,52.666667,2017-05-09 15:01:49
2,712,2,2170,6657911,-25.484518,-49.333416,4717.637,JA012,,-25.484293,...,32680,45992,221.866667,0.214177,1,2017-05-09,2017-05-09 06:04:40,2017-05-09 06:24:55,20.25,2017-05-09 06:02:40
3,211,7,1776,6633394,-25.402759,-49.213098,0.0,BA037,,-25.402765,...,76931,39309,-1.0,0.974592,1,2017-05-09,2017-05-09 18:22:11,2017-05-09 19:08:20,46.15,2017-05-09 18:20:11
4,628,7,2935,6035542,-25.495496,-49.302459,13282.36,HA025,,-25.495243,...,57933,72766,247.216667,0.368644,1,2017-05-09,2017-05-09 13:05:33,2017-05-09 13:38:32,32.983333,2017-05-09 13:03:33
5,183,7,1753,6096567,-25.428575,-49.271158,10934.301,BC311,,-25.428583,...,60800,62505,28.416667,0.798145,1,2017-05-09,2017-05-09 13:53:20,2017-05-09 14:28:34,35.233333,2017-05-09 13:51:20
6,876,1,2937,6065893,-25.422933,-49.304968,4090.977,BC300,,-25.42285,...,32943,64125,519.7,0.904173,1,2017-05-09,2017-05-09 06:09:03,2017-05-09 06:50:39,41.6,2017-05-09 06:07:03
7,463,7,2846,6348437,-25.479636,-49.193971,0.0,DC087,,-25.479658,...,32479,50924,307.416667,0.0,1,2017-05-09,2017-05-09 06:01:19,2017-05-09 13:12:29,431.166667,2017-05-09 05:59:19
8,777,2,2195,4300046,-25.461139,-49.325868,7161.699,JC012,,-25.461108,...,38155,31834,-1.0,0.121724,1,2017-05-09,2017-05-09 07:35:55,2017-05-09 07:45:22,9.45,2017-05-09 07:33:55
9,462,5,3102,5855535,-25.434773,-49.272324,12450.673,DC296,,-25.434785,...,52149,49923,-1.0,0.114577,1,2017-05-09,2017-05-09 11:29:09,2017-05-09 11:59:30,30.35,2017-05-09 11:27:09


In [9]:
raw_od_matrix = od_matrix

In [10]:
od_matrix = raw_od_matrix.limit(100)

#### Getting OTP suggested itineraries

In [11]:
print "Getting OTP suggested itineraries..."
otp_suggestions = get_otp_suggested_trips(od_matrix,otp_server_url)

print "Extracting OTP Legs info..."
otp_legs_df = prepare_otp_legs_df(extract_otp_trips_legs(otp_suggestions))
#otp_legs_df.write.csv(path=results_folderpath+'/trip_plans',header=True, mode='append')

otp_suggestions = None


Getting OTP suggested itineraries...
Extracting OTP Legs info...


#### Printing Itineraries Stats

In [12]:
total_num_itineraries = otp_legs_df.select('user_trip_id','itinerary_id').distinct().count()
total_num_legs = otp_legs_df.count()
num_bus_legs = otp_legs_df.filter('mode == \'BUS\'').count()

print "Total num itineraries:", total_num_itineraries
print "Total num legs:", total_num_legs
print "Total num bus legs:", num_bus_legs, '(', 100*(num_bus_legs/float(total_num_legs)), '%)'


Total num itineraries: 296
Total num legs: 864
Total num bus legs: 292 ( 33.7962962963 %)


#### Matching OTP Bus Legs Origins with BUSTE Data

In [13]:
print "Reading BUSTE data..."
bus_trips_data = read_folders(buste_data_folderpath, sqlContext, sc, initial_date, final_date,'_veiculos')
clean_bus_trips_data = clean_buste_data(bus_trips_data)

print "Finding OTP Bus Legs Actual Start Times in Bus Trips Data..."
otp_legs_st = find_otp_bus_legs_actual_start_time(otp_legs_df,clean_bus_trips_data)


Reading BUSTE data...
Finding OTP Bus Legs Actual Start Times in Bus Trips Data...


In [14]:
printdf(clean_bus_trips_data)

Unnamed: 0,date,route,busCode,tripNum,stopPointId,timestamp
0,2017-05-09,1.0,BN997,10,31454,2017-05-09 10:41:35
1,2017-05-09,1.0,BN997,23,30748,2017-05-09 15:38:20
2,2017-05-09,1.0,BN998,28,30749,2017-05-09 17:51:42
3,2017-05-09,2.0,DN027,5,26550,2017-05-09 09:20:36
4,2017-05-09,2.0,DN028,13,29080,2017-05-09 14:56:07
5,2017-05-09,2.0,DN028,13,40026,2017-05-09 14:59:19
6,2017-05-09,10.0,BB001,4,33161,2017-05-09 09:10:51
7,2017-05-09,10.0,BB302,3,32842,2017-05-09 09:40:43
8,2017-05-09,10.0,BB304,3,33167,2017-05-09 09:48:20
9,2017-05-09,11.0,BB002,8,31743,2017-05-09 15:37:41


In [15]:
printdf(otp_legs_df)

Unnamed: 0,date,user_trip_id,itinerary_id,leg_id,otp_start_time,otp_end_time,mode,route,from_stop_id,to_stop_id,otp_duration_mins
0,2017-05-09,25769803948,1,1,2017-05-09 18:16:42,2017-05-09 18:16:45,WALK,,,,0.05
1,2017-05-09,25769803948,1,2,2017-05-09 18:16:46,2017-05-09 18:23:38,BUS,11.0,32801.0,29938.0,6.866667
2,2017-05-09,25769803948,1,3,2017-05-09 18:23:39,2017-05-09 18:23:59,WALK,,,,0.333333
3,2017-05-09,25769803948,2,1,2017-05-09 18:35:25,2017-05-09 18:35:28,WALK,,,,0.05
4,2017-05-09,25769803948,2,2,2017-05-09 18:35:29,2017-05-09 18:42:56,BUS,11.0,32801.0,29938.0,7.45
5,2017-05-09,25769803948,2,3,2017-05-09 18:42:57,2017-05-09 18:43:17,WALK,,,,0.333333
6,2017-05-09,25769803948,3,1,2017-05-09 18:56:25,2017-05-09 18:56:28,WALK,,,,0.05
7,2017-05-09,25769803948,3,2,2017-05-09 18:56:29,2017-05-09 19:03:56,BUS,11.0,32801.0,29938.0,7.45
8,2017-05-09,25769803948,3,3,2017-05-09 19:03:57,2017-05-09 19:04:17,WALK,,,,0.333333
9,2017-05-09,51539607858,1,1,2017-05-09 13:25:20,2017-05-09 13:25:49,WALK,,,,0.483333


#### Printing Matched OTP Bus Legs Origins Stats

In [16]:
num_bus_legs_st = otp_legs_st.count()
print "Num Bus Legs whose start was found:", num_bus_legs_st, '(', 100*(num_bus_legs_st/float(num_bus_legs)), '%)'


Num Bus Legs whose start was found: 291 ( 99.6575342466 %)


#### Cleaning Memory

In [17]:
#Clean memory
#otp_legs_df.unpersist(blocking=True)
#bus_trips_data.unpersist(blocking=True)
#clean_bus_trips_data.unpersist(blocking=True)

#### Matching OTP Bus Legs Destinations with BUSTE Data

In [18]:
print "Reading BUSTE data again..."
bus_trips_data2 = read_folders(buste_data_folderpath, sqlContext, sc, initial_date, final_date,'_veiculos')
clean_bus_trips_data2 = clean_buste_data(bus_trips_data2)

print "Finding OTP Bus Legs Actual End Times in Bus Trips Data..."
otp_legs_start_end = find_otp_bus_legs_actual_end_time(otp_legs_st,clean_bus_trips_data2)
clean_otp_legs_actual_time = clean_otp_legs_actual_time_df(otp_legs_start_end)


Reading BUSTE data again...
Finding OTP Bus Legs Actual End Times in Bus Trips Data...


In [19]:
num_matched_bus_legs_st = clean_otp_legs_actual_time.count()
print "Num Bus Legs whose end was found:", num_matched_bus_legs_st, '(', 100*(num_matched_bus_legs_st/float(num_bus_legs)), '%)'


Num Bus Legs whose end was found: 263 ( 90.0684931507 %)


#### Cleaning Memory

In [20]:
#Clean Memory
#otp_legs_st.unpersist(blocking=True)
#bus_trips_data2.unpersist(blocking=True)
#clean_bus_trips_data2.unpersist(blocking=True)
#otp_legs_start_end.unpersist(blocking=True)


#### Enriching OTP suggestions legs with actual time data

In [21]:
print "Enriching OTP suggestions legs with actual time data..."
all_legs_actual_time = combine_otp_suggestions_with_bus_legs_actual_time(otp_legs_df,clean_otp_legs_actual_time)


Enriching OTP suggestions legs with actual time data...


#### Filtering out itineraries with bus legs not identified in bus data

In [22]:
print "Filtering out itineraries with bus legs not identified in bus data..."
clean_legs_actual_time = select_itineraries_fully_identified(all_legs_actual_time)


Filtering out itineraries with bus legs not identified in bus data...


In [23]:
printdf(clean_legs_actual_time)

Unnamed: 0,date,user_trip_id,itinerary_id,leg_id,route,from_stop_id,to_stop_id,otp_start_time,otp_end_time,mode,otp_duration_mins,busCode,tripNum,from_timestamp,to_timestamp,actual_duration_mins,considered_duration_mins,considered_start_time
0,2017-05-09,979252543572,2,1,,,,2017-05-09 06:50:38,2017-05-09 06:50:40,WALK,0.033333,,,,,,0.033333,2017-05-09 06:50:38
1,2017-05-09,979252543572,2,2,655.0,38518.0,36238.0,2017-05-09 06:50:41,2017-05-09 06:57:00,BUS,6.316667,HA283,1.0,2017-05-09 06:48:37,2017-05-09 06:57:02,8.416667,8.416667,2017-05-09 06:48:37
2,2017-05-09,979252543572,2,3,,,,2017-05-09 06:57:01,2017-05-09 06:57:02,WALK,0.016667,,,,,,0.016667,2017-05-09 06:57:01
3,2017-05-09,1005022347340,2,1,,,,2017-05-09 18:15:27,2017-05-09 18:15:36,WALK,0.15,,,,,,0.15,2017-05-09 18:15:27
4,2017-05-09,1005022347340,2,3,,,,2017-05-09 18:29:56,2017-05-09 18:30:11,WALK,0.25,,,,,,0.25,2017-05-09 18:29:56
5,2017-05-09,1005022347340,2,2,243.0,27513.0,31108.0,2017-05-09 18:15:37,2017-05-09 18:29:55,BUS,14.3,BA124,21.0,2017-05-09 17:57:09,2017-05-09 18:08:24,11.25,11.25,2017-05-09 17:57:09
6,2017-05-09,1374389535030,1,2,380.0,29867.0,29889.0,2017-05-09 15:06:54,2017-05-09 15:20:41,BUS,13.783333,LC302,6.0,2017-05-09 15:02:56,2017-05-09 15:23:43,20.783333,20.783333,2017-05-09 15:02:56
7,2017-05-09,1374389535030,1,3,,,,2017-05-09 15:20:42,2017-05-09 15:23:52,WALK,3.166667,,,,,,3.166667,2017-05-09 15:20:42
8,2017-05-09,1374389535030,1,1,,,,2017-05-09 15:06:51,2017-05-09 15:06:53,WALK,0.033333,,,,,,0.033333,2017-05-09 15:06:51
9,2017-05-09,1400159338669,2,3,,,,2017-05-09 13:20:01,2017-05-09 13:21:38,WALK,1.616667,,,,,,1.616667,2017-05-09 13:20:01


In [24]:
printdf(clean_legs_actual_time.filter(clean_legs_actual_time['mode'] == 'BUS') \
        .select(['otp_start_time','otp_end_time','otp_duration_mins','from_timestamp','to_timestamp','actual_duration_mins']))

Unnamed: 0,otp_start_time,otp_end_time,otp_duration_mins,from_timestamp,to_timestamp,actual_duration_mins
0,2017-05-09 06:50:41,2017-05-09 06:57:00,6.316667,2017-05-09 06:48:37,2017-05-09 06:57:02,8.416667
1,2017-05-09 18:15:37,2017-05-09 18:29:55,14.3,2017-05-09 17:57:09,2017-05-09 18:08:24,11.25
2,2017-05-09 15:06:54,2017-05-09 15:20:41,13.783333,2017-05-09 15:02:56,2017-05-09 15:23:43,20.783333
3,2017-05-09 13:06:23,2017-05-09 13:20:00,13.616667,2017-05-09 12:45:33,2017-05-09 12:57:50,12.283333
4,2017-05-09 08:00:29,2017-05-09 08:18:00,17.516667,2017-05-09 07:56:31,2017-05-09 08:15:39,19.133333
5,2017-05-09 11:44:55,2017-05-09 11:50:00,5.083333,2017-05-09 11:37:26,2017-05-09 11:52:16,14.833333
6,2017-05-09 14:02:35,2017-05-09 14:17:00,14.416667,2017-05-09 14:07:44,2017-05-09 14:20:05,12.35
7,2017-05-09 15:29:35,2017-05-09 15:35:17,5.7,2017-05-09 15:23:49,2017-05-09 15:30:59,7.166667
8,2017-05-09 10:15:10,2017-05-09 10:31:26,16.266667,2017-05-09 10:13:57,2017-05-09 10:23:10,9.216667
9,2017-05-09 14:39:49,2017-05-09 15:09:00,29.183333,2017-05-09 14:42:15,2017-05-09 15:08:46,26.516667


#### Printing Fully Identified OTP Itineraries Stats

In [25]:
num_itineraries_fully_identified = clean_legs_actual_time.select('user_trip_id','itinerary_id').distinct().count()
print "Num Itineraries fully identified in BUSTE data:", num_itineraries_fully_identified, '(', 100*(num_itineraries_fully_identified/float(total_num_itineraries)), '%)'

Num Itineraries fully identified in BUSTE data: 267 ( 90.2027027027 %)


In [26]:
print "Writing OTP suggested itineraries legs with actual time to file..."
#clean_legs_actual_time.write.csv(path=results_folderpath+'/otp_legs_matched',header=True, mode='append')


Writing OTP suggested itineraries legs with actual time to file...


In [27]:
#Clean Memory
#clean_otp_legs_actual_time.unpersist(blocking=True)
#all_legs_actual_time.unpersist(blocking=True)

In [28]:
printdf(od_matrix)

Unnamed: 0,route,tripNum,shapeId,shapeSequence,shapeLat,shapeLon,distanceTraveledShape,busCode,gpsPointId,gpsLat,...,o_unixtimestamp,next_o_unixtimestamp,leg_duration,dist,rn,date,o_datetime,d_datetime,executed_duration,o_base_datetime
0,370,4,2789,5510497,-25.436972,-49.277444,3768.768,BC032,,-25.437973,...,46949,73805,447.6,0.583099,1,2017-05-09,2017-05-09 10:02:29,2017-05-09 10:45:35,43.1,2017-05-09 10:00:29
1,380,6,4127,6448424,-25.427902,-49.263242,6213.697,LC302,,-25.428101,...,65029,75415,173.1,0.020988,1,2017-05-09,2017-05-09 15:03:49,2017-05-09 15:56:29,52.666667,2017-05-09 15:01:49
2,712,2,2170,6657911,-25.484518,-49.333416,4717.637,JA012,,-25.484293,...,32680,45992,221.866667,0.214177,1,2017-05-09,2017-05-09 06:04:40,2017-05-09 06:24:55,20.25,2017-05-09 06:02:40
3,211,7,1776,6633394,-25.402759,-49.213098,0.0,BA037,,-25.402765,...,76931,39309,-1.0,0.974592,1,2017-05-09,2017-05-09 18:22:11,2017-05-09 19:08:20,46.15,2017-05-09 18:20:11
4,628,7,2935,6035542,-25.495496,-49.302459,13282.36,HA025,,-25.495243,...,57933,72766,247.216667,0.368644,1,2017-05-09,2017-05-09 13:05:33,2017-05-09 13:38:32,32.983333,2017-05-09 13:03:33
5,183,7,1753,6096567,-25.428575,-49.271158,10934.301,BC311,,-25.428583,...,60800,62505,28.416667,0.798145,1,2017-05-09,2017-05-09 13:53:20,2017-05-09 14:28:34,35.233333,2017-05-09 13:51:20
6,876,1,2937,6065893,-25.422933,-49.304968,4090.977,BC300,,-25.42285,...,32943,64125,519.7,0.904173,1,2017-05-09,2017-05-09 06:09:03,2017-05-09 06:50:39,41.6,2017-05-09 06:07:03
7,463,7,2846,6348437,-25.479636,-49.193971,0.0,DC087,,-25.479658,...,32479,50924,307.416667,0.0,1,2017-05-09,2017-05-09 06:01:19,2017-05-09 13:12:29,431.166667,2017-05-09 05:59:19
8,777,2,2195,4300046,-25.461139,-49.325868,7161.699,JC012,,-25.461108,...,38155,31834,-1.0,0.121724,1,2017-05-09,2017-05-09 07:35:55,2017-05-09 07:45:22,9.45,2017-05-09 07:33:55
9,462,5,3102,5855535,-25.434773,-49.272324,12450.673,DC296,,-25.434785,...,52149,49923,-1.0,0.114577,1,2017-05-09,2017-05-09 11:29:09,2017-05-09 11:59:30,30.35,2017-05-09 11:27:09


In [29]:
clean_legs_actual_time.printSchema()

root
 |-- date: string (nullable = true)
 |-- user_trip_id: long (nullable = true)
 |-- itinerary_id: long (nullable = true)
 |-- leg_id: long (nullable = true)
 |-- route: integer (nullable = true)
 |-- from_stop_id: integer (nullable = true)
 |-- to_stop_id: integer (nullable = true)
 |-- otp_start_time: timestamp (nullable = true)
 |-- otp_end_time: timestamp (nullable = true)
 |-- mode: string (nullable = true)
 |-- otp_duration_mins: double (nullable = true)
 |-- busCode: string (nullable = true)
 |-- tripNum: integer (nullable = true)
 |-- from_timestamp: string (nullable = true)
 |-- to_timestamp: string (nullable = true)
 |-- actual_duration_mins: double (nullable = true)
 |-- considered_duration_mins: double (nullable = true)
 |-- considered_start_time: string (nullable = true)



In [30]:
printdf(clean_legs_actual_time \
           .orderBy(['date','user_trip_id','itinerary_id','leg_id']))

Unnamed: 0,date,user_trip_id,itinerary_id,leg_id,route,from_stop_id,to_stop_id,otp_start_time,otp_end_time,mode,otp_duration_mins,busCode,tripNum,from_timestamp,to_timestamp,actual_duration_mins,considered_duration_mins,considered_start_time
0,2017-05-09,25769803948,1,1,,,,2017-05-09 18:16:42,2017-05-09 18:16:45,WALK,0.05,,,,,,0.05,2017-05-09 18:16:42
1,2017-05-09,25769803948,1,2,11.0,32801.0,29938.0,2017-05-09 18:16:46,2017-05-09 18:23:38,BUS,6.866667,JB302,10.0,2017-05-09 18:21:07,2017-05-09 18:36:11,15.066667,15.066667,2017-05-09 18:21:07
2,2017-05-09,25769803948,1,3,,,,2017-05-09 18:23:39,2017-05-09 18:23:59,WALK,0.333333,,,,,,0.333333,2017-05-09 18:23:39
3,2017-05-09,25769803948,2,1,,,,2017-05-09 18:35:25,2017-05-09 18:35:28,WALK,0.05,,,,,,0.05,2017-05-09 18:35:25
4,2017-05-09,25769803948,2,2,11.0,32801.0,29938.0,2017-05-09 18:35:29,2017-05-09 18:42:56,BUS,7.45,JB302,10.0,2017-05-09 18:21:07,2017-05-09 18:36:11,15.066667,15.066667,2017-05-09 18:21:07
5,2017-05-09,25769803948,2,3,,,,2017-05-09 18:42:57,2017-05-09 18:43:17,WALK,0.333333,,,,,,0.333333,2017-05-09 18:42:57
6,2017-05-09,51539607858,1,1,,,,2017-05-09 13:25:20,2017-05-09 13:25:49,WALK,0.483333,,,,,,0.483333,2017-05-09 13:25:20
7,2017-05-09,51539607858,1,2,628.0,35188.0,6521.0,2017-05-09 13:25:50,2017-05-09 14:03:27,BUS,37.616667,HA016,7.0,2017-05-09 13:29:50,2017-05-09 14:11:15,41.416667,41.416667,2017-05-09 13:29:50
8,2017-05-09,51539607858,1,3,,,,2017-05-09 14:03:28,2017-05-09 14:03:57,WALK,0.483333,,,,,,0.483333,2017-05-09 14:03:28
9,2017-05-09,51539607858,2,1,,,,2017-05-09 13:53:20,2017-05-09 13:53:49,WALK,0.483333,,,,,,0.483333,2017-05-09 13:53:20


#### Gather all trips alternative/executed itineraries info

In [31]:
first_boarding_time = clean_legs_actual_time \
                        .filter('mode == \'BUS\'') \
                        .groupby(['date', 'user_trip_id', 'itinerary_id']) \
                        .agg(F.first('otp_start_time').alias('planned_start_time'), \
                             F.first('considered_start_time').alias('actual_start_time')) \
                        .orderBy(['date','user_trip_id','itinerary_id'])        
                
printdf(first_boarding_time)

Unnamed: 0,date,user_trip_id,itinerary_id,planned_start_time,actual_start_time
0,2017-05-09,25769803948,1,2017-05-09 18:16:46,2017-05-09 18:21:07
1,2017-05-09,25769803948,2,2017-05-09 18:35:29,2017-05-09 18:21:07
2,2017-05-09,51539607858,1,2017-05-09 13:25:50,2017-05-09 13:29:50
3,2017-05-09,51539607858,2,2017-05-09 13:53:50,2017-05-09 14:03:36
4,2017-05-09,51539607858,3,2017-05-09 14:21:50,2017-05-09 14:25:12
5,2017-05-09,68719476891,1,2017-05-09 18:35:38,2017-05-09 18:45:08
6,2017-05-09,68719476891,2,2017-05-09 18:55:24,2017-05-09 18:57:18
7,2017-05-09,68719476891,3,2017-05-09 18:50:20,2017-05-09 18:52:44
8,2017-05-09,111669149831,1,2017-05-09 09:42:40,2017-05-09 09:36:49
9,2017-05-09,111669149831,2,2017-05-09 09:59:40,2017-05-09 09:56:08


In [32]:
user_trips_time_info = od_matrix \
                        .withColumnRenamed('executed_duration','exec_duration_mins') \
                        .withColumnRenamed('o_datetime','exec_start_time') \
                        .select(['date','user_trip_id','exec_duration_mins','exec_start_time'])

printdf(user_trips_time_info)

Unnamed: 0,date,user_trip_id,exec_duration_mins,exec_start_time
0,2017-05-09,721554505729,43.1,2017-05-09 10:02:29
1,2017-05-09,1374389535030,52.666667,2017-05-09 15:03:49
2,2017-05-09,420906795093,20.25,2017-05-09 06:04:40
3,2017-05-09,970662608985,46.15,2017-05-09 18:22:11
4,2017-05-09,51539607858,32.983333,2017-05-09 13:05:33
5,2017-05-09,1013612282010,35.233333,2017-05-09 13:53:20
6,2017-05-09,1434519076987,41.6,2017-05-09 06:09:03
7,2017-05-09,592705487116,431.166667,2017-05-09 06:01:19
8,2017-05-09,1262720385130,9.45,2017-05-09 07:35:55
9,2017-05-09,1563368095850,30.35,2017-05-09 11:29:09


In [33]:
matched_otp_legs = clean_legs_actual_time \
                            .groupBy(['date', 'user_trip_id', 'itinerary_id']) \
                            .agg(F.sum('otp_duration_mins').alias('planned_duration_mins'), \
                                 F.sum('considered_duration_mins').alias('actual_duration_mins')) \
                        .join(first_boarding_time, on=['date','user_trip_id','itinerary_id']) \
                        .join(user_trips_time_info, on=['date','user_trip_id'], how='inner') \
                        .orderBy(['date','user_trip_id','itinerary_id'])
                 
printdf(matched_otp_legs)

Unnamed: 0,date,user_trip_id,itinerary_id,planned_duration_mins,actual_duration_mins,planned_start_time,actual_start_time,exec_duration_mins,exec_start_time
0,2017-05-09,25769803948,1,7.25,15.45,2017-05-09 18:16:46,2017-05-09 18:21:07,15.9,2017-05-09 18:09:17
1,2017-05-09,25769803948,2,7.833333,15.45,2017-05-09 18:35:29,2017-05-09 18:21:07,15.9,2017-05-09 18:09:17
2,2017-05-09,51539607858,1,38.583333,42.383333,2017-05-09 13:25:50,2017-05-09 13:29:50,32.983333,2017-05-09 13:05:33
3,2017-05-09,51539607858,2,38.583333,31.666667,2017-05-09 13:53:50,2017-05-09 14:03:36,32.983333,2017-05-09 13:05:33
4,2017-05-09,51539607858,3,38.583333,32.333333,2017-05-09 14:21:50,2017-05-09 14:25:12,32.983333,2017-05-09 13:05:33
5,2017-05-09,68719476891,1,24.933333,26.683333,2017-05-09 18:35:38,2017-05-09 18:45:08,20.133333,2017-05-09 18:29:06
6,2017-05-09,68719476891,2,23.666667,26.266667,2017-05-09 18:55:24,2017-05-09 18:57:18,20.133333,2017-05-09 18:29:06
7,2017-05-09,68719476891,3,22.583333,20.75,2017-05-09 18:50:20,2017-05-09 18:52:44,20.133333,2017-05-09 18:29:06
8,2017-05-09,111669149831,1,9.633333,14.8,2017-05-09 09:42:40,2017-05-09 09:36:49,13.766667,2017-05-09 09:37:41
9,2017-05-09,111669149831,2,9.633333,13.216667,2017-05-09 09:59:40,2017-05-09 09:56:08,13.766667,2017-05-09 09:37:41


In [34]:
executed_trips_with_sugestions_matched = matched_otp_legs.select('user_trip_id')\
                                            .drop_duplicates()
printdf(executed_trips_with_sugestions_matched)

Unnamed: 0,user_trip_id
0,111669149962
1,858993459446
2,68719476891
3,1692217114674
4,781684047908
5,910533066968
6,1400159338669
7,1443109011601
8,893353197709
9,1314259992758


In [35]:
all_trips_alternatives = matched_otp_legs \
                .union(od_matrix \
                        .join(executed_trips_with_sugestions_matched, on='user_trip_id',how='inner') \
                            .withColumn('itinerary_id', F.lit(0)) \
                            .withColumn('planned_duration_mins',F.lit(None)) \
                            .withColumnRenamed('executed_duration','actual_duration_mins') \
                            .withColumn('planned_start_time', F.lit(None)) \
                            .withColumnRenamed('o_datetime', 'actual_start_time') \
                            .withColumn('exec_duration_mins', F.col('actual_duration_mins')) \
                            .withColumn('exec_start_time', F.col('actual_start_time')) \
                            .select(['date','user_trip_id','itinerary_id','planned_duration_mins','actual_duration_mins','planned_start_time','actual_start_time','exec_duration_mins','exec_start_time'])) \
                .orderBy(['date','user_trip_id','itinerary_id'])
printdf(all_trips_alternatives)

Unnamed: 0,date,user_trip_id,itinerary_id,planned_duration_mins,actual_duration_mins,planned_start_time,actual_start_time,exec_duration_mins,exec_start_time
0,2017-05-09,25769803948,0,,15.9,NaT,2017-05-09 18:09:17,15.9,2017-05-09 18:09:17
1,2017-05-09,25769803948,1,7.25,15.45,2017-05-09 18:16:46,2017-05-09 18:21:07,15.9,2017-05-09 18:09:17
2,2017-05-09,25769803948,2,7.833333,15.45,2017-05-09 18:35:29,2017-05-09 18:21:07,15.9,2017-05-09 18:09:17
3,2017-05-09,51539607858,0,,32.983333,NaT,2017-05-09 13:05:33,32.983333,2017-05-09 13:05:33
4,2017-05-09,51539607858,1,38.583333,42.383333,2017-05-09 13:25:50,2017-05-09 13:29:50,32.983333,2017-05-09 13:05:33
5,2017-05-09,51539607858,2,38.583333,31.666667,2017-05-09 13:53:50,2017-05-09 14:03:36,32.983333,2017-05-09 13:05:33
6,2017-05-09,51539607858,3,38.583333,32.333333,2017-05-09 14:21:50,2017-05-09 14:25:12,32.983333,2017-05-09 13:05:33
7,2017-05-09,68719476891,0,,20.133333,NaT,2017-05-09 18:29:06,20.133333,2017-05-09 18:29:06
8,2017-05-09,68719476891,1,24.933333,26.683333,2017-05-09 18:35:38,2017-05-09 18:45:08,20.133333,2017-05-09 18:29:06
9,2017-05-09,68719476891,2,23.666667,26.266667,2017-05-09 18:55:24,2017-05-09 18:57:18,20.133333,2017-05-09 18:29:06


In [75]:
all_trips_alternatives.write.csv(path=results_folderpath+'/all_itineraries',header=True)

In [36]:
printdf(all_trips_alternatives.groupby(['date','user_trip_id']) \
                        .count() \
                        .orderBy('count'))

Unnamed: 0,date,user_trip_id,count
0,2017-05-09,300647710724,2
1,2017-05-09,1563368095803,2
2,2017-05-09,1245540516041,2
3,2017-05-09,412316860483,2
4,2017-05-09,1529008357586,2
5,2017-05-09,223338299433,2
6,2017-05-09,1022202216484,3
7,2017-05-09,721554505729,3
8,2017-05-09,163208757335,3
9,2017-05-09,1700807049484,3


In [None]:
filtered_trips_alternatives = 

### Compute Inefficiency Metrics

#### Given: 
- U - user trip time
- O - otp suggested trip time
- E (executed itineraries) = { Ue, {Oie, 0 < i < n, n = num_otp_alt}}
- P (planned itineraries) = { Up, {Oip, 0 < i < n, n = num_otp_alt}}

#### We can compute:

$$
\begin{equation*}
    \frac{Ue - fastest(E)}{Ue} \text{  User choice actual inefficiency}.
\end{equation*}
$$

$$
\begin{equation*}
    \frac{fastest(E) - executed(fastest(Oe))}{fastest(E)} \text{ System recommendation inefficiency I}.
\end{equation*}
$$

$$
\begin{equation*}
    \frac{fastest(E) - fastest(P))}{fastest(E)} \text{ System recommendation inefficiency II}.
\end{equation*}
$$

$$
\begin{equation*}
    \frac{Up - fastest(P))}{Up} \text{ User choice plan inefficiency}.
\end{equation*}
$$

$$
\begin{equation*}
    {Oe - Op} \text{ System Schedule Deviation}.
\end{equation*}
$$

$$
\begin{equation*}
    {Ue - Up} \text{ User Trip Schedule Deviation}.
\end{equation*}
$$

$$
\begin{equation*}
    {start(Ue) - start(Up)} \text{ User stop waiting time offset}.
\end{equation*}
$$

In [44]:
def filter_trips_alternatives(trips_alternatives):
    min_trip_dur = 10
    max_trip_dur = 50
    max_trip_start_diff = 20
    
    return trips_alternatives[(trips_alternatives['exec_duration_mins'] >= min_trip_dur) & (trips_alternatives['exec_duration_mins'] <= max_trip_dur)] \
                                    .withColumn('start_diff',F.abs(F.unix_timestamp(F.col('exec_start_time')) - F.unix_timestamp(F.col('actual_start_time')))/60) \
                                    .filter('start_diff <= 20')

In [46]:
#Filter trips whose planned start time is too far away from the executed start time
filtered_trips_itineraries = filter_trips_alternatives(all_trips_alternatives)

In [51]:
printdf(filtered_trips_itineraries)

Unnamed: 0,date,user_trip_id,itinerary_id,planned_duration_mins,actual_duration_mins,planned_start_time,actual_start_time,exec_duration_mins,exec_start_time,start_diff
0,2017-05-09,25769803948,0,,15.9,NaT,2017-05-09 18:09:17,15.9,2017-05-09 18:09:17,0.0
1,2017-05-09,25769803948,1,7.25,15.45,2017-05-09 18:16:46,2017-05-09 18:21:07,15.9,2017-05-09 18:09:17,11.833333
2,2017-05-09,25769803948,2,7.833333,15.45,2017-05-09 18:35:29,2017-05-09 18:21:07,15.9,2017-05-09 18:09:17,11.833333
3,2017-05-09,51539607858,0,,32.983333,NaT,2017-05-09 13:05:33,32.983333,2017-05-09 13:05:33,0.0
4,2017-05-09,68719476891,0,,20.133333,NaT,2017-05-09 18:29:06,20.133333,2017-05-09 18:29:06,0.0
5,2017-05-09,68719476891,1,24.933333,26.683333,2017-05-09 18:35:38,2017-05-09 18:45:08,20.133333,2017-05-09 18:29:06,16.033333
6,2017-05-09,111669149831,0,,13.766667,NaT,2017-05-09 09:37:41,13.766667,2017-05-09 09:37:41,0.0
7,2017-05-09,111669149831,1,9.633333,14.8,2017-05-09 09:42:40,2017-05-09 09:36:49,13.766667,2017-05-09 09:37:41,0.866667
8,2017-05-09,111669149831,2,9.633333,13.216667,2017-05-09 09:59:40,2017-05-09 09:56:08,13.766667,2017-05-09 09:37:41,18.45
9,2017-05-09,137438953719,0,,20.0,NaT,2017-05-09 14:19:14,20.0,2017-05-09 14:19:14,0.0


#### User choice actual inefficiency

$$
\begin{equation*}
    \frac{Ue - fastest(E)}{Ue}.
\end{equation*}
$$

In [47]:
#Choose best itinerary for each trip by selecting the ones with lower actual duration
best_trips_itineraries = select_best_trip_itineraries(filtered_trips_itineraries)

In [48]:
printdf(best_trips_itineraries)

Unnamed: 0,date,user_trip_id,itinerary_id,planned_duration_mins,actual_duration_mins,planned_start_time,actual_start_time,exec_duration_mins,exec_start_time,start_diff
0,2017-05-09,515396075621,0,,17.083333,NaT,2017-05-09 07:38:27,17.083333,2017-05-09 07:38:27,0.0
1,2017-05-09,163208757335,3,3.8,4.266667,2017-05-09 08:41:52,2017-05-09 08:36:39,27.166667,2017-05-09 08:37:33,0.9
2,2017-05-09,790273982609,2,24.816667,20.033333,2017-05-09 07:24:43,2017-05-09 07:26:59,33.516667,2017-05-09 07:10:50,16.15
3,2017-05-09,1700807049484,0,,14.933333,NaT,2017-05-09 17:39:37,14.933333,2017-05-09 17:39:37,0.0
4,2017-05-09,721554505729,1,6.616667,6.466667,2017-05-09 10:04:00,2017-05-09 10:12:19,43.1,2017-05-09 10:02:29,9.833333
5,2017-05-09,489626271850,0,,12.316667,NaT,2017-05-09 13:27:40,12.316667,2017-05-09 13:27:40,0.0
6,2017-05-09,25769803948,2,7.833333,15.45,2017-05-09 18:35:29,2017-05-09 18:21:07,15.9,2017-05-09 18:09:17,11.833333
7,2017-05-09,1443109011501,0,,17.166667,NaT,2017-05-09 05:22:54,17.166667,2017-05-09 05:22:54,0.0
8,2017-05-09,1005022347340,0,,11.116667,NaT,2017-05-09 17:57:17,11.116667,2017-05-09 17:57:17,0.0
9,2017-05-09,1194000908297,3,6.15,6.366667,2017-05-09 06:57:27,2017-05-09 07:04:51,36.733333,2017-05-09 06:48:22,16.483333


In [49]:
trips_inefficiency = best_trips_itineraries \
                        .withColumn('dur_diff',(F.col('exec_duration_mins') - F.col('actual_duration_mins'))) \
                        .withColumn('imp_capacity', F.col('dur_diff')/F.col('exec_duration_mins'))

In [50]:
printdf(trips_inefficiency)

Unnamed: 0,date,user_trip_id,itinerary_id,planned_duration_mins,actual_duration_mins,planned_start_time,actual_start_time,exec_duration_mins,exec_start_time,start_diff,dur_diff,imp_capacity
0,2017-05-09,515396075621,0,,17.083333,NaT,2017-05-09 07:38:27,17.083333,2017-05-09 07:38:27,0.0,0.0,0.0
1,2017-05-09,163208757335,3,3.8,4.266667,2017-05-09 08:41:52,2017-05-09 08:36:39,27.166667,2017-05-09 08:37:33,0.9,22.9,0.842945
2,2017-05-09,790273982609,2,24.816667,20.033333,2017-05-09 07:24:43,2017-05-09 07:26:59,33.516667,2017-05-09 07:10:50,16.15,13.483333,0.402287
3,2017-05-09,1700807049484,0,,14.933333,NaT,2017-05-09 17:39:37,14.933333,2017-05-09 17:39:37,0.0,0.0,0.0
4,2017-05-09,721554505729,1,6.616667,6.466667,2017-05-09 10:04:00,2017-05-09 10:12:19,43.1,2017-05-09 10:02:29,9.833333,36.633333,0.849961
5,2017-05-09,489626271850,0,,12.316667,NaT,2017-05-09 13:27:40,12.316667,2017-05-09 13:27:40,0.0,0.0,0.0
6,2017-05-09,25769803948,2,7.833333,15.45,2017-05-09 18:35:29,2017-05-09 18:21:07,15.9,2017-05-09 18:09:17,11.833333,0.45,0.028302
7,2017-05-09,1443109011501,0,,17.166667,NaT,2017-05-09 05:22:54,17.166667,2017-05-09 05:22:54,0.0,0.0,0.0
8,2017-05-09,1005022347340,0,,11.116667,NaT,2017-05-09 17:57:17,11.116667,2017-05-09 17:57:17,0.0,0.0,0.0
9,2017-05-09,1194000908297,3,6.15,6.366667,2017-05-09 06:57:27,2017-05-09 07:04:51,36.733333,2017-05-09 06:48:22,16.483333,30.366667,0.826679


#### System Recommendation Inefficiency I
$$
\begin{equation*}
    \frac{fastest(E) - executed(fastest(Oe))}{fastest(E)}
\end{equation*}
$$

In [73]:
w_rec_inef_i = Window().partitionBy(['date','user_trip_id']).orderBy(F.col('planned_duration_mins'))

shortest_planned_actual_duration = filtered_trips_itineraries \
                            .filter('itinerary_id > 0') \
                            .withColumn("rn", F.row_number().over(w_rec_inef_i)) \
                            .where(F.col("rn") == 1) \
                            .select('date','user_trip_id','planned_duration_mins','actual_duration_mins') \
                            .withColumnRenamed('planned_duration_mins','shortest_OTP_planned_duration') \
                            .withColumnRenamed('actual_duration_mins','shortest_OTP_actual_duration')

printdf(shortest_planned_actual_duration)

Unnamed: 0,date,user_trip_id,shortest_OTP_planned_duration,shortest_OTP_actual_duration
0,2017-05-09,515396075621,17.566667,17.25
1,2017-05-09,163208757335,3.8,4.266667
2,2017-05-09,790273982609,24.6,21.666667
3,2017-05-09,1700807049484,17.3,26.1
4,2017-05-09,721554505729,6.616667,6.466667
5,2017-05-09,489626271850,12.966667,12.983333
6,2017-05-09,25769803948,7.25,15.45
7,2017-05-09,1443109011501,26.483333,17.966667
8,2017-05-09,1005022347340,14.7,11.65
9,2017-05-09,1194000908297,6.15,6.366667


In [74]:
rec_inef_i = filtered_trips_itineraries \
                .groupBy(['date','user_trip_id']) \
                .agg(F.min(F.col('actual_duration_mins')).alias('shortest_actual_duration')) \
                .join(shortest_planned_actual_duration, on=['date','user_trip_id'], how='inner') \
                .withColumn('rec_inef',(F.col('shortest_OTP_actual_duration') - F.col('shortest_actual_duration'))/F.col('shortest_OTP_actual_duration'))

printdf(rec_inef_i_features)

Unnamed: 0,date,user_trip_id,shortest_actual_duration,shortest_planned_OTP_duration,rec_inef
0,2017-05-09,515396075621,17.083333,17.566667,-0.028293
1,2017-05-09,163208757335,4.266667,3.8,0.109375
2,2017-05-09,790273982609,20.033333,24.6,-0.227953
3,2017-05-09,1700807049484,14.933333,17.3,-0.158482
4,2017-05-09,721554505729,6.466667,6.616667,-0.023196
5,2017-05-09,489626271850,12.316667,12.966667,-0.052774
6,2017-05-09,25769803948,15.45,7.25,0.530744
7,2017-05-09,1443109011501,17.166667,26.483333,-0.542718
8,2017-05-09,1005022347340,11.116667,14.7,-0.322339
9,2017-05-09,1194000908297,6.366667,6.15,0.034031


#### System Recommendation Inefficiency II
$$
\begin{equation*}
    \frac{fastest(E) - fastest(P))}{fastest(E)}
\end{equation*}
$$

#### User choice plan inefficiency
$$
\begin{equation*}
    \frac{Up - fastest(P))}{Up}
\end{equation*}
$$

#### System Schedule Deviation
$$
\begin{equation*}
    {Oe - Op}
\end{equation*}
$$

#### User Trip Schedule Deviation
$$
\begin{equation*}
    {Ue - Up}
\end{equation*}
$$

#### User stop waiting time offset
$$
\begin{equation*}
    {start(Ue) - start(Up)}
\end{equation*}
$$

In [70]:
print "Identifying itinerary alternatives which are feasible..."
trips_itineraries_possibilities, filtered_trips_possibilities = determining_trips_alternatives_feasibility(clean_legs_actual_time,od_matrix)

Identifying itinerary alternatives which are feasible...


In [72]:
printdf(od_matrix.select('date'))

Unnamed: 0,date
0,2017-05-09
1,2017-05-09
2,2017-05-09
3,2017-05-09
4,2017-05-09
5,2017-05-09
6,2017-05-09
7,2017-05-09
8,2017-05-09
9,2017-05-09


In [73]:
printdf(trips_itineraries_possibilities)

Unnamed: 0,date,user_trip_id,itinerary_id,duration,alt_start_time,exec_start_time,start_diff
0,2017-05-09,25769803948,1,15.45,2017-05-09 18:16:42,2017-05-09 18:09:17,7.416667
1,2017-05-09,25769803948,2,15.45,2017-05-09 18:42:57,2017-05-09 18:09:17,33.666667
2,2017-05-09,51539607858,1,42.383333,2017-05-09 13:29:50,2017-05-09 13:05:33,24.283333
3,2017-05-09,51539607858,2,31.666667,2017-05-09 13:53:20,2017-05-09 13:05:33,47.783333
4,2017-05-09,51539607858,3,32.333333,2017-05-09 14:59:28,2017-05-09 13:05:33,113.916667
5,2017-05-09,68719476891,1,26.683333,2017-05-09 18:45:08,2017-05-09 18:29:06,16.033333
6,2017-05-09,68719476891,2,26.266667,2017-05-09 18:57:18,2017-05-09 18:29:06,28.2
7,2017-05-09,68719476891,3,20.75,2017-05-09 18:50:04,2017-05-09 18:29:06,20.966667
8,2017-05-09,111669149831,1,14.8,2017-05-09 09:42:32,2017-05-09 09:37:41,4.85
9,2017-05-09,111669149831,2,13.216667,2017-05-09 10:09:09,2017-05-09 09:37:41,31.466667


In [74]:
printdf(filtered_trips_possibilities)

Unnamed: 0,date,user_trip_id,itinerary_id,duration,alt_start_time
0,2017-05-09,25769803948,1,15.45,2017-05-09 18:16:42
1,2017-05-09,68719476891,1,26.683333,2017-05-09 18:45:08
2,2017-05-09,111669149831,1,14.8,2017-05-09 09:42:32
3,2017-05-09,111669149962,1,47.516667,2017-05-09 17:53:38
4,2017-05-09,111669149962,2,39.266667,2017-05-09 18:10:19
5,2017-05-09,120259084497,1,4.833333,2017-05-09 13:13:57
6,2017-05-09,120259084497,2,3.966667,2017-05-09 13:25:17
7,2017-05-09,120259084497,3,3.75,2017-05-09 13:30:06
8,2017-05-09,137438953719,1,31.333333,2017-05-09 14:34:35
9,2017-05-09,163208757335,1,4.016667,2017-05-09 08:35:33


In [75]:
print "Writing itineraries possibilities with feasibility to file..."
#trips_itineraries_possibilities.write.csv(path=results_folderpath+'/itineraries_alternatives',header=True, mode='append')


Writing itineraries possibilities with feasibility to file...


In [79]:
print "Adding executed trips to the pool of itinerary possibilities..."
trips_itineraries_pool = get_trips_itineraries_pool(filtered_trips_possibilities,od_matrix)


Adding executed trips to the pool of itinerary possibilities...


In [80]:
printdf(trips_itineraries_pool,l=10)

Unnamed: 0,date,user_trip_id,itinerary_id,duration,alt_start_time
0,2017-05-09,25769803948,0,15.9,2017-05-09 18:09:17
1,2017-05-09,25769803948,1,15.45,2017-05-09 18:16:42
2,2017-05-09,51539607858,0,32.983333,2017-05-09 13:05:33
3,2017-05-09,68719476891,0,20.133333,2017-05-09 18:29:06
4,2017-05-09,68719476891,1,26.683333,2017-05-09 18:45:08
5,2017-05-09,111669149831,0,13.766667,2017-05-09 09:37:41
6,2017-05-09,111669149831,1,14.8,2017-05-09 09:42:32
7,2017-05-09,111669149962,0,61.233333,2017-05-09 17:54:34
8,2017-05-09,111669149962,1,47.516667,2017-05-09 17:53:38
9,2017-05-09,111669149962,2,39.266667,2017-05-09 18:10:19


In [81]:
print "Selecting best otp itineraries by actual duration..."
best_trips_itineraries = select_best_trip_itineraries(trips_itineraries_pool)


Selecting best otp itineraries by actual duration...


In [82]:
printdf(best_trips_itineraries,l=20)

Unnamed: 0,date,user_trip_id,itinerary_id,duration,alt_start_time
0,2017-05-09,515396075621,0,17.083333,2017-05-09 07:38:27
1,2017-05-09,206158430497,0,4.75,2017-05-09 16:41:46
2,2017-05-09,1022202216484,0,1.533333,2017-05-09 06:44:51
3,2017-05-09,163208757335,1,4.016667,2017-05-09 08:35:33
4,2017-05-09,790273982609,2,20.033333,2017-05-09 07:22:11
5,2017-05-09,1700807049484,0,14.933333,2017-05-09 17:39:37
6,2017-05-09,412316860483,1,5.633333,2017-05-09 11:57:46
7,2017-05-09,721554505729,1,6.466667,2017-05-09 10:02:15
8,2017-05-09,489626271850,0,12.316667,2017-05-09 13:27:40
9,2017-05-09,25769803948,1,15.45,2017-05-09 18:16:42


In [83]:
#Clean Memory
#clean_legs_actual_time.unpersist(blocking=True)
#trips_itineraries_possibilities.unpersist(blocking=True)
#filtered_trips_possibilities.unpersist(blocking=True)
#trips_itineraries_pool.unpersist(blocking=True)


In [85]:
printdf(duration_improvement_capacity)

Unnamed: 0,date,user_trip_id,cardNum,birthdate,gender,exec_start_time,executed_duration,itinerary_id,duration,alt_start_time,imp_capacity
0,2017-05-09,515396075621,2642531,28/09/95,F,2017-05-09 07:38:27,17.083333,0,17.083333,2017-05-09 07:38:27,0.0
1,2017-05-09,206158430497,1757419,,,2017-05-09 16:41:46,4.75,0,4.75,2017-05-09 16:41:46,0.0
2,2017-05-09,1022202216484,10090017,,,2017-05-09 06:44:51,1.533333,0,1.533333,2017-05-09 06:44:51,0.0
3,2017-05-09,163208757335,2782068,,,2017-05-09 08:37:33,27.166667,1,4.016667,2017-05-09 08:35:33,23.15
4,2017-05-09,790273982609,3645964,,,2017-05-09 07:10:50,33.516667,2,20.033333,2017-05-09 07:22:11,13.483333
5,2017-05-09,1700807049484,2658794,30/03/78,M,2017-05-09 17:39:37,14.933333,0,14.933333,2017-05-09 17:39:37,0.0
6,2017-05-09,412316860483,3541708,,,2017-05-09 11:59:46,65.033333,1,5.633333,2017-05-09 11:57:46,59.4
7,2017-05-09,721554505729,643873,,,2017-05-09 10:02:29,43.1,1,6.466667,2017-05-09 10:02:15,36.633333
8,2017-05-09,489626271850,3709654,,,2017-05-09 13:27:40,12.316667,0,12.316667,2017-05-09 13:27:40,0.0
9,2017-05-09,25769803948,3377631,21/06/00,F,2017-05-09 18:09:17,15.9,1,15.45,2017-05-09 18:16:42,0.45


In [84]:
print "Computing Improvement Capacity..."
duration_improvement_capacity = compute_improvement_capacity(best_trips_itineraries,od_matrix)


Computing Improvement Capacity...


In [53]:
#best_trips_itineraries.unpersist(blocking=True)
#od_matrix.unpersist(blocking=True)

In [None]:
print "Writing duration improvement capacity to file..."
#duration_improvement_capacity.write.csv(path=results_folderpath+'/duration_improvement_capacity',header=True, mode='append')


In [None]:
print "Finishing Script..."

In [None]:
sc.stop()
