### Imports

In [15]:
import pandas as pd
import numpy as np

#Python Standard Libs Imports
import json
import urllib2
import sys
from datetime import datetime
from os.path import isfile, join, splitext
from glob import glob
import os
#from distributed import Executor, hdfs

### Functions

#### Basic Functions

In [2]:
def rename_columns(df, list_of_tuples):
    for (old_col, new_col) in list_of_tuples:
        df = df.withColumnRenamed(old_col, new_col)
    return df

def read_folders(path, sqlContext, sc, initial_date, final_date, folder_suffix):
    extension = splitext(path)[1]

    if extension == "":
        path_pattern = path + "/*/part-*"
        if "hdfs" in path:
            URI = sc._gateway.jvm.java.net.URI
            Path = sc._gateway.jvm.org.apache.hadoop.fs.Path
            FileSystem = sc._gateway.jvm.org.apache.hadoop.fs.FileSystem
            Configuration = sc._gateway.jvm.org.apache.hadoop.conf.Configuration

            hdfs = "/".join(path_pattern.split("/")[:3])
            dir = "/" + "/".join(path_pattern.split("/")[3:])

            fs = FileSystem.get(URI(hdfs), Configuration())

            status = fs.globStatus(Path(dir))

            files = map(lambda file_status: str(file_status.getPath()), status)

        else:
            files = glob(path_pattern)

        #print initial_date, final_date
        #print datetime.strptime(files[0].split('/')[-2],('%Y_%m_%d' + folder_suffix))

        files = filter(lambda f: initial_date <= datetime.strptime(f.split("/")[-2], ('%Y_%m_%d' + folder_suffix)) <=
                                 final_date, files)
        
        #print len(files)
        #print files
        if folder_suffix == '_od':
            return reduce(lambda df1, df2: df1.unionAll(df2),
                      map(lambda f: read_hdfs_folder(sqlContext,f), files))
        else:
            return reduce(lambda df1, df2: df1.unionAll(df2),
                      map(lambda f: read_buste_data_v3(sqlContext,f), files))
    else:
        return read_file(path, sqlContext)

def read_hdfs_folder(sqlContext, folderpath):
    data_frame = sqlContext.read.csv(folderpath, header=True,
                                     inferSchema=True,nullValue="-")
    return data_frame

def read_buste_data_v3(sqlContext, folderpath):
    data_frame = read_hdfs_folder(sqlContext,folderpath)
    data_frame = data_frame.withColumn("date", F.unix_timestamp(F.col("date"),'yyyy_MM_dd'))
    
    return data_frame

def printdf(df,l=10):
    return df.limit(l).toPandas()

def get_timestamp_in_tz(unixtime_timestamp,ts_format,tz):
    return F.from_utc_timestamp(F.from_unixtime(unixtime_timestamp, ts_format),tz)

#### OTP Functions

#### Analysis Functions

In [69]:
def get_df_stats(df,filtered_df,df_label,filtered_df_label):
    df_size = df.count()
    filtered_df_size = filtered_df.count()
    print "Total", df_label,":", df_size
    print "Total", filtered_df_label, ":", filtered_df_size, "(", 100*(filtered_df_size/float(df_size)), "%)"

def get_filtered_df_stats(filtered_df,full_df_size,filtered_df_label,full_df_label):
    filtered_df_size = filtered_df.count()
    print filtered_df_label, "in Total", full_df_label, ":", filtered_df_size, "(", 100*(filtered_df_size/float(full_df_size)), "%)"

def combine_otp_suggestions_with_bus_legs_actual_time(otp_suggestions,bus_legs_actual_time):
    return otp_legs_df \
                .join(clean_otp_legs_actual_time, on=['date','user_trip_id','itinerary_id','leg_id', 'route', 'from_stop_id','to_stop_id'], how='left') \
                .withColumn('considered_duration_mins', F.when(F.col('mode') == F.lit('BUS'), F.col('actual_duration_mins')).otherwise(F.col('otp_duration_mins'))) \
                .withColumn('considered_start_time', F.when(F.col('mode') == F.lit('BUS'), F.col('from_timestamp')).otherwise(F.col('otp_start_time')))

def select_itineraries_fully_identified(otp_itineraries_legs):
    itineraries_not_fully_identified = otp_itineraries_legs \
                                        .filter((otp_itineraries_legs.mode == 'BUS') & (otp_itineraries_legs.busCode.isNull())) \
                                        .select(['date','user_trip_id','itinerary_id']).distinct()
    itineraries_fully_identified = otp_itineraries_legs.select(['date','user_trip_id','itinerary_id']).subtract(itineraries_not_fully_identified)
    return otp_itineraries_legs.join(itineraries_fully_identified, on=['date','user_trip_id','itinerary_id'], how='inner')

def filter_itineraries_without_bus_legs(otp_itineraries_legs):
    itineraries_with_bus_legs = otp_itineraries_legs \
                                    .filter((otp_itineraries_legs.mode == 'BUS')) \
                                    .select(['date','user_trip_id','itinerary_id']).distinct()
    return otp_itineraries_legs.join(itineraries_with_bus_legs, on=['date','user_trip_id','itinerary_id'], how='inner')


### Main Code

#### Reading Input Variables

In [4]:
#initial_date = datetime.strptime('2017-07-01', '%Y-%m-%d')
#final_date = datetime.strptime('2017-07-01', '%Y-%m-%d')
#od_matrix_folderpath = '/local/tarciso/masters/data/bus_trips/test/single-day-test/2017_07_01/od/trips_od/'
#buste_data_folderpath = '/local/tarciso/masters/data/bus_trips/test/single-day-test/2017_07_01/buste/'
#otp_server_url = 'http://150.165.85.4:10402/otp/'
#results_folderpath = '/local/tarciso/masters/data/bus_trips/test/single-day-test/2017_07_01'

In [45]:
initial_date = datetime.strptime('2017-05-08', '%Y-%m-%d')
final_date = datetime.strptime('2017-05-08', '%Y-%m-%d')
od_matrix_folderpath = '/local/tarciso/data/trips-optimality-exp/single-day-test/2017_05_08/od/trips_od/'
buste_data_folderpath = '/local/tarciso/data/trips-optimality-exp/single-day-test/2017_05_08/buste'
otp_server_url = 'http://150.165.85.4:10402/otp/'
results_folderpath = '/local/tarciso/data/trips-optimality-exp/single-day-test/itineraries'

#### Reading OD Matrix

In [220]:
print "Reading OD-Matrix Data..."
od_matrix_day_folderpath = od_matrix_folderpath + '/' + initial_date.strftime('%Y_%m_%d') + '_od'
od_matrix = pd.read_csv(od_matrix_day_folderpath + os.sep + 'part-00000')

Reading OD-Matrix Data...


In [221]:
od_matrix[['date']].head(4)

Unnamed: 0,date
0,1494201600
1,1494201600
2,1494201600
3,1494201600


In [222]:
print "Fixing OD Matrix dates due to bug on date saving on cluster..."
SECONDS_OFFSET = 10800
od_matrix.loc[:,'date'] = pd.to_datetime(od_matrix['date'], unit='s').dt.strftime('%Y-%m-%d')
od_matrix['user_trip_id'] = od_matrix['o_boarding_id']

Fixing OD Matrix dates due to bug on date saving on cluster...


In [223]:
od_matrix[['date']].head(4)

Unnamed: 0,date
0,2017-05-08
1,2017-05-08
2,2017-05-08
3,2017-05-08


In [224]:
od_matrix.dtypes

route                      int64
tripNum                    int64
shapeId                    int64
shapeSequence              int64
shapeLat                 float64
shapeLon                 float64
distanceTraveledShape    float64
busCode                   object
gpsPointId               float64
gpsLat                   float64
gpsLon                   float64
distanceToShapePoint     float64
timestamp                 object
stopPointId                int64
problem                   object
birthdate                 object
cardTimestamp             object
lineName                  object
gender                    object
date                      object
id                         int64
o_route                    int64
o_bus_code                object
o_date                     int64
o_tripNum                  int64
o_timestamp               object
o_shape_id                 int64
o_shape_seq                int64
o_shape_lat              float64
o_shape_lon              float64
o_stop_id 

In [225]:
print "Preprocessing Data..."

def advance_od_matrix_start_time(od_matrix,extra_seconds):
    od_matrix.loc[:,'o_datetime'] = pd.to_datetime(od_matrix['date'] + ' ' + od_matrix['o_timestamp'])
    od_matrix.loc[:,'d_datetime'] = pd.to_datetime(od_matrix['date'] + ' ' + od_matrix['timestamp'])
    od_matrix.loc[:,'executed_duration'] = (od_matrix['d_datetime'] - od_matrix['o_datetime']) / pd.Timedelta(minutes=1)
    od_matrix.loc[:,'o_base_datetime'] = pd.to_datetime(od_matrix['o_datetime']) - pd.Timedelta(minutes=2)
    
    return od_matrix

od_matrix = advance_od_matrix_start_time(od_matrix,120)


Preprocessing Data...


In [226]:
raw_od_matrix = od_matrix.copy()

In [227]:
od_matrix = raw_od_matrix.head(50)

In [228]:
len(od_matrix)

50

#### Getting OTP suggested itineraries

In [229]:
def get_router_id(query_date):
    INTERMEDIATE_OTP_DATE = datetime.strptime("2017-06-30", "%Y-%m-%d")
    
    router_id = ''
    date_timestamp = datetime.strptime(query_date, "%Y-%m-%d")   
    
    if (date_timestamp <= INTERMEDIATE_OTP_DATE):
        return 'ctba-2017-1'
    else:
        return 'ctba-2017-2'

def get_otp_itineraries(otp_url,o_lat,o_lon,d_lat,d_lon,date,time,verbose=False):
    otp_http_request = 'routers/{}/plan?fromPlace={},{}&toPlace={},{}&mode=TRANSIT,WALK&date={}&time={}'
	
    router_id = get_router_id(date)
    otp_request_url = otp_url + otp_http_request.format(router_id,o_lat,o_lon,d_lat,d_lon,date,time)
    if verbose:
        print otp_request_url
    return json.loads(urllib2.urlopen(otp_request_url).read())

def get_otp_suggested_trips(od_matrix,otp_url):
    trips_otp_response = {}
    counter = 0
    for index, row in od_matrix.iterrows():
        id=long(row['user_trip_id'])
        start_time = row['o_timestamp']
        trip_plan = get_otp_itineraries(otp_url,row['o_shape_lat'], row['o_shape_lon'], row['shapeLat'], row['shapeLon'],row['date'],start_time)
        trips_otp_response[id] = trip_plan
        counter+=1

    return trips_otp_response

def get_otp_scheduled_trips(od_matrix,otp_url):
    trips_otp_response = {}
    counter = 0
    for index, row in od_matrix.iterrows():
        id=long(row['user_trip_id'])
        start_time = row['o_timestamp']
        trip_plan = get_executed_trip_schedule(otp_url,row['o_shape_lat'], row['o_shape_lon'], row['shapeLat'], row['shapeLon'],
                                               row['date'],start_time,row['route'],row['o_stop_id'])
        trips_otp_response[id] = trip_plan
        counter+=1

    return trips_otp_response

print "Getting OTP suggested itineraries..."
otp_suggestions = get_otp_suggested_trips(od_matrix,otp_server_url)

Getting OTP suggested itineraries...


In [267]:
def extract_otp_trips_legs(otp_trips):
    trips_legs = []

    for trip in otp_trips.keys():
        if 'plan' in otp_trips[trip]:
            itinerary_id = 1
            for itinerary in otp_trips[trip]['plan']['itineraries']:
                date = otp_trips[trip]['plan']['date']/1000
                leg_id = 1
                for leg in itinerary['legs']:
                    route = leg['route'] if leg['route'] != '' else None
                    fromStopId = leg['from']['stopId'].split(':')[1] if leg['mode'] == 'BUS' else None
                    toStopId = leg['to']['stopId'].split(':')[1] if leg['mode'] == 'BUS' else None
                    start_time = long(leg['startTime'])/1000
                    end_time = long(leg['endTime'])/1000
                    duration = (end_time - start_time)/60
                    trips_legs.append((date,trip,itinerary_id,leg_id,start_time,end_time,leg['mode'],route,fromStopId,toStopId, duration))
                    leg_id += 1
                itinerary_id += 1
    return trips_legs

def prepare_otp_legs_df(otp_legs_list):
    labels=['date','user_trip_id','itinerary_id','leg_id','otp_start_time','otp_end_time','mode','route','from_stop_id','to_stop_id','otp_duration_mins']
    otp_legs_df = pd.DataFrame.from_records(data=otp_legs_list, columns=labels)
    otp_legs_df.loc[:,'date'] = pd.to_datetime(otp_legs_df['date'],unit='s').dt.strftime('%Y-%m-%d')
    otp_legs_df.loc[:,'otp_duration_mins'] = (otp_legs_df['otp_end_time'] - otp_legs_df['otp_start_time'])/60
    otp_legs_df.loc[:,'otp_start_time'] = pd.to_datetime(otp_legs_df['otp_start_time'], unit='s')
    otp_legs_df.loc[:,'otp_end_time'] = pd.to_datetime(otp_legs_df['otp_end_time'], unit='s')
    otp_legs_df.loc[:,'route'] = pd.to_numeric(otp_legs_df['route'],errors='coerce')
    otp_legs_df.loc[:,'from_stop_id'] = pd.to_numeric(otp_legs_df['from_stop_id'],errors='coerce')
    otp_legs_df.loc[:,'to_stop_id'] = pd.to_numeric(otp_legs_df['to_stop_id'],errors='coerce')
    otp_legs_df = otp_legs_df.sort_values(by=['date','user_trip_id','itinerary_id','otp_start_time'])
    
    return otp_legs_df

print "Extracting OTP Legs info..."
otp_legs_df = prepare_otp_legs_df(extract_otp_trips_legs(otp_suggestions))

#otp_suggestions = None

Extracting OTP Legs info...


In [268]:
otp_legs_df.head(5)

Unnamed: 0,date,user_trip_id,itinerary_id,leg_id,otp_start_time,otp_end_time,mode,route,from_stop_id,to_stop_id,otp_duration_mins
92,2017-05-08,17179869196,1,1,2017-05-08 11:07:58,2017-05-08 11:07:59,WALK,,,,0.016667
93,2017-05-08,17179869196,1,2,2017-05-08 11:08:00,2017-05-08 11:31:20,BUS,901.0,29598.0,33654.0,23.333333
94,2017-05-08,17179869196,1,3,2017-05-08 11:31:21,2017-05-08 11:31:42,WALK,,,,0.35
95,2017-05-08,17179869196,2,1,2017-05-08 11:16:58,2017-05-08 11:16:59,WALK,,,,0.016667
96,2017-05-08,17179869196,2,2,2017-05-08 11:17:00,2017-05-08 11:40:20,BUS,901.0,29598.0,33654.0,23.333333


In [303]:
otp_legs_df.dtypes

date                         object
user_trip_id                  int64
itinerary_id                  int64
leg_id                        int64
otp_start_time       datetime64[ns]
otp_end_time         datetime64[ns]
mode                         object
route                       float64
from_stop_id                float64
to_stop_id                  float64
otp_duration_mins           float64
dtype: object

#### Gathering schedule info for executed trips

In [269]:
def get_executed_trip_schedule(otp_url,o_lat,o_lon,d_lat,d_lon,date,time,route,start_stop_id,verbose=False):
    DEF_AGENCY_NAME = 'URBS'
    DEF_AGENCY_ID = 1
    otp_http_request = 'routers/{}/plan?fromPlace={},{}&toPlace={},{}&mode=TRANSIT,WALK&date={}&time={}&numItineraries=1&preferredRoutes={}_{}&startTransitStopId={}_{}&maxWalkingDistance=150&maxTransfers=0'
    
    router_id = get_router_id(date)
    otp_request_url = otp_url + otp_http_request.format(router_id,o_lat,o_lon,d_lat,d_lon,date,time,DEF_AGENCY_NAME,route,DEF_AGENCY_ID,start_stop_id)
    if verbose:
        print otp_request_url
    return json.loads(urllib2.urlopen(otp_request_url).read())

print "Getting OTP schedule info for executed trips..."
executed_trips_schedule = get_otp_scheduled_trips(od_matrix,otp_server_url)

Getting OTP schedule info for executed trips...


In [270]:
print "Extracting OTP Legs info..."
executed_trips_schedule_df = prepare_otp_legs_df(extract_otp_trips_legs(executed_trips_schedule))
executed_trips_schedule_df = executed_trips_schedule_df[executed_trips_schedule_df['mode'] == 'BUS']
executed_trips_schedule_df.loc[:,'itinerary_id'] = 0

#executed_trips_schedule = None

Extracting OTP Legs info...


In [271]:
executed_trips_schedule_df.head()

Unnamed: 0,date,user_trip_id,itinerary_id,leg_id,otp_start_time,otp_end_time,mode,route,from_stop_id,to_stop_id,otp_duration_mins
30,2017-05-08,17179869196,0,2,2017-05-08 11:08:00,2017-05-08 11:31:20,BUS,901.0,29598.0,33654.0,23.333333
77,2017-05-08,94489280525,0,2,2017-05-08 21:28:00,2017-05-08 21:50:13,BUS,972.0,29599.0,33375.0,22.216667
103,2017-05-08,163208757264,0,2,2017-05-08 11:33:43,2017-05-08 11:38:56,BUS,821.0,34147.0,33342.0,5.216667
93,2017-05-08,377957122060,0,2,2017-05-08 14:57:24,2017-05-08 15:09:51,BUS,924.0,26811.0,38141.0,12.45
53,2017-05-08,455266533398,0,2,2017-05-08 08:52:23,2017-05-08 09:23:55,BUS,924.0,38142.0,27373.0,31.533333


In [272]:
executed_trips_schedule_df.sort_values(['user_trip_id']).head(5).dtypes

date                         object
user_trip_id                  int64
itinerary_id                  int64
leg_id                        int64
otp_start_time       datetime64[ns]
otp_end_time         datetime64[ns]
mode                         object
route                       float64
from_stop_id                float64
to_stop_id                  float64
otp_duration_mins           float64
dtype: object

In [273]:
od_matrix[['date','user_trip_id','o_base_datetime','route','o_stop_id','stopPointId','next_o_stop_id']].sort_values(['user_trip_id']).head(5)

Unnamed: 0,date,user_trip_id,o_base_datetime,route,o_stop_id,stopPointId,next_o_stop_id
28,2017-05-08,17179869196,2017-05-08 07:57:32,901,29598,33654,33653
44,2017-05-08,94489280525,2017-05-08 18:02:31,972,29599,33375,33308
33,2017-05-08,111669149696,2017-05-08 07:18:56,822,33801,33784,33801
13,2017-05-08,163208757264,2017-05-08 08:31:15,40,34147,33342,33348
35,2017-05-08,377957122060,2017-05-08 11:49:30,924,26758,38141,38141


In [274]:
od_matrix \
        .rename(index=str, columns={'o_stop_id':'from_stop_id', 'stopPointId':'to_stop_id'}) \
        [['date','user_trip_id','o_base_datetime','route','from_stop_id','to_stop_id','next_o_stop_id']] \
        .sort_values('user_trip_id').dtypes

date                       object
user_trip_id                int64
o_base_datetime    datetime64[ns]
route                       int64
from_stop_id                int64
to_stop_id                  int64
next_o_stop_id              int64
dtype: object

In [275]:
matched_executed_trips = od_matrix \
                            .rename(index=str, columns={'o_stop_id':'from_stop_id', 'stopPointId':'to_stop_id'}) \
                            .merge(executed_trips_schedule_df, 
                                 on=['date','user_trip_id','from_stop_id','to_stop_id'], how='inner') \
                            [['date','user_trip_id','itinerary_id','otp_duration_mins','otp_start_time']] \
                            .rename(index=str, columns={'otp_duration_mins':'planned_duration_mins', 'otp_start_time':'planned_start_time'})

In [276]:
matched_executed_trips.head()

Unnamed: 0,date,user_trip_id,itinerary_id,planned_duration_mins,planned_start_time
0,2017-05-08,1451698946054,0,15.983333,2017-05-08 13:31:16
1,2017-05-08,867583393796,0,5.15,2017-05-08 13:30:00
2,2017-05-08,816043786247,0,14.366667,2017-05-08 09:11:36
3,2017-05-08,996432412675,0,7.083333,2017-05-08 10:41:53
4,2017-05-08,1477468749834,0,20.75,2017-05-08 10:08:30


In [277]:
len(matched_executed_trips)

27

In [278]:
len(od_matrix)

50

In [279]:
len(executed_trips_schedule_df)

40

#### Printing Itineraries Stats

In [280]:
total_num_itineraries = len(otp_legs_df[['user_trip_id','itinerary_id']].drop_duplicates())
total_num_legs = len(otp_legs_df)
num_bus_legs = len(otp_legs_df[otp_legs_df['mode'] == 'BUS'])

print "Total num itineraries:", total_num_itineraries
print "Total num legs:", total_num_legs
print "Total num bus legs:", num_bus_legs, '(', 100*(num_bus_legs/float(total_num_legs)), '%)'


Total num itineraries: 148
Total num legs: 420
Total num bus legs: 138 ( 32.8571428571 %)


#### Matching OTP Bus Legs Origins with BUSTE Data

In [289]:
print "Reading BUSTE data..."
buste_day_folderpath = buste_data_folderpath + '/' + initial_date.strftime('%Y_%m_%d')
bus_trips_data = pd.read_csv(buste_day_folderpath + os.sep + 'part-00000')
bus_trips_data['tripNum'] = pd.to_numeric(bus_trips_data['tripNum'], errors='coerce')

Reading BUSTE data...


In [290]:
bus_trips_data.head()

Unnamed: 0,route,tripNum,shapeId,shapeSequence,shapeLat,shapeLon,distanceTraveledShape,busCode,gpsPointId,gpsLat,...,distanceToShapePoint,timestamp,stopPointId,problem,birthdate,cardTimestamp,lineName,cardNum,gender,date
0,901,1.0,2257,6191075,-25.402164,-49.327866,298.503,MC076,-,-,...,-,06:09:23,32369,BETWEEN,-,-,-,-,-,2017_05_08
1,901,1.0,2257,6191091,-25.403712,-49.328166,588.304,MC076,-,-,...,-,06:09:52,33651,BETWEEN,-,-,-,-,-,2017_05_08
2,901,1.0,2257,6191099,-25.404864,-49.327253,817.779,MC076,-,-25.404913,...,5.7547846,06:10:42,33653,NO_PROBLEM,-,-,-,-,-,2017_05_08
3,901,1.0,2257,6191127,-25.408142,-49.324268,1300.149,MC076,-,-,...,-,06:12:01,33657,BETWEEN,-,-,-,-,-,2017_05_08
4,901,1.0,2257,6191158,-25.411098,-49.320234,1880.64,MC076,-,-25.411081,...,6.2103405,06:12:42,33660,NO_PROBLEM,-,-,-,-,-,2017_05_08


In [296]:
bus_trips_data.dtypes

route                      int64
tripNum                  float64
shapeId                    int64
shapeSequence              int64
shapeLat                 float64
shapeLon                 float64
distanceTraveledShape    float64
busCode                   object
gpsPointId                object
gpsLat                    object
gpsLon                    object
distanceToShapePoint      object
timestamp                 object
stopPointId                int64
problem                   object
birthdate                 object
cardTimestamp             object
lineName                  object
cardNum                   object
gender                    object
date                      object
dtype: object

In [394]:
def clean_buste_data(buste_data):
    clean_buste_data = buste_data[["date","route","busCode","tripNum","stopPointId","timestamp"]] \
                        .dropna(subset=["date","route","busCode","tripNum","stopPointId","timestamp"]) \
                        .drop_duplicates(subset=['date','route','busCode','tripNum','stopPointId'])
    clean_buste_data.loc[:,'route'] = pd.to_numeric(clean_buste_data['route'], errors='coerce')
    clean_buste_data.loc[:,'date'] = clean_buste_data['date'].str.replace('_','-')
    clean_buste_data.loc[:,'timestamp'] = pd.to_datetime(clean_buste_data['date'] + ' ' + clean_buste_data['timestamp'])
    
    return clean_buste_data

clean_bus_trips_data = clean_buste_data(bus_trips_data)

In [395]:
clean_bus_trips_data.dtypes

date                   object
route                   int64
busCode                object
tripNum               float64
stopPointId             int64
timestamp      datetime64[ns]
dtype: object

In [396]:
clean_bus_trips_data[['route','date','timestamp']].head()

Unnamed: 0,route,date,timestamp
0,901,2017-05-08,2017-05-08 06:09:23
1,901,2017-05-08,2017-05-08 06:09:52
2,901,2017-05-08,2017-05-08 06:10:42
3,901,2017-05-08,2017-05-08 06:12:01
4,901,2017-05-08,2017-05-08 06:12:42


In [397]:
def find_otp_bus_legs_actual_start_time(otp_legs_df,clean_bus_trips_df):
    legs_buste_match = otp_legs_df.assign(stopPointId = otp_legs_df['from_stop_id']) \
            .merge(clean_bus_trips_df, on=['date','route','stopPointId'], how='inner') \
            .dropna(subset=['timestamp'])
    legs_buste_match.loc[:,'timediff'] = np.absolute(legs_buste_match['timestamp'] - legs_buste_match['otp_start_time'])
    legs_buste_match.drop('otp_duration_mins', axis=1)
    
    earliest_legs_start_times = legs_buste_match.groupby(['date','user_trip_id','itinerary_id','route','from_stop_id']) \
                                .timediff.min().reset_index()
        
    legs_st_time = legs_buste_match.merge(earliest_legs_start_times, on=['date','user_trip_id','itinerary_id','route','from_stop_id','timediff'], how='inner') \
                [['date','user_trip_id','itinerary_id','leg_id','route','busCode','tripNum','from_stop_id','otp_start_time','timestamp','to_stop_id','otp_end_time']] \
                .rename(index=str, columns={'timestamp':'from_timestamp'})
    
    legs_st_time.loc[:,'route'] = pd.to_numeric(legs_st_time['route'])
    
    return legs_st_time
            
    
print "Finding OTP Bus Legs Actual Start Times in Bus Trips Data..."
otp_legs_st = find_otp_bus_legs_actual_start_time(otp_legs_df,clean_bus_trips_data)

Finding OTP Bus Legs Actual Start Times in Bus Trips Data...


In [398]:
otp_legs_st.head()

Unnamed: 0,date,user_trip_id,itinerary_id,leg_id,route,busCode,tripNum,from_stop_id,otp_start_time,from_timestamp,to_stop_id,otp_end_time
0,2017-05-08,17179869196,1,2,901.0,MC305,7.0,29598.0,2017-05-08 11:08:00,2017-05-08 11:12:05,33654.0,2017-05-08 11:31:20
1,2017-05-08,17179869196,2,2,901.0,MC305,7.0,29598.0,2017-05-08 11:17:00,2017-05-08 11:12:05,33654.0,2017-05-08 11:40:20
2,2017-05-08,17179869196,3,2,901.0,MC303,5.0,29598.0,2017-05-08 11:27:00,2017-05-08 11:24:52,33654.0,2017-05-08 11:51:16
3,2017-05-08,386547056643,2,2,901.0,MC302,5.0,29598.0,2017-05-08 10:39:00,2017-05-08 10:42:05,26720.0,2017-05-08 10:40:20
4,2017-05-08,755914244105,2,2,901.0,MC080,19.0,29598.0,2017-05-08 22:41:00,2017-05-08 22:36:34,33681.0,2017-05-08 22:50:33


#### Printing Matched OTP Bus Legs Origins Stats

In [399]:
num_bus_legs_st = len(otp_legs_st)
print "Num Bus Legs whose start was found:", num_bus_legs_st, '(', 100*(num_bus_legs_st/float(num_bus_legs)), '%)'


Num Bus Legs whose start was found: 134 ( 97.1014492754 %)


#### Cleaning Memory

In [400]:
#Clean memory
#otp_legs_df.unpersist(blocking=True)
#bus_trips_data.unpersist(blocking=True)
#clean_bus_trips_data.unpersist(blocking=True)

#### Matching OTP Bus Legs Destinations with BUSTE Data

In [428]:
def find_otp_bus_legs_actual_end_time(otp_legs_st,clean_bus_trips):
    legs_buste_match = otp_legs_st.assign(stopPointId = otp_legs_st['to_stop_id']) \
            .merge(clean_bus_trips_data, on=['date','route','busCode','tripNum','stopPointId'], how='inner') \
            .dropna(subset=['timestamp'])
    legs_buste_match.loc[:,'timediff'] = np.absolute(legs_buste_match['timestamp'] - legs_buste_match['otp_end_time'])
    legs_buste_match = legs_buste_match.rename(index=str,columns={'timestamp':'to_timestamp'}) \
                    .sort_values(by=['date','route','to_stop_id','timediff'])
    return legs_buste_match

print "Finding OTP Bus Legs Actual End Times in Bus Trips Data..."
otp_legs_start_end = find_otp_bus_legs_actual_end_time(otp_legs_st,clean_bus_trips_data)

Finding OTP Bus Legs Actual End Times in Bus Trips Data...


In [429]:
otp_legs_start_end.head()

Unnamed: 0,date,user_trip_id,itinerary_id,leg_id,route,busCode,tripNum,from_stop_id,otp_start_time,from_timestamp,to_stop_id,otp_end_time,stopPointId,to_timestamp,timediff
21,2017-05-08,163208757264,3,2,40,MB301,3.0,34147.0,2017-05-08 11:49:25,2017-05-08 12:17:47,33342.0,2017-05-08 11:54:55,33342,2017-05-08 12:24:04,00:29:09
20,2017-05-08,163208757264,2,2,40,MB301,3.0,34147.0,2017-05-08 11:35:16,2017-05-08 12:17:47,33342.0,2017-05-08 11:40:53,33342,2017-05-08 12:24:04,00:43:11
88,2017-05-08,936302870530,1,2,150,MN611,5.0,26796.0,2017-05-08 10:55:56,2017-05-08 10:56:44,32795.0,2017-05-08 11:07:37,32795,2017-05-08 11:04:58,00:02:39
56,2017-05-08,558345748483,3,1,226,BA123,15.0,34487.0,2017-05-08 15:57:57,2017-05-08 15:59:59,1899.0,2017-05-08 16:01:00,1899,2017-05-08 15:26:19,00:34:41
55,2017-05-08,558345748483,2,1,226,BA112,14.0,34487.0,2017-05-08 15:38:57,2017-05-08 15:39:18,1899.0,2017-05-08 15:42:00,1899,2017-05-08 15:04:12,00:37:48


In [436]:
def clean_otp_legs_actual_time_df(otp_legs_st_end_df):
    clean_legs_time = otp_legs_start_end[['date','user_trip_id','itinerary_id','leg_id','route','busCode','tripNum','from_stop_id','from_timestamp','to_stop_id','to_timestamp']] \
                .assign(actual_duration_mins = ((otp_legs_start_end['to_timestamp'] - otp_legs_start_end['from_timestamp'])/pd.Timedelta(minutes=1))) \
                .sort_values(by=['date','user_trip_id','itinerary_id','leg_id'])
            
    clean_legs_time = clean_legs_time[clean_legs_time['actual_duration_mins'] > 0]
    
    return clean_legs_time

clean_otp_legs_actual_time = clean_otp_legs_actual_time_df(otp_legs_start_end)

In [437]:
clean_otp_legs_actual_time.head()

Unnamed: 0,date,user_trip_id,itinerary_id,leg_id,route,busCode,tripNum,from_stop_id,from_timestamp,to_stop_id,to_timestamp,actual_duration_mins
0,2017-05-08,17179869196,1,2,901,MC305,7.0,29598.0,2017-05-08 11:12:05,33654.0,2017-05-08 11:38:21,26.266667
1,2017-05-08,17179869196,2,2,901,MC305,7.0,29598.0,2017-05-08 11:12:05,33654.0,2017-05-08 11:38:21,26.266667
2,2017-05-08,17179869196,3,2,901,MC303,5.0,29598.0,2017-05-08 11:24:52,33654.0,2017-05-08 11:47:43,22.85
9,2017-05-08,94489280525,1,2,972,MC491,14.0,29599.0,2017-05-08 21:12:39,33375.0,2017-05-08 21:32:05,19.433333
16,2017-05-08,94489280525,2,2,965,MN606,3.0,26720.0,2017-05-08 21:33:33,33375.0,2017-05-08 21:50:12,16.65


In [33]:
print "Reading BUSTE data again..."
bus_trips_data2 = read_folders(buste_data_folderpath, sqlContext, sc, initial_date, final_date,'')
clean_bus_trips_data2 = clean_buste_data(bus_trips_data2)

print "Finding OTP Bus Legs Actual End Times in Bus Trips Data..."
otp_legs_start_end = find_otp_bus_legs_actual_end_time(otp_legs_st,clean_bus_trips_data2)
clean_otp_legs_actual_time = clean_otp_legs_actual_time_df(otp_legs_start_end)


Reading BUSTE data again...
Finding OTP Bus Legs Actual End Times in Bus Trips Data...


In [34]:
num_matched_bus_legs_st = clean_otp_legs_actual_time.count()
print "Num Bus Legs whose end was found:", num_matched_bus_legs_st, '(', 100*(num_matched_bus_legs_st/float(num_bus_legs)), '%)'


Num Bus Legs whose end was found: 5 ( 45.4545454545 %)


#### Cleaning Memory

In [35]:
#Clean Memory
#otp_legs_st.unpersist(blocking=True)
#bus_trips_data2.unpersist(blocking=True)
#clean_bus_trips_data2.unpersist(blocking=True)
#otp_legs_start_end.unpersist(blocking=True)


#### Enriching OTP suggestions legs with actual time data

In [36]:
print "Enriching OTP suggestions legs with actual time data..."
all_legs_actual_time = combine_otp_suggestions_with_bus_legs_actual_time(otp_legs_df,clean_otp_legs_actual_time)


Enriching OTP suggestions legs with actual time data...


#### Filtering out itineraries with bus legs not identified in bus data

In [37]:
print "Filtering out itineraries with bus legs not identified in bus data..."
clean_legs_actual_time = select_itineraries_fully_identified(all_legs_actual_time)

Filtering out itineraries with bus legs not identified in bus data...


In [38]:
print "Filtering out itineraries without bus legs..."
clean_legs_actual_time = filter_itineraries_without_bus_legs(clean_legs_actual_time)

Filtering out itineraries without bus legs...


In [39]:
printdf(clean_legs_actual_time)

Unnamed: 0,date,user_trip_id,itinerary_id,leg_id,route,from_stop_id,to_stop_id,otp_start_time,otp_end_time,mode,otp_duration_mins,busCode,tripNum,from_timestamp,to_timestamp,actual_duration_mins,considered_duration_mins,considered_start_time
0,2017-07-01,34359738511,1,2,500.0,25471.0,27560.0,2017-07-01 06:07:23,2017-07-01 06:15:00,BUS,7.616667,GE716,1.0,2017-07-01 06:05:05,2017-07-01 06:15:34,10.483333,10.483333,2017-07-01 06:05:05
1,2017-07-01,34359738511,1,3,,,,2017-07-01 06:15:01,2017-07-01 06:18:39,WALK,3.633333,,,,,,3.633333,2017-07-01 06:15:01
2,2017-07-01,34359738511,1,1,,,,2017-07-01 05:46:59,2017-07-01 06:07:22,WALK,20.383333,,,,,,20.383333,2017-07-01 05:46:59
3,2017-07-01,128849018906,2,2,500.0,25515.0,27560.0,2017-07-01 14:46:33,2017-07-01 14:48:00,BUS,1.45,GE700,11.0,2017-07-01 14:42:20,2017-07-01 14:46:52,4.533333,4.533333,2017-07-01 14:42:20
4,2017-07-01,128849018906,2,3,,,,2017-07-01 14:48:01,2017-07-01 15:01:37,WALK,13.6,,,,,,13.6,2017-07-01 14:48:01
5,2017-07-01,128849018906,2,1,,,,2017-07-01 14:33:01,2017-07-01 14:46:32,WALK,13.516667,,,,,,13.516667,2017-07-01 14:33:01
6,2017-07-01,34359738511,3,1,,,,2017-07-01 06:16:59,2017-07-01 06:37:22,WALK,20.383333,,,,,,20.383333,2017-07-01 06:16:59
7,2017-07-01,34359738511,3,2,500.0,25471.0,27560.0,2017-07-01 06:37:23,2017-07-01 06:45:00,BUS,7.616667,GE702,1.0,2017-07-01 06:35:20,2017-07-01 06:44:01,8.683333,8.683333,2017-07-01 06:35:20
8,2017-07-01,34359738511,3,3,,,,2017-07-01 06:45:01,2017-07-01 06:48:39,WALK,3.633333,,,,,,3.633333,2017-07-01 06:45:01
9,2017-07-01,34359738511,2,3,,,,2017-07-01 06:35:01,2017-07-01 06:38:39,WALK,3.633333,,,,,,3.633333,2017-07-01 06:35:01


In [40]:
printdf(clean_legs_actual_time.filter(clean_legs_actual_time['mode'] == 'BUS') \
        .select(['otp_start_time','otp_end_time','otp_duration_mins','from_timestamp','to_timestamp','actual_duration_mins']))

Unnamed: 0,otp_start_time,otp_end_time,otp_duration_mins,from_timestamp,to_timestamp,actual_duration_mins
0,2017-07-01 06:07:23,2017-07-01 06:15:00,7.616667,2017-07-01 06:05:05,2017-07-01 06:15:34,10.483333
1,2017-07-01 14:46:33,2017-07-01 14:48:00,1.45,2017-07-01 14:42:20,2017-07-01 14:46:52,4.533333
2,2017-07-01 06:37:23,2017-07-01 06:45:00,7.616667,2017-07-01 06:35:20,2017-07-01 06:44:01,8.683333
3,2017-07-01 06:27:23,2017-07-01 06:35:00,7.616667,2017-07-01 06:23:54,2017-07-01 06:32:59,9.083333
4,2017-07-01 14:30:33,2017-07-01 14:32:00,1.45,2017-07-01 14:28:47,2017-07-01 14:33:30,4.716667


#### Printing Fully Identified OTP Itineraries Stats

In [41]:
num_itineraries_fully_identified = clean_legs_actual_time.select('user_trip_id','itinerary_id').distinct().count()
print "Num Itineraries fully identified in BUSTE data:", num_itineraries_fully_identified, '(', 100*(num_itineraries_fully_identified/float(total_num_itineraries)), '%)'

Num Itineraries fully identified in BUSTE data: 5 ( 17.8571428571 %)


In [42]:
print "Writing OTP suggested itineraries legs with actual time to file..."
#clean_legs_actual_time.write.csv(path=results_folderpath+'/otp_legs_matched',header=True, mode='append')


Writing OTP suggested itineraries legs with actual time to file...


In [43]:
#Clean Memory
#clean_otp_legs_actual_time.unpersist(blocking=True)
#all_legs_actual_time.unpersist(blocking=True)

In [38]:
printdf(od_matrix)

Unnamed: 0,route,tripNum,shapeId,shapeSequence,shapeLat,shapeLon,distanceTraveledShape,busCode,gpsPointId,gpsLat,...,o_unixtimestamp,next_o_unixtimestamp,leg_duration,dist,rn,date,o_datetime,d_datetime,executed_duration,o_base_datetime
0,370,4,2789,5510497,-25.436972,-49.277444,3768.768,BC032,,-25.437973,...,46949,73805,447.6,0.583099,1,2017-05-09,2017-05-09 10:02:29,2017-05-09 10:45:35,43.1,2017-05-09 10:00:29
1,380,6,4127,6448424,-25.427902,-49.263242,6213.697,LC302,,-25.428101,...,65029,75415,173.1,0.020988,1,2017-05-09,2017-05-09 15:03:49,2017-05-09 15:56:29,52.666667,2017-05-09 15:01:49
2,712,2,2170,6657911,-25.484518,-49.333416,4717.637,JA012,,-25.484293,...,32680,45992,221.866667,0.214177,1,2017-05-09,2017-05-09 06:04:40,2017-05-09 06:24:55,20.25,2017-05-09 06:02:40
3,211,7,1776,6633394,-25.402759,-49.213098,0.0,BA037,,-25.402765,...,76931,39309,-1.0,0.974592,1,2017-05-09,2017-05-09 18:22:11,2017-05-09 19:08:20,46.15,2017-05-09 18:20:11
4,628,7,2935,6035542,-25.495496,-49.302459,13282.36,HA025,,-25.495243,...,57933,72766,247.216667,0.368644,1,2017-05-09,2017-05-09 13:05:33,2017-05-09 13:38:32,32.983333,2017-05-09 13:03:33
5,183,7,1753,6096567,-25.428575,-49.271158,10934.301,BC311,,-25.428583,...,60800,62505,28.416667,0.798145,1,2017-05-09,2017-05-09 13:53:20,2017-05-09 14:28:34,35.233333,2017-05-09 13:51:20
6,876,1,2937,6065893,-25.422933,-49.304968,4090.977,BC300,,-25.42285,...,32943,64125,519.7,0.904173,1,2017-05-09,2017-05-09 06:09:03,2017-05-09 06:50:39,41.6,2017-05-09 06:07:03
7,463,7,2846,6348437,-25.479636,-49.193971,0.0,DC087,,-25.479658,...,32479,50924,307.416667,0.0,1,2017-05-09,2017-05-09 06:01:19,2017-05-09 13:12:29,431.166667,2017-05-09 05:59:19
8,777,2,2195,4300046,-25.461139,-49.325868,7161.699,JC012,,-25.461108,...,38155,31834,-1.0,0.121724,1,2017-05-09,2017-05-09 07:35:55,2017-05-09 07:45:22,9.45,2017-05-09 07:33:55
9,462,5,3102,5855535,-25.434773,-49.272324,12450.673,DC296,,-25.434785,...,52149,49923,-1.0,0.114577,1,2017-05-09,2017-05-09 11:29:09,2017-05-09 11:59:30,30.35,2017-05-09 11:27:09


In [39]:
clean_legs_actual_time.printSchema()

root
 |-- date: string (nullable = true)
 |-- user_trip_id: long (nullable = true)
 |-- itinerary_id: long (nullable = true)
 |-- leg_id: long (nullable = true)
 |-- route: integer (nullable = true)
 |-- from_stop_id: integer (nullable = true)
 |-- to_stop_id: integer (nullable = true)
 |-- otp_start_time: timestamp (nullable = true)
 |-- otp_end_time: timestamp (nullable = true)
 |-- mode: string (nullable = true)
 |-- otp_duration_mins: double (nullable = true)
 |-- busCode: string (nullable = true)
 |-- tripNum: integer (nullable = true)
 |-- from_timestamp: string (nullable = true)
 |-- to_timestamp: string (nullable = true)
 |-- actual_duration_mins: double (nullable = true)
 |-- considered_duration_mins: double (nullable = true)
 |-- considered_start_time: string (nullable = true)



In [40]:
printdf(clean_legs_actual_time \
           .orderBy(['date','user_trip_id','itinerary_id','leg_id']))

Unnamed: 0,date,user_trip_id,itinerary_id,leg_id,route,from_stop_id,to_stop_id,otp_start_time,otp_end_time,mode,otp_duration_mins,busCode,tripNum,from_timestamp,to_timestamp,actual_duration_mins,considered_duration_mins,considered_start_time
0,2017-05-09,7,1,1,,,,2017-05-09 14:43:08,2017-05-09 14:43:18,WALK,0.166667,,,,,,0.166667,2017-05-09 14:43:08
1,2017-05-09,7,1,2,703.0,35271.0,26631.0,2017-05-09 14:43:19,2017-05-09 15:18:32,BUS,35.216667,JC310,6.0,2017-05-09 14:40:13,2017-05-09 15:10:07,29.9,29.9,2017-05-09 14:40:13
2,2017-05-09,7,1,3,,,,2017-05-09 15:18:33,2017-05-09 15:18:35,WALK,0.033333,,,,,,0.033333,2017-05-09 15:18:33
3,2017-05-09,7,2,1,,,,2017-05-09 14:56:08,2017-05-09 14:56:18,WALK,0.166667,,,,,,0.166667,2017-05-09 14:56:08
4,2017-05-09,7,2,2,703.0,35271.0,26631.0,2017-05-09 14:56:19,2017-05-09 15:31:32,BUS,35.216667,JC304,7.0,2017-05-09 15:07:46,2017-05-09 15:32:38,24.866667,24.866667,2017-05-09 15:07:46
5,2017-05-09,7,2,3,,,,2017-05-09 15:31:33,2017-05-09 15:31:35,WALK,0.033333,,,,,,0.033333,2017-05-09 15:31:33
6,2017-05-09,7,3,1,,,,2017-05-09 15:09:08,2017-05-09 15:09:18,WALK,0.166667,,,,,,0.166667,2017-05-09 15:09:08
7,2017-05-09,7,3,2,703.0,35271.0,26631.0,2017-05-09 15:09:19,2017-05-09 15:44:32,BUS,35.216667,JC304,7.0,2017-05-09 15:07:46,2017-05-09 15:32:38,24.866667,24.866667,2017-05-09 15:07:46
8,2017-05-09,7,3,3,,,,2017-05-09 15:44:33,2017-05-09 15:44:35,WALK,0.033333,,,,,,0.033333,2017-05-09 15:44:33
9,2017-05-09,8,1,1,,,,2017-05-09 06:30:42,2017-05-09 06:41:50,WALK,11.133333,,,,,,11.133333,2017-05-09 06:30:42


#### Gather all trips alternative/executed itineraries info

In [44]:
first_boarding_time = clean_legs_actual_time \
                        .filter('mode == \'BUS\'') \
                        .groupby(['date', 'user_trip_id', 'itinerary_id']) \
                        .agg(F.first('otp_start_time').alias('planned_start_time'), \
                             F.first('considered_start_time').alias('actual_start_time')) \
                        .orderBy(['date','user_trip_id','itinerary_id'])        
                
printdf(first_boarding_time)

Unnamed: 0,date,user_trip_id,itinerary_id,planned_start_time,actual_start_time
0,2017-07-01,34359738511,1,2017-07-01 06:07:23,2017-07-01 06:05:05
1,2017-07-01,34359738511,2,2017-07-01 06:27:23,2017-07-01 06:23:54
2,2017-07-01,34359738511,3,2017-07-01 06:37:23,2017-07-01 06:35:20
3,2017-07-01,128849018906,1,2017-07-01 14:30:33,2017-07-01 14:28:47
4,2017-07-01,128849018906,2,2017-07-01 14:46:33,2017-07-01 14:42:20


In [45]:
user_trips_time_info = od_matrix \
                        .withColumnRenamed('executed_duration','exec_duration_mins') \
                        .withColumnRenamed('o_datetime','exec_start_time') \
                        .select(['date','user_trip_id','exec_duration_mins','exec_start_time'])

printdf(user_trips_time_info)

Unnamed: 0,date,user_trip_id,exec_duration_mins,exec_start_time
0,2017-07-01,240518168663,5.033333,2017-07-01 21:42:00
1,2017-07-01,34359738511,450.433333,2017-07-01 05:41:47
2,2017-07-01,661424963588,28.266667,2017-07-01 14:52:03
3,2017-07-01,1709396983854,11.6,2017-07-01 13:46:52
4,2017-07-01,1047972020291,64.533333,2017-07-01 19:03:53
5,2017-07-01,343597383815,58.1,2017-07-01 05:38:22
6,2017-07-01,85899345938,19.766667,2017-07-01 05:51:53
7,2017-07-01,901943132161,22.166667,2017-07-01 19:03:41
8,2017-07-01,1073741824016,23.95,2017-07-01 19:47:50
9,2017-07-01,420906795142,7.716667,2017-07-01 09:32:49


In [46]:
printdf(matched_executed_trips)

Unnamed: 0,date,user_trip_id,itinerary_id,planned_duration_mins,planned_start_time


In [47]:
executed_legs = user_trips_time_info \
            .join(matched_executed_trips, on=['date','user_trip_id'], how='left') \
            .withColumn('actual_duration_mins',F.col('exec_duration_mins')) \
            .withColumn('actual_start_time',F.col('exec_start_time')) \
            .withColumn('itinerary_id',F.lit(0)) \
            .select(['date','user_trip_id','itinerary_id','planned_duration_mins','actual_duration_mins','exec_duration_mins',
                     'planned_start_time','actual_start_time','exec_start_time'])
            
printdf(executed_legs)

Unnamed: 0,date,user_trip_id,itinerary_id,planned_duration_mins,actual_duration_mins,exec_duration_mins,planned_start_time,actual_start_time,exec_start_time
0,2017-07-01,274877907051,0,,25.166667,25.166667,,2017-07-01 08:02:00,2017-07-01 08:02:00
1,2017-07-01,1425929142383,0,,107.933333,107.933333,,2017-07-01 14:06:38,2017-07-01 14:06:38
2,2017-07-01,867583393861,0,,24.083333,24.083333,,2017-07-01 18:21:55,2017-07-01 18:21:55
3,2017-07-01,42949672972,0,,12.0,12.0,,2017-07-01 20:35:27,2017-07-01 20:35:27
4,2017-07-01,1314259992746,0,,24.6,24.6,,2017-07-01 13:18:37,2017-07-01 13:18:37
5,2017-07-01,128849018906,0,,13.833333,13.833333,,2017-07-01 14:09:05,2017-07-01 14:09:05
6,2017-07-01,609885356211,0,,4.95,4.95,,2017-07-01 20:19:59,2017-07-01 20:19:59
7,2017-07-01,858993459358,0,,5.983333,5.983333,,2017-07-01 18:09:32,2017-07-01 18:09:32
8,2017-07-01,1236950581425,0,,7.533333,7.533333,,2017-07-01 15:28:01,2017-07-01 15:28:01
9,2017-07-01,1580547964991,0,,28.65,28.65,,2017-07-01 22:33:38,2017-07-01 22:33:38


In [48]:
matched_otp_legs = clean_legs_actual_time \
                            .groupBy(['date', 'user_trip_id', 'itinerary_id']) \
                            .agg(F.sum('otp_duration_mins').alias('planned_duration_mins'), \
                                 F.sum('considered_duration_mins').alias('actual_duration_mins')) \
                        .join(first_boarding_time, on=['date','user_trip_id','itinerary_id']) \
                        .join(user_trips_time_info, on=['date','user_trip_id'], how='inner') \
                        .orderBy(['date','user_trip_id','itinerary_id']) \
                        .select(['date','user_trip_id','itinerary_id','planned_duration_mins','actual_duration_mins','exec_duration_mins',
                     'planned_start_time','actual_start_time','exec_start_time'])
                 
printdf(matched_otp_legs)

Unnamed: 0,date,user_trip_id,itinerary_id,planned_duration_mins,actual_duration_mins,exec_duration_mins,planned_start_time,actual_start_time,exec_start_time
0,2017-07-01,34359738511,1,31.633333,34.5,450.433333,2017-07-01 06:07:23,2017-07-01 06:05:05,2017-07-01 05:41:47
1,2017-07-01,34359738511,2,31.633333,33.1,450.433333,2017-07-01 06:27:23,2017-07-01 06:23:54,2017-07-01 05:41:47
2,2017-07-01,34359738511,3,31.633333,32.7,450.433333,2017-07-01 06:37:23,2017-07-01 06:35:20,2017-07-01 05:41:47
3,2017-07-01,128849018906,1,28.566667,31.833333,13.833333,2017-07-01 14:30:33,2017-07-01 14:28:47,2017-07-01 14:09:05
4,2017-07-01,128849018906,2,28.566667,31.65,13.833333,2017-07-01 14:46:33,2017-07-01 14:42:20,2017-07-01 14:09:05


In [50]:
executed_trips_with_sugestions_matched = matched_otp_legs.select('user_trip_id')\
                                            .drop_duplicates()
printdf(executed_trips_with_sugestions_matched)

Py4JJavaError: An error occurred while calling o3264.collectToPython.
: java.lang.OutOfMemoryError: Java heap space
	at java.util.Arrays.copyOf(Arrays.java:3332)
	at java.lang.AbstractStringBuilder.ensureCapacityInternal(AbstractStringBuilder.java:124)
	at java.lang.AbstractStringBuilder.append(AbstractStringBuilder.java:448)
	at java.lang.StringBuilder.append(StringBuilder.java:136)
	at java.lang.StringBuilder.append(StringBuilder.java:131)
	at scala.StringContext.standardInterpolator(StringContext.scala:125)
	at scala.StringContext.s(StringContext.scala:95)
	at org.apache.spark.sql.execution.QueryExecution.toString(QueryExecution.scala:225)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:54)
	at org.apache.spark.sql.Dataset.withNewExecutionId(Dataset.scala:2765)
	at org.apache.spark.sql.Dataset.collectToPython(Dataset.scala:2742)
	at sun.reflect.GeneratedMethodAccessor121.invoke(Unknown Source)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:280)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:214)
	at java.lang.Thread.run(Thread.java:745)


In [None]:
all_trips_alternatives = matched_otp_legs \
                .union(executed_legs) \
                .join(executed_trips_with_sugestions_matched, on='user_trip_id',how='inner') \
                .orderBy(['date','user_trip_id','itinerary_id'])
printdf(all_trips_alternatives)

In [48]:
all_trips_alternatives.write.csv(path=results_folderpath+'/all_itineraries',header=True)

In [49]:
print all_trips_alternatives.count(), od_matrix.count(), executed_trips_with_sugestions_matched.count()

19772 5752 5563


In [50]:
executed_trips_with_user_plan_matched = matched_executed_trips.select('user_trip_id')\
                                            .drop_duplicates()
printdf(executed_trips_with_sugestions_matched)

Unnamed: 0,user_trip_id
0,42949673263
1,68719476787
2,94489280644
3,300647710918
4,455266533408
5,455266533670
6,472446402873
7,498216206460
8,558345748598
9,644245094648


In [51]:
all_executed_trips_alternatives = matched_otp_legs \
                .union(executed_legs) \
                .join(executed_trips_with_user_plan_matched, on='user_trip_id',how='inner') \
                .orderBy(['date','user_trip_id','itinerary_id'])
printdf(all_executed_trips_alternatives)

Unnamed: 0,user_trip_id,date,itinerary_id,planned_duration_mins,actual_duration_mins,exec_duration_mins,planned_start_time,actual_start_time,exec_start_time
0,7,2017-05-09,0,39.5,29.733333,29.733333,2017-05-09 15:03:44,2017-05-09 14:40:23,2017-05-09 14:40:23
1,7,2017-05-09,1,35.416667,30.1,29.733333,2017-05-09 14:43:19,2017-05-09 14:40:13,2017-05-09 14:40:23
2,7,2017-05-09,2,35.416667,25.066667,29.733333,2017-05-09 14:56:19,2017-05-09 15:07:46,2017-05-09 14:40:23
3,7,2017-05-09,3,35.416667,25.066667,29.733333,2017-05-09 15:09:19,2017-05-09 15:07:46,2017-05-09 14:40:23
4,8,2017-05-09,0,33.833333,28.3,28.3,2017-05-09 06:31:38,2017-05-09 06:32:25,2017-05-09 06:32:25
5,8,2017-05-09,1,38.9,34.333333,28.3,2017-05-09 06:41:51,2017-05-09 06:41:03,2017-05-09 06:32:25
6,8,2017-05-09,2,33.983333,35.6,28.3,2017-05-09 06:43:38,2017-05-09 06:49:27,2017-05-09 06:32:25
7,8,2017-05-09,3,33.983333,44.6,28.3,2017-05-09 07:07:38,2017-05-09 07:07:45,2017-05-09 06:32:25
8,9,2017-05-09,0,23.433333,21.4,21.4,2017-05-09 10:53:34,2017-05-09 10:44:40,2017-05-09 10:44:40
9,9,2017-05-09,1,24.333333,24.85,21.4,2017-05-09 10:52:51,2017-05-09 10:54:29,2017-05-09 10:44:40


In [52]:
print all_executed_trips_alternatives.count(), od_matrix.count(), executed_trips_with_user_plan_matched.count()

11769 5752 3345


In [None]:
all_executed_trips_alternatives.write.csv(path=results_folderpath+'/fully_matched_itineraries',header=True)

In [36]:
printdf(all_trips_alternatives.groupby(['date','user_trip_id']) \
                        .count() \
                        .orderBy('count'))

Unnamed: 0,date,user_trip_id,count
0,2017-05-09,300647710724,2
1,2017-05-09,1563368095803,2
2,2017-05-09,1245540516041,2
3,2017-05-09,412316860483,2
4,2017-05-09,1529008357586,2
5,2017-05-09,223338299433,2
6,2017-05-09,1022202216484,3
7,2017-05-09,721554505729,3
8,2017-05-09,163208757335,3
9,2017-05-09,1700807049484,3


In [None]:
filtered_trips_alternatives = 

### Compute Inefficiency Metrics

#### Given: 
- U - user trip time
- O - otp suggested trip time
- E (executed itineraries) = { Ue, {Oie, 0 < i < n, n = num_otp_alt}}
- P (planned itineraries) = { Up, {Oip, 0 < i < n, n = num_otp_alt}}

#### We can compute:

$$
\begin{equation*}
    \frac{Ue - fastest(E)}{Ue} \text{  User choice actual inefficiency}.
\end{equation*}
$$

$$
\begin{equation*}
    \frac{fastest(E) - executed(fastest(Oe))}{fastest(E)} \text{ System recommendation inefficiency I}.
\end{equation*}
$$

$$
\begin{equation*}
    \frac{fastest(E) - fastest(P))}{fastest(E)} \text{ System recommendation inefficiency II}.
\end{equation*}
$$

$$
\begin{equation*}
    \frac{Up - fastest(P))}{Up} \text{ User choice plan inefficiency}.
\end{equation*}
$$

$$
\begin{equation*}
    {Oe - Op} \text{ System Schedule Deviation}.
\end{equation*}
$$

$$
\begin{equation*}
    {Ue - Up} \text{ User Trip Schedule Deviation}.
\end{equation*}
$$

$$
\begin{equation*}
    {start(Ue) - start(Up)} \text{ User stop waiting time offset}.
\end{equation*}
$$

In [44]:
def filter_trips_alternatives(trips_alternatives):
    min_trip_dur = 10
    max_trip_dur = 50
    max_trip_start_diff = 20
    
    return trips_alternatives[(trips_alternatives['exec_duration_mins'] >= min_trip_dur) & (trips_alternatives['exec_duration_mins'] <= max_trip_dur)] \
                                    .withColumn('start_diff',F.abs(F.unix_timestamp(F.col('exec_start_time')) - F.unix_timestamp(F.col('actual_start_time')))/60) \
                                    .filter('start_diff <= 20')

In [46]:
#Filter trips whose planned start time is too far away from the executed start time
filtered_trips_itineraries = filter_trips_alternatives(all_trips_alternatives)

In [51]:
printdf(filtered_trips_itineraries)

Unnamed: 0,date,user_trip_id,itinerary_id,planned_duration_mins,actual_duration_mins,planned_start_time,actual_start_time,exec_duration_mins,exec_start_time,start_diff
0,2017-05-09,25769803948,0,,15.9,NaT,2017-05-09 18:09:17,15.9,2017-05-09 18:09:17,0.0
1,2017-05-09,25769803948,1,7.25,15.45,2017-05-09 18:16:46,2017-05-09 18:21:07,15.9,2017-05-09 18:09:17,11.833333
2,2017-05-09,25769803948,2,7.833333,15.45,2017-05-09 18:35:29,2017-05-09 18:21:07,15.9,2017-05-09 18:09:17,11.833333
3,2017-05-09,51539607858,0,,32.983333,NaT,2017-05-09 13:05:33,32.983333,2017-05-09 13:05:33,0.0
4,2017-05-09,68719476891,0,,20.133333,NaT,2017-05-09 18:29:06,20.133333,2017-05-09 18:29:06,0.0
5,2017-05-09,68719476891,1,24.933333,26.683333,2017-05-09 18:35:38,2017-05-09 18:45:08,20.133333,2017-05-09 18:29:06,16.033333
6,2017-05-09,111669149831,0,,13.766667,NaT,2017-05-09 09:37:41,13.766667,2017-05-09 09:37:41,0.0
7,2017-05-09,111669149831,1,9.633333,14.8,2017-05-09 09:42:40,2017-05-09 09:36:49,13.766667,2017-05-09 09:37:41,0.866667
8,2017-05-09,111669149831,2,9.633333,13.216667,2017-05-09 09:59:40,2017-05-09 09:56:08,13.766667,2017-05-09 09:37:41,18.45
9,2017-05-09,137438953719,0,,20.0,NaT,2017-05-09 14:19:14,20.0,2017-05-09 14:19:14,0.0


#### User choice actual inefficiency

$$
\begin{equation*}
    \frac{Ue - fastest(E)}{Ue}.
\end{equation*}
$$

In [47]:
#Choose best itinerary for each trip by selecting the ones with lower actual duration
best_trips_itineraries = select_best_trip_itineraries(filtered_trips_itineraries)

In [48]:
printdf(best_trips_itineraries)

Unnamed: 0,date,user_trip_id,itinerary_id,planned_duration_mins,actual_duration_mins,planned_start_time,actual_start_time,exec_duration_mins,exec_start_time,start_diff
0,2017-05-09,515396075621,0,,17.083333,NaT,2017-05-09 07:38:27,17.083333,2017-05-09 07:38:27,0.0
1,2017-05-09,163208757335,3,3.8,4.266667,2017-05-09 08:41:52,2017-05-09 08:36:39,27.166667,2017-05-09 08:37:33,0.9
2,2017-05-09,790273982609,2,24.816667,20.033333,2017-05-09 07:24:43,2017-05-09 07:26:59,33.516667,2017-05-09 07:10:50,16.15
3,2017-05-09,1700807049484,0,,14.933333,NaT,2017-05-09 17:39:37,14.933333,2017-05-09 17:39:37,0.0
4,2017-05-09,721554505729,1,6.616667,6.466667,2017-05-09 10:04:00,2017-05-09 10:12:19,43.1,2017-05-09 10:02:29,9.833333
5,2017-05-09,489626271850,0,,12.316667,NaT,2017-05-09 13:27:40,12.316667,2017-05-09 13:27:40,0.0
6,2017-05-09,25769803948,2,7.833333,15.45,2017-05-09 18:35:29,2017-05-09 18:21:07,15.9,2017-05-09 18:09:17,11.833333
7,2017-05-09,1443109011501,0,,17.166667,NaT,2017-05-09 05:22:54,17.166667,2017-05-09 05:22:54,0.0
8,2017-05-09,1005022347340,0,,11.116667,NaT,2017-05-09 17:57:17,11.116667,2017-05-09 17:57:17,0.0
9,2017-05-09,1194000908297,3,6.15,6.366667,2017-05-09 06:57:27,2017-05-09 07:04:51,36.733333,2017-05-09 06:48:22,16.483333


In [49]:
trips_inefficiency = best_trips_itineraries \
                        .withColumn('dur_diff',(F.col('exec_duration_mins') - F.col('actual_duration_mins'))) \
                        .withColumn('imp_capacity', F.col('dur_diff')/F.col('exec_duration_mins'))

In [50]:
printdf(trips_inefficiency)

Unnamed: 0,date,user_trip_id,itinerary_id,planned_duration_mins,actual_duration_mins,planned_start_time,actual_start_time,exec_duration_mins,exec_start_time,start_diff,dur_diff,imp_capacity
0,2017-05-09,515396075621,0,,17.083333,NaT,2017-05-09 07:38:27,17.083333,2017-05-09 07:38:27,0.0,0.0,0.0
1,2017-05-09,163208757335,3,3.8,4.266667,2017-05-09 08:41:52,2017-05-09 08:36:39,27.166667,2017-05-09 08:37:33,0.9,22.9,0.842945
2,2017-05-09,790273982609,2,24.816667,20.033333,2017-05-09 07:24:43,2017-05-09 07:26:59,33.516667,2017-05-09 07:10:50,16.15,13.483333,0.402287
3,2017-05-09,1700807049484,0,,14.933333,NaT,2017-05-09 17:39:37,14.933333,2017-05-09 17:39:37,0.0,0.0,0.0
4,2017-05-09,721554505729,1,6.616667,6.466667,2017-05-09 10:04:00,2017-05-09 10:12:19,43.1,2017-05-09 10:02:29,9.833333,36.633333,0.849961
5,2017-05-09,489626271850,0,,12.316667,NaT,2017-05-09 13:27:40,12.316667,2017-05-09 13:27:40,0.0,0.0,0.0
6,2017-05-09,25769803948,2,7.833333,15.45,2017-05-09 18:35:29,2017-05-09 18:21:07,15.9,2017-05-09 18:09:17,11.833333,0.45,0.028302
7,2017-05-09,1443109011501,0,,17.166667,NaT,2017-05-09 05:22:54,17.166667,2017-05-09 05:22:54,0.0,0.0,0.0
8,2017-05-09,1005022347340,0,,11.116667,NaT,2017-05-09 17:57:17,11.116667,2017-05-09 17:57:17,0.0,0.0,0.0
9,2017-05-09,1194000908297,3,6.15,6.366667,2017-05-09 06:57:27,2017-05-09 07:04:51,36.733333,2017-05-09 06:48:22,16.483333,30.366667,0.826679


#### System Recommendation Inefficiency I
$$
\begin{equation*}
    \frac{fastest(E) - executed(fastest(Oe))}{fastest(E)}
\end{equation*}
$$

In [73]:
w_rec_inef_i = Window().partitionBy(['date','user_trip_id']).orderBy(F.col('planned_duration_mins'))

shortest_planned_actual_duration = filtered_trips_itineraries \
                            .filter('itinerary_id > 0') \
                            .withColumn("rn", F.row_number().over(w_rec_inef_i)) \
                            .where(F.col("rn") == 1) \
                            .select('date','user_trip_id','planned_duration_mins','actual_duration_mins') \
                            .withColumnRenamed('planned_duration_mins','shortest_OTP_planned_duration') \
                            .withColumnRenamed('actual_duration_mins','shortest_OTP_actual_duration')

printdf(shortest_planned_actual_duration)

Unnamed: 0,date,user_trip_id,shortest_OTP_planned_duration,shortest_OTP_actual_duration
0,2017-05-09,515396075621,17.566667,17.25
1,2017-05-09,163208757335,3.8,4.266667
2,2017-05-09,790273982609,24.6,21.666667
3,2017-05-09,1700807049484,17.3,26.1
4,2017-05-09,721554505729,6.616667,6.466667
5,2017-05-09,489626271850,12.966667,12.983333
6,2017-05-09,25769803948,7.25,15.45
7,2017-05-09,1443109011501,26.483333,17.966667
8,2017-05-09,1005022347340,14.7,11.65
9,2017-05-09,1194000908297,6.15,6.366667


In [74]:
rec_inef_i = filtered_trips_itineraries \
                .groupBy(['date','user_trip_id']) \
                .agg(F.min(F.col('actual_duration_mins')).alias('shortest_actual_duration')) \
                .join(shortest_planned_actual_duration, on=['date','user_trip_id'], how='inner') \
                .withColumn('rec_inef',(F.col('shortest_OTP_actual_duration') - F.col('shortest_actual_duration'))/F.col('shortest_OTP_actual_duration'))

printdf(rec_inef_i_features)

Unnamed: 0,date,user_trip_id,shortest_actual_duration,shortest_planned_OTP_duration,rec_inef
0,2017-05-09,515396075621,17.083333,17.566667,-0.028293
1,2017-05-09,163208757335,4.266667,3.8,0.109375
2,2017-05-09,790273982609,20.033333,24.6,-0.227953
3,2017-05-09,1700807049484,14.933333,17.3,-0.158482
4,2017-05-09,721554505729,6.466667,6.616667,-0.023196
5,2017-05-09,489626271850,12.316667,12.966667,-0.052774
6,2017-05-09,25769803948,15.45,7.25,0.530744
7,2017-05-09,1443109011501,17.166667,26.483333,-0.542718
8,2017-05-09,1005022347340,11.116667,14.7,-0.322339
9,2017-05-09,1194000908297,6.366667,6.15,0.034031


#### System Recommendation Inefficiency II
$$
\begin{equation*}
    \frac{fastest(E) - fastest(P))}{fastest(E)}
\end{equation*}
$$

#### User choice plan inefficiency
$$
\begin{equation*}
    \frac{Up - fastest(P))}{Up}
\end{equation*}
$$

#### System Schedule Deviation
$$
\begin{equation*}
    {Oe - Op}
\end{equation*}
$$

#### User Trip Schedule Deviation
$$
\begin{equation*}
    {Ue - Up}
\end{equation*}
$$

#### User stop waiting time offset
$$
\begin{equation*}
    {start(Ue) - start(Up)}
\end{equation*}
$$

In [70]:
print "Identifying itinerary alternatives which are feasible..."
trips_itineraries_possibilities, filtered_trips_possibilities = determining_trips_alternatives_feasibility(clean_legs_actual_time,od_matrix)

Identifying itinerary alternatives which are feasible...


In [72]:
printdf(od_matrix.select('date'))

Unnamed: 0,date
0,2017-05-09
1,2017-05-09
2,2017-05-09
3,2017-05-09
4,2017-05-09
5,2017-05-09
6,2017-05-09
7,2017-05-09
8,2017-05-09
9,2017-05-09


In [73]:
printdf(trips_itineraries_possibilities)

Unnamed: 0,date,user_trip_id,itinerary_id,duration,alt_start_time,exec_start_time,start_diff
0,2017-05-09,25769803948,1,15.45,2017-05-09 18:16:42,2017-05-09 18:09:17,7.416667
1,2017-05-09,25769803948,2,15.45,2017-05-09 18:42:57,2017-05-09 18:09:17,33.666667
2,2017-05-09,51539607858,1,42.383333,2017-05-09 13:29:50,2017-05-09 13:05:33,24.283333
3,2017-05-09,51539607858,2,31.666667,2017-05-09 13:53:20,2017-05-09 13:05:33,47.783333
4,2017-05-09,51539607858,3,32.333333,2017-05-09 14:59:28,2017-05-09 13:05:33,113.916667
5,2017-05-09,68719476891,1,26.683333,2017-05-09 18:45:08,2017-05-09 18:29:06,16.033333
6,2017-05-09,68719476891,2,26.266667,2017-05-09 18:57:18,2017-05-09 18:29:06,28.2
7,2017-05-09,68719476891,3,20.75,2017-05-09 18:50:04,2017-05-09 18:29:06,20.966667
8,2017-05-09,111669149831,1,14.8,2017-05-09 09:42:32,2017-05-09 09:37:41,4.85
9,2017-05-09,111669149831,2,13.216667,2017-05-09 10:09:09,2017-05-09 09:37:41,31.466667


In [74]:
printdf(filtered_trips_possibilities)

Unnamed: 0,date,user_trip_id,itinerary_id,duration,alt_start_time
0,2017-05-09,25769803948,1,15.45,2017-05-09 18:16:42
1,2017-05-09,68719476891,1,26.683333,2017-05-09 18:45:08
2,2017-05-09,111669149831,1,14.8,2017-05-09 09:42:32
3,2017-05-09,111669149962,1,47.516667,2017-05-09 17:53:38
4,2017-05-09,111669149962,2,39.266667,2017-05-09 18:10:19
5,2017-05-09,120259084497,1,4.833333,2017-05-09 13:13:57
6,2017-05-09,120259084497,2,3.966667,2017-05-09 13:25:17
7,2017-05-09,120259084497,3,3.75,2017-05-09 13:30:06
8,2017-05-09,137438953719,1,31.333333,2017-05-09 14:34:35
9,2017-05-09,163208757335,1,4.016667,2017-05-09 08:35:33


In [75]:
print "Writing itineraries possibilities with feasibility to file..."
#trips_itineraries_possibilities.write.csv(path=results_folderpath+'/itineraries_alternatives',header=True, mode='append')


Writing itineraries possibilities with feasibility to file...


In [79]:
print "Adding executed trips to the pool of itinerary possibilities..."
trips_itineraries_pool = get_trips_itineraries_pool(filtered_trips_possibilities,od_matrix)


Adding executed trips to the pool of itinerary possibilities...


In [80]:
printdf(trips_itineraries_pool,l=10)

Unnamed: 0,date,user_trip_id,itinerary_id,duration,alt_start_time
0,2017-05-09,25769803948,0,15.9,2017-05-09 18:09:17
1,2017-05-09,25769803948,1,15.45,2017-05-09 18:16:42
2,2017-05-09,51539607858,0,32.983333,2017-05-09 13:05:33
3,2017-05-09,68719476891,0,20.133333,2017-05-09 18:29:06
4,2017-05-09,68719476891,1,26.683333,2017-05-09 18:45:08
5,2017-05-09,111669149831,0,13.766667,2017-05-09 09:37:41
6,2017-05-09,111669149831,1,14.8,2017-05-09 09:42:32
7,2017-05-09,111669149962,0,61.233333,2017-05-09 17:54:34
8,2017-05-09,111669149962,1,47.516667,2017-05-09 17:53:38
9,2017-05-09,111669149962,2,39.266667,2017-05-09 18:10:19


In [81]:
print "Selecting best otp itineraries by actual duration..."
best_trips_itineraries = select_best_trip_itineraries(trips_itineraries_pool)


Selecting best otp itineraries by actual duration...


In [82]:
printdf(best_trips_itineraries,l=20)

Unnamed: 0,date,user_trip_id,itinerary_id,duration,alt_start_time
0,2017-05-09,515396075621,0,17.083333,2017-05-09 07:38:27
1,2017-05-09,206158430497,0,4.75,2017-05-09 16:41:46
2,2017-05-09,1022202216484,0,1.533333,2017-05-09 06:44:51
3,2017-05-09,163208757335,1,4.016667,2017-05-09 08:35:33
4,2017-05-09,790273982609,2,20.033333,2017-05-09 07:22:11
5,2017-05-09,1700807049484,0,14.933333,2017-05-09 17:39:37
6,2017-05-09,412316860483,1,5.633333,2017-05-09 11:57:46
7,2017-05-09,721554505729,1,6.466667,2017-05-09 10:02:15
8,2017-05-09,489626271850,0,12.316667,2017-05-09 13:27:40
9,2017-05-09,25769803948,1,15.45,2017-05-09 18:16:42


In [83]:
#Clean Memory
#clean_legs_actual_time.unpersist(blocking=True)
#trips_itineraries_possibilities.unpersist(blocking=True)
#filtered_trips_possibilities.unpersist(blocking=True)
#trips_itineraries_pool.unpersist(blocking=True)


In [85]:
printdf(duration_improvement_capacity)

Unnamed: 0,date,user_trip_id,cardNum,birthdate,gender,exec_start_time,executed_duration,itinerary_id,duration,alt_start_time,imp_capacity
0,2017-05-09,515396075621,2642531,28/09/95,F,2017-05-09 07:38:27,17.083333,0,17.083333,2017-05-09 07:38:27,0.0
1,2017-05-09,206158430497,1757419,,,2017-05-09 16:41:46,4.75,0,4.75,2017-05-09 16:41:46,0.0
2,2017-05-09,1022202216484,10090017,,,2017-05-09 06:44:51,1.533333,0,1.533333,2017-05-09 06:44:51,0.0
3,2017-05-09,163208757335,2782068,,,2017-05-09 08:37:33,27.166667,1,4.016667,2017-05-09 08:35:33,23.15
4,2017-05-09,790273982609,3645964,,,2017-05-09 07:10:50,33.516667,2,20.033333,2017-05-09 07:22:11,13.483333
5,2017-05-09,1700807049484,2658794,30/03/78,M,2017-05-09 17:39:37,14.933333,0,14.933333,2017-05-09 17:39:37,0.0
6,2017-05-09,412316860483,3541708,,,2017-05-09 11:59:46,65.033333,1,5.633333,2017-05-09 11:57:46,59.4
7,2017-05-09,721554505729,643873,,,2017-05-09 10:02:29,43.1,1,6.466667,2017-05-09 10:02:15,36.633333
8,2017-05-09,489626271850,3709654,,,2017-05-09 13:27:40,12.316667,0,12.316667,2017-05-09 13:27:40,0.0
9,2017-05-09,25769803948,3377631,21/06/00,F,2017-05-09 18:09:17,15.9,1,15.45,2017-05-09 18:16:42,0.45


In [84]:
print "Computing Improvement Capacity..."
duration_improvement_capacity = compute_improvement_capacity(best_trips_itineraries,od_matrix)


Computing Improvement Capacity...


In [53]:
#best_trips_itineraries.unpersist(blocking=True)
#od_matrix.unpersist(blocking=True)

In [None]:
print "Writing duration improvement capacity to file..."
#duration_improvement_capacity.write.csv(path=results_folderpath+'/duration_improvement_capacity',header=True, mode='append')


In [None]:
print "Finishing Script..."

In [None]:
sc.stop()
