In [1]:
import pyspark
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.window import Window
from pyspark.sql import functions as F
from pyspark.sql import types as T

import json
import urllib2

In [2]:
OTP_SERVER_URL = 'http://localhost:5601/otp/'

def rename_columns(df, list_of_tuples):
    for (old_col, new_col) in list_of_tuples:
        df = df.withColumnRenamed(old_col, new_col)
    return df

def read_hdfs_folder(sqlContext, folderpath):
    data_frame = sqlContext.read.csv(folderpath, header=True,
                                     inferSchema=True,nullValue="-")
    return data_frame

def read_buste_data_v3(sqlContext, folderpath):
    data_frame = read_hdfs_folder(sqlContext,folderpath)
    
    date = "-".join(folderpath.split("/")[-2].split("_")[:3])

    data_frame = data_frame.withColumn("date", F.date_sub(F.lit(date),1))
    #data_frame = data_frame.withColumn("date", F.unix_timestamp(F.col("date"),1),'yyyy-MM-dd'))
    
    return data_frame

def printdf(df,l=10):
    return df.limit(l).toPandas()

def get_timestamp_in_tz(unixtime_timestamp,ts_format,tz):
    return F.from_utc_timestamp(F.from_unixtime(unixtime_timestamp, ts_format),tz)

In [3]:
spark  = SparkSession.builder.getOrCreate()
spark.conf.set('spark.sql.crossJoin.enabled', 'true')

sc = spark.sparkContext
sqlContext = pyspark.SQLContext(sc)

In [5]:
#base_folder_path = '/local/tarciso/masters/experiments/preliminary-exp/preliminary-exp-sample-data/buste-v3a/
base_folder_path = '/local/tarciso/data/'
od_matrix = read_hdfs_folder(sqlContext,base_folder_path + 'od_matrix/')

In [6]:
od_matrix.printSchema()

root
 |-- route: integer (nullable = true)
 |-- tripNum: integer (nullable = true)
 |-- shapeId: integer (nullable = true)
 |-- shapeSequence: integer (nullable = true)
 |-- shapeLat: double (nullable = true)
 |-- shapeLon: double (nullable = true)
 |-- distanceTraveledShape: double (nullable = true)
 |-- busCode: string (nullable = true)
 |-- gpsPointId: string (nullable = true)
 |-- gpsLat: double (nullable = true)
 |-- gpsLon: double (nullable = true)
 |-- distanceToShapePoint: double (nullable = true)
 |-- timestamp: string (nullable = true)
 |-- stopPointId: integer (nullable = true)
 |-- problem: string (nullable = true)
 |-- birthdate: string (nullable = true)
 |-- cardTimestamp: string (nullable = true)
 |-- lineName: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- date: timestamp (nullable = true)
 |-- id: long (nullable = true)
 |-- o_route: integer (nullable = true)
 |-- o_bus_code: string (nullable = true)
 |-- o_date: timestamp (nullable = true)
 |-- o_

In [7]:
printdf(od_matrix)

Unnamed: 0,route,tripNum,shapeId,shapeSequence,shapeLat,shapeLon,distanceTraveledShape,busCode,gpsPointId,gpsLat,...,next_o_shape_seq,next_o_shape_lat,next_o_shape_lon,next_o_stop_id,next_o_boarding_id,o_unixtimestamp,next_o_unixtimestamp,leg_duration,dist,rn
0,860,2,2240,4494816,-25.440235,-49.277592,840.222,LC011,,-25.440471,...,6448950,-25.434527,-49.280126,29874,438086664359,32969,78541,759.533333,0.683846,1
1,870,10,2926,5386395,-25.431688,-49.276556,518.881,BC023,,-25.43163,...,5162628,-25.428355,-49.272707,28429,1262720385319,70255,52607,-1.0,0.535484,1
2,30,2,1715,6293026,-25.410378,-49.204718,25103.643,BB608,,-25.410268,...,5846746,-25.410157,-49.204149,32508,206158430263,49275,71111,363.933333,0.062245,1
3,860,8,2241,4494781,-25.436792,-49.274447,12604.144,LC026,,-25.436816,...,5255201,-25.428123,-49.271896,28556,1666447310868,69745,45888,-1.0,0.997384,1
4,826,11,2230,6062927,-25.511831,-49.324408,12069.86,JA001,,-25.511798,...,6416861,-25.510229,-49.32615,35840,1563368095816,76640,36968,-1.0,0.249584,1
5,777,2,2195,4299860,-25.435179,-49.273374,0.0,JC004,,-25.435188,...,6222433,-25.428604,-49.270626,26358,798863917103,34748,47558,213.5,0.781424,1
6,30,3,1715,6292678,-25.489034,-49.226245,11197.387,GR123,,-25.489063,...,6358078,-25.490593,-49.222306,30062,1022202216489,55367,80712,422.416667,0.431719,1
7,393,5,1907,6350328,-25.434318,-49.274526,11645.587,DN600,,-25.434348,...,6360100,-25.43537,-49.271665,26180,481036337331,64512,77787,221.25,0.310218,1
8,778,3,2196,5603972,-25.435144,-49.273284,12300.302,JC007,,-25.43519,...,6594723,-25.430341,-49.267154,26376,1589137899766,39360,35167,-1.0,0.815047,1
9,467,14,2818,5136062,-25.444211,-49.267533,1036.662,DN606,,,...,6405216,-25.444187,-49.267545,26584,627065225481,81022,40917,-1.0,0.002895,1


In [8]:
printdf(od_matrix.select(['date','route','o_stop_id','o_timestamp','stopPointId','timestamp','o_boarding_id']))

Unnamed: 0,date,route,o_stop_id,o_timestamp,stopPointId,timestamp,o_boarding_id
0,2017-05-10,860,26163,06:09:29,30633,06:14:13,403726926010
1,2017-05-10,870,26314,16:30:55,28604,16:35:43,635655159957
2,2017-05-10,30,32612,10:41:15,32508,12:10:57,231928234038
3,2017-05-10,860,3280,16:22:25,26163,16:42:40,463856468060
4,2017-05-10,826,33180,18:17:20,30432,18:54:05,549755814128
5,2017-05-10,777,33632,06:39:08,26149,07:00:22,979252543558
6,2017-05-10,30,32600,12:22:47,32577,12:38:45,1219770712102
7,2017-05-10,393,30182,14:55:12,28592,15:05:38,755914244190
8,2017-05-10,778,33533,07:56:00,26146,08:23:16,1606317768917
9,2017-05-10,467,26181,19:30:22,26584,19:34:08,42949673032


In [9]:
od_matrix = od_matrix.withColumn('date_in_ms', F.unix_timestamp(F.col('date'),'yyyy-MM-dd')) \
                        .withColumn('o_time_in_ms', F.unix_timestamp(F.col('o_timestamp'),'HH:mm:ss')) \
                        .withColumn('o_datetime_in_ms', F.col('date_in_ms') + F.col('o_time_in_ms')) \
                        .withColumn('o_datetime',get_timestamp_in_tz(F.col('o_datetime_in_ms'),'yyyy-MM-dd HH:mm:ss','GMT-3'))
                        

In [10]:
printdf(od_matrix.select(['date_in_ms','o_time_in_ms','o_datetime_in_ms','date','o_timestamp','o_datetime']))

Unnamed: 0,date_in_ms,o_time_in_ms,o_datetime_in_ms,date,o_timestamp,o_datetime
0,1494385200,32969,1494418169,2017-05-10,06:09:29,2017-05-10 06:09:29
1,1494385200,70255,1494455455,2017-05-10,16:30:55,2017-05-10 16:30:55
2,1494385200,49275,1494434475,2017-05-10,10:41:15,2017-05-10 10:41:15
3,1494385200,69745,1494454945,2017-05-10,16:22:25,2017-05-10 16:22:25
4,1494385200,76640,1494461840,2017-05-10,18:17:20,2017-05-10 18:17:20
5,1494385200,34748,1494419948,2017-05-10,06:39:08,2017-05-10 06:39:08
6,1494385200,55367,1494440567,2017-05-10,12:22:47,2017-05-10 12:22:47
7,1494385200,64512,1494449712,2017-05-10,14:55:12,2017-05-10 14:55:12
8,1494385200,39360,1494424560,2017-05-10,07:56:00,2017-05-10 07:56:00
9,1494385200,81022,1494466222,2017-05-10,19:30:22,2017-05-10 19:30:22


In [11]:
od_matrix = od_matrix.withColumn('o_base_timestamp_in_ms', F.col('o_datetime_in_ms') - 60 * 2) \
                        .withColumn('o_base_timestamp',get_timestamp_in_tz(F.col('o_base_timestamp_in_ms'),'yyyy-MM-dd HH:mm:ss','GMT-3')) \
                        .withColumn('o_base_time',F.split(F.col('o_base_timestamp'),'\s+')[1]) \
                        .withColumn('o_base_date',F.split(F.col('o_base_timestamp'),'\s+')[0]) \


In [12]:
printdf(od_matrix.select('o_datetime_in_ms','o_base_timestamp_in_ms','o_datetime','o_base_timestamp','o_base_date','o_base_time'))

Unnamed: 0,o_datetime_in_ms,o_base_timestamp_in_ms,o_datetime,o_base_timestamp,o_base_date,o_base_time
0,1494418169,1494418049,2017-05-10 06:09:29,2017-05-10 06:07:29,2017-05-10,06:07:29
1,1494455455,1494455335,2017-05-10 16:30:55,2017-05-10 16:28:55,2017-05-10,16:28:55
2,1494434475,1494434355,2017-05-10 10:41:15,2017-05-10 10:39:15,2017-05-10,10:39:15
3,1494454945,1494454825,2017-05-10 16:22:25,2017-05-10 16:20:25,2017-05-10,16:20:25
4,1494461840,1494461720,2017-05-10 18:17:20,2017-05-10 18:15:20,2017-05-10,18:15:20
5,1494419948,1494419828,2017-05-10 06:39:08,2017-05-10 06:37:08,2017-05-10,06:37:08
6,1494440567,1494440447,2017-05-10 12:22:47,2017-05-10 12:20:47,2017-05-10,12:20:47
7,1494449712,1494449592,2017-05-10 14:55:12,2017-05-10 14:53:12,2017-05-10,14:53:12
8,1494424560,1494424440,2017-05-10 07:56:00,2017-05-10 07:54:00,2017-05-10,07:54:00
9,1494466222,1494466102,2017-05-10 19:30:22,2017-05-10 19:28:22,2017-05-10,19:28:22


In [124]:
def get_otp_itineraries(o_lat,o_lon,d_lat,d_lon,date,time):
    otp_http_request = 'routers/ctba/plan?fromPlace={},{}&toPlace={},{}&mode=TRANSIT,WALK&date={}&time={}'
    otp_request_url = OTP_SERVER_URL + otp_http_request.format(o_lat,o_lon,d_lat,d_lon,date,time)
    #print otp_request_url
    return json.loads(urllib2.urlopen(otp_request_url).read())

In [125]:
test_itinerary = get_otp_itineraries(-25.413083,-49.229020,-25.428322,-49.266739,'05-10-2017','7:19:00')

print test_itinerary

{u'elevationMetadata': {u'geoidElevation': False, u'ellipsoidToGeoidDifference': 3.686111287840708}, u'plan': {u'date': 1494411540000, u'to': {u'lat': -25.428322, u'vertexType': u'NORMAL', u'lon': -49.266739, u'name': u'Destination', u'orig': u''}, u'itineraries': [{u'walkTime': 2, u'legs': [{u'distance': 0.151, u'from': {u'vertexType': u'NORMAL', u'name': u'Origin', u'lon': -49.22902, u'departure': 1494411566000, u'lat': -25.413083, u'orig': u''}, u'interlineWithPreviousLeg': False, u'transitLeg': False, u'realTime': False, u'route': u'', u'departureDelay': 0, u'agencyTimeZoneOffset': -10800000, u'to': {u'arrival': 1494411567000, u'vertexType': u'TRANSIT', u'name': u'Rua Fagundes Varela, 1295 - Jardim Social', u'stopSequence': 12, u'lon': -49.229029895014, u'departure': 1494411568000, u'stopId': u'1:31957', u'stopCode': u'130279', u'lat': -25.413059468751, u'stopIndex': 11}, u'rentedBike': False, u'arrivalDelay': 0, u'mode': u'WALK', u'startTime': 1494411566000, u'duration': 1.0, u'st

### New Approach

In [126]:
trips_otp_response = {}
counter = 0
for row in od_matrix.collect():
        id=long(row['o_boarding_id'])
        trip_plan = get_otp_itineraries(row['o_shape_lat'], row['o_shape_lon'], row['shapeLat'], row['shapeLon'],row['o_base_date'],row['o_base_time'])
        trips_otp_response[id] = trip_plan
        counter+=1

In [179]:
#max_ctr = 1
trips_plans = []

for trip in trips_otp_response.keys():
    #if max_ctr == 0:
    #    break
    if 'plan' in trips_otp_response[trip]:
        itinerary_id = 1
        for itinerary in trips_otp_response[trip]['plan']['itineraries']:
            date = trips_otp_response[trip]['plan']['date']/1000
            leg_id = 1
            for leg in itinerary['legs']:
                route = leg['route'] if leg['route'] != '' else None
                fromStopId = leg['from']['stopId'].split(':')[1] if leg['mode'] == 'BUS' else None
                toStopId = leg['to']['stopId'].split(':')[1] if leg['mode'] == 'BUS' else None
                start_time = long(leg['startTime'])/1000
                end_time = long(leg['endTime'])/1000
                duration = (end_time - start_time)/60
                #date_str = start_time.strftime('%Y-%m-%d')
                #start_time_str = start_time.strftime('%H:%M:%S')
                #end_time_str = end_time.strftime('%H:%M:%S')
                trips_plans.append((date,trip,itinerary_id,leg_id,start_time,end_time,leg['mode'],route,fromStopId,toStopId, duration))
                leg_id += 1
            itinerary_id += 1
    #max_ctr -= 1

labels=['date','user_trip_id','itinerary_id','leg_id','start_time','end_time','mode','route','from_stop_id','to_stop_id','otp_duration_mins']
trips_plans_df = sqlContext.createDataFrame(trips_plans, labels) \
                    .withColumn('date',F.from_unixtime(F.col('date'),'yyyy-MM-dd')) \
                    .withColumn('start_time',F.from_unixtime(F.col('start_time'),'yyyy-MM-dd HH:mm:ss').astype('timestamp')) \
                    .withColumn('end_time',F.from_unixtime(F.col('end_time'),'yyyy-MM-dd HH:mm:ss').astype('timestamp')) \
                    .withColumn('otp_duration_mins',(F.unix_timestamp(F.col('end_time')) - F.unix_timestamp(F.col('start_time')))/60) \
                    .withColumn('route', F.col('route').astype('integer')) \
                    .withColumn('from_stop_id', F.col('from_stop_id').astype('integer')) \
                    .withColumn('to_stop_id', F.col('to_stop_id').astype('integer')) \
                    .orderBy(['date','user_trip_id','itinerary_id','start_time'])

In [180]:
trips_plans_df.printSchema()

root
 |-- date: string (nullable = true)
 |-- user_trip_id: long (nullable = true)
 |-- itinerary_id: long (nullable = true)
 |-- leg_id: long (nullable = true)
 |-- start_time: timestamp (nullable = true)
 |-- end_time: timestamp (nullable = true)
 |-- mode: string (nullable = true)
 |-- route: integer (nullable = true)
 |-- from_stop_id: integer (nullable = true)
 |-- to_stop_id: integer (nullable = true)
 |-- otp_duration_mins: double (nullable = true)



In [181]:
trips_plans_df.count()

42656

In [182]:
printdf(trips_plans_df,l=20)

Unnamed: 0,date,user_trip_id,itinerary_id,leg_id,start_time,end_time,mode,route,from_stop_id,to_stop_id,otp_duration_mins
0,2017-05-10,2,1,1,2017-05-10 08:56:13,2017-05-10 08:56:14,WALK,,,,0.016667
1,2017-05-10,2,1,2,2017-05-10 08:56:15,2017-05-10 09:28:03,BUS,50.0,31638.0,31748.0,31.8
2,2017-05-10,2,1,3,2017-05-10 09:28:04,2017-05-10 09:28:11,WALK,,,,0.116667
3,2017-05-10,2,2,1,2017-05-10 09:14:48,2017-05-10 09:14:49,WALK,,,,0.016667
4,2017-05-10,2,2,2,2017-05-10 09:14:50,2017-05-10 09:48:00,BUS,50.0,31638.0,31748.0,33.166667
5,2017-05-10,2,2,3,2017-05-10 09:48:01,2017-05-10 09:48:08,WALK,,,,0.116667
6,2017-05-10,2,3,1,2017-05-10 09:34:38,2017-05-10 09:34:39,WALK,,,,0.016667
7,2017-05-10,2,3,2,2017-05-10 09:34:40,2017-05-10 10:07:00,BUS,50.0,31638.0,31748.0,32.333333
8,2017-05-10,2,3,3,2017-05-10 10:07:01,2017-05-10 10:07:08,WALK,,,,0.116667
9,2017-05-10,9,1,1,2017-05-10 18:04:09,2017-05-10 18:04:19,WALK,,,,0.166667


In [183]:
simple_od_matrix = od_matrix.select(['date','route','o_stop_id','stopPointId','o_boarding_id']) \
                        .withColumnRenamed('o_stop_id','from_stop_id') \
                        .withColumnRenamed('stopPointId','to_stop_id')

In [184]:
simple_od_matrix.printSchema()

root
 |-- date: timestamp (nullable = true)
 |-- route: integer (nullable = true)
 |-- from_stop_id: integer (nullable = true)
 |-- to_stop_id: integer (nullable = true)
 |-- o_boarding_id: long (nullable = true)



In [185]:
printdf(simple_od_matrix)

Unnamed: 0,date,route,from_stop_id,to_stop_id,o_boarding_id
0,2017-05-10,860,26163,30633,403726926010
1,2017-05-10,870,26314,28604,635655159957
2,2017-05-10,30,32612,32508,231928234038
3,2017-05-10,860,3280,26163,463856468060
4,2017-05-10,826,33180,30432,549755814128
5,2017-05-10,777,33632,26149,979252543558
6,2017-05-10,30,32600,32577,1219770712102
7,2017-05-10,393,30182,28592,755914244190
8,2017-05-10,778,33533,26146,1606317768917
9,2017-05-10,467,26181,26584,42949673032


In [186]:
labelled_trips_plans = trips_plans_df.join(simple_od_matrix, on=['date','route','from_stop_id','to_stop_id'], how='inner')

In [187]:
printdf(labelled_trips_plans)

Unnamed: 0,date,route,from_stop_id,to_stop_id,user_trip_id,itinerary_id,leg_id,start_time,end_time,mode,otp_duration_mins,o_boarding_id
0,2017-05-10,50,31638,31748,2,1,2,2017-05-10 08:56:15,2017-05-10 09:28:03,BUS,31.8,2
1,2017-05-10,50,31638,31748,2,2,2,2017-05-10 09:14:50,2017-05-10 09:48:00,BUS,33.166667,2
2,2017-05-10,50,31638,31748,2,3,2,2017-05-10 09:34:40,2017-05-10 10:07:00,BUS,32.333333,2
3,2017-05-10,860,30221,30219,9,3,2,2017-05-10 18:15:55,2017-05-10 18:17:00,BUS,1.083333,9
4,2017-05-10,777,33626,26149,16,1,2,2017-05-10 07:11:47,2017-05-10 07:45:00,BUS,33.216667,16
5,2017-05-10,777,33626,26149,16,1,2,2017-05-10 07:11:47,2017-05-10 07:45:00,BUS,33.216667,532575944785
6,2017-05-10,777,33626,26149,16,1,2,2017-05-10 07:11:47,2017-05-10 07:45:00,BUS,33.216667,979252543649
7,2017-05-10,777,33626,26149,16,2,2,2017-05-10 07:21:48,2017-05-10 07:56:00,BUS,34.2,16
8,2017-05-10,777,33626,26149,16,2,2,2017-05-10 07:21:48,2017-05-10 07:56:00,BUS,34.2,532575944785
9,2017-05-10,777,33626,26149,16,2,2,2017-05-10 07:21:48,2017-05-10 07:56:00,BUS,34.2,979252543649


### Read Bus Data

In [135]:
bus_trips_data = read_buste_data_v3(sqlContext,base_folder_path + '2017_05_11_veiculos.csv/')

In [136]:
bus_trips_data.printSchema()

root
 |-- route: string (nullable = true)
 |-- tripNum: integer (nullable = true)
 |-- shapeId: integer (nullable = true)
 |-- shapeSequence: integer (nullable = true)
 |-- shapeLat: double (nullable = true)
 |-- shapeLon: double (nullable = true)
 |-- distanceTraveledShape: double (nullable = true)
 |-- busCode: string (nullable = true)
 |-- gpsPointId: string (nullable = true)
 |-- gpsLat: double (nullable = true)
 |-- gpsLon: double (nullable = true)
 |-- distanceToShapePoint: double (nullable = true)
 |-- timestamp: string (nullable = true)
 |-- stopPointId: integer (nullable = true)
 |-- problem: string (nullable = true)
 |-- birthdate: string (nullable = true)
 |-- cardTimestamp: string (nullable = true)
 |-- lineName: string (nullable = true)
 |-- cardNum: integer (nullable = true)
 |-- gender: string (nullable = true)
 |-- date: date (nullable = true)



In [137]:
clean_bus_trips_data = bus_trips_data.select(["date","route","busCode","tripNum","stopPointId","timestamp"]) \
                    .na.drop(subset=["date","route","busCode","tripNum","stopPointId","timestamp"]) \
                    .dropDuplicates(['route','busCode','tripNum','stopPointId']) \
                    .withColumn('route',F.col('route').astype('float')) \
                    .withColumn('date',F.col('date').astype('string')) \
                    .withColumn('timestamp',F.from_unixtime(F.unix_timestamp(F.concat(F.col('date'),F.lit(' '),F.col('timestamp')), 'yyyy-MM-dd HH:mm:ss')))

In [138]:
bus_trips_data.printSchema()

root
 |-- route: string (nullable = true)
 |-- tripNum: integer (nullable = true)
 |-- shapeId: integer (nullable = true)
 |-- shapeSequence: integer (nullable = true)
 |-- shapeLat: double (nullable = true)
 |-- shapeLon: double (nullable = true)
 |-- distanceTraveledShape: double (nullable = true)
 |-- busCode: string (nullable = true)
 |-- gpsPointId: string (nullable = true)
 |-- gpsLat: double (nullable = true)
 |-- gpsLon: double (nullable = true)
 |-- distanceToShapePoint: double (nullable = true)
 |-- timestamp: string (nullable = true)
 |-- stopPointId: integer (nullable = true)
 |-- problem: string (nullable = true)
 |-- birthdate: string (nullable = true)
 |-- cardTimestamp: string (nullable = true)
 |-- lineName: string (nullable = true)
 |-- cardNum: integer (nullable = true)
 |-- gender: string (nullable = true)
 |-- date: date (nullable = true)



In [139]:
printdf(clean_bus_trips_data)

Unnamed: 0,date,route,busCode,tripNum,stopPointId,timestamp
0,2017-05-10,1.0,BN997,13,31454,2017-05-10 11:25:09
1,2017-05-10,1.0,BN997,15,35219,2017-05-10 12:20:11
2,2017-05-10,1.0,BN997,24,29420,2017-05-10 15:48:08
3,2017-05-10,1.0,BN997,27,35216,2017-05-10 16:47:51
4,2017-05-10,1.0,BN997,29,29420,2017-05-10 17:35:33
5,2017-05-10,1.0,BN998,2,29420,2017-05-10 07:41:43
6,2017-05-10,2.0,DN027,3,10899,2017-05-10 08:17:19
7,2017-05-10,2.0,DN027,14,30225,2017-05-10 15:39:21
8,2017-05-10,10.0,BB001,8,33148,2017-05-10 09:49:56
9,2017-05-10,10.0,BB001,14,33172,2017-05-10 16:58:07


### Finding Real User Trip Beginning Time

In [188]:
trips_plans_df_start = trips_plans_df.withColumn('stopPointId', F.col('from_stop_id'))
trip_plans_start = trips_plans_df_start.join(clean_bus_trips_data, ['date','route','stopPointId'], how='inner') \
                        .na.drop(subset=['timestamp']) \
                        .withColumn('timediff',F.abs(F.unix_timestamp(F.col('timestamp')) - F.unix_timestamp(F.col('start_time')))) \
                        .drop('otp_duration')

In [189]:
printdf(trip_plans_start.select(['date','route','from_stop_id','tripNum','timestamp','start_time','timediff']) \
                .orderBy(['date','route','from_stop_id','timediff']))

Unnamed: 0,date,route,from_stop_id,tripNum,timestamp,start_time,timediff
0,2017-05-10,1,26166,13,2017-05-10 11:30:17,2017-05-10 11:32:01,104
1,2017-05-10,1,26166,12,2017-05-10 11:46:41,2017-05-10 11:43:01,220
2,2017-05-10,1,26166,11,2017-05-10 11:22:34,2017-05-10 11:32:01,567
3,2017-05-10,1,26166,13,2017-05-10 11:30:17,2017-05-10 11:43:01,764
4,2017-05-10,1,26166,14,2017-05-10 11:57:21,2017-05-10 11:43:01,860
5,2017-05-10,1,26166,12,2017-05-10 11:46:41,2017-05-10 11:32:01,880
6,2017-05-10,1,26166,11,2017-05-10 11:22:34,2017-05-10 11:43:01,1227
7,2017-05-10,1,26166,12,2017-05-10 11:07:52,2017-05-10 11:32:01,1449
8,2017-05-10,1,26166,14,2017-05-10 11:57:21,2017-05-10 11:32:01,1520
9,2017-05-10,1,26166,13,2017-05-10 12:10:44,2017-05-10 11:43:01,1663


In [190]:
trip_plans_start.printSchema()

root
 |-- date: string (nullable = true)
 |-- route: integer (nullable = true)
 |-- stopPointId: integer (nullable = true)
 |-- user_trip_id: long (nullable = true)
 |-- itinerary_id: long (nullable = true)
 |-- leg_id: long (nullable = true)
 |-- start_time: timestamp (nullable = true)
 |-- end_time: timestamp (nullable = true)
 |-- mode: string (nullable = true)
 |-- from_stop_id: integer (nullable = true)
 |-- to_stop_id: integer (nullable = true)
 |-- otp_duration_mins: double (nullable = true)
 |-- busCode: string (nullable = true)
 |-- tripNum: integer (nullable = true)
 |-- timestamp: string (nullable = true)
 |-- timediff: long (nullable = true)



In [191]:
printdf(trip_plans_start)

Unnamed: 0,date,route,stopPointId,user_trip_id,itinerary_id,leg_id,start_time,end_time,mode,from_stop_id,to_stop_id,otp_duration_mins,busCode,tripNum,timestamp,timediff
0,2017-05-10,1,31454,188978561190,1,2,2017-05-10 13:06:07,2017-05-10 13:18:48,BUS,31454,29082,12.683333,BN997,13,2017-05-10 11:25:09,6058
1,2017-05-10,1,31454,188978561190,1,2,2017-05-10 13:06:07,2017-05-10 13:18:48,BUS,31454,29082,12.683333,BN998,10,2017-05-10 10:51:07,8100
2,2017-05-10,1,31454,188978561190,1,2,2017-05-10 13:06:07,2017-05-10 13:18:48,BUS,31454,29082,12.683333,BN997,21,2017-05-10 14:28:42,4955
3,2017-05-10,1,31454,188978561190,1,2,2017-05-10 13:06:07,2017-05-10 13:18:48,BUS,31454,29082,12.683333,BN997,22,2017-05-10 14:50:47,6280
4,2017-05-10,1,31454,188978561190,1,2,2017-05-10 13:06:07,2017-05-10 13:18:48,BUS,31454,29082,12.683333,BN998,2,2017-05-10 07:29:54,20173
5,2017-05-10,1,31454,188978561190,1,2,2017-05-10 13:06:07,2017-05-10 13:18:48,BUS,31454,29082,12.683333,BN997,17,2017-05-10 12:52:24,823
6,2017-05-10,1,31454,188978561190,1,2,2017-05-10 13:06:07,2017-05-10 13:18:48,BUS,31454,29082,12.683333,BN998,7,2017-05-10 09:34:41,12686
7,2017-05-10,1,31454,188978561190,1,2,2017-05-10 13:06:07,2017-05-10 13:18:48,BUS,31454,29082,12.683333,BN998,13,2017-05-10 12:05:46,3621
8,2017-05-10,1,31454,188978561190,1,2,2017-05-10 13:06:07,2017-05-10 13:18:48,BUS,31454,29082,12.683333,BN997,1,2017-05-10 06:22:44,24203
9,2017-05-10,1,31454,188978561190,1,2,2017-05-10 13:06:07,2017-05-10 13:18:48,BUS,31454,29082,12.683333,BN997,2,2017-05-10 06:49:45,22582


In [192]:
w = Window.partitionBy(['date','user_trip_id','itinerary_id','route','from_stop_id']).orderBy(['timediff'])

trip_plans_start = trip_plans_start.withColumn('rn', F.row_number().over(w)) \
                    .where(F.col('rn') == 1)

In [193]:
printdf(trip_plans_start.select('route','from_stop_id','timestamp','start_time'))

Unnamed: 0,route,from_stop_id,timestamp,start_time
0,169,27290,2017-05-10 17:56:51,2017-05-10 17:39:05
1,168,26877,2017-05-10 07:15:06,2017-05-10 07:18:45
2,680,31034,2017-05-10 07:29:53,2017-05-10 07:26:00
3,860,34113,2017-05-10 06:17:47,2017-05-10 06:18:26
4,561,31988,2017-05-10 07:34:26,2017-05-10 07:32:59
5,380,33018,2017-05-10 07:11:08,2017-05-10 07:10:46
6,40,34140,2017-05-10 17:57:00,2017-05-10 17:56:55
7,777,33630,2017-05-10 07:41:22,2017-05-10 07:42:43
8,40,34127,2017-05-10 09:33:38,2017-05-10 09:37:51
9,160,26635,2017-05-10 11:31:17,2017-05-10 11:30:04


In [194]:
trip_plans_start = trip_plans_start \
        .select(['date','user_trip_id','itinerary_id','leg_id','route','busCode','tripNum','from_stop_id','start_time','timestamp','to_stop_id','end_time']) \
        .withColumnRenamed('timestamp','from_timestamp')
printdf(trip_plans_start)

Unnamed: 0,date,user_trip_id,itinerary_id,leg_id,route,busCode,tripNum,from_stop_id,start_time,from_timestamp,to_stop_id,end_time
0,2017-05-10,8589934685,3,2,169,BC002,16,27290,2017-05-10 17:39:05,2017-05-10 17:56:51,26769,2017-05-10 17:45:41
1,2017-05-10,8589934831,3,2,168,BC004,2,26877,2017-05-10 07:18:45,2017-05-10 07:15:06,28429,2017-05-10 07:48:00
2,2017-05-10,8589934845,1,2,680,HR411,3,31034,2017-05-10 07:26:00,2017-05-10 07:29:53,38528,2017-05-10 07:48:47
3,2017-05-10,17179869217,2,2,860,LC010,1,34113,2017-05-10 06:18:26,2017-05-10 06:17:47,29924,2017-05-10 06:48:56
4,2017-05-10,51539607749,2,2,561,EC002,3,31988,2017-05-10 07:32:59,2017-05-10 07:34:26,26152,2017-05-10 07:44:00
5,2017-05-10,60129542244,1,2,380,LC301,3,33018,2017-05-10 07:10:46,2017-05-10 07:11:08,29914,2017-05-10 07:29:09
6,2017-05-10,68719476860,1,2,40,LB603,5,34140,2017-05-10 17:56:55,2017-05-10 17:57:00,34147,2017-05-10 17:58:33
7,2017-05-10,103079215268,3,2,777,LC027,2,33630,2017-05-10 07:42:43,2017-05-10 07:41:22,26149,2017-05-10 08:13:00
8,2017-05-10,111669149950,2,2,40,BB615,2,34127,2017-05-10 09:37:51,2017-05-10 09:33:38,34169,2017-05-10 09:56:58
9,2017-05-10,146028888332,2,2,160,BC022,5,26635,2017-05-10 11:30:04,2017-05-10 11:31:17,26904,2017-05-10 11:32:01


In [195]:
trip_plans_start.printSchema()

root
 |-- date: string (nullable = true)
 |-- user_trip_id: long (nullable = true)
 |-- itinerary_id: long (nullable = true)
 |-- leg_id: long (nullable = true)
 |-- route: integer (nullable = true)
 |-- busCode: string (nullable = true)
 |-- tripNum: integer (nullable = true)
 |-- from_stop_id: integer (nullable = true)
 |-- start_time: timestamp (nullable = true)
 |-- from_timestamp: string (nullable = true)
 |-- to_stop_id: integer (nullable = true)
 |-- end_time: timestamp (nullable = true)



### Finding Real User Trip End Time

In [148]:
bus_trips_data2 = read_buste_data_v3(sqlContext,base_folder_path + '2017_05_11_veiculos.csv/')

In [149]:
clean_bus_trips_data2 = bus_trips_data2.select(["date","route","busCode","tripNum","stopPointId","timestamp"]) \
                    .na.drop(subset=["date","route","busCode","tripNum","stopPointId","timestamp"]) \
                    .dropDuplicates(['route','busCode','tripNum','stopPointId']) \
                    .withColumn('route',F.col('route').astype('integer')) \
                    .withColumn('date',F.col('date').astype('string')) \
                    .withColumn('timestamp',F.from_unixtime(F.unix_timestamp(F.concat(F.col('date'),F.lit(' '),F.col('timestamp')), 'yyyy-MM-dd HH:mm:ss')))

In [150]:
clean_bus_trips_data2.printSchema()

root
 |-- date: string (nullable = true)
 |-- route: integer (nullable = true)
 |-- busCode: string (nullable = true)
 |-- tripNum: integer (nullable = true)
 |-- stopPointId: integer (nullable = true)
 |-- timestamp: string (nullable = true)



In [151]:
trip_plans_start.printSchema()

root
 |-- date: string (nullable = true)
 |-- user_trip_id: long (nullable = true)
 |-- itinerary_id: long (nullable = true)
 |-- leg_id: long (nullable = true)
 |-- route: integer (nullable = true)
 |-- busCode: string (nullable = true)
 |-- tripNum: integer (nullable = true)
 |-- from_stop_id: integer (nullable = true)
 |-- start_time: timestamp (nullable = true)
 |-- from_timestamp: string (nullable = true)
 |-- to_stop_id: integer (nullable = true)
 |-- end_time: timestamp (nullable = true)



In [196]:
trip_plans_df_end = trip_plans_start.withColumnRenamed('to_stop_id','stopPointId')
trip_plans_start_end = trip_plans_df_end.join(clean_bus_trips_data2, ['date','route','busCode','tripNum','stopPointId'], how='inner') \
                        .na.drop(subset=['timestamp']) \
                        .withColumn('timediff',F.abs(F.unix_timestamp(F.col('timestamp')) - F.unix_timestamp(F.col('end_time'))))

In [197]:
trip_plans_start_end = trip_plans_start_end.withColumnRenamed('timestamp', 'to_timestamp') \
                .withColumnRenamed('stopPointId','to_stop_id') \
                .orderBy(['date','route','stopPointId','timediff'])
                
printdf(trip_plans_start_end.select(['date','route','to_stop_id','tripNum','to_timestamp','end_time','timediff']))

Unnamed: 0,date,route,to_stop_id,tripNum,to_timestamp,end_time,timediff
0,2017-05-10,1,26360,22,2017-05-10 14:40:18,2017-05-10 15:10:00,1782
1,2017-05-10,1,26375,14,2017-05-10 12:35:06,2017-05-10 12:36:19,73
2,2017-05-10,1,26375,14,2017-05-10 12:35:06,2017-05-10 12:36:19,73
3,2017-05-10,1,26375,11,2017-05-10 10:53:01,2017-05-10 10:54:26,85
4,2017-05-10,1,26375,16,2017-05-10 13:22:57,2017-05-10 13:20:26,151
5,2017-05-10,1,26375,20,2017-05-10 14:51:17,2017-05-10 14:55:12,235
6,2017-05-10,1,26375,28,2017-05-10 17:20:06,2017-05-10 17:15:19,287
7,2017-05-10,1,26375,22,2017-05-10 15:02:00,2017-05-10 15:07:12,312
8,2017-05-10,1,26375,19,2017-05-10 13:49:55,2017-05-10 13:55:12,317
9,2017-05-10,1,26375,29,2017-05-10 17:36:49,2017-05-10 18:04:06,1637


In [198]:
trip_plans_start_end.printSchema()

root
 |-- date: string (nullable = true)
 |-- route: integer (nullable = true)
 |-- busCode: string (nullable = true)
 |-- tripNum: integer (nullable = true)
 |-- to_stop_id: integer (nullable = true)
 |-- user_trip_id: long (nullable = true)
 |-- itinerary_id: long (nullable = true)
 |-- leg_id: long (nullable = true)
 |-- from_stop_id: integer (nullable = true)
 |-- start_time: timestamp (nullable = true)
 |-- from_timestamp: string (nullable = true)
 |-- end_time: timestamp (nullable = true)
 |-- to_timestamp: string (nullable = true)
 |-- timediff: long (nullable = true)



In [199]:
trips_actual_time = trip_plans_start_end.select(['date','user_trip_id','itinerary_id','leg_id','route','busCode','tripNum','from_stop_id','from_timestamp','to_stop_id','to_timestamp']) \
                        .orderBy(['date','user_trip_id','itinerary_id','leg_id'])

In [200]:
printdf(trips_actual_time)

Unnamed: 0,date,user_trip_id,itinerary_id,leg_id,route,busCode,tripNum,from_stop_id,from_timestamp,to_stop_id,to_timestamp
0,2017-05-10,2,1,2,50,DR103,2,31638,2017-05-10 08:57:43,31748,2017-05-10 09:25:21
1,2017-05-10,2,2,2,50,LA054,2,31638,2017-05-10 09:13:53,31748,2017-05-10 09:46:32
2,2017-05-10,2,3,2,50,JB605,3,31638,2017-05-10 09:31:30,31748,2017-05-10 10:04:05
3,2017-05-10,9,1,2,370,LC009,8,30221,2017-05-10 18:15:45,30219,2017-05-10 18:26:50
4,2017-05-10,9,3,2,860,LC011,10,30221,2017-05-10 18:24:56,30219,2017-05-10 18:27:05
5,2017-05-10,16,1,2,777,JC003,3,33626,2017-05-10 07:11:58,26149,2017-05-10 07:47:04
6,2017-05-10,16,2,2,777,JC006,2,33626,2017-05-10 07:23:01,26149,2017-05-10 07:54:36
7,2017-05-10,16,3,2,777,LC027,2,33626,2017-05-10 07:40:14,26149,2017-05-10 08:12:45
8,2017-05-10,21,2,2,40,LB603,1,35564,2017-05-10 07:02:02,35565,2017-05-10 07:03:11
9,2017-05-10,21,3,2,40,MB604,3,35564,2017-05-10 07:07:31,35565,2017-05-10 07:08:40


### Joining Walk and Bus legs into a single Dataframe

In [201]:
printdf(trips_plans_df,l=20)

Unnamed: 0,date,user_trip_id,itinerary_id,leg_id,start_time,end_time,mode,route,from_stop_id,to_stop_id,otp_duration_mins
0,2017-05-10,2,1,1,2017-05-10 08:56:13,2017-05-10 08:56:14,WALK,,,,0.016667
1,2017-05-10,2,1,2,2017-05-10 08:56:15,2017-05-10 09:28:03,BUS,50.0,31638.0,31748.0,31.8
2,2017-05-10,2,1,3,2017-05-10 09:28:04,2017-05-10 09:28:11,WALK,,,,0.116667
3,2017-05-10,2,2,1,2017-05-10 09:14:48,2017-05-10 09:14:49,WALK,,,,0.016667
4,2017-05-10,2,2,2,2017-05-10 09:14:50,2017-05-10 09:48:00,BUS,50.0,31638.0,31748.0,33.166667
5,2017-05-10,2,2,3,2017-05-10 09:48:01,2017-05-10 09:48:08,WALK,,,,0.116667
6,2017-05-10,2,3,1,2017-05-10 09:34:38,2017-05-10 09:34:39,WALK,,,,0.016667
7,2017-05-10,2,3,2,2017-05-10 09:34:40,2017-05-10 10:07:00,BUS,50.0,31638.0,31748.0,32.333333
8,2017-05-10,2,3,3,2017-05-10 10:07:01,2017-05-10 10:07:08,WALK,,,,0.116667
9,2017-05-10,9,1,1,2017-05-10 18:04:09,2017-05-10 18:04:19,WALK,,,,0.166667


In [202]:
trips_actual_itineraries = trips_plans_df.join(trips_actual_time, on=['date','user_trip_id','itinerary_id','leg_id', 'route', 'from_stop_id','to_stop_id'], how='left_outer')

In [203]:
printdf(trips_actual_itineraries.orderBy(['itinerary_id','leg_id','start_time']), l=20)

Unnamed: 0,date,user_trip_id,itinerary_id,leg_id,route,from_stop_id,to_stop_id,start_time,end_time,mode,otp_duration_mins,busCode,tripNum,from_timestamp,to_timestamp
0,2017-05-10,1666447311024,1,1,,,,2017-05-10 04:43:01,2017-05-10 04:43:48,WALK,0.783333,,,,
1,2017-05-10,1606317768783,1,1,,,,2017-05-10 04:55:58,2017-05-10 04:56:47,WALK,0.816667,,,,
2,2017-05-10,1571958030485,1,1,,,,2017-05-10 05:00:07,2017-05-10 05:13:48,WALK,13.683333,,,,
3,2017-05-10,498216206418,1,1,,,,2017-05-10 05:01:44,2017-05-10 05:01:58,WALK,0.233333,,,,
4,2017-05-10,1606317768844,1,1,,,,2017-05-10 05:02:19,2017-05-10 05:02:22,WALK,0.05,,,,
5,2017-05-10,25769803795,1,1,,,,2017-05-10 05:03:16,2017-05-10 05:10:07,WALK,6.85,,,,
6,2017-05-10,1030792151170,1,1,,,,2017-05-10 05:05:32,2017-05-10 05:05:46,WALK,0.233333,,,,
7,2017-05-10,541165879454,1,1,,,,2017-05-10 05:06:10,2017-05-10 05:06:12,WALK,0.033333,,,,
8,2017-05-10,1168231104745,1,1,,,,2017-05-10 05:08:40,2017-05-10 05:08:48,WALK,0.133333,,,,
9,2017-05-10,979252543648,1,1,,,,2017-05-10 05:12:01,2017-05-10 05:12:14,WALK,0.216667,,,,


In [204]:
trips_actual_itineraries = trips_actual_itineraries.withColumn('actual_duration_mins', (F.unix_timestamp(F.col('to_timestamp')) - F.unix_timestamp(F.col('from_timestamp')))/60)

In [205]:
printdf(trips_actual_itineraries, l=20)

Unnamed: 0,date,user_trip_id,itinerary_id,leg_id,route,from_stop_id,to_stop_id,start_time,end_time,mode,otp_duration_mins,busCode,tripNum,from_timestamp,to_timestamp,actual_duration_mins
0,2017-05-10,37,1,3,,,,2017-05-10 06:52:24,2017-05-10 06:52:36,WALK,0.2,,,,,
1,2017-05-10,198,2,3,,,,2017-05-10 10:06:03,2017-05-10 10:06:23,WALK,0.333333,,,,,
2,2017-05-10,8589934599,2,1,,,,2017-05-10 09:42:31,2017-05-10 09:42:36,WALK,0.083333,,,,,
3,2017-05-10,8589934650,2,4,204.0,26252.0,26239.0,2017-05-10 07:03:00,2017-05-10 07:14:13,BUS,11.216667,BL072,1.0,2017-05-10 07:04:26,2017-05-10 07:18:11,13.75
4,2017-05-10,8589934775,2,3,,,,2017-05-10 07:14:01,2017-05-10 07:16:27,WALK,2.433333,,,,,
5,2017-05-10,8589934845,1,2,680.0,31034.0,38528.0,2017-05-10 07:26:00,2017-05-10 07:48:47,BUS,22.783333,,,,,
6,2017-05-10,17179869205,2,2,372.0,29922.0,30193.0,2017-05-10 13:52:59,2017-05-10 14:07:49,BUS,14.833333,BC323,7.0,2017-05-10 13:49:38,2017-05-10 14:05:16,15.633333
7,2017-05-10,25769804036,3,1,,,,2017-05-10 06:55:40,2017-05-10 06:55:42,WALK,0.033333,,,,,
8,2017-05-10,42949672966,1,1,,,,2017-05-10 09:29:29,2017-05-10 09:29:39,WALK,0.166667,,,,,
9,2017-05-10,42949673220,2,2,612.0,32208.0,31800.0,2017-05-10 08:27:17,2017-05-10 08:34:02,BUS,6.75,HA024,10.0,2017-05-10 08:23:36,2017-05-10 08:29:38,6.033333


In [206]:
trips_actual_itineraries = trips_actual_itineraries.withColumn('considered_duration_mins', F.when(F.col('mode') == F.lit('BUS'), F.col('actual_duration_mins')).otherwise(F.col('otp_duration_mins')))

In [207]:
printdf(trips_actual_itineraries, l=20)

Unnamed: 0,date,user_trip_id,itinerary_id,leg_id,route,from_stop_id,to_stop_id,start_time,end_time,mode,otp_duration_mins,busCode,tripNum,from_timestamp,to_timestamp,actual_duration_mins,considered_duration_mins
0,2017-05-10,37,1,3,,,,2017-05-10 06:52:24,2017-05-10 06:52:36,WALK,0.2,,,,,,0.2
1,2017-05-10,198,2,3,,,,2017-05-10 10:06:03,2017-05-10 10:06:23,WALK,0.333333,,,,,,0.333333
2,2017-05-10,8589934599,2,1,,,,2017-05-10 09:42:31,2017-05-10 09:42:36,WALK,0.083333,,,,,,0.083333
3,2017-05-10,8589934650,2,4,204.0,26252.0,26239.0,2017-05-10 07:03:00,2017-05-10 07:14:13,BUS,11.216667,BL072,1.0,2017-05-10 07:04:26,2017-05-10 07:18:11,13.75,13.75
4,2017-05-10,8589934775,2,3,,,,2017-05-10 07:14:01,2017-05-10 07:16:27,WALK,2.433333,,,,,,2.433333
5,2017-05-10,8589934845,1,2,680.0,31034.0,38528.0,2017-05-10 07:26:00,2017-05-10 07:48:47,BUS,22.783333,,,,,,
6,2017-05-10,17179869205,2,2,372.0,29922.0,30193.0,2017-05-10 13:52:59,2017-05-10 14:07:49,BUS,14.833333,BC323,7.0,2017-05-10 13:49:38,2017-05-10 14:05:16,15.633333,15.633333
7,2017-05-10,25769804036,3,1,,,,2017-05-10 06:55:40,2017-05-10 06:55:42,WALK,0.033333,,,,,,0.033333
8,2017-05-10,42949672966,1,1,,,,2017-05-10 09:29:29,2017-05-10 09:29:39,WALK,0.166667,,,,,,0.166667
9,2017-05-10,42949673220,2,2,612.0,32208.0,31800.0,2017-05-10 08:27:17,2017-05-10 08:34:02,BUS,6.75,HA024,10.0,2017-05-10 08:23:36,2017-05-10 08:29:38,6.033333,6.033333


### Aggregating Itineraries Legs to compute metrics on them

In [208]:
trips_itineraries_duration = trips_actual_itineraries.groupBy(['date', 'user_trip_id', 'itinerary_id']).agg({'considered_duration_mins':'sum'}).orderBy(['date','user_trip_id','itinerary_id'])

In [209]:
printdf(trips_itineraries_duration)

Unnamed: 0,date,user_trip_id,itinerary_id,sum(considered_duration_mins)
0,2017-05-10,2,1,27.766667
1,2017-05-10,2,2,32.783333
2,2017-05-10,2,3,32.716667
3,2017-05-10,9,1,11.3
4,2017-05-10,9,2,3.383333
5,2017-05-10,9,3,2.366667
6,2017-05-10,16,1,36.35
7,2017-05-10,16,2,32.833333
8,2017-05-10,16,3,33.766667
9,2017-05-10,21,1,5.366667


### Appending Real Trips to OTP-Suggested Trips for overall analysis

In [210]:
od_matrix_itineraries = od_matrix \
                            .withColumnRenamed('o_boarding_id','user_trip_id') \
                            .withColumn('itinerary_id',F.lit(0)) \
                            .withColumn('leg_id',F.lit(1)) \
                            .withColumnRenamed('o_stop_id','from_stop_id') \
                            .withColumnRenamed('stopPointId','to_stop_id') \
                            .withColumn('start_time', F.lit(None)) \
                            .withColumn('end_time', F.lit(None)) \
                            .withColumn('mode', F.lit('BUS')) \
                            .withColumn('otp_duration_mins', F.lit(None)) \
                            .withColumn('date', F.from_unixtime(F.unix_timestamp(F.col('date')), 'yyyy-MM-dd')) \
                            .withColumn('from_timestamp', F.concat(F.col('date'),F.lit(' '),F.col('o_timestamp'))) \
                            .withColumn('to_timestamp', F.concat(F.col('date'),F.lit(' '),F.col('timestamp'))) \
                            .withColumn('actual_duration_mins', (F.unix_timestamp('to_timestamp') - F.unix_timestamp('from_timestamp'))/60) \
                            .withColumn('considered_duration_mins', F.col('actual_duration_mins')) \
                            .select(['date','user_trip_id','itinerary_id','leg_id','route','from_stop_id','to_stop_id','start_time','end_time','mode','otp_duration_mins','busCode','tripNum','from_timestamp','to_timestamp','actual_duration_mins','considered_duration_mins'])

In [211]:
printdf(od_matrix_itineraries)

Unnamed: 0,date,user_trip_id,itinerary_id,leg_id,route,from_stop_id,to_stop_id,start_time,end_time,mode,otp_duration_mins,busCode,tripNum,from_timestamp,to_timestamp,actual_duration_mins,considered_duration_mins
0,2017-05-10,403726926010,0,1,860,26163,30633,,,BUS,,LC011,2,2017-05-10 06:09:29,2017-05-10 06:14:13,4.733333,4.733333
1,2017-05-10,635655159957,0,1,870,26314,28604,,,BUS,,BC023,10,2017-05-10 16:30:55,2017-05-10 16:35:43,4.8,4.8
2,2017-05-10,231928234038,0,1,30,32612,32508,,,BUS,,BB608,2,2017-05-10 10:41:15,2017-05-10 12:10:57,89.7,89.7
3,2017-05-10,463856468060,0,1,860,3280,26163,,,BUS,,LC026,8,2017-05-10 16:22:25,2017-05-10 16:42:40,20.25,20.25
4,2017-05-10,549755814128,0,1,826,33180,30432,,,BUS,,JA001,11,2017-05-10 18:17:20,2017-05-10 18:54:05,36.75,36.75
5,2017-05-10,979252543558,0,1,777,33632,26149,,,BUS,,JC004,2,2017-05-10 06:39:08,2017-05-10 07:00:22,21.233333,21.233333
6,2017-05-10,1219770712102,0,1,30,32600,32577,,,BUS,,GR123,3,2017-05-10 12:22:47,2017-05-10 12:38:45,15.966667,15.966667
7,2017-05-10,755914244190,0,1,393,30182,28592,,,BUS,,DN600,5,2017-05-10 14:55:12,2017-05-10 15:05:38,10.433333,10.433333
8,2017-05-10,1606317768917,0,1,778,33533,26146,,,BUS,,JC007,3,2017-05-10 07:56:00,2017-05-10 08:23:16,27.266667,27.266667
9,2017-05-10,42949673032,0,1,467,26181,26584,,,BUS,,DN606,14,2017-05-10 19:30:22,2017-05-10 19:34:08,3.766667,3.766667


In [212]:
overall_itineraries_ranking = trips_actual_itineraries.union(od_matrix_itineraries) \
    .orderBy('date','user_trip_id','itinerary_id','leg_id')

### Aggregating Itineraries Legs to compute metrics on them

In [213]:
overall_trips_itineraries_duration = overall_itineraries_ranking.groupBy(['date', 'user_trip_id', 'itinerary_id']).agg(F.sum('considered_duration_mins').alias('duration')).orderBy(['date','user_trip_id','duration'])

In [214]:
printdf(overall_trips_itineraries_duration)

Unnamed: 0,date,user_trip_id,itinerary_id,duration
0,2017-05-10,2,0,27.433333
1,2017-05-10,2,1,27.766667
2,2017-05-10,2,3,32.716667
3,2017-05-10,2,2,32.783333
4,2017-05-10,9,0,0.9
5,2017-05-10,9,3,2.366667
6,2017-05-10,9,2,3.383333
7,2017-05-10,9,1,11.3
8,2017-05-10,16,0,30.216667
9,2017-05-10,16,2,32.833333


In [215]:
itineraries_window = Window.partitionBy(['date','user_trip_id']).orderBy(['duration'])
user_trips_ranks = overall_trips_itineraries_duration.withColumn('rank', F.row_number().over(itineraries_window))

In [217]:
printdf(user_trips_ranks)

Unnamed: 0,date,user_trip_id,itinerary_id,duration,rank
0,2017-05-10,317827580172,3,3.033333,1
1,2017-05-10,317827580172,2,3.183333,2
2,2017-05-10,317827580172,1,5.916667,3
3,2017-05-10,317827580172,0,41.65,4
4,2017-05-10,377957122336,2,-36.733333,1
5,2017-05-10,377957122336,1,9.05,2
6,2017-05-10,377957122336,3,9.75,3
7,2017-05-10,377957122336,0,30.483333,4
8,2017-05-10,395136991379,1,1.183333,1
9,2017-05-10,395136991379,2,1.183333,2


In [174]:
user_real_trip_rank = user_trips_ranks.filter(user_trips_ranks.itinerary_id == 0)

In [175]:
printdf(user_real_trip_rank.filter(overall_trips_itineraries_duration.user_trip_id == 231928234038))

Unnamed: 0,date,user_trip_id,itinerary_id,duration,rank
0,2017-05-10,231928234038,0,89.7,1


In [176]:
printdf(od_matrix.filter(od_matrix['o_boarding_id'] == 231928234038) \
       .select(['date','route','o_stop_id','o_timestamp','stopPointId','timestamp','o_boarding_id','cardNum']))

Unnamed: 0,date,route,o_stop_id,o_timestamp,stopPointId,timestamp,o_boarding_id,cardNum
0,2017-05-10,30,32612,10:41:15,32508,12:10:57,231928234038,1809130


In [177]:
od_matrix.printSchema()

root
 |-- route: integer (nullable = true)
 |-- tripNum: integer (nullable = true)
 |-- shapeId: integer (nullable = true)
 |-- shapeSequence: integer (nullable = true)
 |-- shapeLat: double (nullable = true)
 |-- shapeLon: double (nullable = true)
 |-- distanceTraveledShape: double (nullable = true)
 |-- busCode: string (nullable = true)
 |-- gpsPointId: string (nullable = true)
 |-- gpsLat: double (nullable = true)
 |-- gpsLon: double (nullable = true)
 |-- distanceToShapePoint: double (nullable = true)
 |-- timestamp: string (nullable = true)
 |-- stopPointId: integer (nullable = true)
 |-- problem: string (nullable = true)
 |-- birthdate: string (nullable = true)
 |-- cardTimestamp: string (nullable = true)
 |-- lineName: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- date: timestamp (nullable = true)
 |-- id: long (nullable = true)
 |-- o_route: integer (nullable = true)
 |-- o_bus_code: string (nullable = true)
 |-- o_date: timestamp (nullable = true)
 |-- o_