In [2]:
import pyspark
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.window import Window
from pyspark.sql import functions as F
from pyspark.sql import types as T

import json
import urllib2

In [3]:
OTP_SERVER_URL = 'http://localhost:5601/otp/'

def rename_columns(df, list_of_tuples):
    for (old_col, new_col) in list_of_tuples:
        df = df.withColumnRenamed(old_col, new_col)
    return df

def read_hdfs_folder(sqlContext, folderpath):
    data_frame = sqlContext.read.csv(folderpath, header=True,
                                     inferSchema=True,nullValue="-")
    return data_frame

def read_buste_data_v3(sqlContext, folderpath):
    data_frame = read_hdfs_folder(sqlContext,folderpath)
    
    date = "-".join(folderpath.split("/")[-2].split("_")[:3])

    data_frame = data_frame.withColumn("date", F.date_sub(F.lit(date),1))
    #data_frame = data_frame.withColumn("date", F.unix_timestamp(F.col("date"),1),'yyyy-MM-dd'))
    
    return data_frame

def printdf(df,l=10):
    return df.limit(l).toPandas()

def get_timestamp_in_tz(unixtime_timestamp,ts_format,tz):
    return F.from_utc_timestamp(F.from_unixtime(unixtime_timestamp, ts_format),tz)

In [4]:
spark  = SparkSession.builder.getOrCreate()
spark.conf.set('spark.sql.crossJoin.enabled', 'true')

sc = spark.sparkContext
sqlContext = pyspark.SQLContext(sc)

In [5]:
od_matrix = read_hdfs_folder(sqlContext,'/local/tarciso/masters/experiments/preliminary-exp/preliminary-exp-sample-data/buste-v3a/od_matrix/')

In [6]:
od_matrix.printSchema()

root
 |-- route: integer (nullable = true)
 |-- tripNum: integer (nullable = true)
 |-- shapeId: integer (nullable = true)
 |-- shapeSequence: integer (nullable = true)
 |-- shapeLat: double (nullable = true)
 |-- shapeLon: double (nullable = true)
 |-- distanceTraveledShape: double (nullable = true)
 |-- busCode: string (nullable = true)
 |-- gpsPointId: string (nullable = true)
 |-- gpsLat: double (nullable = true)
 |-- gpsLon: double (nullable = true)
 |-- distanceToShapePoint: double (nullable = true)
 |-- timestamp: string (nullable = true)
 |-- stopPointId: integer (nullable = true)
 |-- problem: string (nullable = true)
 |-- birthdate: string (nullable = true)
 |-- cardTimestamp: string (nullable = true)
 |-- lineName: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- date: timestamp (nullable = true)
 |-- id: long (nullable = true)
 |-- o_route: integer (nullable = true)
 |-- o_bus_code: string (nullable = true)
 |-- o_date: timestamp (nullable = true)
 |-- o_

In [7]:
printdf(od_matrix)

Unnamed: 0,route,tripNum,shapeId,shapeSequence,shapeLat,shapeLon,distanceTraveledShape,busCode,gpsPointId,gpsLat,...,next_o_shape_seq,next_o_shape_lat,next_o_shape_lon,next_o_stop_id,next_o_boarding_id,o_unixtimestamp,next_o_unixtimestamp,leg_duration,dist,rn
0,860,2,2240,4494816,-25.440235,-49.277592,840.222,LC011,,-25.440471,...,6448950,-25.434527,-49.280126,29874,438086664359,32969,78541,759.533333,0.683846,1
1,870,10,2926,5386395,-25.431688,-49.276556,518.881,BC023,,-25.43163,...,5162628,-25.428355,-49.272707,28429,1262720385319,70255,52607,-1.0,0.535484,1
2,30,2,1715,6293026,-25.410378,-49.204718,25103.643,BB608,,-25.410268,...,5846746,-25.410157,-49.204149,32508,206158430263,49275,71111,363.933333,0.062245,1
3,860,8,2241,4494781,-25.436792,-49.274447,12604.144,LC026,,-25.436816,...,5255201,-25.428123,-49.271896,28556,1666447310868,69745,45888,-1.0,0.997384,1
4,826,11,2230,6062927,-25.511831,-49.324408,12069.86,JA001,,-25.511798,...,6416861,-25.510229,-49.32615,35840,1563368095816,76640,36968,-1.0,0.249584,1
5,777,2,2195,4299860,-25.435179,-49.273374,0.0,JC004,,-25.435188,...,6222433,-25.428604,-49.270626,26358,798863917103,34748,47558,213.5,0.781424,1
6,30,3,1715,6292678,-25.489034,-49.226245,11197.387,GR123,,-25.489063,...,6358078,-25.490593,-49.222306,30062,1022202216489,55367,80712,422.416667,0.431719,1
7,393,5,1907,6350328,-25.434318,-49.274526,11645.587,DN600,,-25.434348,...,6360100,-25.43537,-49.271665,26180,481036337331,64512,77787,221.25,0.310218,1
8,778,3,2196,5603972,-25.435144,-49.273284,12300.302,JC007,,-25.43519,...,6594723,-25.430341,-49.267154,26376,1589137899766,39360,35167,-1.0,0.815047,1
9,467,14,2818,5136062,-25.444211,-49.267533,1036.662,DN606,,,...,6405216,-25.444187,-49.267545,26584,627065225481,81022,40917,-1.0,0.002895,1


In [11]:
printdf(od_matrix.select(['date','route','o_stop_id','o_timestamp','stopPointId','timestamp','o_boarding_id']))

Unnamed: 0,date,route,o_stop_id,o_timestamp,stopPointId,timestamp,o_boarding_id
0,2017-05-10,860,26163,06:09:29,30633,06:14:13,403726926010
1,2017-05-10,870,26314,16:30:55,28604,16:35:43,635655159957
2,2017-05-10,30,32612,10:41:15,32508,12:10:57,231928234038
3,2017-05-10,860,3280,16:22:25,26163,16:42:40,463856468060
4,2017-05-10,826,33180,18:17:20,30432,18:54:05,549755814128
5,2017-05-10,777,33632,06:39:08,26149,07:00:22,979252543558
6,2017-05-10,30,32600,12:22:47,32577,12:38:45,1219770712102
7,2017-05-10,393,30182,14:55:12,28592,15:05:38,755914244190
8,2017-05-10,778,33533,07:56:00,26146,08:23:16,1606317768917
9,2017-05-10,467,26181,19:30:22,26584,19:34:08,42949673032


In [12]:
od_matrix = od_matrix.withColumn('date_in_ms', F.unix_timestamp(F.col('date'),'yyyy-MM-dd')) \
                        .withColumn('o_time_in_ms', F.unix_timestamp(F.col('o_timestamp'),'HH:mm:ss')) \
                        .withColumn('o_datetime_in_ms', F.col('date_in_ms') + F.col('o_time_in_ms')) \
                        .withColumn('o_datetime',get_timestamp_in_tz(F.col('o_datetime_in_ms'),'yyyy-MM-dd HH:mm:ss','GMT-3'))
                        

In [13]:
printdf(od_matrix.select(['date_in_ms','o_time_in_ms','o_datetime_in_ms','date','o_timestamp','o_datetime']))

Unnamed: 0,date_in_ms,o_time_in_ms,o_datetime_in_ms,date,o_timestamp,o_datetime
0,1494385200,32969,1494418169,2017-05-10,06:09:29,2017-05-10 06:09:29
1,1494385200,70255,1494455455,2017-05-10,16:30:55,2017-05-10 16:30:55
2,1494385200,49275,1494434475,2017-05-10,10:41:15,2017-05-10 10:41:15
3,1494385200,69745,1494454945,2017-05-10,16:22:25,2017-05-10 16:22:25
4,1494385200,76640,1494461840,2017-05-10,18:17:20,2017-05-10 18:17:20
5,1494385200,34748,1494419948,2017-05-10,06:39:08,2017-05-10 06:39:08
6,1494385200,55367,1494440567,2017-05-10,12:22:47,2017-05-10 12:22:47
7,1494385200,64512,1494449712,2017-05-10,14:55:12,2017-05-10 14:55:12
8,1494385200,39360,1494424560,2017-05-10,07:56:00,2017-05-10 07:56:00
9,1494385200,81022,1494466222,2017-05-10,19:30:22,2017-05-10 19:30:22


In [14]:
od_matrix = od_matrix.withColumn('o_base_timestamp_in_ms', F.col('o_datetime_in_ms') - 60 * 2) \
                        .withColumn('o_base_timestamp',get_timestamp_in_tz(F.col('o_base_timestamp_in_ms'),'yyyy-MM-dd HH:mm:ss','GMT-3')) \
                        .withColumn('o_base_time',F.split(F.col('o_base_timestamp'),'\s+')[1]) \
                        .withColumn('o_base_date',F.split(F.col('o_base_timestamp'),'\s+')[0]) \


In [15]:
printdf(od_matrix.select('o_datetime_in_ms','o_base_timestamp_in_ms','o_datetime','o_base_timestamp','o_base_date','o_base_time'))

Unnamed: 0,o_datetime_in_ms,o_base_timestamp_in_ms,o_datetime,o_base_timestamp,o_base_date,o_base_time
0,1494418169,1494418049,2017-05-10 06:09:29,2017-05-10 06:07:29,2017-05-10,06:07:29
1,1494455455,1494455335,2017-05-10 16:30:55,2017-05-10 16:28:55,2017-05-10,16:28:55
2,1494434475,1494434355,2017-05-10 10:41:15,2017-05-10 10:39:15,2017-05-10,10:39:15
3,1494454945,1494454825,2017-05-10 16:22:25,2017-05-10 16:20:25,2017-05-10,16:20:25
4,1494461840,1494461720,2017-05-10 18:17:20,2017-05-10 18:15:20,2017-05-10,18:15:20
5,1494419948,1494419828,2017-05-10 06:39:08,2017-05-10 06:37:08,2017-05-10,06:37:08
6,1494440567,1494440447,2017-05-10 12:22:47,2017-05-10 12:20:47,2017-05-10,12:20:47
7,1494449712,1494449592,2017-05-10 14:55:12,2017-05-10 14:53:12,2017-05-10,14:53:12
8,1494424560,1494424440,2017-05-10 07:56:00,2017-05-10 07:54:00,2017-05-10,07:54:00
9,1494466222,1494466102,2017-05-10 19:30:22,2017-05-10 19:28:22,2017-05-10,19:28:22


In [16]:
def get_otp_itineraries(o_lat,o_lon,d_lat,d_lon,date,time):
    otp_http_request = 'routers/ctba/plan?fromPlace={},{}&toPlace={},{}&mode=TRANSIT,WALK&date={}&time={}'
    otp_request_url = OTP_SERVER_URL + otp_http_request.format(o_lat,o_lon,d_lat,d_lon,date,time)
    print otp_request_url
    return json.loads(urllib2.urlopen(otp_request_url).read())

In [17]:
test_itinerary = get_otp_itineraries(-25.413083,-49.229020,-25.428322,-49.266739,'05-10-2017','7:19:00')

print test_itinerary

http://localhost:5601/otp/routers/ctba/plan?fromPlace=-25.413083,-49.22902&toPlace=-25.428322,-49.266739&mode=TRANSIT,WALK&date=05-10-2017&time=7:19:00
{u'elevationMetadata': {u'geoidElevation': False, u'ellipsoidToGeoidDifference': 3.686111287840708}, u'plan': {u'date': 1494411540000, u'to': {u'lat': -25.428322, u'vertexType': u'NORMAL', u'lon': -49.266739, u'name': u'Destination', u'orig': u''}, u'itineraries': [{u'walkTime': 2, u'legs': [{u'distance': 0.151, u'from': {u'vertexType': u'NORMAL', u'name': u'Origin', u'lon': -49.22902, u'departure': 1494411566000, u'lat': -25.413083, u'orig': u''}, u'interlineWithPreviousLeg': False, u'transitLeg': False, u'realTime': False, u'route': u'', u'departureDelay': 0, u'agencyTimeZoneOffset': -10800000, u'to': {u'arrival': 1494411567000, u'vertexType': u'TRANSIT', u'name': u'Rua Fagundes Varela, 1295 - Jardim Social', u'stopSequence': 12, u'lon': -49.229029895014, u'departure': 1494411568000, u'stopId': u'1:31957', u'stopCode': u'130279', u'la

### New Approach

In [77]:
trips_otp_response = {}
counter = 0
for row in od_matrix.filter(od_matrix['o_boarding_id'] == 231928234038).collect():
        id=long(row['o_boarding_id'])
        trip_plan = get_otp_itineraries(row['o_shape_lat'], row['o_shape_lon'], row['shapeLat'], row['shapeLon'],row['o_base_date'],row['o_base_time'])
        trips_otp_response[id] = trip_plan
        counter+=1

http://localhost:5601/otp/routers/ctba/plan?fromPlace=-25.5142200773,-49.271038192&toPlace=-25.4103783466,-49.2047176841&mode=TRANSIT,WALK&date=2017-05-10&time=10:39:15


In [80]:
max_ctr = 1
trips_plans = []

for trip in trips_otp_response.keys():
    if max_ctr == 0:
        break
    if 'plan' in trips_otp_response[trip]:
        itinerary_id = 1
        for itinerary in trips_otp_response[trip]['plan']['itineraries']:
            date = trips_otp_response[trip]['plan']['date']/1000
            leg_id = 1
            for leg in itinerary['legs']:
                route = leg['route'] if leg['route'] != '' else None
                fromStopId = leg['from']['stopId'].split(':')[1] if leg['mode'] == 'BUS' else None
                toStopId = leg['to']['stopId'].split(':')[1] if leg['mode'] == 'BUS' else None
                start_time = long(leg['startTime'])/1000
                end_time = long(leg['endTime'])/1000
                duration = (end_time - start_time)/60
                #date_str = start_time.strftime('%Y-%m-%d')
                #start_time_str = start_time.strftime('%H:%M:%S')
                #end_time_str = end_time.strftime('%H:%M:%S')
                trips_plans.append((date,trip,itinerary_id,leg_id,start_time,end_time,leg['mode'],route,fromStopId,toStopId, duration))
                leg_id += 1
            itinerary_id += 1
    max_ctr -= 1

labels=['date','user_trip_id','itinerary_id','leg_id','start_time','end_time','mode','route','from_stop_id','to_stop_id','otp_duration_mins']
trips_plans_df = sqlContext.createDataFrame(trips_plans, labels) \
                    .withColumn('date',F.from_unixtime(F.col('date'),'yyyy-MM-dd')) \
                    .withColumn('start_time',F.from_unixtime(F.col('start_time'),'yyyy-MM-dd HH:mm:ss').astype('timestamp')) \
                    .withColumn('end_time',F.from_unixtime(F.col('end_time'),'yyyy-MM-dd HH:mm:ss').astype('timestamp')) \
                    .withColumn('otp_duration_mins',(F.unix_timestamp(F.col('end_time')) - F.unix_timestamp(F.col('start_time')))/60) \
                    .withColumn('route', F.col('route').astype('integer')) \
                    .withColumn('from_stop_id', F.col('from_stop_id').astype('integer')) \
                    .withColumn('to_stop_id', F.col('to_stop_id').astype('integer')) \
                    .orderBy(['date','user_trip_id','itinerary_id','start_time'])

In [81]:
trips_plans_df.printSchema()

root
 |-- date: string (nullable = true)
 |-- user_trip_id: long (nullable = true)
 |-- itinerary_id: long (nullable = true)
 |-- leg_id: long (nullable = true)
 |-- start_time: timestamp (nullable = true)
 |-- end_time: timestamp (nullable = true)
 |-- mode: string (nullable = true)
 |-- route: integer (nullable = true)
 |-- from_stop_id: integer (nullable = true)
 |-- to_stop_id: integer (nullable = true)
 |-- otp_duration_mins: double (nullable = true)



In [82]:
printdf(trips_plans_df,l=20)

Unnamed: 0,date,user_trip_id,itinerary_id,leg_id,start_time,end_time,mode,route,from_stop_id,to_stop_id,otp_duration_mins
0,2017-05-10,231928234038,1,1,2017-05-10 10:40:42,2017-05-10 10:41:13,WALK,,,,0.516667
1,2017-05-10,231928234038,1,2,2017-05-10 10:41:14,2017-05-10 10:55:00,BUS,30.0,32612.0,27636.0,13.766667
2,2017-05-10,231928234038,1,3,2017-05-10 10:55:00,2017-05-10 10:55:33,WALK,,,,0.55
3,2017-05-10,231928234038,1,4,2017-05-10 10:56:30,2017-05-10 11:22:00,BUS,607.0,26247.0,25894.0,25.5
4,2017-05-10,231928234038,1,5,2017-05-10 11:22:00,2017-05-10 11:22:45,WALK,,,,0.75
5,2017-05-10,231928234038,1,6,2017-05-10 11:23:00,2017-05-10 11:53:00,BUS,307.0,25896.0,26208.0,30.0
6,2017-05-10,231928234038,1,7,2017-05-10 11:53:01,2017-05-10 11:59:16,WALK,,,,6.25
7,2017-05-10,231928234038,2,1,2017-05-10 11:04:48,2017-05-10 11:10:00,WALK,,,,5.2
8,2017-05-10,231928234038,2,2,2017-05-10 11:10:01,2017-05-10 12:10:25,BUS,30.0,32610.0,32508.0,60.4
9,2017-05-10,231928234038,2,3,2017-05-10 12:10:26,2017-05-10 12:11:00,WALK,,,,0.566667


In [83]:
simple_od_matrix = od_matrix.select(['date','route','o_stop_id','stopPointId','o_boarding_id']) \
                        .withColumnRenamed('o_stop_id','from_stop_id') \
                        .withColumnRenamed('stopPointId','to_stop_id')

In [84]:
simple_od_matrix.printSchema()

root
 |-- date: timestamp (nullable = true)
 |-- route: integer (nullable = true)
 |-- from_stop_id: integer (nullable = true)
 |-- to_stop_id: integer (nullable = true)
 |-- o_boarding_id: long (nullable = true)



In [85]:
printdf(simple_od_matrix)

Unnamed: 0,date,route,from_stop_id,to_stop_id,o_boarding_id
0,2017-05-10,860,26163,30633,403726926010
1,2017-05-10,870,26314,28604,635655159957
2,2017-05-10,30,32612,32508,231928234038
3,2017-05-10,860,3280,26163,463856468060
4,2017-05-10,826,33180,30432,549755814128
5,2017-05-10,777,33632,26149,979252543558
6,2017-05-10,30,32600,32577,1219770712102
7,2017-05-10,393,30182,28592,755914244190
8,2017-05-10,778,33533,26146,1606317768917
9,2017-05-10,467,26181,26584,42949673032


In [86]:
labelled_trips_plans = trips_plans_df.join(simple_od_matrix, on=['date','route','from_stop_id','to_stop_id'], how='inner')

In [87]:
printdf(labelled_trips_plans)

Unnamed: 0,date,route,from_stop_id,to_stop_id,user_trip_id,itinerary_id,leg_id,start_time,end_time,mode,otp_duration_mins,o_boarding_id


### Read Bus Data

In [88]:
bus_trips_data = read_buste_data_v3(sqlContext,'/local/tarciso/masters/experiments/preliminary-exp/preliminary-exp-sample-data/buste-v3a/bulma-output/2017_05_11_veiculos.csv/')

In [89]:
bus_trips_data.printSchema()

root
 |-- route: string (nullable = true)
 |-- tripNum: integer (nullable = true)
 |-- shapeId: integer (nullable = true)
 |-- shapeSequence: integer (nullable = true)
 |-- shapeLat: double (nullable = true)
 |-- shapeLon: double (nullable = true)
 |-- distanceTraveledShape: double (nullable = true)
 |-- busCode: string (nullable = true)
 |-- gpsPointId: string (nullable = true)
 |-- gpsLat: double (nullable = true)
 |-- gpsLon: double (nullable = true)
 |-- distanceToShapePoint: double (nullable = true)
 |-- timestamp: string (nullable = true)
 |-- stopPointId: integer (nullable = true)
 |-- problem: string (nullable = true)
 |-- birthdate: string (nullable = true)
 |-- cardTimestamp: string (nullable = true)
 |-- lineName: string (nullable = true)
 |-- cardNum: integer (nullable = true)
 |-- gender: string (nullable = true)
 |-- date: date (nullable = true)



In [95]:
clean_bus_trips_data = bus_trips_data.select(["date","route","busCode","tripNum","stopPointId","timestamp"]) \
                    .na.drop(subset=["date","route","busCode","tripNum","stopPointId","timestamp"]) \
                    .dropDuplicates(['route','busCode','tripNum','stopPointId']) \
                    .withColumn('route',F.col('route').astype('float')) \
                    .withColumn('date',F.col('date').astype('string')) \
                    .withColumn('timestamp',F.from_unixtime(F.unix_timestamp(F.concat(F.col('date'),F.lit(' '),F.col('timestamp')), 'yyyy-MM-dd HH:mm:ss')))

In [97]:
bus_trips_data.printSchema()

root
 |-- route: string (nullable = true)
 |-- tripNum: integer (nullable = true)
 |-- shapeId: integer (nullable = true)
 |-- shapeSequence: integer (nullable = true)
 |-- shapeLat: double (nullable = true)
 |-- shapeLon: double (nullable = true)
 |-- distanceTraveledShape: double (nullable = true)
 |-- busCode: string (nullable = true)
 |-- gpsPointId: string (nullable = true)
 |-- gpsLat: double (nullable = true)
 |-- gpsLon: double (nullable = true)
 |-- distanceToShapePoint: double (nullable = true)
 |-- timestamp: string (nullable = true)
 |-- stopPointId: integer (nullable = true)
 |-- problem: string (nullable = true)
 |-- birthdate: string (nullable = true)
 |-- cardTimestamp: string (nullable = true)
 |-- lineName: string (nullable = true)
 |-- cardNum: integer (nullable = true)
 |-- gender: string (nullable = true)
 |-- date: date (nullable = true)



In [96]:
printdf(clean_bus_trips_data)

Unnamed: 0,date,route,busCode,tripNum,stopPointId,timestamp
0,2017-05-10,1.0,BN997,13,31454,2017-05-10 11:25:09
1,2017-05-10,1.0,BN997,15,35219,2017-05-10 12:20:11
2,2017-05-10,1.0,BN997,24,29420,2017-05-10 15:48:08
3,2017-05-10,1.0,BN997,27,35216,2017-05-10 16:47:51
4,2017-05-10,1.0,BN997,29,29420,2017-05-10 17:35:33
5,2017-05-10,1.0,BN998,2,29420,2017-05-10 07:41:43
6,2017-05-10,2.0,DN027,3,10899,2017-05-10 08:17:19
7,2017-05-10,2.0,DN027,14,30225,2017-05-10 15:39:21
8,2017-05-10,10.0,BB001,8,33148,2017-05-10 09:49:56
9,2017-05-10,10.0,BB001,14,33172,2017-05-10 16:58:07


### Finding Real User Trip Beginning Time

In [139]:
trips_plans_df_start = trips_plans_df.withColumn('stopPointId', F.col('from_stop_id'))
trip_plans_start = trips_plans_df_start.join(clean_bus_trips_data, ['date','route','stopPointId'], how='inner') \
                        .na.drop(subset=['timestamp']) \
                        .withColumn('timediff',F.abs(F.unix_timestamp(F.col('timestamp')) - F.unix_timestamp(F.col('start_time')))) \
                        .drop('otp_duration')

In [140]:
printdf(trip_plans_start.select(['date','route','from_stop_id','tripNum','timestamp','start_time','timediff']) \
                .orderBy(['date','route','from_stop_id','timediff']))

Unnamed: 0,date,route,from_stop_id,tripNum,timestamp,start_time,timediff
0,2017-05-10,30,32610,2,2017-05-10 11:09:02,2017-05-10 11:10:01,59
1,2017-05-10,30,32610,3,2017-05-10 11:32:13,2017-05-10 11:34:52,159
2,2017-05-10,30,32610,3,2017-05-10 11:53:23,2017-05-10 11:34:52,1111
3,2017-05-10,30,32610,3,2017-05-10 11:32:13,2017-05-10 11:10:01,1332
4,2017-05-10,30,32610,2,2017-05-10 10:44:46,2017-05-10 11:10:01,1515
5,2017-05-10,30,32610,2,2017-05-10 11:09:02,2017-05-10 11:34:52,1550
6,2017-05-10,30,32610,2,2017-05-10 10:29:16,2017-05-10 11:10:01,2445
7,2017-05-10,30,32610,3,2017-05-10 12:16:57,2017-05-10 11:34:52,2525
8,2017-05-10,30,32610,3,2017-05-10 11:53:23,2017-05-10 11:10:01,2602
9,2017-05-10,30,32610,2,2017-05-10 10:23:43,2017-05-10 11:10:01,2778


In [141]:
trip_plans_start.printSchema()

root
 |-- date: string (nullable = true)
 |-- route: integer (nullable = true)
 |-- stopPointId: integer (nullable = true)
 |-- user_trip_id: long (nullable = true)
 |-- itinerary_id: long (nullable = true)
 |-- leg_id: long (nullable = true)
 |-- start_time: timestamp (nullable = true)
 |-- end_time: timestamp (nullable = true)
 |-- mode: string (nullable = true)
 |-- from_stop_id: integer (nullable = true)
 |-- to_stop_id: integer (nullable = true)
 |-- otp_duration_mins: double (nullable = true)
 |-- busCode: string (nullable = true)
 |-- tripNum: integer (nullable = true)
 |-- timestamp: string (nullable = true)
 |-- timediff: long (nullable = true)



In [142]:
printdf(trip_plans_start)

Unnamed: 0,date,route,stopPointId,user_trip_id,itinerary_id,leg_id,start_time,end_time,mode,from_stop_id,to_stop_id,otp_duration_mins,busCode,tripNum,timestamp,timediff
0,2017-05-10,607,26247,231928234038,1,4,2017-05-10 10:56:30,2017-05-10 11:22:00,BUS,26247,25894,25.5,HL322,7,2017-05-10 18:09:25,25975
1,2017-05-10,607,26247,231928234038,1,4,2017-05-10 10:56:30,2017-05-10 11:22:00,BUS,26247,25894,25.5,HL321,6,2017-05-10 19:48:44,31934
2,2017-05-10,607,26247,231928234038,1,4,2017-05-10 10:56:30,2017-05-10 11:22:00,BUS,26247,25894,25.5,HL309,6,2017-05-10 19:17:14,30044
3,2017-05-10,607,26247,231928234038,1,4,2017-05-10 10:56:30,2017-05-10 11:22:00,BUS,26247,25894,25.5,HL303,2,2017-05-10 09:29:13,5237
4,2017-05-10,607,26247,231928234038,1,4,2017-05-10 10:56:30,2017-05-10 11:22:00,BUS,26247,25894,25.5,HL324,3,2017-05-10 11:30:47,2057
5,2017-05-10,607,26247,231928234038,1,4,2017-05-10 10:56:30,2017-05-10 11:22:00,BUS,26247,25894,25.5,HL326,5,2017-05-10 11:42:37,2767
6,2017-05-10,607,26247,231928234038,1,4,2017-05-10 10:56:30,2017-05-10 11:22:00,BUS,26247,25894,25.5,HL322,1,2017-05-10 23:08:53,43943
7,2017-05-10,607,26247,231928234038,1,4,2017-05-10 10:56:30,2017-05-10 11:22:00,BUS,26247,25894,25.5,HL306,1,2017-05-10 05:58:53,17857
8,2017-05-10,607,26247,231928234038,1,4,2017-05-10 10:56:30,2017-05-10 11:22:00,BUS,26247,25894,25.5,HL314,2,2017-05-10 08:04:32,10318
9,2017-05-10,607,26247,231928234038,1,4,2017-05-10 10:56:30,2017-05-10 11:22:00,BUS,26247,25894,25.5,HL302,2,2017-05-10 17:09:34,22384


In [143]:
w = Window.partitionBy(['date','user_trip_id','itinerary_id','route','from_stop_id']).orderBy(['timediff'])

trip_plans_start = trip_plans_start.withColumn('rn', F.row_number().over(w)) \
                    .where(F.col('rn') == 1)

In [144]:
printdf(trip_plans_start.select('route','from_stop_id','timestamp','start_time'))

Unnamed: 0,route,from_stop_id,timestamp,start_time
0,30,32610,2017-05-10 11:09:02,2017-05-10 11:10:01
1,307,25896,2017-05-10 11:21:54,2017-05-10 11:23:00
2,30,32610,2017-05-10 11:32:13,2017-05-10 11:34:52
3,607,26247,2017-05-10 10:51:41,2017-05-10 10:56:30
4,30,32612,2017-05-10 10:40:14,2017-05-10 10:41:14


In [145]:
trip_plans_start = trip_plans_start \
        .select(['date','user_trip_id','itinerary_id','leg_id','route','busCode','tripNum','from_stop_id','start_time','timestamp','to_stop_id','end_time']) \
        .withColumnRenamed('timestamp','from_timestamp')
printdf(trip_plans_start)

Unnamed: 0,date,user_trip_id,itinerary_id,leg_id,route,busCode,tripNum,from_stop_id,start_time,from_timestamp,to_stop_id,end_time
0,2017-05-10,231928234038,2,2,30,BB608,2,32610,2017-05-10 11:10:01,2017-05-10 11:09:02,32508,2017-05-10 12:10:25
1,2017-05-10,231928234038,1,6,307,CL306,4,25896,2017-05-10 11:23:00,2017-05-10 11:21:54,26208,2017-05-10 11:53:00
2,2017-05-10,231928234038,3,2,30,GR121,3,32610,2017-05-10 11:34:52,2017-05-10 11:32:13,32508,2017-05-10 12:33:57
3,2017-05-10,231928234038,1,4,607,HL306,3,26247,2017-05-10 10:56:30,2017-05-10 10:51:41,25894,2017-05-10 11:22:00
4,2017-05-10,231928234038,1,2,30,BB608,2,32612,2017-05-10 10:41:14,2017-05-10 10:40:14,27636,2017-05-10 10:55:00


In [146]:
trip_plans_start.printSchema()

root
 |-- date: string (nullable = true)
 |-- user_trip_id: long (nullable = true)
 |-- itinerary_id: long (nullable = true)
 |-- leg_id: long (nullable = true)
 |-- route: integer (nullable = true)
 |-- busCode: string (nullable = true)
 |-- tripNum: integer (nullable = true)
 |-- from_stop_id: integer (nullable = true)
 |-- start_time: timestamp (nullable = true)
 |-- from_timestamp: string (nullable = true)
 |-- to_stop_id: integer (nullable = true)
 |-- end_time: timestamp (nullable = true)



### Finding Real User Trip End Time

In [147]:
bus_trips_data2 = read_buste_data_v3(sqlContext,'/local/tarciso/masters/experiments/preliminary-exp/preliminary-exp-sample-data/buste-v3a/bulma-output/2017_05_11_veiculos.csv/')

In [148]:
clean_bus_trips_data2 = bus_trips_data2.select(["date","route","busCode","tripNum","stopPointId","timestamp"]) \
                    .na.drop(subset=["date","route","busCode","tripNum","stopPointId","timestamp"]) \
                    .dropDuplicates(['route','busCode','tripNum','stopPointId']) \
                    .withColumn('route',F.col('route').astype('integer')) \
                    .withColumn('date',F.col('date').astype('string')) \
                    .withColumn('timestamp',F.from_unixtime(F.unix_timestamp(F.concat(F.col('date'),F.lit(' '),F.col('timestamp')), 'yyyy-MM-dd HH:mm:ss')))

In [149]:
clean_bus_trips_data2.printSchema()

root
 |-- date: string (nullable = true)
 |-- route: integer (nullable = true)
 |-- busCode: string (nullable = true)
 |-- tripNum: integer (nullable = true)
 |-- stopPointId: integer (nullable = true)
 |-- timestamp: string (nullable = true)



In [150]:
trip_plans_start.printSchema()

root
 |-- date: string (nullable = true)
 |-- user_trip_id: long (nullable = true)
 |-- itinerary_id: long (nullable = true)
 |-- leg_id: long (nullable = true)
 |-- route: integer (nullable = true)
 |-- busCode: string (nullable = true)
 |-- tripNum: integer (nullable = true)
 |-- from_stop_id: integer (nullable = true)
 |-- start_time: timestamp (nullable = true)
 |-- from_timestamp: string (nullable = true)
 |-- to_stop_id: integer (nullable = true)
 |-- end_time: timestamp (nullable = true)



In [151]:
trip_plans_df_end = trip_plans_start.withColumnRenamed('to_stop_id','stopPointId')
trip_plans_start_end = trip_plans_df_end.join(clean_bus_trips_data2, ['date','route','busCode','tripNum','stopPointId'], how='inner') \
                        .na.drop(subset=['timestamp']) \
                        .withColumn('timediff',F.abs(F.unix_timestamp(F.col('timestamp')) - F.unix_timestamp(F.col('end_time'))))

In [152]:
trip_plans_start_end = trip_plans_start_end.withColumnRenamed('timestamp', 'to_timestamp') \
                .withColumnRenamed('stopPointId','to_stop_id') \
                .orderBy(['date','route','stopPointId','timediff'])
                
printdf(trip_plans_start_end.select(['date','route','to_stop_id','tripNum','to_timestamp','end_time','timediff']))

Unnamed: 0,date,route,to_stop_id,tripNum,to_timestamp,end_time,timediff
0,2017-05-10,30,27636,2,2017-05-10 10:51:21,2017-05-10 10:55:00,219
1,2017-05-10,30,32508,2,2017-05-10 12:10:57,2017-05-10 12:10:25,32
2,2017-05-10,30,32508,3,2017-05-10 12:35:28,2017-05-10 12:33:57,91
3,2017-05-10,307,26208,4,2017-05-10 11:47:21,2017-05-10 11:53:00,339
4,2017-05-10,607,25894,3,2017-05-10 11:17:08,2017-05-10 11:22:00,292


In [153]:
trip_plans_start_end.printSchema()

root
 |-- date: string (nullable = true)
 |-- route: integer (nullable = true)
 |-- busCode: string (nullable = true)
 |-- tripNum: integer (nullable = true)
 |-- to_stop_id: integer (nullable = true)
 |-- user_trip_id: long (nullable = true)
 |-- itinerary_id: long (nullable = true)
 |-- leg_id: long (nullable = true)
 |-- from_stop_id: integer (nullable = true)
 |-- start_time: timestamp (nullable = true)
 |-- from_timestamp: string (nullable = true)
 |-- end_time: timestamp (nullable = true)
 |-- to_timestamp: string (nullable = true)
 |-- timediff: long (nullable = true)



In [154]:
trips_actual_time = trip_plans_start_end.select(['date','user_trip_id','itinerary_id','leg_id','route','busCode','tripNum','busCode','from_stop_id','from_timestamp','to_stop_id','to_timestamp']) \
                        .orderBy(['date','user_trip_id','itinerary_id','leg_id'])

In [155]:
printdf(trips_actual_time)

Unnamed: 0,date,user_trip_id,itinerary_id,leg_id,route,busCode,tripNum,busCode.1,from_stop_id,from_timestamp,to_stop_id,to_timestamp
0,2017-05-10,231928234038,1,2,30,BB608,2,BB608,32612,2017-05-10 10:40:14,27636,2017-05-10 10:51:21
1,2017-05-10,231928234038,1,4,607,HL306,3,HL306,26247,2017-05-10 10:51:41,25894,2017-05-10 11:17:08
2,2017-05-10,231928234038,1,6,307,CL306,4,CL306,25896,2017-05-10 11:21:54,26208,2017-05-10 11:47:21
3,2017-05-10,231928234038,2,2,30,BB608,2,BB608,32610,2017-05-10 11:09:02,32508,2017-05-10 12:10:57
4,2017-05-10,231928234038,3,2,30,GR121,3,GR121,32610,2017-05-10 11:32:13,32508,2017-05-10 12:35:28


### Joining Walk and Bus legs into a single Dataframe

In [156]:
printdf(trips_plans_df,l=20)

Unnamed: 0,date,user_trip_id,itinerary_id,leg_id,start_time,end_time,mode,route,from_stop_id,to_stop_id,otp_duration_mins
0,2017-05-10,231928234038,1,1,2017-05-10 10:40:42,2017-05-10 10:41:13,WALK,,,,0.516667
1,2017-05-10,231928234038,1,2,2017-05-10 10:41:14,2017-05-10 10:55:00,BUS,30.0,32612.0,27636.0,13.766667
2,2017-05-10,231928234038,1,3,2017-05-10 10:55:00,2017-05-10 10:55:33,WALK,,,,0.55
3,2017-05-10,231928234038,1,4,2017-05-10 10:56:30,2017-05-10 11:22:00,BUS,607.0,26247.0,25894.0,25.5
4,2017-05-10,231928234038,1,5,2017-05-10 11:22:00,2017-05-10 11:22:45,WALK,,,,0.75
5,2017-05-10,231928234038,1,6,2017-05-10 11:23:00,2017-05-10 11:53:00,BUS,307.0,25896.0,26208.0,30.0
6,2017-05-10,231928234038,1,7,2017-05-10 11:53:01,2017-05-10 11:59:16,WALK,,,,6.25
7,2017-05-10,231928234038,2,1,2017-05-10 11:04:48,2017-05-10 11:10:00,WALK,,,,5.2
8,2017-05-10,231928234038,2,2,2017-05-10 11:10:01,2017-05-10 12:10:25,BUS,30.0,32610.0,32508.0,60.4
9,2017-05-10,231928234038,2,3,2017-05-10 12:10:26,2017-05-10 12:11:00,WALK,,,,0.566667


In [157]:
trips_actual_itineraries = trips_plans_df.join(trips_actual_time, on=['date','user_trip_id','itinerary_id','leg_id', 'route', 'from_stop_id','to_stop_id'], how='left_outer')

In [158]:
printdf(trips_actual_itineraries.orderBy(['itinerary_id','leg_id','start_time']), l=20)

Unnamed: 0,date,user_trip_id,itinerary_id,leg_id,route,from_stop_id,to_stop_id,start_time,end_time,mode,otp_duration_mins,busCode,tripNum,busCode.1,from_timestamp,to_timestamp
0,2017-05-10,231928234038,1,1,,,,2017-05-10 10:40:42,2017-05-10 10:41:13,WALK,0.516667,,,,,
1,2017-05-10,231928234038,1,2,30.0,32612.0,27636.0,2017-05-10 10:41:14,2017-05-10 10:55:00,BUS,13.766667,BB608,2.0,BB608,2017-05-10 10:40:14,2017-05-10 10:51:21
2,2017-05-10,231928234038,1,3,,,,2017-05-10 10:55:00,2017-05-10 10:55:33,WALK,0.55,,,,,
3,2017-05-10,231928234038,1,4,607.0,26247.0,25894.0,2017-05-10 10:56:30,2017-05-10 11:22:00,BUS,25.5,HL306,3.0,HL306,2017-05-10 10:51:41,2017-05-10 11:17:08
4,2017-05-10,231928234038,1,5,,,,2017-05-10 11:22:00,2017-05-10 11:22:45,WALK,0.75,,,,,
5,2017-05-10,231928234038,1,6,307.0,25896.0,26208.0,2017-05-10 11:23:00,2017-05-10 11:53:00,BUS,30.0,CL306,4.0,CL306,2017-05-10 11:21:54,2017-05-10 11:47:21
6,2017-05-10,231928234038,1,7,,,,2017-05-10 11:53:01,2017-05-10 11:59:16,WALK,6.25,,,,,
7,2017-05-10,231928234038,2,1,,,,2017-05-10 11:04:48,2017-05-10 11:10:00,WALK,5.2,,,,,
8,2017-05-10,231928234038,2,2,30.0,32610.0,32508.0,2017-05-10 11:10:01,2017-05-10 12:10:25,BUS,60.4,BB608,2.0,BB608,2017-05-10 11:09:02,2017-05-10 12:10:57
9,2017-05-10,231928234038,2,3,,,,2017-05-10 12:10:26,2017-05-10 12:11:00,WALK,0.566667,,,,,


In [162]:
trips_actual_itineraries = trips_actual_itineraries.withColumn('actual_duration_mins', (F.unix_timestamp(F.col('to_timestamp')) - F.unix_timestamp(F.col('from_timestamp')))/60)

In [163]:
printdf(trips_actual_itineraries, l=20)

Unnamed: 0,date,user_trip_id,itinerary_id,leg_id,route,from_stop_id,to_stop_id,start_time,end_time,mode,otp_duration_mins,busCode,tripNum,busCode.1,from_timestamp,to_timestamp,actual_duration,actual_duration_mins
0,2017-05-10,231928234038,1,1,,,,2017-05-10 10:40:42,2017-05-10 10:41:13,WALK,0.516667,,,,,,,
1,2017-05-10,231928234038,1,7,,,,2017-05-10 11:53:01,2017-05-10 11:59:16,WALK,6.25,,,,,,,
2,2017-05-10,231928234038,1,5,,,,2017-05-10 11:22:00,2017-05-10 11:22:45,WALK,0.75,,,,,,,
3,2017-05-10,231928234038,3,2,30.0,32610.0,32508.0,2017-05-10 11:34:52,2017-05-10 12:33:57,BUS,59.083333,GR121,3.0,GR121,2017-05-10 11:32:13,2017-05-10 12:35:28,63.25,63.25
4,2017-05-10,231928234038,1,6,307.0,25896.0,26208.0,2017-05-10 11:23:00,2017-05-10 11:53:00,BUS,30.0,CL306,4.0,CL306,2017-05-10 11:21:54,2017-05-10 11:47:21,25.45,25.45
5,2017-05-10,231928234038,3,3,,,,2017-05-10 12:33:58,2017-05-10 12:34:32,WALK,0.566667,,,,,,,
6,2017-05-10,231928234038,1,2,30.0,32612.0,27636.0,2017-05-10 10:41:14,2017-05-10 10:55:00,BUS,13.766667,BB608,2.0,BB608,2017-05-10 10:40:14,2017-05-10 10:51:21,11.116667,11.116667
7,2017-05-10,231928234038,2,3,,,,2017-05-10 12:10:26,2017-05-10 12:11:00,WALK,0.566667,,,,,,,
8,2017-05-10,231928234038,3,1,,,,2017-05-10 11:29:39,2017-05-10 11:34:51,WALK,5.2,,,,,,,
9,2017-05-10,231928234038,1,3,,,,2017-05-10 10:55:00,2017-05-10 10:55:33,WALK,0.55,,,,,,,


In [164]:
trips_actual_itineraries = trips_actual_itineraries.withColumn('considered_duration_mins', F.when(F.col('mode') == F.lit('BUS'), F.col('actual_duration_mins')).otherwise(F.col('otp_duration_mins')))

In [165]:
printdf(trips_actual_itineraries, l=20)

Unnamed: 0,date,user_trip_id,itinerary_id,leg_id,route,from_stop_id,to_stop_id,start_time,end_time,mode,otp_duration_mins,busCode,tripNum,busCode.1,from_timestamp,to_timestamp,actual_duration,actual_duration_mins,considered_duration_mins
0,2017-05-10,231928234038,1,1,,,,2017-05-10 10:40:42,2017-05-10 10:41:13,WALK,0.516667,,,,,,,,0.516667
1,2017-05-10,231928234038,1,7,,,,2017-05-10 11:53:01,2017-05-10 11:59:16,WALK,6.25,,,,,,,,6.25
2,2017-05-10,231928234038,1,5,,,,2017-05-10 11:22:00,2017-05-10 11:22:45,WALK,0.75,,,,,,,,0.75
3,2017-05-10,231928234038,3,2,30.0,32610.0,32508.0,2017-05-10 11:34:52,2017-05-10 12:33:57,BUS,59.083333,GR121,3.0,GR121,2017-05-10 11:32:13,2017-05-10 12:35:28,63.25,63.25,63.25
4,2017-05-10,231928234038,1,6,307.0,25896.0,26208.0,2017-05-10 11:23:00,2017-05-10 11:53:00,BUS,30.0,CL306,4.0,CL306,2017-05-10 11:21:54,2017-05-10 11:47:21,25.45,25.45,25.45
5,2017-05-10,231928234038,3,3,,,,2017-05-10 12:33:58,2017-05-10 12:34:32,WALK,0.566667,,,,,,,,0.566667
6,2017-05-10,231928234038,1,2,30.0,32612.0,27636.0,2017-05-10 10:41:14,2017-05-10 10:55:00,BUS,13.766667,BB608,2.0,BB608,2017-05-10 10:40:14,2017-05-10 10:51:21,11.116667,11.116667,11.116667
7,2017-05-10,231928234038,2,3,,,,2017-05-10 12:10:26,2017-05-10 12:11:00,WALK,0.566667,,,,,,,,0.566667
8,2017-05-10,231928234038,3,1,,,,2017-05-10 11:29:39,2017-05-10 11:34:51,WALK,5.2,,,,,,,,5.2
9,2017-05-10,231928234038,1,3,,,,2017-05-10 10:55:00,2017-05-10 10:55:33,WALK,0.55,,,,,,,,0.55


In [167]:
trips_itineraries_duration = trips_actual_itineraries.groupBy(['date', 'user_trip_id', 'itinerary_id']).agg({'considered_duration_mins':'sum'}).orderBy(['date','user_trip_id','itinerary_id'])

In [168]:
printdf(trips_itineraries_duration)

Unnamed: 0,date,user_trip_id,itinerary_id,sum(considered_duration_mins)
0,2017-05-10,231928234038,1,70.083333
1,2017-05-10,231928234038,2,67.683333
2,2017-05-10,231928234038,3,69.016667


In [None]:
w = Window.partitionBy(['date','trip_id','route','to_stop_id']).orderBy(['timediff'])

real_trip_end = real_trip_end.withColumn('rn', F.row_number().over(w)) \
                    .where(F.col('rn') == 1)

In [None]:
printdf(real_trip_end.select('route','to_stop_id','timestamp','start_time') \
           .withColumn('start_time',F.from_unixtime(F.col('start_time'))))

In [None]:
trips_plans_df.head()

In [None]:
for index, row in trips_plans_df.iterrows():
    if row['mode'] == 'BUS':
        bus_trips_data.filter((bus_trips_data.route == row['route']) & 
                              (int(bus_trips_data.stopPointId) == int(row['from_stop_id']))
    