In [1]:
import pyspark
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.window import Window
from pyspark.sql import functions as F
from pyspark.sql import types as T

import json
import urllib2
#import pandas as pd
#import numpy as np

In [2]:
OTP_SERVER_URL = 'http://localhost:5601/otp/'

def rename_columns(df, list_of_tuples):
    for (old_col, new_col) in list_of_tuples:
        df = df.withColumnRenamed(old_col, new_col)
    return df

def read_hdfs_folder(sqlContext, folderpath):
    data_frame = sqlContext.read.csv(folderpath, header=True,
                                     inferSchema=True,nullValue="-")
    return data_frame

def read_buste_data_v3(sqlContext, folderpath):
    data_frame = read_hdfs_folder(sqlContext,folderpath)
    
    date = "-".join(folderpath.split("/")[-2].split("_")[:3])

    data_frame = data_frame.withColumn("date", F.lit(date))
    data_frame = data_frame.withColumn("date", F.unix_timestamp(F.date_sub(F.col("date"),1),'yyyy-MM-dd'))
    
    return data_frame

def printdf(df,l=10):
    return df.limit(l).toPandas()

def get_timestamp_in_tz(unixtime_timestamp,ts_format,tz):
    return F.from_utc_timestamp(F.from_unixtime(unixtime_timestamp, ts_format),tz)

In [3]:
spark  = SparkSession.builder.getOrCreate()
spark.conf.set('spark.sql.crossJoin.enabled', 'true')

sc = spark.sparkContext
sqlContext = pyspark.SQLContext(sc)

In [4]:
od_matrix = read_hdfs_folder(sqlContext,'/local/tarciso/masters/experiments/preliminary-exp/preliminary-exp-sample-data/buste-v3a/od_matrix/')

In [5]:
od_matrix.printSchema()

root
 |-- route: integer (nullable = true)
 |-- tripNum: integer (nullable = true)
 |-- shapeId: integer (nullable = true)
 |-- shapeSequence: integer (nullable = true)
 |-- shapeLat: double (nullable = true)
 |-- shapeLon: double (nullable = true)
 |-- distanceTraveledShape: double (nullable = true)
 |-- busCode: string (nullable = true)
 |-- gpsPointId: string (nullable = true)
 |-- gpsLat: double (nullable = true)
 |-- gpsLon: double (nullable = true)
 |-- distanceToShapePoint: double (nullable = true)
 |-- timestamp: string (nullable = true)
 |-- stopPointId: integer (nullable = true)
 |-- problem: string (nullable = true)
 |-- birthdate: string (nullable = true)
 |-- cardTimestamp: string (nullable = true)
 |-- lineName: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- date: timestamp (nullable = true)
 |-- id: long (nullable = true)
 |-- o_route: integer (nullable = true)
 |-- o_bus_code: string (nullable = true)
 |-- o_date: timestamp (nullable = true)
 |-- o_

In [6]:
printdf(od_matrix)

Unnamed: 0,route,tripNum,shapeId,shapeSequence,shapeLat,shapeLon,distanceTraveledShape,busCode,gpsPointId,gpsLat,...,next_o_shape_seq,next_o_shape_lat,next_o_shape_lon,next_o_stop_id,next_o_boarding_id,o_unixtimestamp,next_o_unixtimestamp,leg_duration,dist,rn
0,860,2,2240,4494816,-25.440235,-49.277592,840.222,LC011,,-25.440471,...,6448950,-25.434527,-49.280126,29874,438086664359,32969,78541,759.533333,0.683846,1
1,870,10,2926,5386395,-25.431688,-49.276556,518.881,BC023,,-25.43163,...,5162628,-25.428355,-49.272707,28429,1262720385319,70255,52607,-1.0,0.535484,1
2,30,2,1715,6293026,-25.410378,-49.204718,25103.643,BB608,,-25.410268,...,5846746,-25.410157,-49.204149,32508,206158430263,49275,71111,363.933333,0.062245,1
3,860,8,2241,4494781,-25.436792,-49.274447,12604.144,LC026,,-25.436816,...,5255201,-25.428123,-49.271896,28556,1666447310868,69745,45888,-1.0,0.997384,1
4,826,11,2230,6062927,-25.511831,-49.324408,12069.86,JA001,,-25.511798,...,6416861,-25.510229,-49.32615,35840,1563368095816,76640,36968,-1.0,0.249584,1
5,777,2,2195,4299860,-25.435179,-49.273374,0.0,JC004,,-25.435188,...,6222433,-25.428604,-49.270626,26358,798863917103,34748,47558,213.5,0.781424,1
6,30,3,1715,6292678,-25.489034,-49.226245,11197.387,GR123,,-25.489063,...,6358078,-25.490593,-49.222306,30062,1022202216489,55367,80712,422.416667,0.431719,1
7,393,5,1907,6350328,-25.434318,-49.274526,11645.587,DN600,,-25.434348,...,6360100,-25.43537,-49.271665,26180,481036337331,64512,77787,221.25,0.310218,1
8,778,3,2196,5603972,-25.435144,-49.273284,12300.302,JC007,,-25.43519,...,6594723,-25.430341,-49.267154,26376,1589137899766,39360,35167,-1.0,0.815047,1
9,467,14,2818,5136062,-25.444211,-49.267533,1036.662,DN606,,,...,6405216,-25.444187,-49.267545,26584,627065225481,81022,40917,-1.0,0.002895,1


In [7]:
printdf(od_matrix.select(['date','route','o_stop_id','o_timestamp','stopPointId','timestamp','o_boarding_id']))

Unnamed: 0,date,route,o_stop_id,o_timestamp,stopPointId,timestamp,o_boarding_id
0,2017-05-10,860,26163,06:09:29,30633,06:14:13,403726926010
1,2017-05-10,870,26314,16:30:55,28604,16:35:43,635655159957
2,2017-05-10,30,32612,10:41:15,32508,12:10:57,231928234038
3,2017-05-10,860,3280,16:22:25,26163,16:42:40,463856468060
4,2017-05-10,826,33180,18:17:20,30432,18:54:05,549755814128
5,2017-05-10,777,33632,06:39:08,26149,07:00:22,979252543558
6,2017-05-10,30,32600,12:22:47,32577,12:38:45,1219770712102
7,2017-05-10,393,30182,14:55:12,28592,15:05:38,755914244190
8,2017-05-10,778,33533,07:56:00,26146,08:23:16,1606317768917
9,2017-05-10,467,26181,19:30:22,26584,19:34:08,42949673032


In [8]:
od_matrix = od_matrix.withColumn('date_in_ms', F.unix_timestamp(F.col('date'),'yyyy-MM-dd')) \
                        .withColumn('o_time_in_ms', F.unix_timestamp(F.col('o_timestamp'),'HH:mm:ss')) \
                        .withColumn('o_datetime_in_ms', F.col('date_in_ms') + F.col('o_time_in_ms')) \
                        .withColumn('o_datetime',get_timestamp_in_tz(F.col('o_datetime_in_ms'),'yyyy-MM-dd HH:mm:ss','GMT-3'))
                        

In [9]:
#printdf(od_matrix.select(['o_datetime_in_ms','o_datetime','date_in_ms','date','o_timestamp']))
printdf(od_matrix.select(['date_in_ms','o_time_in_ms','o_datetime_in_ms','date','o_timestamp','o_datetime']))

Unnamed: 0,date_in_ms,o_time_in_ms,o_datetime_in_ms,date,o_timestamp,o_datetime
0,1494385200,32969,1494418169,2017-05-10,06:09:29,2017-05-10 06:09:29
1,1494385200,70255,1494455455,2017-05-10,16:30:55,2017-05-10 16:30:55
2,1494385200,49275,1494434475,2017-05-10,10:41:15,2017-05-10 10:41:15
3,1494385200,69745,1494454945,2017-05-10,16:22:25,2017-05-10 16:22:25
4,1494385200,76640,1494461840,2017-05-10,18:17:20,2017-05-10 18:17:20
5,1494385200,34748,1494419948,2017-05-10,06:39:08,2017-05-10 06:39:08
6,1494385200,55367,1494440567,2017-05-10,12:22:47,2017-05-10 12:22:47
7,1494385200,64512,1494449712,2017-05-10,14:55:12,2017-05-10 14:55:12
8,1494385200,39360,1494424560,2017-05-10,07:56:00,2017-05-10 07:56:00
9,1494385200,81022,1494466222,2017-05-10,19:30:22,2017-05-10 19:30:22


In [10]:
od_matrix = od_matrix.withColumn('o_base_timestamp_in_ms', F.col('o_datetime_in_ms') - 60 * 2) \
                        .withColumn('o_base_timestamp',get_timestamp_in_tz(F.col('o_base_timestamp_in_ms'),'yyyy-MM-dd HH:mm:ss','GMT-3')) \
                        .withColumn('o_base_time',F.split(F.col('o_base_timestamp'),'\s+')[1]) \
                        .withColumn('o_base_date',F.split(F.col('o_base_timestamp'),'\s+')[0]) \


In [11]:
printdf(od_matrix.select('o_datetime_in_ms','o_base_timestamp_in_ms','o_datetime','o_base_timestamp','o_base_date','o_base_time'))

Unnamed: 0,o_datetime_in_ms,o_base_timestamp_in_ms,o_datetime,o_base_timestamp,o_base_date,o_base_time
0,1494418169,1494418049,2017-05-10 06:09:29,2017-05-10 06:07:29,2017-05-10,06:07:29
1,1494455455,1494455335,2017-05-10 16:30:55,2017-05-10 16:28:55,2017-05-10,16:28:55
2,1494434475,1494434355,2017-05-10 10:41:15,2017-05-10 10:39:15,2017-05-10,10:39:15
3,1494454945,1494454825,2017-05-10 16:22:25,2017-05-10 16:20:25,2017-05-10,16:20:25
4,1494461840,1494461720,2017-05-10 18:17:20,2017-05-10 18:15:20,2017-05-10,18:15:20
5,1494419948,1494419828,2017-05-10 06:39:08,2017-05-10 06:37:08,2017-05-10,06:37:08
6,1494440567,1494440447,2017-05-10 12:22:47,2017-05-10 12:20:47,2017-05-10,12:20:47
7,1494449712,1494449592,2017-05-10 14:55:12,2017-05-10 14:53:12,2017-05-10,14:53:12
8,1494424560,1494424440,2017-05-10 07:56:00,2017-05-10 07:54:00,2017-05-10,07:54:00
9,1494466222,1494466102,2017-05-10 19:30:22,2017-05-10 19:28:22,2017-05-10,19:28:22


In [12]:
def get_otp_itineraries(o_lat,o_lon,d_lat,d_lon,date,time):
    otp_http_request = 'routers/ctba/plan?fromPlace={},{}&toPlace={},{}&mode=TRANSIT,WALK&date={}&time={}'
    otp_request_url = OTP_SERVER_URL + otp_http_request.format(o_lat,o_lon,d_lat,d_lon,date,time)
    print otp_request_url
    return json.loads(urllib2.urlopen(otp_request_url).read())

In [13]:
test_itinerary = get_otp_itineraries(-25.413083,-49.229020,-25.428322,-49.266739,'05-10-2017','7:19:00')

print test_itinerary

http://localhost:5601/otp/routers/ctba/plan?fromPlace=-25.413083,-49.22902&toPlace=-25.428322,-49.266739&mode=TRANSIT,WALK&date=05-10-2017&time=7:19:00
{u'elevationMetadata': {u'geoidElevation': False, u'ellipsoidToGeoidDifference': 3.686111287840708}, u'plan': {u'date': 1494411540000, u'to': {u'lat': -25.428322, u'vertexType': u'NORMAL', u'lon': -49.266739, u'name': u'Destination', u'orig': u''}, u'itineraries': [{u'walkTime': 2, u'legs': [{u'distance': 0.151, u'from': {u'vertexType': u'NORMAL', u'name': u'Origin', u'lon': -49.22902, u'departure': 1494411566000, u'lat': -25.413083, u'orig': u''}, u'interlineWithPreviousLeg': False, u'transitLeg': False, u'realTime': False, u'route': u'', u'departureDelay': 0, u'agencyTimeZoneOffset': -10800000, u'to': {u'arrival': 1494411567000, u'vertexType': u'TRANSIT', u'name': u'Rua Fagundes Varela, 1295 - Jardim Social', u'stopSequence': 12, u'lon': -49.229029895014, u'departure': 1494411568000, u'stopId': u'1:31957', u'stopCode': u'130279', u'la

### New Approach

In [14]:
trips_otp_response = {}
counter = 0
for row in od_matrix.filter(od_matrix['o_boarding_id'] == 231928234038).collect():
        id=long(row['o_boarding_id'])
        trip_plan = get_otp_itineraries(row['o_shape_lat'], row['o_shape_lon'], row['shapeLat'], row['shapeLon'],row['o_base_date'],row['o_base_time'])
        trips_otp_response[id] = trip_plan
        counter+=1

http://localhost:5601/otp/routers/ctba/plan?fromPlace=-25.5142200773,-49.271038192&toPlace=-25.4103783466,-49.2047176841&mode=TRANSIT,WALK&date=2017-05-10&time=10:39:15


In [16]:
max_ctr = 1
trips_plans = []

for trip in trips_otp_response.keys():
    if max_ctr == 0:
        break
    if 'plan' in trips_otp_response[trip]:
        itinerary_id = 1
        for itinerary in trips_otp_response[trip]['plan']['itineraries']:
            leg_id = 1
            for leg in itinerary['legs']:
                route = leg['route'] if leg['route'] != '' else None
                fromStopId = leg['from']['stopId'].split(':')[1] if leg['mode'] == 'BUS' else None
                toStopId = leg['to']['stopId'].split(':')[1] if leg['mode'] == 'BUS' else None
                start_time = long(leg['startTime'])/1000
                end_time = long(leg['endTime'])/1000
                duration = (end_time - start_time)/60
                #date_str = start_time.strftime('%Y-%m-%d')
                #start_time_str = start_time.strftime('%H:%M:%S')
                #end_time_str = end_time.strftime('%H:%M:%S')
                trips_plans.append((start_time,trip,itinerary_id,leg_id,start_time,end_time,leg['mode'],route,fromStopId,toStopId, duration))
                leg_id += 1
            itinerary_id += 1
    max_ctr -= 1

labels=['user_trip_date','user_trip_id','itinerary_id','leg_id','start_time','end_time','mode','user_trip_route','from_stop_id','to_stop_id','duration']
trips_plans_df = sqlContext.createDataFrame(trips_plans, labels) \
                     .withColumn('user_trip_date',get_timestamp_in_tz(F.col('user_trip_date'),'yyyy-MM-dd', 'GMT')) \
                     .withColumn('user_trip_route', F.col('user_trip_route').astype('float')) \
                     .withColumn('from_stop_id', F.col('from_stop_id').astype('integer')) \
                     .withColumn('to_stop_id', F.col('to_stop_id').astype('integer')) \
                     .orderBy(['user_trip_date','user_trip_id','itinerary_id','start_time'])
#trips_plans_df['route'] = trips_plans_df['route'].astype(float)
#trips_plans_df['from_stop_id'] = trips_plans_df['from_stop_id'].astype(float)
#trips_plans_df['to_stop_id'] = trips_plans_df['to_stop_id'].astype(float)

In [17]:
trips_plans_df.printSchema()

root
 |-- user_trip_date: timestamp (nullable = true)
 |-- user_trip_id: long (nullable = true)
 |-- itinerary_id: long (nullable = true)
 |-- leg_id: long (nullable = true)
 |-- start_time: long (nullable = true)
 |-- end_time: long (nullable = true)
 |-- mode: string (nullable = true)
 |-- user_trip_route: float (nullable = true)
 |-- from_stop_id: integer (nullable = true)
 |-- to_stop_id: integer (nullable = true)
 |-- duration: long (nullable = true)



In [18]:
printdf(trips_plans_df,l=20)

Unnamed: 0,user_trip_date,user_trip_id,itinerary_id,leg_id,start_time,end_time,mode,user_trip_route,from_stop_id,to_stop_id,duration
0,2017-05-10,231928234038,1,1,1494423642,1494423673,WALK,,,,0
1,2017-05-10,231928234038,1,2,1494423674,1494424500,BUS,30.0,32612.0,27636.0,13
2,2017-05-10,231928234038,1,3,1494424500,1494424533,WALK,,,,0
3,2017-05-10,231928234038,1,4,1494424590,1494426120,BUS,607.0,26247.0,25894.0,25
4,2017-05-10,231928234038,1,5,1494426120,1494426165,WALK,,,,0
5,2017-05-10,231928234038,1,6,1494426180,1494427980,BUS,307.0,25896.0,26208.0,30
6,2017-05-10,231928234038,1,7,1494427981,1494428356,WALK,,,,6
7,2017-05-10,231928234038,2,1,1494425088,1494425400,WALK,,,,5
8,2017-05-10,231928234038,2,2,1494425401,1494429025,BUS,30.0,32610.0,32508.0,60
9,2017-05-10,231928234038,2,3,1494429026,1494429060,WALK,,,,0


In [19]:
printdf(od_matrix.filter(od_matrix.o_boarding_id == 627065225216).select('o_boarding_id','route','o_timestamp','timestamp'))

Unnamed: 0,o_boarding_id,route,o_timestamp,timestamp
0,627065225216,373,14:54:45,14:56:07


### Read Bus Data

In [20]:
bus_trips_data = read_buste_data_v3(sqlContext,'/local/tarciso/masters/experiments/preliminary-exp/preliminary-exp-sample-data/buste-v3a/bulma-output/2017_05_11_veiculos.csv/')

In [21]:
bus_trips_data.printSchema()

root
 |-- route: string (nullable = true)
 |-- tripNum: integer (nullable = true)
 |-- shapeId: integer (nullable = true)
 |-- shapeSequence: integer (nullable = true)
 |-- shapeLat: double (nullable = true)
 |-- shapeLon: double (nullable = true)
 |-- distanceTraveledShape: double (nullable = true)
 |-- busCode: string (nullable = true)
 |-- gpsPointId: string (nullable = true)
 |-- gpsLat: double (nullable = true)
 |-- gpsLon: double (nullable = true)
 |-- distanceToShapePoint: double (nullable = true)
 |-- timestamp: string (nullable = true)
 |-- stopPointId: integer (nullable = true)
 |-- problem: string (nullable = true)
 |-- birthdate: string (nullable = true)
 |-- cardTimestamp: string (nullable = true)
 |-- lineName: string (nullable = true)
 |-- cardNum: integer (nullable = true)
 |-- gender: string (nullable = true)
 |-- date: long (nullable = true)



In [22]:
printdf(bus_trips_data.select('date'))

Unnamed: 0,date
0,1494385200
1,1494385200
2,1494385200
3,1494385200
4,1494385200
5,1494385200
6,1494385200
7,1494385200
8,1494385200
9,1494385200


In [23]:
bus_trips_data = bus_trips_data.withColumn('date',F.from_unixtime(F.col('date'),'yyyy-MM-dd')) \
                    .withColumn('route',F.col('route').astype('float')) \
                    .withColumn('datetime',F.unix_timestamp(F.concat(F.col('date'),F.lit(' '),F.col('timestamp')), 'yyyy-MM-dd HH:mm:ss'))

### Finding Real User Trip Beginning Time

In [72]:
real_trip_beginning_join_cond = [trips_plans_df.user_trip_date == bus_trips_data.date,
                                 trips_plans_df.user_trip_route == bus_trips_data.route,
                                 trips_plans_df.from_stop_id == bus_trips_data.stopPointId]
real_trip_beginning = trips_plans_df.join(bus_trips_data, real_trip_beginning_join_cond, how='inner') \
                        .na.drop(subset=['datetime']) \
                        .withColumn('timediff',F.abs(F.col('datetime') - F.col('start_time')))

In [73]:
printdf(real_trip_beginning.select(['date','route','from_stop_id','tripNum','datetime','start_time','timediff']) \
                .orderBy(['date','route','from_stop_id','timediff']), l=200)

Unnamed: 0,date,route,from_stop_id,tripNum,datetime,start_time,timediff
0,2017-05-10,30.0,32610,2,1494425342,1494425401,59
1,2017-05-10,30.0,32610,3,1494426733,1494426892,159
2,2017-05-10,30.0,32610,3,1494428003,1494426892,1111
3,2017-05-10,30.0,32610,3,1494426733,1494425401,1332
4,2017-05-10,30.0,32610,2,1494423886,1494425401,1515
5,2017-05-10,30.0,32610,2,1494425342,1494426892,1550
6,2017-05-10,30.0,32610,2,1494422956,1494425401,2445
7,2017-05-10,30.0,32610,3,1494429417,1494426892,2525
8,2017-05-10,30.0,32610,3,1494428003,1494425401,2602
9,2017-05-10,30.0,32610,2,1494422623,1494425401,2778


In [74]:
real_trip_beginning.printSchema()

root
 |-- user_trip_date: timestamp (nullable = true)
 |-- user_trip_id: long (nullable = true)
 |-- itinerary_id: long (nullable = true)
 |-- leg_id: long (nullable = true)
 |-- start_time: long (nullable = true)
 |-- end_time: long (nullable = true)
 |-- mode: string (nullable = true)
 |-- user_trip_route: float (nullable = true)
 |-- from_stop_id: integer (nullable = true)
 |-- to_stop_id: integer (nullable = true)
 |-- duration: long (nullable = true)
 |-- route: float (nullable = true)
 |-- tripNum: integer (nullable = true)
 |-- shapeId: integer (nullable = true)
 |-- shapeSequence: integer (nullable = true)
 |-- shapeLat: double (nullable = true)
 |-- shapeLon: double (nullable = true)
 |-- distanceTraveledShape: double (nullable = true)
 |-- busCode: string (nullable = true)
 |-- gpsPointId: string (nullable = true)
 |-- gpsLat: double (nullable = true)
 |-- gpsLon: double (nullable = true)
 |-- distanceToShapePoint: double (nullable = true)
 |-- timestamp: string (nullable = t

In [75]:
w = Window.partitionBy(['date','user_trip_id','route','from_stop_id']).orderBy(['timediff'])

real_trip_beginning = real_trip_beginning.withColumn('rn', F.row_number().over(w)) \
                    .where(F.col('rn') == 1)

In [76]:
printdf(real_trip_beginning.select('route','from_stop_id','timestamp','start_time') \
           .withColumn('start_time',F.from_unixtime(F.col('start_time'))))

Unnamed: 0,route,from_stop_id,timestamp,start_time
0,607.0,26247,10:51:41,2017-05-10 10:56:30
1,307.0,25896,11:21:54,2017-05-10 11:23:00
2,30.0,32612,10:40:14,2017-05-10 10:41:14
3,30.0,32610,11:09:02,2017-05-10 11:10:01


In [77]:
real_trip_beginning = real_trip_beginning \
        .select(['user_trip_date','user_trip_id','itinerary_id','leg_id','user_trip_route','busCode','tripNum','from_stop_id','start_time','timestamp','to_stop_id','end_time','duration']) \
        .withColumnRenamed('tripNum','bus_trip_num')
printdf(real_trip_beginning)

Unnamed: 0,user_trip_date,user_trip_id,itinerary_id,leg_id,user_trip_route,busCode,bus_trip_num,from_stop_id,start_time,timestamp,to_stop_id,end_time,duration
0,2017-05-10,231928234038,1,4,607.0,HL306,3,26247,1494424590,10:51:41,25894,1494426120,25
1,2017-05-10,231928234038,1,6,307.0,CL306,4,25896,1494426180,11:21:54,26208,1494427980,30
2,2017-05-10,231928234038,1,2,30.0,BB608,2,32612,1494423674,10:40:14,27636,1494424500,13
3,2017-05-10,231928234038,2,2,30.0,BB608,2,32610,1494425401,11:09:02,32508,1494429025,60


### Finding Real User Trip End Time

In [78]:
bus_trips_data.printSchema()

root
 |-- route: float (nullable = true)
 |-- tripNum: integer (nullable = true)
 |-- shapeId: integer (nullable = true)
 |-- shapeSequence: integer (nullable = true)
 |-- shapeLat: double (nullable = true)
 |-- shapeLon: double (nullable = true)
 |-- distanceTraveledShape: double (nullable = true)
 |-- busCode: string (nullable = true)
 |-- gpsPointId: string (nullable = true)
 |-- gpsLat: double (nullable = true)
 |-- gpsLon: double (nullable = true)
 |-- distanceToShapePoint: double (nullable = true)
 |-- timestamp: string (nullable = true)
 |-- stopPointId: integer (nullable = true)
 |-- problem: string (nullable = true)
 |-- birthdate: string (nullable = true)
 |-- cardTimestamp: string (nullable = true)
 |-- lineName: string (nullable = true)
 |-- cardNum: integer (nullable = true)
 |-- gender: string (nullable = true)
 |-- date: string (nullable = true)
 |-- datetime: long (nullable = true)



In [79]:
real_trip_beginning.printSchema()

root
 |-- user_trip_date: timestamp (nullable = true)
 |-- user_trip_id: long (nullable = true)
 |-- itinerary_id: long (nullable = true)
 |-- leg_id: long (nullable = true)
 |-- user_trip_route: float (nullable = true)
 |-- busCode: string (nullable = true)
 |-- bus_trip_num: integer (nullable = true)
 |-- from_stop_id: integer (nullable = true)
 |-- start_time: long (nullable = true)
 |-- timestamp: string (nullable = true)
 |-- to_stop_id: integer (nullable = true)
 |-- end_time: long (nullable = true)
 |-- duration: long (nullable = true)



In [91]:
real_trip_end_join_cond = [real_trip_beginning.user_trip_date == bus_trips_data.date,
                                 real_trip_beginning.user_trip_route == bus_trips_data.route,
                                  real_trip_beginning.bus_trip_num == bus_trips_data.tripNum,
                                 real_trip_beginning.to_stop_id == bus_trips_data.stopPointId]
real_trip_end = real_trip_beginning.join(bus_trips_data, real_trip_end_join_cond, how='inner')
# \
#                         .na.drop(subset=['datetime']) \
#                         .withColumn('timediff',F.abs(F.col('datetime') - F.col('end_time')))

AnalysisException: u'resolved attribute(s) stopPointId#855,tripNum#843 missing from distanceTraveledShape#3325,to_stop_id#816,cardTimestamp#3335,end_time#759L,birthdate#3334,shapeLat#3323,stopPointId#3332,from_stop_id#803,cardNum#3337,user_trip_route#790,start_time#758L,leg_id#757L,duration#764L,gpsPointId#3327,route#955,gpsLat#3328,bus_trip_num#3069,busCode#849,timestamp#854,gpsLon#3329,shapeId#3321,tripNum#3320,user_trip_date#777,itinerary_id#756L,timestamp#3331,gender#3338,user_trip_id#755L,lineName#3336,datetime#978L,shapeSequence#3322,date#932,busCode#3326,problem#3333,distanceToShapePoint#3330,shapeLon#3324 in operator !Join Inner, ((((user_trip_date#777 = cast(date#932 as timestamp)) && (user_trip_route#790 = route#955)) && (bus_trip_num#3069 = tripNum#843)) && (to_stop_id#816 = stopPointId#855));;\n!Join Inner, ((((user_trip_date#777 = cast(date#932 as timestamp)) && (user_trip_route#790 = route#955)) && (bus_trip_num#3069 = tripNum#843)) && (to_stop_id#816 = stopPointId#855))\n:- Project [user_trip_date#777, user_trip_id#755L, itinerary_id#756L, leg_id#757L, user_trip_route#790, busCode#849, tripNum#843 AS bus_trip_num#3069, from_stop_id#803, start_time#758L, timestamp#854, to_stop_id#816, end_time#759L, duration#764L]\n:  +- Project [user_trip_date#777, user_trip_id#755L, itinerary_id#756L, leg_id#757L, user_trip_route#790, busCode#849, tripNum#843, from_stop_id#803, start_time#758L, timestamp#854, to_stop_id#816, end_time#759L, duration#764L]\n:     +- Filter (rn#3002 = 1)\n:        +- Project [user_trip_date#777, user_trip_id#755L, itinerary_id#756L, leg_id#757L, start_time#758L, end_time#759L, mode#760, user_trip_route#790, from_stop_id#803, to_stop_id#816, duration#764L, route#955, tripNum#843, shapeId#844, shapeSequence#845, shapeLat#846, shapeLon#847, distanceTraveledShape#848, busCode#849, gpsPointId#850, gpsLat#851, gpsLon#852, distanceToShapePoint#853, timestamp#854, ... 11 more fields]\n:           +- Project [user_trip_date#777, user_trip_id#755L, itinerary_id#756L, leg_id#757L, start_time#758L, end_time#759L, mode#760, user_trip_route#790, from_stop_id#803, to_stop_id#816, duration#764L, route#955, tripNum#843, shapeId#844, shapeSequence#845, shapeLat#846, shapeLon#847, distanceTraveledShape#848, busCode#849, gpsPointId#850, gpsLat#851, gpsLon#852, distanceToShapePoint#853, timestamp#854, ... 12 more fields]\n:              +- Window [row_number() windowspecdefinition(date#932, user_trip_id#755L, route#955, from_stop_id#803, timediff#2951L ASC NULLS FIRST, ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS rn#3002], [date#932, user_trip_id#755L, route#955, from_stop_id#803], [timediff#2951L ASC NULLS FIRST]\n:                 +- Project [user_trip_date#777, user_trip_id#755L, itinerary_id#756L, leg_id#757L, start_time#758L, end_time#759L, mode#760, user_trip_route#790, from_stop_id#803, to_stop_id#816, duration#764L, route#955, tripNum#843, shapeId#844, shapeSequence#845, shapeLat#846, shapeLon#847, distanceTraveledShape#848, busCode#849, gpsPointId#850, gpsLat#851, gpsLon#852, distanceToShapePoint#853, timestamp#854, ... 10 more fields]\n:                    +- Project [user_trip_date#777, user_trip_id#755L, itinerary_id#756L, leg_id#757L, start_time#758L, end_time#759L, mode#760, user_trip_route#790, from_stop_id#803, to_stop_id#816, duration#764L, route#955, tripNum#843, shapeId#844, shapeSequence#845, shapeLat#846, shapeLon#847, distanceTraveledShape#848, busCode#849, gpsPointId#850, gpsLat#851, gpsLon#852, distanceToShapePoint#853, timestamp#854, ... 10 more fields]\n:                       +- Filter AtLeastNNulls(n, datetime#978L)\n:                          +- Join Inner, (((user_trip_date#777 = cast(date#932 as timestamp)) && (user_trip_route#790 = route#955)) && (from_stop_id#803 = stopPointId#855))\n:                             :- Sort [user_trip_date#777 ASC NULLS FIRST, user_trip_id#755L ASC NULLS FIRST, itinerary_id#756L ASC NULLS FIRST, start_time#758L ASC NULLS FIRST], true\n:                             :  +- Project [user_trip_date#777, user_trip_id#755L, itinerary_id#756L, leg_id#757L, start_time#758L, end_time#759L, mode#760, user_trip_route#790, from_stop_id#803, cast(cast(to_stop_id#763 as decimal(20,0)) as int) AS to_stop_id#816, duration#764L]\n:                             :     +- Project [user_trip_date#777, user_trip_id#755L, itinerary_id#756L, leg_id#757L, start_time#758L, end_time#759L, mode#760, user_trip_route#790, cast(cast(from_stop_id#762 as decimal(20,0)) as int) AS from_stop_id#803, to_stop_id#763, duration#764L]\n:                             :        +- Project [user_trip_date#777, user_trip_id#755L, itinerary_id#756L, leg_id#757L, start_time#758L, end_time#759L, mode#760, cast(user_trip_route#761 as float) AS user_trip_route#790, from_stop_id#762, to_stop_id#763, duration#764L]\n:                             :           +- Project [from_utc_timestamp(cast(from_unixtime(user_trip_date#754L, yyyy-MM-dd) as timestamp), GMT) AS user_trip_date#777, user_trip_id#755L, itinerary_id#756L, leg_id#757L, start_time#758L, end_time#759L, mode#760, user_trip_route#761, from_stop_id#762, to_stop_id#763, duration#764L]\n:                             :              +- LogicalRDD [user_trip_date#754L, user_trip_id#755L, itinerary_id#756L, leg_id#757L, start_time#758L, end_time#759L, mode#760, user_trip_route#761, from_stop_id#762, to_stop_id#763, duration#764L]\n:                             +- Project [route#955, tripNum#843, shapeId#844, shapeSequence#845, shapeLat#846, shapeLon#847, distanceTraveledShape#848, busCode#849, gpsPointId#850, gpsLat#851, gpsLon#852, distanceToShapePoint#853, timestamp#854, stopPointId#855, problem#856, birthdate#857, cardTimestamp#858, lineName#859, cardNum#860, gender#861, date#932, unix_timestamp(concat(date#932,  , timestamp#854), yyyy-MM-dd HH:mm:ss) AS datetime#978L]\n:                                +- Project [cast(route#842 as float) AS route#955, tripNum#843, shapeId#844, shapeSequence#845, shapeLat#846, shapeLon#847, distanceTraveledShape#848, busCode#849, gpsPointId#850, gpsLat#851, gpsLon#852, distanceToShapePoint#853, timestamp#854, stopPointId#855, problem#856, birthdate#857, cardTimestamp#858, lineName#859, cardNum#860, gender#861, date#932]\n:                                   +- Project [route#842, tripNum#843, shapeId#844, shapeSequence#845, shapeLat#846, shapeLon#847, distanceTraveledShape#848, busCode#849, gpsPointId#850, gpsLat#851, gpsLon#852, distanceToShapePoint#853, timestamp#854, stopPointId#855, problem#856, birthdate#857, cardTimestamp#858, lineName#859, cardNum#860, gender#861, from_unixtime(date#906L, yyyy-MM-dd) AS date#932]\n:                                      +- Project [route#842, tripNum#843, shapeId#844, shapeSequence#845, shapeLat#846, shapeLon#847, distanceTraveledShape#848, busCode#849, gpsPointId#850, gpsLat#851, gpsLon#852, distanceToShapePoint#853, timestamp#854, stopPointId#855, problem#856, birthdate#857, cardTimestamp#858, lineName#859, cardNum#860, gender#861, unix_timestamp(date_sub(cast(date#883 as date), 1), yyyy-MM-dd) AS date#906L]\n:                                         +- Project [route#842, tripNum#843, shapeId#844, shapeSequence#845, shapeLat#846, shapeLon#847, distanceTraveledShape#848, busCode#849, gpsPointId#850, gpsLat#851, gpsLon#852, distanceToShapePoint#853, timestamp#854, stopPointId#855, problem#856, birthdate#857, cardTimestamp#858, lineName#859, cardNum#860, gender#861, 2017-05-11 AS date#883]\n:                                            +- Relation[route#842,tripNum#843,shapeId#844,shapeSequence#845,shapeLat#846,shapeLon#847,distanceTraveledShape#848,busCode#849,gpsPointId#850,gpsLat#851,gpsLon#852,distanceToShapePoint#853,timestamp#854,stopPointId#855,problem#856,birthdate#857,cardTimestamp#858,lineName#859,cardNum#860,gender#861] csv\n+- Project [route#955, tripNum#3320, shapeId#3321, shapeSequence#3322, shapeLat#3323, shapeLon#3324, distanceTraveledShape#3325, busCode#3326, gpsPointId#3327, gpsLat#3328, gpsLon#3329, distanceToShapePoint#3330, timestamp#3331, stopPointId#3332, problem#3333, birthdate#3334, cardTimestamp#3335, lineName#3336, cardNum#3337, gender#3338, date#932, unix_timestamp(concat(date#932,  , timestamp#3331), yyyy-MM-dd HH:mm:ss) AS datetime#978L]\n   +- Project [cast(route#3319 as float) AS route#955, tripNum#3320, shapeId#3321, shapeSequence#3322, shapeLat#3323, shapeLon#3324, distanceTraveledShape#3325, busCode#3326, gpsPointId#3327, gpsLat#3328, gpsLon#3329, distanceToShapePoint#3330, timestamp#3331, stopPointId#3332, problem#3333, birthdate#3334, cardTimestamp#3335, lineName#3336, cardNum#3337, gender#3338, date#932]\n      +- Project [route#3319, tripNum#3320, shapeId#3321, shapeSequence#3322, shapeLat#3323, shapeLon#3324, distanceTraveledShape#3325, busCode#3326, gpsPointId#3327, gpsLat#3328, gpsLon#3329, distanceToShapePoint#3330, timestamp#3331, stopPointId#3332, problem#3333, birthdate#3334, cardTimestamp#3335, lineName#3336, cardNum#3337, gender#3338, from_unixtime(date#906L, yyyy-MM-dd) AS date#932]\n         +- Project [route#3319, tripNum#3320, shapeId#3321, shapeSequence#3322, shapeLat#3323, shapeLon#3324, distanceTraveledShape#3325, busCode#3326, gpsPointId#3327, gpsLat#3328, gpsLon#3329, distanceToShapePoint#3330, timestamp#3331, stopPointId#3332, problem#3333, birthdate#3334, cardTimestamp#3335, lineName#3336, cardNum#3337, gender#3338, unix_timestamp(date_sub(cast(date#883 as date), 1), yyyy-MM-dd) AS date#906L]\n            +- Project [route#3319, tripNum#3320, shapeId#3321, shapeSequence#3322, shapeLat#3323, shapeLon#3324, distanceTraveledShape#3325, busCode#3326, gpsPointId#3327, gpsLat#3328, gpsLon#3329, distanceToShapePoint#3330, timestamp#3331, stopPointId#3332, problem#3333, birthdate#3334, cardTimestamp#3335, lineName#3336, cardNum#3337, gender#3338, 2017-05-11 AS date#883]\n               +- Relation[route#3319,tripNum#3320,shapeId#3321,shapeSequence#3322,shapeLat#3323,shapeLon#3324,distanceTraveledShape#3325,busCode#3326,gpsPointId#3327,gpsLat#3328,gpsLon#3329,distanceToShapePoint#3330,timestamp#3331,stopPointId#3332,problem#3333,birthdate#3334,cardTimestamp#3335,lineName#3336,cardNum#3337,gender#3338] csv\n'

In [None]:
printdf(real_trip_end.select(['date','route','to_stop_id','tripNum','datetime','end_time','timediff']) \
                .orderBy(['date','route','to_stop_id','timediff']), l=200)

In [None]:
real_trip_beginning.printSchema()

In [None]:
w = Window.partitionBy(['date','trip_id','route','to_stop_id']).orderBy(['timediff'])

real_trip_end = real_trip_end.withColumn('rn', F.row_number().over(w)) \
                    .where(F.col('rn') == 1)

In [None]:
printdf(real_trip_end.select('route','to_stop_id','timestamp','start_time') \
           .withColumn('start_time',F.from_unixtime(F.col('start_time'))))

In [None]:
trips_plans_df.head()

In [None]:
for index, row in trips_plans_df.iterrows():
    if row['mode'] == 'BUS':
        bus_trips_data.filter((bus_trips_data.route == row['route']) & 
                              (int(bus_trips_data.stopPointId) == int(row['from_stop_id']))
    