In [2]:
import pyspark
from pyspark import SparkContext
from pyspark.sql.window import Window
from pyspark.sql import functions as F
from pyspark.sql import types as T

import json
import urllib2
#import pandas as pd
#import numpy as np

In [107]:
OTP_SERVER_URL = 'http://localhost:5601/otp/'

def rename_columns(df, list_of_tuples):
    for (old_col, new_col) in list_of_tuples:
        df = df.withColumnRenamed(old_col, new_col)
    return df

def read_hdfs_folder(sqlContext, folderpath):
    data_frame = sqlContext.read.csv(folderpath, header=True,
                                     inferSchema=True,nullValue="-")
    return data_frame

def read_buste_data_v3(sqlContext, folderpath):
    data_frame = read_hdfs_folder(sqlContext,folderpath)
    
    date = "-".join(folderpath.split("/")[-2].split("_")[:3])

    data_frame = data_frame.withColumn("date", F.lit(date))
    data_frame = data_frame.withColumn("date", F.unix_timestamp(F.date_sub(F.col("date"),1),'yyyy-MM-dd'))
    
    return data_frame

def printdf(df,l=10):
    return df.limit(l).toPandas()

def get_timestamp_in_tz(unixtime_timestamp,ts_format,tz):
    return F.from_utc_timestamp(F.from_unixtime(unixtime_timestamp, ts_format),tz)

In [7]:
sc = SparkContext.getOrCreate()
sqlContext = pyspark.SQLContext(sc)

In [53]:
od_matrix = read_hdfs_folder(sqlContext,'/local/tarciso/masters/experiments/preliminary-exp/preliminary-exp-sample-data/buste-v3a/od_matrix/')

In [9]:
od_matrix.printSchema()

root
 |-- route: integer (nullable = true)
 |-- tripNum: integer (nullable = true)
 |-- shapeId: integer (nullable = true)
 |-- shapeSequence: integer (nullable = true)
 |-- shapeLat: double (nullable = true)
 |-- shapeLon: double (nullable = true)
 |-- distanceTraveledShape: double (nullable = true)
 |-- busCode: string (nullable = true)
 |-- gpsPointId: string (nullable = true)
 |-- gpsLat: double (nullable = true)
 |-- gpsLon: double (nullable = true)
 |-- distanceToShapePoint: double (nullable = true)
 |-- timestamp: string (nullable = true)
 |-- stopPointId: integer (nullable = true)
 |-- problem: string (nullable = true)
 |-- birthdate: string (nullable = true)
 |-- cardTimestamp: string (nullable = true)
 |-- lineName: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- date: timestamp (nullable = true)
 |-- id: long (nullable = true)
 |-- o_route: integer (nullable = true)
 |-- o_bus_code: string (nullable = true)
 |-- o_date: timestamp (nullable = true)
 |-- o_

In [11]:
printdf(od_matrix)

Unnamed: 0,route,tripNum,shapeId,shapeSequence,shapeLat,shapeLon,distanceTraveledShape,busCode,gpsPointId,gpsLat,...,next_o_shape_seq,next_o_shape_lat,next_o_shape_lon,next_o_stop_id,next_o_boarding_id,o_unixtimestamp,next_o_unixtimestamp,leg_duration,dist,rn
0,860,2,2240,4494816,-25.440235,-49.277592,840.222,LC011,,-25.440471,...,6448950,-25.434527,-49.280126,29874,438086664359,32969,78541,759.533333,0.683846,1
1,870,10,2926,5386395,-25.431688,-49.276556,518.881,BC023,,-25.43163,...,5162628,-25.428355,-49.272707,28429,1262720385319,70255,52607,-1.0,0.535484,1
2,30,2,1715,6293026,-25.410378,-49.204718,25103.643,BB608,,-25.410268,...,5846746,-25.410157,-49.204149,32508,206158430263,49275,71111,363.933333,0.062245,1
3,860,8,2241,4494781,-25.436792,-49.274447,12604.144,LC026,,-25.436816,...,5255201,-25.428123,-49.271896,28556,1666447310868,69745,45888,-1.0,0.997384,1
4,826,11,2230,6062927,-25.511831,-49.324408,12069.86,JA001,,-25.511798,...,6416861,-25.510229,-49.32615,35840,1563368095816,76640,36968,-1.0,0.249584,1
5,777,2,2195,4299860,-25.435179,-49.273374,0.0,JC004,,-25.435188,...,6222433,-25.428604,-49.270626,26358,798863917103,34748,47558,213.5,0.781424,1
6,30,3,1715,6292678,-25.489034,-49.226245,11197.387,GR123,,-25.489063,...,6358078,-25.490593,-49.222306,30062,1022202216489,55367,80712,422.416667,0.431719,1
7,393,5,1907,6350328,-25.434318,-49.274526,11645.587,DN600,,-25.434348,...,6360100,-25.43537,-49.271665,26180,481036337331,64512,77787,221.25,0.310218,1
8,778,3,2196,5603972,-25.435144,-49.273284,12300.302,JC007,,-25.43519,...,6594723,-25.430341,-49.267154,26376,1589137899766,39360,35167,-1.0,0.815047,1
9,467,14,2818,5136062,-25.444211,-49.267533,1036.662,DN606,,,...,6405216,-25.444187,-49.267545,26584,627065225481,81022,40917,-1.0,0.002895,1


In [127]:
od_matrix = od_matrix.withColumn('date_in_ms', F.unix_timestamp(F.col('date'),'yyyy-MM-dd')) \
                        .withColumn('o_time_in_ms', F.unix_timestamp(F.col('o_timestamp'),'HH:mm:ss')) \
                        .withColumn('o_datetime_in_ms', F.col('date_in_ms') + F.col('o_time_in_ms')) \
                        .withColumn('o_datetime',get_timestamp_in_tz(F.col('o_datetime_in_ms'),'yyyy-MM-dd HH:mm:ss','GMT-3'))
                        

In [128]:
#printdf(od_matrix.select(['o_datetime_in_ms','o_datetime','date_in_ms','date','o_timestamp']))
printdf(od_matrix.select(['date_in_ms','o_time_in_ms','o_datetime_in_ms','date','o_timestamp','o_datetime']))

Unnamed: 0,date_in_ms,o_time_in_ms,o_datetime_in_ms,date,o_timestamp,o_datetime
0,1494385200,32969,1494418169,2017-05-10,06:09:29,2017-05-10 06:09:29
1,1494385200,70255,1494455455,2017-05-10,16:30:55,2017-05-10 16:30:55
2,1494385200,49275,1494434475,2017-05-10,10:41:15,2017-05-10 10:41:15
3,1494385200,69745,1494454945,2017-05-10,16:22:25,2017-05-10 16:22:25
4,1494385200,76640,1494461840,2017-05-10,18:17:20,2017-05-10 18:17:20
5,1494385200,34748,1494419948,2017-05-10,06:39:08,2017-05-10 06:39:08
6,1494385200,55367,1494440567,2017-05-10,12:22:47,2017-05-10 12:22:47
7,1494385200,64512,1494449712,2017-05-10,14:55:12,2017-05-10 14:55:12
8,1494385200,39360,1494424560,2017-05-10,07:56:00,2017-05-10 07:56:00
9,1494385200,81022,1494466222,2017-05-10,19:30:22,2017-05-10 19:30:22


In [129]:
od_matrix = od_matrix.withColumn('o_base_timestamp_in_ms', F.col('o_datetime_in_ms') - 60 * 2) \
                        .withColumn('o_base_timestamp',get_timestamp_in_tz(F.col('o_base_timestamp_in_ms'),'yyyy-MM-dd HH:mm:ss','GMT-3')) \
                        .withColumn('o_base_time',F.split(F.col('o_base_timestamp'),'\s+')[1]) \
                        .withColumn('o_base_date',F.split(F.col('o_base_timestamp'),'\s+')[0]) \


In [131]:
printdf(od_matrix.select('o_datetime_in_ms','o_base_timestamp_in_ms','o_datetime','o_base_timestamp','o_base_date','o_base_time'))

Unnamed: 0,o_datetime_in_ms,o_base_timestamp_in_ms,o_datetime,o_base_timestamp,o_base_date,o_base_time
0,1494418169,1494418049,2017-05-10 06:09:29,2017-05-10 06:07:29,2017-05-10,06:07:29
1,1494455455,1494455335,2017-05-10 16:30:55,2017-05-10 16:28:55,2017-05-10,16:28:55
2,1494434475,1494434355,2017-05-10 10:41:15,2017-05-10 10:39:15,2017-05-10,10:39:15
3,1494454945,1494454825,2017-05-10 16:22:25,2017-05-10 16:20:25,2017-05-10,16:20:25
4,1494461840,1494461720,2017-05-10 18:17:20,2017-05-10 18:15:20,2017-05-10,18:15:20
5,1494419948,1494419828,2017-05-10 06:39:08,2017-05-10 06:37:08,2017-05-10,06:37:08
6,1494440567,1494440447,2017-05-10 12:22:47,2017-05-10 12:20:47,2017-05-10,12:20:47
7,1494449712,1494449592,2017-05-10 14:55:12,2017-05-10 14:53:12,2017-05-10,14:53:12
8,1494424560,1494424440,2017-05-10 07:56:00,2017-05-10 07:54:00,2017-05-10,07:54:00
9,1494466222,1494466102,2017-05-10 19:30:22,2017-05-10 19:28:22,2017-05-10,19:28:22


In [133]:
def get_otp_itineraries(o_lat,o_lon,d_lat,d_lon,date,time):
    otp_http_request = 'routers/ctba/plan?fromPlace={},{}&toPlace={},{}&mode=TRANSIT,WALK&date={}&time={}'
    otp_request_url = OTP_SERVER_URL + otp_http_request.format(o_lat,o_lon,d_lat,d_lon,date,time)
    return json.loads(urllib2.urlopen(otp_request_url).read())

In [114]:
test_itinerary = get_otp_itineraries(-25.413083,-49.229020,-25.428322,-49.266739,'05-10-2017','7:19:00')

print test_itinerary

http://localhost:5601/otp/routers/ctba/plan?fromPlace=-25.413083,-49.22902&toPlace=-25.428322,-49.266739&mode=TRANSIT,WALK&date=05-10-2017&time=7:19:00
{u'elevationMetadata': {u'geoidElevation': False, u'ellipsoidToGeoidDifference': 3.686111287840708}, u'plan': {u'date': 1494411540000, u'to': {u'lat': -25.428322, u'vertexType': u'NORMAL', u'lon': -49.266739, u'name': u'Destination', u'orig': u''}, u'itineraries': [{u'walkTime': 2, u'legs': [{u'distance': 0.151, u'from': {u'vertexType': u'NORMAL', u'name': u'Origin', u'lon': -49.22902, u'departure': 1494411566000, u'lat': -25.413083, u'orig': u''}, u'interlineWithPreviousLeg': False, u'transitLeg': False, u'realTime': False, u'route': u'', u'departureDelay': 0, u'agencyTimeZoneOffset': -10800000, u'to': {u'arrival': 1494411567000, u'vertexType': u'TRANSIT', u'name': u'Rua Fagundes Varela, 1295 - Jardim Social', u'stopSequence': 12, u'lon': -49.229029895014, u'departure': 1494411568000, u'stopId': u'1:31957', u'stopCode': u'130279', u'la

### New Approach

In [134]:
trips_otp_response = {}
counter = 0
for row in od_matrix.collect():
        id=long(row['o_boarding_id'])
        trip_plan = get_otp_itineraries(row['o_shape_lat'], row['o_shape_lon'], row['shapeLat'], row['shapeLon'],row['o_base_date'],row['o_base_time'])
        trips_otp_response[id] = trip_plan
        counter+=1

In [146]:
trips_plans = []

for trip in trips_otp_response.keys():
    itinerary_id = 1
    if trips_otp_response[trip]['plan'] != None:
        for itinerary in trips_otp_response[trip]['plan']['itineraries']:
            leg_id = 1
            for leg in itinerary['legs']:
                route = leg['route'] if leg['route'] != '' else None
                fromStopId = leg['from']['stopId'].split(':')[1] if leg['mode'] == 'BUS' else None
                toStopId = leg['to']['stopId'].split(':')[1] if leg['mode'] == 'BUS' else None
                start_time = long(leg['startTime'])
                end_time = long(leg['endTime'])
                duration = end_time - start_time
                #date_str = start_time.strftime('%Y-%m-%d')
                #start_time_str = start_time.strftime('%H:%M:%S')
                #end_time_str = end_time.strftime('%H:%M:%S')
                trips_plans.append((start_time,trip,itinerary_id,leg_id,start_time,end_time,leg['mode'],route,fromStopId,toStopId, duration))
                leg_id += 1
            itinerary_id += 1

labels=['date','trip_id','itinerary_id','leg_id','start_time','end_time','mode','route','from_stop_id','to_stop_id','duration']
trips_plans_df = pd.DataFrame.from_records(trips_plans,columns=labels)
#trips_plans_df['route'] = trips_plans_df['route'].astype(float)
#trips_plans_df['from_stop_id'] = trips_plans_df['from_stop_id'].astype(float)
#trips_plans_df['to_stop_id'] = trips_plans_df['to_stop_id'].astype(float)

KeyError: 'plan'

In [144]:
trips_otp_response[627065225216]['plan']

{u'date': 1494438765000,
 u'from': {u'lat': -25.4328495036,
  u'lon': -49.2704182564,
  u'name': u'Origin',
  u'orig': u'',
  u'vertexType': u'NORMAL'},
 u'itineraries': [{u'duration': 83,
   u'elevationGained': 0.0,
   u'elevationLost': 0.0,
   u'endTime': 1494438981000,
   u'legs': [{u'agencyTimeZoneOffset': -10800000,
     u'arrivalDelay': 0,
     u'departureDelay': 0,
     u'distance': 0.196,
     u'duration': 1.0,
     u'endTime': 1494438899000,
     u'from': {u'departure': 1494438898000,
      u'lat': -25.4328495036,
      u'lon': -49.2704182564,
      u'name': u'Origin',
      u'orig': u'',
      u'vertexType': u'NORMAL'},
     u'interlineWithPreviousLeg': False,
     u'legGeometry': {u'length': 2, u'points': u'xjfzC|bvkH??'},
     u'mode': u'WALK',
     u'pathway': False,
     u'realTime': False,
     u'rentedBike': False,
     u'route': u'',
     u'startTime': 1494438898000,
     u'steps': [{u'absoluteDirection': u'SOUTHWEST',
       u'area': False,
       u'bogusName': False,

In [138]:
trips_plans_df

Unnamed: 0,date,trip_id,itinerary_id,leg_id,start_time,end_time,mode,route,from_stop_id,to_stop_id,duration
0,2017-05-10,1434519077089,1,1,07:19:26,07:19:27,WALK,,,,00:00:01
1,2017-05-10,1434519077089,1,2,07:19:28,07:42:00,BUS,374.0,31957.0,26362.0,00:22:32
2,2017-05-10,1434519077089,1,3,07:42:01,07:42:02,WALK,,,,00:00:01
3,2017-05-10,1434519077089,2,1,07:27:14,07:27:15,WALK,,,,00:00:01
4,2017-05-10,1434519077089,2,2,07:27:16,07:48:00,BUS,361.0,31957.0,26361.0,00:20:44
5,2017-05-10,1434519077089,2,3,07:48:01,07:48:17,WALK,,,,00:00:16
6,2017-05-10,1434519077089,3,1,07:35:11,07:35:12,WALK,,,,00:00:01
7,2017-05-10,1434519077089,3,2,07:35:13,07:59:00,BUS,374.0,31957.0,26362.0,00:23:47
8,2017-05-10,1434519077089,3,3,07:59:01,07:59:02,WALK,,,,00:00:01
9,2017-05-10,249108103395,1,1,16:36:21,16:36:32,WALK,,,,00:00:11


In [127]:
trips_plans_df.describe(include='all')

Unnamed: 0,date,trip_id,itinerary_id,leg_id,start_time,end_time,mode,route,from_stop_id,to_stop_id,duration
count,84,84.0,84.0,84.0,84,84,84,27.0,27.0,27.0,84
unique,1,,,,84,84,2,,,,
top,05-10-2017,,,,07:19:28,07:26:54,WALK,,,,
freq,84,,,,1,1,57,,,,
mean,,1085809000000.0,2.047619,1.964286,,,,556.592593,29924.037037,28627.814815,0 days 00:06:51.797619
std,,509935700000.0,0.820002,0.827923,,,,323.680191,6292.226287,7718.793932,0 days 00:12:16.581975
min,,249108100000.0,1.0,1.0,,,,40.0,14298.0,4664.0,0 days 00:00:01
25%,,584115600000.0,1.0,1.0,,,,363.0,28520.0,26362.0,0 days 00:00:07
50%,,953482700000.0,2.0,2.0,,,,535.0,31957.0,30052.0,0 days 00:00:48
75%,,1477469000000.0,3.0,3.0,,,,860.0,33681.0,33375.0,0 days 00:05:02.750000


### Read Bus Data

In [128]:
bus_trips_data = pd.read_csv('buste_crowd.csv')

In [129]:
bus_trips_data.columns

Index([u'route', u'tripNum', u'shapeId', u'shapeSequence', u'shapeLat',
       u'shapeLon', u'distanceTraveledShape', u'busCode', u'gpsPointId',
       u'gpsLat', u'gpsLon', u'distanceToShapePoint', u'timestamp',
       u'stopPointId', u'problem', u'birthdate', u'cardTimestamp', u'lineName',
       u'gender', u'date', u'id', u'boarding_cnt', u'alighting_cnt',
       u'crowd_bal', u'num_pass', u'extrap_factor', u'ext_num_pass'],
      dtype='object')

In [130]:
bus_trips_data['stopPointId'] = bus_trips_data['stopPointId'].astype(float)

In [131]:
bus_trips_data.head()

Unnamed: 0,route,tripNum,shapeId,shapeSequence,shapeLat,shapeLon,distanceTraveledShape,busCode,gpsPointId,gpsLat,...,lineName,gender,date,id,boarding_cnt,alighting_cnt,crowd_bal,num_pass,extrap_factor,ext_num_pass
0,463.0,1,2846,6348468,-25.478025,-49.203033,1128.772,DC087,,-25.478023,...,,,2017-05-10,1279900255353,0,0,0,0,39.22807,0.0
1,463.0,1,2846,6348493,-25.474027,-49.209193,1899.853,DC087,,,...,,,2017-05-10,549755815049,0,0,0,0,39.22807,0.0
2,463.0,1,2846,6348516,-25.46865,-49.213528,2638.152,DC087,,,...,,,2017-05-10,523986011178,0,0,0,0,39.22807,0.0
3,463.0,1,2846,6348531,-25.46409,-49.21717,3262.176,DC087,,-25.463931,...,,,2017-05-10,180388627571,0,0,0,0,39.22807,0.0
4,463.0,1,2846,6348545,-25.459453,-49.220872,3896.639,DC087,,-25.45961,...,,,2017-05-10,1142461301906,0,0,0,0,39.22807,0.0


In [140]:
real_trip_beginning = pd.merge(trips_plans_df,bus_trips_data, left_on=['date','route','from_stop_id'],right_on=['date','route','stopPointId'],how='inner')
real_trip_beginning = real_trip_beginning.groupby(['date','route','trip_id','itinerary_id','leg_id','from_stop_id'])

In [142]:
real_trip_beginning[['date','route','from_stop_id','tripNum','timestamp','start_time']]

Unnamed: 0,date,route,from_stop_id,tripNum,timestamp,start_time
0,2017-05-10,374.0,31957.0,1,06:24:44,07:19:28
1,2017-05-10,374.0,31957.0,2,07:19:18,07:19:28
2,2017-05-10,374.0,31957.0,3,08:22:47,07:19:28
3,2017-05-10,374.0,31957.0,4,09:39:23,07:19:28
4,2017-05-10,374.0,31957.0,5,10:36:29,07:19:28
5,2017-05-10,374.0,31957.0,6,11:37:44,07:19:28
6,2017-05-10,374.0,31957.0,7,12:45:21,07:19:28
7,2017-05-10,374.0,31957.0,8,13:52:44,07:19:28
8,2017-05-10,374.0,31957.0,9,15:03:10,07:19:28
9,2017-05-10,374.0,31957.0,10,16:05:30,07:19:28


In [168]:
trips_plans_df.head()

Unnamed: 0,trip_id,itinerary_id,leg_id,start_time,end_time,mode,route,from_stop_id,to_stop_id,duration
0,1434519077089,1,1,1494411566000,1494411567000,WALK,,,,00:00:01
1,1434519077089,1,2,1494411568000,1494412920000,BUS,374.0,31957.0,26362.0,00:22:32
2,1434519077089,1,3,1494412921000,1494412922000,WALK,,,,00:00:01
3,1434519077089,2,1,1494412034000,1494412035000,WALK,,,,00:00:01
4,1434519077089,2,2,1494412036000,1494413280000,BUS,361.0,31957.0,26361.0,00:20:44


In [173]:
for index, row in trips_plans_df.iterrows():
    if row['mode'] == 'BUS':
        bus_trips_data.filter((bus_trips_data.route == row['route']) & 
                              (int(bus_trips_data.stopPointId) == int(row['from_stop_id']))
    

SyntaxError: invalid syntax (<ipython-input-173-9376cbc206d6>, line 5)