<a href="https://colab.research.google.com/github/xukanz/Xukan/blob/main/Grab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import datetime
import numpy as np
import matplotlib.pyplot as plt

BASE_DIR = "/content/drive/MyDrive/Dataset/Grab/"
FILE_NAME_PATTERN = "/part-0000{}-8bbff892-97d2-4011-9961-703e38972569.c000.snappy.parquet"

## Load dataset

In [None]:
def load_dataset_by_city(city):
  dataset = pd.DataFrame()
  for i in range(10):
    df = pd.read_parquet(BASE_DIR + city + FILE_NAME_PATTERN.format(i))
    dataset = pd.concat([dataset, df])
  return dataset

In [None]:
def load_single_jakarta_file():
  return pd.read_parquet(BASE_DIR + "Jakarta" + FILE_NAME_PATTERN.format(0))

## Change data types

In [None]:
# functions to change dtypes
def to_category(df, *args):
    for col_name in args:
        df[col_name] = df[col_name].astype("category")
    
def to_float32(df, *args):
    for col_name in args:
        df[col_name] = df[col_name].astype("float32")
        
def to_uint16(df, *args):
    for col_name in args:
        df[col_name] = df[col_name].astype("uint16")
  
def to_int32(df, *args):
    for col_name in args:
      df[col_name] = df[col_name].astype("int32")

def format_datetime(df, col_name):
    # get datetime obj for all timestamps
    dt = df[col_name].apply(datetime.datetime.fromtimestamp)
    
    df["time"] = dt.apply(lambda x: x.time())
    df["day_of_week"] = dt.apply(lambda x: x.weekday())
    df["month"] = dt.apply(lambda x: x.month)
    df["year"] = dt.apply(lambda x: x.year)

In [None]:
sg_dataset = load_dataset_by_city('Singapore')
format_datetime(sg_dataset, "pingtimestamp")
to_category(sg_dataset, ["trj_id", "driving_mode", "osname"])
to_float32(sg_dataset, ["rawlat", "rawlng", "speed", "accuracy"])
to_uint16(sg_dataset, ["bearing", "day_of_week", "month", "year"])
to_int32(sg_dataset, "pingtimestamp")
sg_dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 30329685 entries, 0 to 3032568
Data columns (total 13 columns):
 #   Column         Dtype   
---  ------         -----   
 0   trj_id         category
 1   driving_mode   category
 2   osname         category
 3   pingtimestamp  int32   
 4   rawlat         float32 
 5   rawlng         float32 
 6   speed          float32 
 7   bearing        uint16  
 8   accuracy       float32 
 9   time           object  
 10  day_of_week    uint16  
 11  month          uint16  
 12  year           uint16  
dtypes: category(3), float32(4), int32(1), object(1), uint16(4)
memory usage: 1.4+ GB


In [None]:
sg_dataset.describe()

Unnamed: 0,pingtimestamp,rawlat,rawlng,speed,bearing,accuracy,day_of_week,month,year
count,30329680.0,30329680.0,30329680.0,30329680.0,30329680.0,30329680.0,30329680.0,30329685.0,30329685.0
mean,1555321000.0,1.106323,70.80468,17.23095,178.842,6.4237,3.134983,4.0,2019.0
std,358012.1,0.232354,33.65821,6.906806,101.7391,38.71886,1.969039,0.0,0.0
min,1554682000.0,1.241585,103.6143,-1.0,0.0,1.0,0.0,4.0,2019.0
25%,1554999000.0,1.322314,103.7903,11.68946,93.0,4.0,1.0,4.0,2019.0
50%,1555343000.0,1.339908,103.8394,17.89483,172.0,6.0,3.0,4.0,2019.0
75%,1555645000.0,1.377545,103.8748,21.86,269.0,10.0,5.0,4.0,2019.0
max,1555891000.0,1.46565,104.0314,93.0,359.0,149000.0,6.0,4.0,2019.0


## Preprocess all start and end locations and write into file

In [None]:
sg_trj_id_set = np.unique(sg_dataset.trj_id)

In [None]:
def get_route(dataset, trj_id):
  return dataset.query('trj_id == ' + f"'{trj_id}'").sort_values("pingtimestamp")[["rawlat", "rawlng", "pingtimestamp"]]

In [None]:
# from tqdm import tqdm
# sg_trj_starts = {}
# sg_trj_ends = {}
# for id in tqdm(sg_trj_id_set):
#   route = get_route(sg_dataset, id)
#   sg_trj_starts[id] = route.iloc[0, :]
#   sg_trj_ends[id] = route.iloc[-1, :]

In [None]:
# import pickle
# with open(BASE_DIR + "SG_ROUTE_START.pkl", "wb") as f:
#   pickle.dump(sg_trj_starts, f)

# with open(BASE_DIR + "SG_ROUTE_END.pkl", "wb") as f:
#   pickle.dump(sg_trj_ends, f)

In [None]:
import pickle
with open(BASE_DIR + "SG_ROUTE_START.pkl", "rb") as f:
  sg_trj_starts = pickle.load(f)

with open(BASE_DIR + "SG_ROUTE_END.pkl", "rb") as f:
  sg_trj_ends = pickle.load(f)

## Preprocess morning peek hours and night peek hours every day.

In [None]:
min_timestamp = 1.554682e+09
max_timestamp = 1.555891e+09
one_day_seconds = 24 * 3600
print(datetime.datetime.fromtimestamp(min_timestamp))
print(datetime.datetime.fromtimestamp(max_timestamp))

2019-04-08 00:06:40
2019-04-21 23:56:40


In [None]:
morning_peek_hour_start = datetime.datetime(2019, 4, 8, 9)
morning_peek_hour_start = int(morning_peek_hour_start.timestamp())

morning_peek_hour_end = datetime.datetime(2019, 4, 8, 13)
morning_peek_hour_end = int(morning_peek_hour_end.timestamp())

night_peek_hour_start = datetime.datetime(2019, 4, 8, 21)
night_peek_hour_start = int(night_peek_hour_start.timestamp())

night_peek_hour_end = datetime.datetime(2019, 4, 9, 1)
night_peek_hour_end = int(night_peek_hour_end.timestamp())

In [None]:
all_morning_peek_hours = [(morning_peek_hour_start + i * one_day_seconds, morning_peek_hour_end + i * one_day_seconds) for i in range(14)]
all_night_peek_hours = [(night_peek_hour_start + i * one_day_seconds, night_peek_hour_end + i * one_day_seconds) for i in range(14)]

In [None]:
all_night_peek_hours

[(1554757200, 1554771600),
 (1554843600, 1554858000),
 (1554930000, 1554944400),
 (1555016400, 1555030800),
 (1555102800, 1555117200),
 (1555189200, 1555203600),
 (1555275600, 1555290000),
 (1555362000, 1555376400),
 (1555448400, 1555462800),
 (1555534800, 1555549200),
 (1555621200, 1555635600),
 (1555707600, 1555722000),
 (1555794000, 1555808400),
 (1555880400, 1555894800)]

## Sample passengers and drivers
we sampled passengers and drivers by enumerating “pivot” time every 10 minutes . For every pivot, we filtered all trajectories that ended within 3 minutes before the pivot and took their end locations as locations of drivers. Similarly, we filtered all trajectories that started within 3 minutes before the pivot and took their start locations as locations of locations of passengers.

In [None]:
from tqdm import tqdm
sample_passengers = {}
sample_drivers = {}

for i in tqdm(range(14)):
  morning_peek_start = all_morning_peek_hours[i][0]
  morning_peek_end = all_morning_peek_hours[i][1]
  night_peek_start = all_night_peek_hours[i][0]
  night_peek_end = all_night_peek_hours[i][1]
  sample_passengers[i] = {}
  sample_drivers[i] = {}
  
  # morning peek
  for time_point in range(morning_peek_start, morning_peek_end, 300):
    sample_passengers[i][time_point] = []
    sample_drivers[i][time_point] = []
    
    # get all ends
    for trj_id in sg_trj_id_set:
      trj_id = str(trj_id)
      location = (sg_trj_ends[trj_id][0], sg_trj_ends[trj_id][1])
      trj_timestamp = sg_trj_ends[trj_id][2]
      if trj_timestamp > time_point - 180 and trj_timestamp < time_point:
          sample_drivers[i][time_point].append(location)
    
    # get all starts
    for trj_id in sg_trj_id_set:
      trj_id = str(trj_id)
      location = (sg_trj_starts[trj_id][0], sg_trj_starts[trj_id][1])
      trj_timestamp = sg_trj_starts[trj_id][2]
      if trj_timestamp > time_point and trj_timestamp < time_point + 180:
          sample_passengers[i][time_point].append(location)
              
  
  # night peek
  for time_point in range(night_peek_start, night_peek_end, 300):
    sample_passengers[i][time_point] = []
    sample_drivers[i][time_point] = []
    
    # get all ends
    for trj_id in sg_trj_id_set:
      trj_id = str(trj_id)
      location = (sg_trj_ends[trj_id][0], sg_trj_ends[trj_id][1])
      trj_timestamp = sg_trj_ends[trj_id][2]
      if trj_timestamp > time_point - 180 and trj_timestamp < time_point:
          sample_drivers[i][time_point].append(location)
    
    # get all starts
    for trj_id in sg_trj_id_set:
      trj_id = str(trj_id)
      location = (sg_trj_starts[trj_id][0], sg_trj_starts[trj_id][1])
      trj_timestamp = sg_trj_starts[trj_id][2]
      if trj_timestamp > time_point and trj_timestamp < time_point + 180:
          sample_passengers[i][time_point].append(location)
    

In [None]:
import pickle
with open(BASE_DIR + "SAMPLE_DRIVER2.pkl", "wb") as f:
  pickle.dump(sample_drivers, f)

with open(BASE_DIR + "SAMPLE_PASSENGER2.pkl", "wb") as f:
  pickle.dump(sample_passengers, f)

## Visulization

In [None]:
import folium
from folium import plugins
from branca.element import Figure

In [None]:
sg_lat = 1.290270
sg_lng = 103.851959

jk_lat = -6.2088
jk_lng = 106.8456

In [None]:
# create figure
fig = Figure(width = 550, height = 350)

# add map to figure
m = folium.Map(location = [sg_lat, sg_lng],
         tiles = 'cartodbpositron',
         zoom_start = 11)
fig.add_child(m)

In [None]:
def plot_morning_by_day(day_idx):
  
  morning_peek_start = all_morning_peek_hours[day_idx][0]
  morning_peek_end = all_morning_peek_hours[day_idx][1]
  
  for time_point in range(morning_peek_start, morning_peek_end, 600):
    # create figure
    fig = Figure(width = 550, height = 350)

    # add map to figure
    m = folium.Map(location = [sg_lat, sg_lng],
            tiles = 'cartodbpositron',
            zoom_start = 11)
    fig.add_child(m)

    
    stamp_key = str(time_point)
    day_key = str(day_idx)
    sample_ps = sample_passengers[day_idx][time_point]
    sample_ds = sample_drivers[day_idx][time_point]

    if len(sample_ps) == 0 or len(sample_ds) == 0:
      continue
    
    for p in sample_ps:
      folium.Marker(location = [p[0], p[1]],
             popup = 'Default popup Marker1',
             icon = folium.Icon(color = "green")).add_to(m)

    for d in sample_ds:
      folium.Marker(location = [d[0], d[1]],
             popup = 'Default popup Marker1',
             icon = folium.Icon(color = "blue")).add_to(m)
    
    stamp_to_datetime = str(datetime.datetime.fromtimestamp(time_point))
    m.save(BASE_DIR + "JK/" + "day{}/".format(day_idx + 1) + 
           stamp_to_datetime + ".html")


In [None]:
def plot_night_by_day(day_idx):
  
  night_peek_start = all_night_peek_hours[day_idx][0]
  night_peek_end = all_night_peek_hours[day_idx][1]
  
  for time_point in range(night_peek_start, night_peek_end, 600):
    # create figure
    fig = Figure(width = 550, height = 350)

    # add map to figure
    m = folium.Map(location = [sg_lat, sg_lng],
            tiles = 'cartodbpositron',
            zoom_start = 11)
    fig.add_child(m)

    
    stamp_key = str(time_point)
    day_key = str(day_idx)
    sample_ps = sample_passengers[day_idx][time_point]
    sample_ds = sample_drivers[day_idx][time_point]

    if len(sample_ps) == 0 or len(sample_ds) == 0:
      continue
    
    for p in sample_ps:
      folium.Marker(location = [p[0], p[1]],
             popup = 'Default popup Marker1',
             icon = folium.Icon(color = "green")).add_to(m)

    for d in sample_ds:
      folium.Marker(location = [d[0], d[1]],
             popup = 'Default popup Marker1',
             icon = folium.Icon(color = "blue")).add_to(m)
    
    stamp_to_datetime = str(datetime.datetime.fromtimestamp(time_point))
    m.save(BASE_DIR + "JK/" + "day{}/".format(day_idx + 1) + 
           stamp_to_datetime + ".html")

In [None]:
for i in range(14):
  plot_morning_by_day(i)
  plot_night_by_day(i)

In [None]:
pings = sg_dataset[['rawlat', 'rawlng']].to_numpy()
batch_size = 128
num_batches = pings.shape[0] // batch_size

In [None]:
# create new figure
heat_map = Figure(height = 550, width = 750)

# create new map
map = folium.Map([sg_lat, sg_lng], zoom_start = 11,
                             min_zoom = 11, max_zoom = 16)
heat_map.add_child(map)
for i in range(num_batches):
  # create heatmap and add to map
  ping_batch = pings[i * batch_size: (i + 1) * batch_size]
  plugins.HeatMap(ping_batch, radius = 4, blur = 6).add_to(map)