# Generate map tiles from NYC taxi trip data using parallel method

在[上一篇文章](https://yeshuanova.github.io/blog/posts/implement-OSM-map-tiles/)中已展示過了簡單建立圖磚資料的方式。本文將改進前篇的圖磚建立方式，加速產生速度以及能處理大檔案。

- Concurrency (平行化)
    - 使用 [concurrent.futures](https://docs.python.org/3/library/concurrent.futures.html) 套件將任務平行處理，讓多 CPU 能充分發揮處理能力。
    - 盡可能將處理演算法獨立可分散處理的部份盡量平行化。


- Large size file (單一大檔案)
  - 以 [Divide and Conquer](https://en.wikipedia.org/wiki/Divide_and_conquer_algorithm) 概念來分割檔案並單獨處理，最後再將結果合併。
      
    
- 加速 map tiles 建立速度
  - 先建立一個基底 Tiles 後，再由下往上合併的方式 (Bottom-Up)建立新 Tile，避免不必要的運算。


- 不使用 Log 方式而使用 [Equalization Histogram](https://en.wikipedia.org/wiki/Histogram_equalization) 繪製 Map tile 來取得較好的繪圖效果。


- 不繪製 aggregation 中 count 結果不到 **5** 的點位，避免離散雜訊資料影響結果。

## 資料前處理

### 下載 NYC Taxi Trip Data

使用 `wget` 指令取得 NYC Taxi trip data，這裡一樣使用 2016 年 5 月的資料。

```bash
wget https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2016-05.csv
```

### 將原始 CSV 分割為小檔案

依照資料筆數切割為許多小檔案

```bash
# 建立 split 資料夾
mkdir split

# 以 1,000,000 lines 為單位，將原始 csv 檔案分割為數個檔案（不包含 header）
tail -n +2 yellow_tripdata_2016-05.csv | split -d -l 1000000 - split/trip.csv.
```

完成後可得到 `trip.csv.00`，`trip.csv.01` 等不包含標頭的分割檔

## 轉換 csv 內容到目標格式

將 csv 檔案轉換到包含 Web Mercator 格式座標(epsg:3857)以及所在基底 Tile 的座標資料。

In [None]:
import os, glob, csv, mercantile
import numpy as np

from pyproj import transform, Proj

proj_source = Proj(init="epsg:4326") # WGS84
proj_target = Proj(init="epsg:3857") # Web mercator

def toEpsg3857(lng, lat):
    return transform(proj_source, proj_target, lng, lat)

def toMapTileCoord(lng, lat, zoom):
    tile = mercantile.tile(lng, lat, zoom)
    return tile.x, tile.y, tile.z

# 轉換 source csv 到  target csv file，並將
def convTripGpsToWebMercator(source, target, base_zoom):
    os.makedirs(os.path.dirname(target), exist_ok=True)
    with open(source, 'r') as cf:
        creader = csv.reader(cf, delimiter=',')
        with open(target, 'w') as wf:
            cwriter = csv.writer(wf, delimiter=',')
            cwriter.writerow(['x', 'y', 'zoom', 'xtile', 'ytile'])
            for row in creader:
                try:
                    lat, lng = float(row[6]), float(row[5])
                    if (-180.0 <= lng <= 180.0) and (-90.0 <= lat <= 90.0):
                        x, y = toEpsg3857(lng, lat)
                        xtile, ytile, zoom = toMapTileCoord(lng, lat, base_zoom)
                        cwriter.writerow([x, y, zoom, xtile, ytile])
                except ValueError:
                    continue

def convTripGpsToWebMercatorWrapper(tup):
    convTripGpsToWebMercator(tup[0], tup[1], tup[2])

- 建立 csv 來源與目標檔案列表

In [None]:
base_zoom = 15

files_parts = glob.glob('./split/*.csv.*')
files_conv = [os.path.join('./split/epsg3857', os.path.basename(f)) for f in files_parts]
base_zoom_list = [base_zoom] * len(files_parts)

- 以 Concurrent.futures.ProcessPoolExecutor() 同步執行轉換步驟

In [None]:
import concurrent.futures as futures

with futures.ProcessPoolExecutor() as executor:
    tuple_list = list(zip(files_parts, files_conv, base_zoom_list))
    fs = executor.map(convTripGpsToWebMercatorWrapper, tuple_list)
    futures.as_completed(fs)
        

可得到格式為

| x | y | zoom | xtile  | ytile |
|-|-|-|-|:-|
| Web Mercator - X | Web Mercator - Y | 地圖基底 zoom | Map Tile X 位置 | Map Tile Y 位置 |

的 csv 檔案

## 建立 map tile aggregation file

建立基底 zoom 中，所有包含資料的 map tile aggregation files

- 用 Pandas 讀取分割的 csv 檔案
- 執行 Groupby() 計算所需建立的 tile group，避免沒必要的 aggregate 計算
- 使用 concurrent.futures 進行平行處理，加快計算速度


In [None]:
import gzip, pickle, yaml

# 取得 aggregation file path 
def getAggFilePath(root, x, y, z):
    return os.path.join(root, str(z), str(x), str(y) + '.pkl.gz') 

# 取得 aggregation yaml path
def getAggYamlFilePath(root, x, y, z):
    return os.path.join(root, str(z), str(x), str(y) + '.yaml')

# 使用 Pickle 序列化 Aggregation 並儲存成  gzip 格式的檔案
def serializeAggToFile(agg, file_path):
    os.makedirs(os.path.dirname(file_path), exist_ok=True)
    with gzip.open(file_path, mode='wb') as file:
        pickle.dump(agg, file)        

# 建立 Aggregation 檔案的 Yaml 檔
def serializeAggYaml(agg, file_path):
    os.makedirs(os.path.dirname(file_path), exist_ok=True)
    with open(file_path, mode='w') as file:
        obj = {"max_count": int(agg.values.max())}
        yaml.dump(obj, file, default_flow_style=False)

In [None]:
import datashader as ds

# 依照 Tile 位置建立 datashader.Canvas
def mapTileCanvas(xtile, ytile, zoom, tile_size=(256, 256)):
    bounds = mercantile.xy_bounds(xtile, ytile, zoom)
    canvas = ds.Canvas(plot_width = tile_size[0],
                       plot_height = tile_size[1],
                       x_range = (bounds.left, bounds.right),
                       y_range = (bounds.bottom, bounds.top))
    return canvas

In [None]:
import pandas as pd

def makeTilesAggregation(source, agg_root):
    
    df = pd.read_csv(source, 
                     usecols=['x', 'y', 'zoom', 'xtile', 'ytile'],
                     dtype={'x':np.float32,
                            'y':np.float32,
                            'zoom':np.int8,
                            'xtile':np.int32,
                            'ytile':np.int32})
    
    for ((zoom, xtile, ytile), data) in df.groupby(by=['zoom', 'xtile', 'ytile']):
        agg = mapTileCanvas(xtile, ytile, zoom).points(data, 'x', 'y')
        serializeAggToFile(agg, getAggFilePath(agg_root, xtile, ytile, zoom))
        serializeAggYaml(agg, getAggYamlFilePath(agg_root, xtile, ytile, zoom))
        
def makeTilesAggregationWrapper(tup):
    makeTilesAggregation(tup[0], tup[1])

In [None]:
# 平行化建立 Map tiles
with futures.ProcessPoolExecutor() as executor:
    csv_source = glob.glob('./split/epsg3857/*')
    agg_target = [os.path.join('./map-parallel/agg/temp/', os.path.basename(file)) for file in csv_source]
    tuple_list = list(zip(csv_source, agg_target))
    
    fs = executor.map(makeTilesAggregationWrapper, tuple_list)
    futures.as_completed(fs)   

## Combine multiple aggregation

合併所有 map tiles aggregation files 成為一個包含所有分割 csv 資料的整合 aggregation 結果。

In [None]:
# 由 path 得出 aggregation file 對應的 map tile
# Return: (xtile, ytile, zoom)
def getTileFromPath(agg_path):
    sep = agg_path.split(os.sep)
    if len(sep) < 3:
        raise ValueError("agg_path can not convert to tile path")        
    return (int(sep[-2]), int(sep[-1].split('.')[0]), int(sep[-3]))

def readAggregationFile(file):
    with gzip.open(file, 'rb') as f:
        return pickle.load(f)


In [None]:
# 取得所有需要合併的 map tile 位置
def getCombinTiles(temp_folder):
    tile_set = set()
    tiles = glob.glob(os.path.join(temp_folder, '*', '*', '*', '*.pkl.gz'))
    for tile_path in tiles:
        x, y, z = getTileFromPath(tile_path)
        tile_set.add((x, y, z))
    return list(tile_set)

In [None]:
from functools import reduce

def combineAggregation(x, y, z, temp_folder, target_folder):
    os.makedirs(target_folder, exist_ok=True)
    files = glob.glob(os.path.join(temp_folder, f'*/{z}/{x}/{y}.pkl.gz'))
    
    aggs = map(readAggregationFile, files)
    agg = reduce(lambda x, y: x + y, aggs)

    serializeAggToFile(agg, getAggFilePath(target_folder, x, y, z))
    serializeAggYaml(agg, getAggYamlFilePath(target_folder, x, y, z))

def combineAggregationWrapper(tup):
    (x, y, z), temp, target = tup
    combineAggregation(x, y, z, temp, target)

In [None]:
# 平行化合併處理方式
with futures.ProcessPoolExecutor() as executor:
    temp_folder = './map-parallel/agg/temp/'
    target_folder = './map-parallel/agg/'
    tile_list = getCombinTiles(temp_folder)
        
    tuple_list = list(zip(tile_list, [temp_folder] * len(tile_list), [target_folder] * len(tile_list)))    
    fs = executor.map(combineAggregationWrapper, tuple_list)
    futures.as_completed(fs)

完成後可以得到合併後的基底 map tile 的 aggregation file

## 使用 Bottom-Up 方式合併並產生新圖層

- 從基底 tile 的 aggregation 的檔案中建立 parents tile 列表
- 讀取 parents tile 中所有的 child tiles 檔案並合併資料後，在寫入 parents tile
  - 如果有的 child tiles 不存在，則建立內部為空值的 map tile
- 不斷往上建立 parents tiles 直到完成為止

### 範例

- 假如現在有一個 tile 座標為 (xtile, ytile, zoom) = (23, 43, 6)，則該 tile 的 parenet tile 為 ptile = (11, 21, 5)。

- 而 pTile 的 四個 children tiles 為 (22, 42, 6), (23, 42, 6), (23, 43, 6), (22, 43, 6)。

- 因此 pTile 的 aggregation file 可從四個 children tiles 中合併得來

In [None]:
import pandas as pd
from mercantile import Tile

# 建立用來讓 canvas 產生 aggregation 的 dummy dataframe
dummy_df = pd.DataFrame.from_dict(data={'x':[0], 'y': [0]})

In [None]:
# 檢查是否為使用 gzip 壓縮後的 pickle file
def isPickleFile(file):
    sep = file.split('.')
    if len(sep) < 3:
        return False
    if sep[-1] != 'gz' or sep[-2] != 'pkl':
        return False
    return True

In [None]:
# 將 matrix size 縮小一半 (ex. 256x256 -> 128x128)
def poolmat(m):
    return m[::2, ::2] + m[::2, 1::2] + m[1::2, 1::2] + m[1::2, ::2]

In [None]:
# 建立特定 Tile 的 Aggregation，若不存在則 則回傳 empty aggregation (created by dummy dataframe)
def makeTileAggegation(agg_root, tile):
    file_path = getAggFilePath(agg_root, *tile)
    if os.path.exists(file_path):
        try:
            with gzip.open(file_path, 'rb') as f:
                return pickle.load(f)
        except:
            return mapTileCanvas(*tile).points(dummy_df, 'x', 'y')
    else:
        return mapTileCanvas(*tile).points(dummy_df, 'x', 'y')

In [None]:

# 以 Bottom-Up 方式建立新的 aggregation data
def combineTileButtomUp(agg_root, x, y, z):
    
    agg = mapTileCanvas(x, y, z).points(dummy_df, 'x', 'y')
    
    row, col = agg.values.shape
    row_c, col_c = int(row/2), int(col/2)
    
    # lt = left-top, rt = right-top, rb = right-bottom, lb = left-bottom
    lt, rt, rb, lb = mercantile.children(Tile(x, y, z)) 
        
    agg.values[0:row_c, 0:col_c] = poolmat(makeTileAggegation(agg_root, lb).values)
    agg.values[0:row_c, col_c:col] = poolmat(makeTileAggegation(agg_root, rb).values)
    agg.values[row_c:row, col_c:col] = poolmat(makeTileAggegation(agg_root, rt).values)
    agg.values[row_c:row, 0:col_c] = poolmat(makeTileAggegation(agg_root, lt).values)
    
    return agg

In [None]:
# 產生 parents tile 列表
def getParentTiles(agg_root, base_zoom):
    p_tile_set = set()
    files = glob.glob(os.path.join(agg_root, str(base_zoom), '*', '*.pkl.gz'))
    for x, y, z in map(getTileFromPath, files):
        p_tile = mercantile.parent(Tile(x, y, z))
        p_tile_set.add((p_tile.x, p_tile.y, p_tile.z))
    
    return list(p_tile_set)

In [None]:
def makeTilesBottomUp(agg_root, x, y, z):
    agg = combineTileButtomUp(agg_root, x, y, z)
    serializeAggToFile(agg, getAggFilePath(agg_root, x, y, z))
    serializeAggYaml(agg, getAggYamlFilePath(agg_root, x, y, z))

def makeTilesBottomUpWrapper(tuple_obj):
    makeTilesBottomUp(*tuple_obj)

In [None]:

import concurrent.futures as futures

agg_root = './map-parallel/agg/'

for zoom in range(base_zoom, 0, -1):
    with futures.ProcessPoolExecutor() as executor:
        its = [(agg_root, x, y, z) for x, y, z in getParentTiles(agg_root, zoom)]
        fs = executor.map(makeTilesBottomUpWrapper, its)
        futures.as_completed(fs)
    print(f'Make parents tiles from zoom {zoom}')

## 產生 Aggregation 檔案對應的 Tile 影像

In [None]:
import datashader.transfer_functions as tf
from colorcet import fire

def getRenderImage(img_root, agg_path):
    x, y, z = getTileFromPath(agg_path)
    return os.path.join(img_root, f'{z}', f'{x}', f'{y}.png')   


In [None]:
def makeTileImage(source, target):

    if not isPickleFile(source):
        return
    
    with gzip.open(source, 'rb') as f:
        agg = pickle.load(f)
        img = tf.shade(agg.where(agg > 5), cmap=fire)
        x, y, z = getTileFromPath(source)
        tile_path = os.path.join(tile_root, f'{z}', f'{x}', f'{y}.png')
        os.makedirs(os.path.dirname(tile_path), exist_ok=True)
        with open(tile_path, mode='wb') as out:
            out.write(img.to_bytesio(format='png').read())

def makeTileImageWrapper(tup_obj):
    makeTileImage(tup_obj[0], tup_obj[1])

In [None]:

with futures.ProcessPoolExecutor() as executor:
    tile_root = './map-parallel/tile/'

    render_agg_list = glob.glob(os.path.join(agg_root, '*', '*', '*.pkl.gz'))
    render_img_list = [getRenderImage(tile_root, agg_path) for agg_path in render_agg_list]

    tup_list = list(zip(render_agg_list, render_img_list))

    fs = executor.map(makeTileImageWrapper, tup_list)
    futures.as_completed(fs)


## 使用 Folium 顯示圖層

In [4]:
import folium

# 使用 Carto Dark 建立底圖
fmap = folium.Map(location=[40.772562, -73.974039],
                  tiles='https://cartodb-basemaps-{s}.global.ssl.fastly.net/dark_all/{z}/{x}/{y}.png',
                  max_zoom=15,
                  zoom_start=12,
                  attr='Carto Dark')

# 加入放在 GitHub 存放的 map tiles 位置
fmap.add_tile_layer(tiles='https://raw.githubusercontent.com/yeshuanova/nyc_taxi_trip_map/master/map-parallel/tile/{z}/{x}/{y}.png',
                    attr='NYC taxi pickup Heatmap',
                    max_zoom=15)

# 儲存成 html 檔案
fmap.save('index-parallel.html')

# 顯示地圖
fmap