# NYC Taxi Fare Prediction with RayDP and Pytorch

In [1]:
import ray
import os
import pandas as pd, numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F

from pyspark.sql.functions import *

import raydp
from raydp.torch.estimator import TorchEstimator
from raydp.utils import random_split

## Initialize or connect to existed Ray cluster

In [2]:
# Firstly, You need to init or connect to a ray cluster. Note that you should set include_java to True.
# For more config info in ray, please refer the ray doc. https://docs.ray.io/en/latest/package-ref.html
# ray.init(address="auto", redis_password="123")
ray.init()

2020-12-01 14:43:29,110	INFO services.py:1169 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m


{'node_ip_address': '192.168.1.5',
 'raylet_ip_address': '192.168.1.5',
 'redis_address': '192.168.1.5:6379',
 'object_store_address': '/tmp/ray/session_2020-12-01_14-43-28_599723_35307/sockets/plasma_store',
 'raylet_socket_name': '/tmp/ray/session_2020-12-01_14-43-28_599723_35307/sockets/raylet',
 'webui_url': '127.0.0.1:8265',
 'session_dir': '/tmp/ray/session_2020-12-01_14-43-28_599723_35307',
 'metrics_export_port': 51619,
 'node_id': '90d19e71221e9fdd2f21b03cc7e03037b307db19'}

In [4]:
# After initialize ray cluster, you can use the raydp api to get a spark session
app_name = "NYC Taxi Fare Prediction with RayDP"
num_executors = 4
cores_per_executor = 1
memory_per_executor = "2GB"
spark = raydp.init_spark(app_name, num_executors, cores_per_executor, memory_per_executor)

Exception: The spark environment has inited.

In [4]:
ls

pytorch_nyctaxi.ipynb   raydp-0.1-SNAPSHOT.jar


## Distributed data preprocessing with pyspark

In [5]:
# Then you can code as you are using spark
# The dataset can be downloaded from https://www.kaggle.com/c/new-york-city-taxi-fare-prediction/data
# Here we just use a subset of the training data
train = spark.read.format("csv").option("header", "true") \
        .option("inferSchema", "true") \
        .load("../../data1/new-york-city-taxi-fare-prediction/train.csv")

# Set spark timezone for processing datetime
spark.conf.set("spark.sql.session.timeZone", "UTC")

NameError: name 'spark' is not defined

In [6]:
# Clean up the outlier
def clean_up(data):
    
    data = data.filter(col('pickup_longitude')<=-72) \
            .filter(col('pickup_longitude')>=-76) \
            .filter(col('dropoff_longitude')<=-72) \
            .filter(col('dropoff_longitude')>=-76) \
            .filter(col('pickup_latitude')<=42) \
            .filter(col('pickup_latitude')>=38) \
            .filter(col('dropoff_latitude')<=42) \
            .filter(col('dropoff_latitude')>=38) \
            .filter(col('passenger_count')<=6) \
            .filter(col('passenger_count')>=1) \
            .filter(col('fare_amount') > 0) \
            .filter(col('fare_amount') < 250) \
            .filter(col('dropoff_longitude') != col('pickup_longitude')) \
            .filter(col('dropoff_latitude') != col('pickup_latitude')) 
    
    return data

In [7]:
# Add time related features
def add_time_features(data):
    
    data = data.withColumn("day", dayofmonth(col("pickup_datetime")))
    data = data.withColumn("hour_of_day", hour(col("pickup_datetime")))
    data = data.withColumn("day_of_week", dayofweek(col("pickup_datetime"))-2)
    data = data.withColumn("week_of_year", weekofyear(col("pickup_datetime")))
    data = data.withColumn("month_of_year", month(col("pickup_datetime")))
    data = data.withColumn("quarter_of_year", quarter(col("pickup_datetime")))
    data = data.withColumn("year", year(col("pickup_datetime")))
    
    @udf("int")
    def night(hour, weekday):
        if ((hour <= 20) and (hour >= 16) and (weekday < 5)):
            return int(1)
        else:
            return int(0)

    @udf("int")
    def late_night(hour):
        if ((hour <= 6) and (hour >= 20)):
            return int(1)
        else:
            return int(0)
    data = data.withColumn("night", night("hour_of_day", "day_of_week"))
    data = data.withColumn("late_night", late_night("hour_of_day"))
    return data

In [8]:
# Add distance related features
def add_distance_features(data):

    @udf("float")
    def manhattan(lat1, lon1, lat2, lon2):
        return float(np.abs(lat2 - lat1) + np.abs(lon2 - lon1))
    
    # Location of NYC downtown
    ny = (-74.0063889, 40.7141667)
    # Location of the three airport in NYC
    jfk = (-73.7822222222, 40.6441666667)
    ewr = (-74.175, 40.69)
    lgr = (-73.87, 40.77)
    
    # Features about the distance between pickup/dropoff and airport
    data = data.withColumn("abs_diff_longitude", abs(col("dropoff_longitude")-col("pickup_longitude"))) \
            .withColumn("abs_diff_latitude", abs(col("dropoff_latitude") - col("pickup_latitude")))
    data = data.withColumn("manhattan", col("abs_diff_latitude")+col("abs_diff_longitude"))
    data = data.withColumn("pickup_distance_jfk", manhattan("pickup_longitude", "pickup_latitude", lit(jfk[0]), lit(jfk[1])))
    data = data.withColumn("dropoff_distance_jfk", manhattan("dropoff_longitude", "dropoff_latitude", lit(jfk[0]), lit(jfk[1])))
    data = data.withColumn("pickup_distance_ewr", manhattan("pickup_longitude", "pickup_latitude", lit(ewr[0]), lit(ewr[1])))
    data = data.withColumn("dropoff_distance_ewr", manhattan("dropoff_longitude", "dropoff_latitude", lit(ewr[0]), lit(ewr[1])))
    data = data.withColumn("pickup_distance_lgr", manhattan("pickup_longitude", "pickup_latitude", lit(lgr[0]), lit(lgr[1])))
    data = data.withColumn("dropoff_distance_lgr", manhattan("dropoff_longitude", "dropoff_latitude", lit(lgr[0]), lit(lgr[1])))
    data = data.withColumn("pickup_distance_downtown", manhattan("pickup_longitude", "pickup_latitude", lit(ny[0]), lit(ny[1])))
    data = data.withColumn("dropoff_distance_downtown", manhattan("dropoff_longitude", "dropoff_latitude", lit(ny[0]), lit(ny[1])))
    
    return data

In [9]:
# Drop unused features
def drop_col(data):
    
    data = data.drop("pickup_datetime") \
            .drop("pickup_longitude") \
            .drop("pickup_latitude") \
            .drop("dropoff_longitude") \
            .drop("dropoff_latitude") \
            .drop("passenger_count") \
            .drop("key")
    
    return data

In [10]:
train_data = clean_up(train)

train_data = add_time_features(train_data)

train_data = add_distance_features(train_data)

train_data = drop_col(train_data)

## Distributed model training and evaluation

In [11]:
# Split data into train_dataset and test_dataset
train_df, test_df = random_split(train_data, [0.9, 0.1])
features = [field.name for field in list(train_df.schema) if field.name != "fare_amount"]

In [12]:
# Define the model, loss function and optimizer
class NYC_Model(nn.Module):
    def __init__(self, cols):
        super(NYC_Model, self).__init__()
        
        self.fc1 = nn.Linear(cols, 256)
        self.fc2 = nn.Linear(256, 128)
        self.fc3 = nn.Linear(128, 64)
        self.fc4 = nn.Linear(64, 16)
        self.fc5 = nn.Linear(16, 1)
        
        self.bn1 = nn.BatchNorm1d(256)
        self.bn2 = nn.BatchNorm1d(128)
        self.bn3 = nn.BatchNorm1d(64)
        self.bn4 = nn.BatchNorm1d(16)

    def forward(self, *x):
        x = torch.cat(x, dim=1)
        x = F.relu(self.fc1(x))
        x = self.bn1(x)
        x = F.relu(self.fc2(x))
        x = self.bn2(x)
        x = F.relu(self.fc3(x))
        x = self.bn3(x)
        x = F.relu(self.fc4(x))
        x = self.bn4(x)
        x = self.fc5(x)
        
        return x.squeeze(1)

nyc_model = NYC_Model(len(features))
criterion = nn.SmoothL1Loss()
optimizer = torch.optim.Adam(nyc_model.parameters(), lr=0.001)

In [13]:
# Create a distributed estimator based on the raydp api
estimator = TorchEstimator(num_workers=4, model=nyc_model, optimizer=optimizer, loss=criterion,
                            feature_columns=features, label_column="fare_amount", batch_size=256, num_epochs=30)

In [14]:
# Train the model
estimator.fit_on_spark(train_df, test_df)

Traceback (most recent call last):
  File "/Users/dr6jl/anaconda3/lib/python3.8/site-packages/ray/function_manager.py", line 494, in _load_actor_class_from_gcs
    actor_class = pickle.loads(pickled_class)
  File "/Users/dr6jl/anaconda3/lib/python3.8/site-packages/ray/util/sgd/__init__.py", line 1, in <module>
    from ray.util.sgd.torch import TorchTrainer
  File "/Users/dr6jl/anaconda3/lib/python3.8/site-packages/ray/util/sgd/torch/__init__.py", line 12, in <module>
    from ray.util.sgd.torch.torch_trainer import (TorchTrainer,
  File "/Users/dr6jl/anaconda3/lib/python3.8/site-packages/ray/util/sgd/torch/torch_trainer.py", line 13, in <module>
    from ray.tune import Trainable
  File "/Users/dr6jl/anaconda3/lib/python3.8/site-packages/ray/tune/__init__.py", line 2, in <module>
    from ray.tune.tune import run_experiments, run
  File "/Users/dr6jl/anaconda3/lib/python3.8/site-packages/ray/tune/tune.py", line 13, in <module>
    from ray.tune.ray_trial_executor import RayTrialExecut

[2m[36m(pid=26015)[0m 2020-12-01 00:03:26,509	ERROR function_manager.py:496 -- Failed to load actor class DistributedTorchRunner.
[2m[36m(pid=26015)[0m Traceback (most recent call last):
[2m[36m(pid=26015)[0m   File "/Users/dr6jl/anaconda3/lib/python3.8/site-packages/ray/function_manager.py", line 494, in _load_actor_class_from_gcs
[2m[36m(pid=26015)[0m     actor_class = pickle.loads(pickled_class)
[2m[36m(pid=26015)[0m   File "/Users/dr6jl/anaconda3/lib/python3.8/site-packages/ray/util/sgd/__init__.py", line 1, in <module>
[2m[36m(pid=26015)[0m     from ray.util.sgd.torch import TorchTrainer
[2m[36m(pid=26015)[0m   File "/Users/dr6jl/anaconda3/lib/python3.8/site-packages/ray/util/sgd/torch/__init__.py", line 12, in <module>
[2m[36m(pid=26015)[0m     from ray.util.sgd.torch.torch_trainer import (TorchTrainer,
[2m[36m(pid=26015)[0m   File "/Users/dr6jl/anaconda3/lib/python3.8/site-packages/ray/util/sgd/torch/torch_trainer.py", line 13, in <module>
[2m[36m(pi

RayTaskError(AttributeError): [36mray::DistributedTorchRunner.setup_address()[39m (pid=26014, ip=192.168.1.5)
  File "python/ray/_raylet.pyx", line 426, in ray._raylet.execute_task
  File "python/ray/_raylet.pyx", line 429, in ray._raylet.execute_task
  File "python/ray/_raylet.pyx", line 446, in ray._raylet.execute_task
  File "/Users/dr6jl/anaconda3/lib/python3.8/site-packages/ray/serialization.py", line 310, in deserialize_objects
    self._deserialize_object(data, metadata, object_ref))
  File "/Users/dr6jl/anaconda3/lib/python3.8/site-packages/ray/serialization.py", line 248, in _deserialize_object
    return self._deserialize_msgpack_data(data, metadata_fields)
  File "/Users/dr6jl/anaconda3/lib/python3.8/site-packages/ray/serialization.py", line 226, in _deserialize_msgpack_data
    python_objects = self._deserialize_pickle5_data(pickle5_data)
  File "/Users/dr6jl/anaconda3/lib/python3.8/site-packages/ray/serialization.py", line 216, in _deserialize_pickle5_data
    obj = pickle.loads(in_band)
  File "/Users/dr6jl/anaconda3/lib/python3.8/site-packages/ray/util/sgd/__init__.py", line 1, in <module>
    from ray.util.sgd.torch import TorchTrainer
  File "/Users/dr6jl/anaconda3/lib/python3.8/site-packages/ray/util/sgd/torch/__init__.py", line 12, in <module>
    from ray.util.sgd.torch.torch_trainer import (TorchTrainer,
  File "/Users/dr6jl/anaconda3/lib/python3.8/site-packages/ray/util/sgd/torch/torch_trainer.py", line 13, in <module>
    from ray.tune import Trainable
  File "/Users/dr6jl/anaconda3/lib/python3.8/site-packages/ray/tune/__init__.py", line 2, in <module>
    from ray.tune.tune import run_experiments, run
  File "/Users/dr6jl/anaconda3/lib/python3.8/site-packages/ray/tune/tune.py", line 13, in <module>
    from ray.tune.ray_trial_executor import RayTrialExecutor
  File "/Users/dr6jl/anaconda3/lib/python3.8/site-packages/ray/tune/ray_trial_executor.py", line 15, in <module>
    from ray.tune.durable_trainable import DurableTrainable
  File "/Users/dr6jl/anaconda3/lib/python3.8/site-packages/ray/tune/durable_trainable.py", line 5, in <module>
    from ray.tune.syncer import get_cloud_sync_client
  File "/Users/dr6jl/anaconda3/lib/python3.8/site-packages/ray/tune/syncer.py", line 90, in <module>
    class SyncConfig:
  File "/Users/dr6jl/anaconda3/lib/python3.8/site-packages/dataclasses.py", line 958, in dataclass
    return wrap(_cls)
  File "/Users/dr6jl/anaconda3/lib/python3.8/site-packages/dataclasses.py", line 950, in wrap
    return _process_class(cls, init, repr, eq, order, unsafe_hash, frozen)
  File "/Users/dr6jl/anaconda3/lib/python3.8/site-packages/dataclasses.py", line 800, in _process_class
    cls_fields = [_get_field(cls, name, type)
  File "/Users/dr6jl/anaconda3/lib/python3.8/site-packages/dataclasses.py", line 800, in <listcomp>
    cls_fields = [_get_field(cls, name, type)
  File "/Users/dr6jl/anaconda3/lib/python3.8/site-packages/dataclasses.py", line 659, in _get_field
    if (_is_classvar(a_type, typing)
  File "/Users/dr6jl/anaconda3/lib/python3.8/site-packages/dataclasses.py", line 550, in _is_classvar
    return type(a_type) is typing._ClassVar
AttributeError: module 'typing' has no attribute '_ClassVar'

In [15]:
# shutdown raydp and ray
estimator.shutdown()
raydp.stop_spark()
ray.shutdown()