In [None]:
from dask.distributed import Client
import os
scheduler = "<SCHEDULER_EXTERNAL_IP>"
client = Client(f"{scheduler}:8786")
client.restart()

In [None]:
%%time

dir = "/ml-share/taxi-csv"
files = [os.path.join(dir,x) for x in os.listdir(dir)
         if 'yellow' in x and ('2019'in x or '2018' in x or '2017' in x)]


cols = ['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime',
        'passenger_count', 'trip_distance','RatecodeID', 
        'store_and_fwd_flag', 'PULocationID', 'DOLocationID', 
        'payment_type', 'fare_amount','extra', 'mta_tax', 
        'tip_amount', 'tolls_amount', 'improvement_surcharge', 'total_amount']

from dask import dataframe as dd
import dask_cudf
taxi = dd.read_csv(files, 
                          assume_missing=True,
                          parse_dates=[1,2], 
                          usecols=cols)

In [None]:
features = ['pickup_weekday', 'pickup_hour', 'pickup_minute',
            'pickup_week_hour', 'passenger_count', 'VendorID', 
            'RatecodeID', 'store_and_fwd_flag', 'PULocationID', 
            'DOLocationID']

In [None]:
%%time

from dask import persist
from dask.distributed import wait

taxi['pickup_weekday'] = taxi.tpep_pickup_datetime.dt.weekday
taxi['pickup_hour'] = taxi.tpep_pickup_datetime.dt.hour
taxi['pickup_minute'] = taxi.tpep_pickup_datetime.dt.minute
taxi['pickup_week_hour'] = (taxi.pickup_weekday * 24) + taxi.pickup_hour
taxi['store_and_fwd_flag'] = (taxi.store_and_fwd_flag == 'Y').astype(float)
#taxi = taxi.fillna(-1)
taxi = taxi.dropna()
taxi = taxi[taxi["fare_amount"] > 0]
taxi = taxi[taxi["total_amount"] > 0]
taxi = taxi[taxi['passenger_count'] > 0]

X = taxi[features].astype('float32')
y = taxi['total_amount'].astype('float32')

X, y = persist(X, y)
_ = wait([X, y])

In [None]:
%%time

from dask_ml.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, shuffle=True)

X_train, X_test, y_train, y_test = persist(X_train, X_test, y_train, y_test)
_ = wait([X_train, X_test, y_train, y_test])

In [None]:
%%time

import xgboost as xgb

dtrain = xgb.dask.DaskDMatrix(client, X_train, y_train)
output = xgb.dask.train(
        client,
        {"verbosity": 3, 
#         "tree_method":'gpu_hist', 
         "tree_method": 'hist',
#         "gpu_id": 0,
         "objective": "reg:squarederror",
         "nthread": 7,
         "n_jobs": -1,
         "n_estimators": 100,
         "random_state": 42 },
        dtrain,
        num_boost_round=50,
        evals=[(dtrain, "train")],
    )
                     

In [None]:
%%time

y_pred = xgb.dask.predict(client, output, X_test)

In [None]:
%%time

pred = y_pred.compute()#.to_array()
gt = y_test.compute().to_array()
import numpy as np
from sklearn import metrics

print('Mean Absolute Error (MAE):', metrics.mean_absolute_error(gt, pred))
print('Mean Squared Error (MSE):', metrics.mean_squared_error(gt, pred))
print('Root Mean Squared Error (RMSE):', np.sqrt(metrics.mean_squared_error(gt, pred)))
mape = np.mean(np.abs((gt - pred) / np.abs(gt)))
print('Mean Absolute Percentage Error (MAPE):', round(mape * 100, 2))
print('Accuracy:', round(100*(1 - mape), 2))    

In [None]:
%%time

del X_train, X_test, y_test, y_train, y_pred, taxi