In [38]:
import ray
import os
import time
from ray import tune
from pyhdfs import HdfsClient
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost_ray import RayXGBRFRegressor, RayParams
from sklearn.preprocessing import MinMaxScaler
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score
import pickle

ray.shutdown()
ray.init()

2022-11-22 23:08:05,995	INFO worker.py:1519 -- Started a local Ray instance. View the dashboard at [1m[32m127.0.0.1:8266 [39m[22m


0,1
Python version:,3.10.6
Ray version:,2.1.0
Dashboard:,http://127.0.0.1:8266


In [6]:
nodes = ["127.0.0.1"]
client = HdfsClient(hosts=nodes, user_name="bigdata2022-VirtualBox")
df=pd.read_csv(client.open("hdfs://localhost:9870/user/bigdata2022/datasets/taxi_dataset_EDA.csv"))

In [7]:
df.head()
seed=42

In [8]:
X = df.drop(['trip_duration'], axis=1)
t = df['trip_duration']

In [9]:
X_train, X_test, t_train, t_test = train_test_split(X, t, train_size=0.75, random_state=seed)


In [10]:
start_time=time.time()


In [11]:
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [18]:
#ray_param = RayParams(
 #                     num_actors=4,
  #                    cpus_per_actor=1
   #                  ) 

In [19]:
model = RayXGBRFRegressor(n_jobs=4, random_state=seed, max_depth= 3,
          min_child_weight= 0.3361,
          gamma= 14.96,
          learning_rate= 0.243,
          subsample= 0.53,
          colsample_bylevel= 1.0,
          colsample_bytree= 0.6099)

In [20]:
param_grid = {'max_depth': [3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
              'min_child_weight': np.arange(0.0001, 0.5, 0.001),
              'gamma': np.arange(0.0,40.0,0.005),
              'learning_rate': np.arange(0.0005,0.3,0.0005),
              'subsample': np.arange(0.01,1.0,0.01),
              'colsample_bylevel': np.round(np.arange(0.1,1.0,0.01)),
              'colsample_bytree': np.arange(0.1,1.0,0.01)
             }   

In [34]:
grid_search = RandomizedSearchCV(model, param_grid, cv=5, scoring="r2")

In [35]:
grid_result = grid_search.fit(X_train,t_train)

2022-11-22 22:21:52,045	INFO main.py:1035 -- [RayXGBoost] Created 4 new actors (4 total actors). Waiting until actors are ready for training.
2022-11-22 22:21:58,622	INFO main.py:1080 -- [RayXGBoost] Starting XGBoost training.
[2m[36m(_RemoteRayXGBoostActor pid=13759)[0m [22:22:03] task [xgboost.ray]:139696559849376 got new rank 0
[2m[36m(_RemoteRayXGBoostActor pid=13766)[0m [22:22:04] task [xgboost.ray]:140025644851232 got new rank 1
[2m[36m(_RemoteRayXGBoostActor pid=13767)[0m [22:22:04] task [xgboost.ray]:139813537308704 got new rank 2
[2m[36m(_RemoteRayXGBoostActor pid=13768)[0m [22:22:04] task [xgboost.ray]:139740276344096 got new rank 3
2022-11-22 22:22:30,544	INFO main.py:1575 -- [RayXGBoost] Finished XGBoost training on training data with total N=875,147 in 42.12 seconds (31.89 pure XGBoost training time).
2022-11-22 22:22:31,073	INFO main.py:1620 -- [RayXGBoost] Created 4 remote actors.
2022-11-22 22:22:37,194	INFO main.py:1637 -- [RayXGBoost] Starting XGBoost pred

In [23]:
model.fit(X_train, t_train,)

2022-11-22 22:01:40,785	INFO main.py:1035 -- [RayXGBoost] Created 4 new actors (4 total actors). Waiting until actors are ready for training.
2022-11-22 22:01:54,125	INFO main.py:1080 -- [RayXGBoost] Starting XGBoost training.
[2m[36m(_RemoteRayXGBoostActor pid=11114)[0m [22:02:01] task [xgboost.ray]:139976057177376 got new rank 0
[2m[36m(_RemoteRayXGBoostActor pid=11121)[0m [22:02:01] task [xgboost.ray]:139694167408544 got new rank 1
[2m[36m(_RemoteRayXGBoostActor pid=11240)[0m [22:02:01] task [xgboost.ray]:139888996069632 got new rank 2
[2m[36m(_RemoteRayXGBoostActor pid=11275)[0m [22:02:01] task [xgboost.ray]:139845519982496 got new rank 3
2022-11-22 22:02:21,505	INFO main.py:1575 -- [RayXGBoost] Finished XGBoost training on training data with total N=1,093,934 in 43.40 seconds (27.33 pure XGBoost training time).


In [36]:
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

Best: 0.001057 using {'subsample': 0.9500000000000001, 'min_child_weight': 0.0031, 'max_depth': 12, 'learning_rate': 0.28800000000000003, 'gamma': 38.565, 'colsample_bytree': 0.7999999999999996, 'colsample_bylevel': 1.0}


In [40]:
model_saved='finalized_model.sav'
pickle.dump(model, open(model_saved, 'wb'))

In [24]:
pred_ray = model.predict(X_test)
print(pred_ray)

2022-11-22 22:02:52,328	INFO main.py:1620 -- [RayXGBoost] Created 4 remote actors.
2022-11-22 22:03:00,952	INFO main.py:1637 -- [RayXGBoost] Starting XGBoost prediction.


[149.74857  154.82167  119.081825 ... 230.31985  119.081825 351.42758 ]


In [31]:
#score = model.score(X_test,pred_ray)

2022-11-22 22:19:32,550	INFO main.py:1620 -- [RayXGBoost] Created 4 remote actors.
2022-11-22 22:19:39,505	INFO main.py:1637 -- [RayXGBoost] Starting XGBoost prediction.


In [41]:
params = model.get_xgb_params()
print(params)

{'colsample_bynode': 0.8, 'learning_rate': 0.243, 'reg_lambda': 1e-05, 'subsample': 0.53, 'objective': 'reg:squarederror', 'base_score': 0.5, 'booster': 'gbtree', 'colsample_bylevel': 1.0, 'colsample_bytree': 0.6099, 'eval_metric': None, 'gamma': 14.96, 'gpu_id': -1, 'grow_policy': 'depthwise', 'interaction_constraints': '', 'max_bin': 256, 'max_cat_threshold': 64, 'max_cat_to_onehot': 4, 'max_delta_step': 0, 'max_depth': 3, 'max_leaves': 0, 'min_child_weight': 0.3361, 'monotone_constraints': '()', 'n_jobs': 4, 'num_parallel_tree': 100, 'predictor': 'auto', 'random_state': 42, 'reg_alpha': 0, 'sampling_method': 'uniform', 'scale_pos_weight': 1, 'tree_method': 'approx', 'validate_parameters': 1, 'verbosity': None}


In [42]:
print(time.time()-start_time)

4447.034363269806
