In [24]:
import ray
import os
import time
from ray import tune
from pyhdfs import HdfsClient
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost_ray import RayDMatrix, RayParams, train, RayXGBClassifier
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression
import pickle
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score, confusion_matrix, recall_score, roc_auc_score, precision_score, f1_score, classification_report

ray.shutdown()
ray.init()

2022-11-24 12:42:44,397	INFO worker.py:1519 -- Started a local Ray instance. View the dashboard at [1m[32m127.0.0.1:8265 [39m[22m


0,1
Python version:,3.10.6
Ray version:,2.1.0
Dashboard:,http://127.0.0.1:8265


In [9]:
nodes = ["127.0.0.1"]
client = HdfsClient(hosts=nodes, user_name="bigdata2022-VirtualBox")
df=pd.read_csv(client.open("hdfs://localhost:9870/user/bigdata2022/datasets/Australia1.csv"))

In [11]:
print(df.head())
seed=42

   MinTemp  MaxTemp  Rainfall  Evaporation  Sunshine  WindGustSpeed  \
0     13.4     22.9       0.6     5.472516       8.5           44.0   
1      7.4     25.1       0.0     5.472516       8.5           44.0   
2     12.9     25.7       0.0     5.472516       8.5           46.0   
3      9.2     28.0       0.0     5.472516       8.5           24.0   
4     17.5     32.3       1.0     5.472516       8.5           41.0   

   WindSpeed9am  WindSpeed3pm  Humidity9am  Humidity3pm  ...  WindDir3pm_NNW  \
0          20.0          24.0         71.0         22.0  ...               0   
1           4.0          22.0         44.0         25.0  ...               0   
2          19.0          26.0         38.0         30.0  ...               0   
3          11.0           9.0         45.0         16.0  ...               0   
4           7.0          20.0         82.0         33.0  ...               0   

   WindDir3pm_NW  WindDir3pm_S  WindDir3pm_SE  WindDir3pm_SSE  WindDir3pm_SSW  \
0          

In [12]:
X = df.drop(['RainTomorrow'], axis=1)
t = df['RainTomorrow']

In [13]:
X_train, X_test, t_train, t_test = train_test_split(X, t, train_size=0.75)
print(t_test.head())

84973     0.0
72501     1.0
1059      1.0
48974     0.0
111248    0.0
Name: RainTomorrow, dtype: float64


In [14]:
start_time=time.time()

In [15]:
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [19]:
model = RayXGBClassifier(n_jobs=4,random_state=seed)

In [20]:
model.fit(X_train,t_train)

2022-11-24 12:38:38,631	INFO main.py:1035 -- [RayXGBoost] Created 4 new actors (4 total actors). Waiting until actors are ready for training.
2022-11-24 12:38:49,045	INFO main.py:1080 -- [RayXGBoost] Starting XGBoost training.
[2m[36m(_RemoteRayXGBoostActor pid=10105)[0m [12:38:50] task [xgboost.ray]:139797684964224 got new rank 0
[2m[36m(_RemoteRayXGBoostActor pid=10106)[0m [12:38:50] task [xgboost.ray]:140430396887952 got new rank 1
[2m[36m(_RemoteRayXGBoostActor pid=10158)[0m [12:38:50] task [xgboost.ray]:139965277817776 got new rank 2
[2m[36m(_RemoteRayXGBoostActor pid=10180)[0m [12:38:50] task [xgboost.ray]:140630279555968 got new rank 3
2022-11-24 12:39:16,677	INFO main.py:1575 -- [RayXGBoost] Finished XGBoost training on training data with total N=104,904 in 39.04 seconds (27.59 pure XGBoost training time).


In [21]:
pred_ray = model.predict(X_test)
print(pred_ray)

2022-11-24 12:39:54,602	INFO main.py:1620 -- [RayXGBoost] Created 4 remote actors.
2022-11-24 12:40:00,768	INFO main.py:1637 -- [RayXGBoost] Starting XGBoost prediction.


[0 1 1 ... 0 0 0]


In [23]:
pred_proba_ray = model.predict_proba(X_test)
print(pred_proba_ray)

2022-11-24 12:40:25,572	INFO main.py:1620 -- [RayXGBoost] Created 4 remote actors.
2022-11-24 12:40:31,047	INFO main.py:1637 -- [RayXGBoost] Starting XGBoost prediction.


[[0.7840189  0.21598114]
 [0.28433394 0.71566606]
 [0.02068979 0.9793102 ]
 ...
 [0.8048259  0.1951741 ]
 [0.8512474  0.14875264]
 [0.9733881  0.02661194]]


In [25]:
print('Test-set accuracy score: {0:0.4f}'. format(accuracy_score(t_test, pred_ray)))

Test-set accuracy score: 0.8611


In [28]:
print(classification_report(t_test,pred_ray))

              precision    recall  f1-score   support

         0.0       0.88      0.95      0.91     27266
         1.0       0.75      0.56      0.64      7702

    accuracy                           0.86     34968
   macro avg       0.82      0.75      0.78     34968
weighted avg       0.85      0.86      0.85     34968



In [31]:
cm = confusion_matrix(t_test, pred_ray)
print('\nTrue Positives(TP) = ', cm[0,0])
print('\nTrue Negatives(TN) = ', cm[1,1])
print('\nFalse Positives(FP) = ', cm[0,1])
print('\nFalse Negatives(FN) = ', cm[1,0])


True Positives(TP) =  25829

True Negatives(TN) =  4283

False Positives(FP) =  1437

False Negatives(FN) =  3419


    [state-dump] 	NodeManager.deadline_timer.print_event_loop_stats - 23 total (1 active, 1 running), CPU time: mean = 5.835 ms, total = 134.214 ms
    [state-dump] 	CoreWorkerService.grpc_client.LocalGC - 10 total (0 active), CPU time: mean = 28.367 us, total = 283.672 us
    [state-dump] 	PeriodicalRunner.RunFnPeriodically - 8 total (0 active), CPU time: mean = 856.358 us, total = 6.851 ms
    [state-dump] 	NodeManagerService.grpc_server.GetSystemConfig - 5 total (0 active), CPU time: mean = 126.824 us, total = 634.121 us
    [state-dump] 	ObjectManager.ObjectAdded - 5 total (0 active), CPU time: mean = 12.413 us, total = 62.063 us
    [state-dump] 	ObjectManager.ObjectDeleted - 5 total (0 active), CPU time: mean = 21.762 us, total = 108.809 us
    [state-dump] 	ClientConnection.async_write.DoAsyncWrites - 5 total (0 active), CPU time: mean = 1.084 us, total = 5.419 us
    [state-dump] 	InternalPubSubGcsService.grpc_client.GcsSubscriberPoll - 4 total (1 active), CPU time: mean = 219.