# Peformance Comparison of cuML and scikit-learn
This notebook compares the performance of cuML and scikit-learn. The comparisons are conducted on datasets of identical sizes. The notebook primarily demonstrates the speedup factor that users can experience when similar machine learning algorithms from scikit-learn are run on GPUs using cuML.



In [1]:
import cuml
from cupy import asnumpy
from joblib import dump, load

In [2]:
from cuml.datasets.classification import make_classification
from cuml.model_selection import train_test_split
from cuml.ensemble import RandomForestClassifier as cuRF
from sklearn.metrics import accuracy_score
import time

# synthetic dataset dimensions
n_samples = 100000
n_features = 10
n_classes = 2

# random forest depth and size
n_estimators = 25
max_depth = 10

# generate synthetic data [ binary classification task ]
X, y = make_classification ( n_classes = n_classes,
                             n_features = n_features,
                             n_samples = n_samples,
                             random_state = 0 )

X_train, X_test, y_train, y_test = train_test_split( X, y, random_state = 0 )

model = cuRF( max_depth = max_depth,
              n_estimators = n_estimators,
              random_state  = 0 )

start_time = time.time()

trained_RF = model.fit ( X_train, y_train )

end_time = time.time()  

# Time taken for cuML model training
cuml_training_time = end_time - start_time

predictions = model.predict ( X_test )

cu_score = cuml.metrics.accuracy_score( y_test, predictions )
sk_score = accuracy_score( asnumpy( y_test ), asnumpy( predictions ) )

print( " cuml accuracy: ", cu_score )
print( " sklearn accuracy : ", sk_score )
print("cuML Random Forest classification training time", cuml_training_time, "seconds")

# save
dump( trained_RF, 'RF.model')

# to reload the model uncomment the line below
loaded_model = load('RF.model')

  return func(**kwargs)


 cuml accuracy:  0.9949600100517273
 sklearn accuracy :  0.99496
cuML Random Forest classification training time 0.12621426582336426 seconds


In [3]:
from cuml.datasets.classification import make_classification
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import time

# synthetic dataset dimensions
n_samples = 100000
n_features = 10
n_classes = 2

# random forest depth and size
n_estimators = 25
max_depth = 10

# generate synthetic data [binary classification task]
# X, y = make_classification(n_classes=n_classes,
#                            n_features=n_features,
#                            n_samples=n_samples,
#                            random_state=0)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(asnumpy(X), asnumpy(y), random_state = 0 )

# Create and train the model using scikit-learn's RandomForestClassifier
sk_model = RandomForestClassifier(n_estimators=n_estimators,
                                  max_depth=max_depth,
                                  random_state=0)

start_time = time.time()  # Start time for scikit-learn model training

sk_model.fit(X_train, y_train)

end_time = time.time()  # End time for scikit-learn model training

# Time taken for scikit-learn model training
sk_learning_time = end_time - start_time

# Predictions with scikit-learn model
predictions = sk_model.predict(X_test)

# Accuracy score calculation with scikit-learn
sk_score = accuracy_score(y_test, predictions)

# Print scikit-learn results
print("scikit-learn's accuracy:", sk_score)
print("scikit-learn's training time:", sk_learning_time, "seconds")

scikit-learn's accuracy: 0.9936
scikit-learn's training time: 3.826442241668701 seconds


## Results comparison

cuML's Randof Forest classification training time: **~0.12 seconds**

cpu's training time of scikit learn RF classification model: **~3.82 seconds**

cuML training is approximately `**32 times faster**` than scikit-learn classification RF model on 100K samples and 10 features