# CPU Workflow

In [1]:
import time
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

In [2]:
# UCI Covertype dataset URL and column names
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/covtype/covtype.data.gz"
columns = [
    "Elevation", "Aspect", "Slope",
    "Horizontal_Distance_To_Hydrology", "Vertical_Distance_To_Hydrology",
    "Horizontal_Distance_To_Roadways", "Hillshade_9am", "Hillshade_Noon",
    "Hillshade_3pm", "Horizontal_Distance_To_Fire_Points",
    "Wilderness_Area1", "Wilderness_Area2", "Wilderness_Area3", "Wilderness_Area4",
    "Soil_Type1", "Soil_Type2", "Soil_Type3", "Soil_Type4", "Soil_Type5", "Soil_Type6",
    "Soil_Type7", "Soil_Type8", "Soil_Type9", "Soil_Type10", "Soil_Type11",
    "Soil_Type12", "Soil_Type13", "Soil_Type14", "Soil_Type15", "Soil_Type16",
    "Soil_Type17", "Soil_Type18", "Soil_Type19", "Soil_Type20", "Soil_Type21",
    "Soil_Type22", "Soil_Type23", "Soil_Type24", "Soil_Type25", "Soil_Type26",
    "Soil_Type27", "Soil_Type28", "Soil_Type29", "Soil_Type30", "Soil_Type31",
    "Soil_Type32", "Soil_Type33", "Soil_Type34", "Soil_Type35", "Soil_Type36",
    "Soil_Type37", "Soil_Type38", "Soil_Type39", "Soil_Type40",
    "Cover_Type"
]

In [3]:
# Load the dataset
data = pd.read_csv(url, header=None)
data.columns = columns

In [4]:
# Separate features and target
X = data.drop("Cover_Type", axis=1)
y = data["Cover_Type"]

In [5]:
# Split into training and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [6]:
# Time training the CPU-based RandomForestClassifier
start_time = time.time()
clf = RandomForestClassifier(n_estimators=100, max_depth=10, max_features=1, n_jobs=-1, random_state=42)
clf.fit(X_train, y_train)
cpu_train_time = time.time() - start_time

In [7]:
# Evaluate the model
y_pred = clf.predict(X_test)
print("CPU RandomForestClassifier training time: {:.2f} seconds".format(cpu_train_time))
print("Accuracy: {:.3f}".format(accuracy_score(y_test, y_pred)))
print("\nClassification Report:")
print(classification_report(y_test, y_pred, zero_division=0))

CPU RandomForestClassifier training time: 6.68 seconds
Accuracy: 0.683

Classification Report:
              precision    recall  f1-score   support

           1       0.78      0.54      0.64     42368
           2       0.65      0.94      0.77     56661
           3       0.72      0.40      0.52      7151
           4       0.00      0.00      0.00       549
           5       0.00      0.00      0.00      1899
           6       1.00      0.01      0.02      3473
           7       0.00      0.00      0.00      4102

    accuracy                           0.68    116203
   macro avg       0.45      0.27      0.28    116203
weighted avg       0.68      0.68      0.64    116203



In [8]:
# Save the CPU training time for later comparison
with open("cpu_time.txt", "w") as f:
    f.write(str(cpu_train_time))

# GPU Workflow

In [1]:
!pip install -q jupyter-client==7.4.9

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/133.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m133.5/133.5 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
!pip install --quiet --extra-index-url=https://pypi.nvidia.com \
    "cudf-cu12==25.2.*" \
    "dask-cudf-cu12==25.2.*" \
    "cuml-cu12==25.2.*" \
    "cugraph-cu12==25.2.*" \
    "nx-cugraph-cu12==25.2.*" \
    "cuspatial-cu12==25.2.*" \
    "cuproj-cu12==25.2.*" \
    "cuxfilter-cu12==25.2.*" \
    "cucim-cu12==25.2.*" \
    "pylibraft-cu12==25.2.*" \
    "raft-dask-cu12==25.2.*" \
    "cuvs-cu12==25.2.*" \
    "nx-cugraph-cu12==25.2.*"

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m169.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.3/4.3 MB[0m [31m192.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m193.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m83.6/83.6 kB[0m [31m86.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m213.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m32.6/32.6 MB[0m [31m193.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.3/18.3 MB[0m [31m86.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m385.8/385.8 kB[0m [31m28.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━

In [3]:
!sudo apt-get update -qq > /dev/null 2>&1
!sudo apt-get install -y -qq cuda-toolkit-12-2 > /dev/null 2>&1

In [4]:
%load_ext cuml.accel

[2025-04-07 04:33:59.221] [CUML] [info] cuML: Installed accelerator for sklearn.
[2025-04-07 04:34:24.428] [CUML] [info] cuML: Installed accelerator for umap.
[2025-04-07 04:34:24.571] [CUML] [info] cuML: Installed accelerator for hdbscan.
[2025-04-07 04:34:24.571] [CUML] [info] cuML: Successfully initialized accelerator.


In [5]:
import time
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

In [6]:
# UCI Covertype dataset URL and column names
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/covtype/covtype.data.gz"
columns = [
    "Elevation", "Aspect", "Slope",
    "Horizontal_Distance_To_Hydrology", "Vertical_Distance_To_Hydrology",
    "Horizontal_Distance_To_Roadways", "Hillshade_9am", "Hillshade_Noon",
    "Hillshade_3pm", "Horizontal_Distance_To_Fire_Points",
    "Wilderness_Area1", "Wilderness_Area2", "Wilderness_Area3", "Wilderness_Area4",
    "Soil_Type1", "Soil_Type2", "Soil_Type3", "Soil_Type4", "Soil_Type5", "Soil_Type6",
    "Soil_Type7", "Soil_Type8", "Soil_Type9", "Soil_Type10", "Soil_Type11",
    "Soil_Type12", "Soil_Type13", "Soil_Type14", "Soil_Type15", "Soil_Type16",
    "Soil_Type17", "Soil_Type18", "Soil_Type19", "Soil_Type20", "Soil_Type21",
    "Soil_Type22", "Soil_Type23", "Soil_Type24", "Soil_Type25", "Soil_Type26",
    "Soil_Type27", "Soil_Type28", "Soil_Type29", "Soil_Type30", "Soil_Type31",
    "Soil_Type32", "Soil_Type33", "Soil_Type34", "Soil_Type35", "Soil_Type36",
    "Soil_Type37", "Soil_Type38", "Soil_Type39", "Soil_Type40",
    "Cover_Type"
]

In [7]:
# Load the dataset
data = pd.read_csv(url, header=None)
data.columns = columns

In [8]:
# Separate features and target
X = data.drop("Cover_Type", axis=1)
y = data["Cover_Type"]

In [9]:
# Split into training and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [10]:
# Time training the RandomForestClassifier
start_time_gpu = time.time()
clf_gpu = RandomForestClassifier(n_estimators=100, max_depth=10, max_features=1, n_jobs=-1, random_state=42)
clf_gpu.fit(X_train, y_train)
gpu_train_time = time.time() - start_time_gpu

In [11]:
# Evaluate the model
y_pred_gpu = clf_gpu.predict(X_test)
print("cuML-accelerated RandomForestClassifier training time: {:.2f} seconds".format(gpu_train_time))
print("Accuracy: {:.3f}".format(accuracy_score(y_test, y_pred_gpu)))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_gpu, zero_division=0))

cuML-accelerated RandomForestClassifier training time: 5.02 seconds
Accuracy: 0.521

Classification Report:
              precision    recall  f1-score   support

           1       0.69      0.11      0.18     42368
           2       0.51      0.99      0.67     56661
           3       0.00      0.00      0.00      7151
           4       0.00      0.00      0.00       549
           5       0.00      0.00      0.00      1899
           6       0.00      0.00      0.00      3473
           7       0.00      0.00      0.00      4102

    accuracy                           0.52    116203
   macro avg       0.17      0.16      0.12    116203
weighted avg       0.50      0.52      0.40    116203



In [12]:
# Read the saved CPU training time
with open("cpu_time.txt", "r") as f:
    saved_cpu_time = float(f.read())

print("CPU RandomForestClassifier training time: {:.2f} seconds".format(saved_cpu_time))
print("cuML-accelerated RandomForestClassifier training time: {:.2f} seconds".format(gpu_train_time))

CPU RandomForestClassifier training time: 6.68 seconds
cuML-accelerated RandomForestClassifier training time: 5.02 seconds
