In [1]:
import time
from sklearn.datasets import fetch_openml
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load MNIST dataset
mnist = fetch_openml('mnist_784', version=1)
X, y = mnist['data'], mnist['target'].astype(int)

# Split the dataset into training (60,000) and test (10,000) sets
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]

In [2]:
# Train a Random Forest classifier on the original data and time it
start_time = time.time()
rf = RandomForestClassifier(n_jobs=-1, random_state=42)
rf.fit(X_train, y_train)
train_time = time.time() - start_time
y_pred = rf.predict(X_test)
original_accuracy = accuracy_score(y_test, y_pred)

print(f"Original training time: {train_time:.3f} seconds")
print(f"Accuracy on test set (original): {original_accuracy * 100:.2f}%")

Original training time: 6.683 seconds
Accuracy on test set (original): 97.05%


In [3]:
print(f"Number of components original : {X_train.shape[1]}")

Number of components original : 784


In [4]:
# Apply PCA to reduce dimensionality (95% explained variance)
pca = PCA(n_components=0.97, random_state=42)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)
print(f"Number of components after PCA: {X_train_pca.shape[1]}")

Number of components after PCA: 214


In [5]:
# Train the Random Forest classifier on the reduced dataset and time it
start_time = time.time()
rf_pca = RandomForestClassifier(n_jobs=-1, random_state=42)
rf_pca.fit(X_train_pca, y_train)
train_time_pca = time.time() - start_time
y_pred_pca = rf_pca.predict(X_test_pca)
pca_accuracy = accuracy_score(y_test, y_pred_pca)

print(f"Training time with PCA: {train_time_pca:.3f} seconds")
print(f"Accuracy on test set (PCA): {pca_accuracy * 100:.2f}%")

Training time with PCA: 22.747 seconds
Accuracy on test set (PCA): 94.45%


In [12]:
print(X_train.dtypes.unique())  # Shows unique data types used in the DataFrame

[dtype('int64')]


In [14]:
print("Original X_train dtype:", X_train.to_numpy().dtype)
print("PCA X_train_pca dtype:", X_train_pca.dtype)


Original X_train dtype: int64
PCA X_train_pca dtype: float64
