In [8]:
import time
import numpy as np
from sklearn.datasets import fetch_openml
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler

# Step 1: Load the MNIST dataset
print("Fetching MNIST dataset...")
X, y = fetch_openml('mnist_784', version=1, return_X_y=True, as_frame=False)
                    
X, y = X[:10000], y[:10000]  # Select first 10,000 images for processing
print(f"Using first 10,000 images: {X.shape[0]} training samples.")
                   
X = X / 255.0  # Normalize the pixel values to [0, 1]
y = y.astype(int)  # Convert labels to integers

# Step 2: Standardize the features
print("Standardizing features...")
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Step 3: Use 10-fold cross-validation
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
fold_scores = []
fold_times = []

# Initialize the K-NN classifier with Ball Tree and Euclidean distance
knn = KNeighborsClassifier(n_neighbors=3, algorithm='ball_tree', metric='euclidean')

start_time = time.time()

for fold_idx, (train_idx, test_idx) in enumerate(kf.split(X, y), 1):
    print(f"Processing fold {fold_idx}...")  # Print which fold is being processed

    # Split the data for the current fold
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    # Train the model
    knn.fit(X_train, y_train)

    # Predict on the test set
    y_pred = knn.predict(X_test)

    # Evaluate the model
    score = accuracy_score(y_test, y_pred)
    fold_scores.append(score)
    
    fold_end_time = time.time()
    fold_time = fold_end_time - start_time
    fold_times.append(fold_time)
    
    print(f"Fold {fold_idx}: Accuracy = {score * 100:.2f}%, Time = {fold_time:.2f} seconds")
    
# Step 4: Report overall results
mean_accuracy = np.mean(fold_scores)
std_accuracy = np.std(fold_scores)
total_time = np.sum(fold_times)
best_accuracy = np.max(fold_scores)

print(f"\nCross-validation completed in {total_time:.2f} seconds.")
print(f"Accuracy for each fold: {[f'{score * 100:.2f}%' for score in fold_scores]}")
print(f"Mean accuracy: {mean_accuracy * 100:.2f}%")
print(f"Standard deviation of accuracy: {std_accuracy * 100:.2f}%")
print(f"Best accuracy: {best_accuracy * 100:.2f}%")


Fetching MNIST dataset...
Using first 10,000 images: 10000 training samples.
Standardizing features...
Processing fold 1...
Fold 1: Accuracy = 89.90%, Time = 13.15 seconds
Processing fold 2...
Fold 2: Accuracy = 93.10%, Time = 42.59 seconds
Processing fold 3...
Fold 3: Accuracy = 91.30%, Time = 92.96 seconds
Processing fold 4...
Fold 4: Accuracy = 90.70%, Time = 158.35 seconds
Processing fold 5...
Fold 5: Accuracy = 90.60%, Time = 213.16 seconds
Processing fold 6...
Fold 6: Accuracy = 91.40%, Time = 255.98 seconds
Processing fold 7...
Fold 7: Accuracy = 91.00%, Time = 296.20 seconds
Processing fold 8...
Fold 8: Accuracy = 92.40%, Time = 336.92 seconds
Processing fold 9...
Fold 9: Accuracy = 90.90%, Time = 376.35 seconds
Processing fold 10...
Fold 10: Accuracy = 91.00%, Time = 415.00 seconds

Cross-validation completed in 2200.65 seconds.
Accuracy for each fold: ['89.90%', '93.10%', '91.30%', '90.70%', '90.60%', '91.40%', '91.00%', '92.40%', '90.90%', '91.00%']
Mean accuracy: 91.23%
Sta

In [9]:
import time
import numpy as np
from sklearn.datasets import fetch_openml
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler

# Step 1: Load the MNIST dataset
print("Fetching MNIST dataset...")
X, y = fetch_openml('mnist_784', version=1, return_X_y=True, as_frame=False)

X, y = X[:10000], y[:10000]  # Select first 10,000 images for processing
print(f"Using first 10,000 images: {X.shape[0]} training samples.")


X = X / 255.0  # Normalize the pixel values to [0, 1]
y = y.astype(int)  # Convert labels to integers

# Step 2: Standardize the features
print("Standardizing features...")
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Step 3: Use 10-fold cross-validation
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
fold_scores = []
fold_times = []

# Initialize the K-NN classifier with Ball Tree and Cosine distance
knn = KNeighborsClassifier(n_neighbors=3, algorithm='ball_tree', metric='cosine')

start_time = time.time()

for fold_idx, (train_idx, test_idx) in enumerate(kf.split(X, y), 1):
    print(f"Processing fold {fold_idx}...")  # Print which fold is being processed

    # Split the data for the current fold
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    # Train the model
    knn.fit(X_train, y_train)

    # Predict on the test set
    y_pred = knn.predict(X_test)

    # Evaluate the model
    score = accuracy_score(y_test, y_pred)
    fold_scores.append(score)
    
    fold_end_time = time.time()
    fold_time = fold_end_time - start_time
    fold_times.append(fold_time)
    
    print(f"Fold {fold_idx}: Accuracy = {score * 100:.2f}%, Time = {fold_time:.2f} seconds")
    
# Step 4: Report overall results
mean_accuracy = np.mean(fold_scores)
std_accuracy = np.std(fold_scores)
total_time = np.sum(fold_times)
best_accuracy = np.max(fold_scores)

print(f"\nCross-validation completed in {total_time:.2f} seconds.")
print(f"Accuracy for each fold: {[f'{score * 100:.2f}%' for score in fold_scores]}")
print(f"Mean accuracy: {mean_accuracy * 100:.2f}%")
print(f"Standard deviation of accuracy: {std_accuracy * 100:.2f}%")
print(f"Best accuracy: {best_accuracy * 100:.2f}%")


Fetching MNIST dataset...
Using first 10,000 images: 10000 training samples.
Standardizing features...
Processing fold 1...


ValueError: Metric 'cosine' not valid. Use sorted(sklearn.neighbors.VALID_METRICS['ball_tree']) to get valid options. Metric can also be a callable function.

In [None]:
import time
import numpy as np
from sklearn.datasets import fetch_openml
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler

# Step 1: Load the MNIST dataset
print("Fetching MNIST dataset...")
X, y = fetch_openml('mnist_784', version=1, return_X_y=True, as_frame=False)

X, y = X[:10000], y[:10000]  # Select first 10,000 images for processing
print(f"Using first 10,000 images: {X.shape[0]} training samples.")


X = X / 255.0  # Normalize the pixel values to [0, 1]
y = y.astype(int)  # Convert labels to integers

# Step 2: Standardize the features
print("Standardizing features...")
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Step 3: Use 10-fold cross-validation
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
fold_scores = []
fold_times = []

# Initialize the K-NN classifier with Ball Tree and Manhattan distance
knn = KNeighborsClassifier(n_neighbors=3, algorithm='ball_tree', metric='manhattan')

start_time = time.time()

for fold_idx, (train_idx, test_idx) in enumerate(kf.split(X, y), 1):
    print(f"Processing fold {fold_idx}...")  # Print which fold is being processed

    # Split the data for the current fold
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    # Train the model
    knn.fit(X_train, y_train)

    # Predict on the test set
    y_pred = knn.predict(X_test)

    # Evaluate the model
    score = accuracy_score(y_test, y_pred)
    fold_scores.append(score)
    
    fold_end_time = time.time()
    fold_time = fold_end_time - start_time
    fold_times.append(fold_time)
    
    print(f"Fold {fold_idx}: Accuracy = {score * 100:.2f}%, Time = {fold_time:.2f} seconds")
    
# Step 4: Report overall results
mean_accuracy = np.mean(fold_scores)
std_accuracy = np.std(fold_scores)
total_time = np.sum(fold_times)
best_accuracy = np.max(fold_scores)

print(f"\nCross-validation completed in {total_time:.2f} seconds.")
print(f"Accuracy for each fold: {[f'{score * 100:.2f}%' for score in fold_scores]}")
print(f"Mean accuracy: {mean_accuracy * 100:.2f}%")
print(f"Standard deviation of accuracy: {std_accuracy * 100:.2f}%")
print(f"Best accuracy: {best_accuracy * 100:.2f}%")


Fetching MNIST dataset...
