In [1]:
import time
import numpy as np
from sklearn.datasets import fetch_openml
from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

# Step 1: Fetch and preprocess the MNIST dataset
print("Fetching MNIST dataset...")
X, y = fetch_openml('mnist_784', version=1, return_X_y=True, as_frame=False)

X, y = X[:10000], y[:10000]  # Select first 10,000 images for processing
print(f"Using first 10,000 images: {X.shape[0]} training samples.")


X = X / 255.0  # Normalize pixel values to [0, 1]
y = y.astype(int)  # Convert labels to integers

# Step 2: Standardize the features (optional but often improves performance)
print("Standardizing features...")
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Step 3: Apply 10-fold cross-validation on the whole dataset
print("Applying 10-fold cross-validation...")
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)  # 10-fold cross-validation
accuracies = []
fold_times = []

knn = KNeighborsClassifier(n_neighbors=3, algorithm='brute', metric='euclidean')

start_time = time.time()
for fold_idx, (train_idx, test_idx) in enumerate(kf.split(X, y), 1):
    print(f"Processing fold {fold_idx}...")  # Console output indicating the current fold
    # Split the data for the current fold
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    # Train the k-NN classifier
    knn.fit(X_train, y_train)

    # Predict on the test set
    y_pred = knn.predict(X_test)

    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    accuracies.append(accuracy)
    print(f"Fold {fold_idx}: Accuracy = {accuracy * 100:.2f}%")

end_time = time.time()

# Step 4: Report overall results
print(f"\nCross-validation completed in {end_time - start_time:.2f} seconds.")
print(f"Accuracy for each fold: {[f'{score * 100:.2f}%' for score in accuracies]}")
print(f"Mean accuracy: {np.mean(accuracies) * 100:.2f}%")
print(f"Standard deviation: {np.std(accuracies) * 100:.2f}%")


Fetching MNIST dataset...
Using first 10,000 images: 10000 training samples.
Standardizing features...
Applying 10-fold cross-validation...
Processing fold 1...
Fold 1: Accuracy = 89.90%
Processing fold 2...
Fold 2: Accuracy = 93.10%
Processing fold 3...
Fold 3: Accuracy = 91.30%
Processing fold 4...
Fold 4: Accuracy = 90.70%
Processing fold 5...
Fold 5: Accuracy = 90.60%
Processing fold 6...
Fold 6: Accuracy = 91.40%
Processing fold 7...
Fold 7: Accuracy = 91.00%
Processing fold 8...
Fold 8: Accuracy = 92.40%
Processing fold 9...
Fold 9: Accuracy = 90.90%
Processing fold 10...
Fold 10: Accuracy = 91.00%

Cross-validation completed in 17.95 seconds.
Accuracy for each fold: ['89.90%', '93.10%', '91.30%', '90.70%', '90.60%', '91.40%', '91.00%', '92.40%', '90.90%', '91.00%']
Mean accuracy: 91.23%
Standard deviation: 0.87%


In [2]:
import time
import numpy as np
from sklearn.datasets import fetch_openml
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler

# Step 1: Fetch and preprocess the MNIST dataset
print("Fetching MNIST dataset...")
X, y = fetch_openml('mnist_784', version=1, return_X_y=True, as_frame=False)

X, y = X[:10000], y[:10000]  # Select first 10,000 images for processing
print(f"Using first 10,000 images: {X.shape[0]} training samples.")


X = X / 255.0  # Normalize pixel values to [0, 1]
y = y.astype(int)  # Convert labels to integers

# Step 2: Standardize the features
print("Standardizing features...")
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Step 3: Use 10-fold cross-validation
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
fold_scores = []
fold_times = []

# Initialize the K-NN classifier with Cosine distance
knn = KNeighborsClassifier(n_neighbors=3, algorithm='brute', metric='cosine')

start_time = time.time()

for fold_idx, (train_idx, test_idx) in enumerate(kf.split(X, y), 1):
    # Split the data for the current fold
    print(f"Processing fold {fold_idx}...") 
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    # Train the model
    knn.fit(X_train, y_train)

    # Predict on the test set
    y_pred = knn.predict(X_test)

    # Evaluate the model
    score = accuracy_score(y_test, y_pred)
    fold_scores.append(score)
    fold_end_time = time.time()
    fold_time = fold_end_time - start_time
    fold_times.append(fold_time)
    
    print(f"Fold {fold_idx}: Accuracy = {score * 100:.2f}%, Time = {fold_time:.2f} seconds")
    
# Step 4: Report overall results
mean_accuracy = np.mean(fold_scores)
std_accuracy = np.std(fold_scores)
total_time = np.sum(fold_times)
best_accuracy = np.max(fold_scores)

print(f"\nCross-validation completed in {total_time:.2f} seconds.")
print(f"Accuracy for each fold: {[f'{score * 100:.2f}%' for score in fold_scores]}")
print(f"Mean accuracy: {mean_accuracy * 100:.2f}%")
print(f"Standard deviation of accuracy: {std_accuracy * 100:.2f}%")
print(f"Best accuracy: {best_accuracy * 100:.2f}%")


Fetching MNIST dataset...
Using first 10,000 images: 10000 training samples.
Standardizing features...
Processing fold 1...
Fold 1: Accuracy = 90.40%, Time = 2.36 seconds
Processing fold 2...
Fold 2: Accuracy = 93.00%, Time = 4.27 seconds
Processing fold 3...
Fold 3: Accuracy = 90.90%, Time = 6.14 seconds
Processing fold 4...
Fold 4: Accuracy = 90.40%, Time = 8.10 seconds
Processing fold 5...
Fold 5: Accuracy = 90.50%, Time = 10.58 seconds
Processing fold 6...
Fold 6: Accuracy = 91.80%, Time = 12.99 seconds
Processing fold 7...
Fold 7: Accuracy = 91.40%, Time = 14.88 seconds
Processing fold 8...
Fold 8: Accuracy = 92.10%, Time = 16.70 seconds
Processing fold 9...
Fold 9: Accuracy = 91.30%, Time = 18.48 seconds
Processing fold 10...
Fold 10: Accuracy = 91.60%, Time = 20.26 seconds

Cross-validation completed in 114.75 seconds.
Accuracy for each fold: ['90.40%', '93.00%', '90.90%', '90.40%', '90.50%', '91.80%', '91.40%', '92.10%', '91.30%', '91.60%']
Mean accuracy: 91.34%
Standard deviat

In [3]:
import time
import numpy as np
from sklearn.datasets import fetch_openml
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler

# Step 1: Fetch and preprocess the MNIST dataset
print("Fetching MNIST dataset...")
X, y = fetch_openml('mnist_784', version=1, return_X_y=True, as_frame=False)

X, y = X[:10000], y[:10000]  # Select first 10,000 images for processing
print(f"Using first 10,000 images: {X.shape[0]} training samples.")


X = X / 255.0  # Normalize pixel values to [0, 1]
y = y.astype(int)  # Convert labels to integers

# Step 2: Standardize the features
print("Standardizing features...")
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Step 3: Use 10-fold cross-validation
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
fold_scores = []
fold_times = []

# Initialize the K-NN classifier with Manhattan distance
knn = KNeighborsClassifier(n_neighbors=3, algorithm='brute', metric='manhattan')

start_time = time.time()

for fold_idx, (train_idx, test_idx) in enumerate(kf.split(X, y), 1):
    print(f"Processing fold {fold_idx}...") 
    # Split the data for the current fold
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    # Train the model
    knn.fit(X_train, y_train)

    # Predict on the test set
    y_pred = knn.predict(X_test)

    # Evaluate the model
    score = accuracy_score(y_test, y_pred)
    fold_scores.append(score)
    fold_end_time = time.time()
    fold_time = fold_end_time - start_time
    fold_times.append(fold_time)
    
    print(f"Fold {fold_idx}: Accuracy = {score * 100:.2f}%, Time = {fold_time:.2f} seconds")
    
# Step 4: Report overall results
mean_accuracy = np.mean(fold_scores)
std_accuracy = np.std(fold_scores)
total_time = np.sum(fold_times)
best_accuracy = np.max(fold_scores)

print(f"\nCross-validation completed in {total_time:.2f} seconds.")
print(f"Accuracy for each fold: {[f'{score * 100:.2f}%' for score in fold_scores]}")
print(f"Mean accuracy: {mean_accuracy * 100:.2f}%")
print(f"Standard deviation of accuracy: {std_accuracy * 100:.2f}%")
print(f"Best accuracy: {best_accuracy * 100:.2f}%")


Fetching MNIST dataset...
Using first 10,000 images: 10000 training samples.
Standardizing features...
Processing fold 1...
Fold 1: Accuracy = 91.50%, Time = 12.68 seconds
Processing fold 2...
Fold 2: Accuracy = 94.70%, Time = 25.23 seconds
Processing fold 3...
Fold 3: Accuracy = 93.40%, Time = 38.00 seconds
Processing fold 4...
Fold 4: Accuracy = 91.90%, Time = 49.76 seconds
Processing fold 5...
Fold 5: Accuracy = 92.70%, Time = 62.64 seconds
Processing fold 6...
Fold 6: Accuracy = 93.30%, Time = 74.92 seconds
Processing fold 7...
Fold 7: Accuracy = 92.00%, Time = 87.34 seconds
Processing fold 8...
Fold 8: Accuracy = 93.80%, Time = 99.14 seconds
Processing fold 9...
Fold 9: Accuracy = 93.70%, Time = 112.74 seconds
Processing fold 10...
Fold 10: Accuracy = 92.80%, Time = 124.96 seconds

Cross-validation completed in 687.41 seconds.
Accuracy for each fold: ['91.50%', '94.70%', '93.40%', '91.90%', '92.70%', '93.30%', '92.00%', '93.80%', '93.70%', '92.80%']
Mean accuracy: 92.98%
Standard 