In [1]:
import time
import numpy as np
from sklearn.datasets import fetch_openml
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# Step 1: Fetch and preprocess the MNIST dataset
print("Fetching MNIST dataset...")
X, y = fetch_openml('mnist_784', version=1, return_X_y=True, as_frame=False)

X, y = X[:10000], y[:10000]  # Select first 10,000 images for processing
print(f"Using first 10,000 images: {X.shape[0]} training samples.")

X = X / 255.0  # Normalize the pixel values to [0, 1]
y = y.astype(int)  # Convert labels to integers

# Step 2: Standardize the features
print("Standardizing features...")
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Step 3: Apply PCA for dimensionality reduction
print("Applying PCA...")
pca = PCA(n_components=0.95)  # Retain enough components to explain 95% variance
X = pca.fit_transform(X)
print(f"Reduced data shape: {X.shape}")

# Step 4: Euclidean Distance function
def euclidean_distance(x, y):
    """Calculate the Euclidean distance between two points."""
    return np.sqrt(np.sum((x - y) ** 2))

# Step 5: Implement k-NN without BallTree
def knn_predict(X_train, y_train, X_test, k=3):
    """Predict labels using k-NN and Euclidean distance."""
    y_pred = []
    
    # For each test point, calculate distances to all training points
    for test_point in X_test:
        distances = [euclidean_distance(test_point, train_point) for train_point in X_train]
        
        # Get indices of the k smallest distances
        k_indices = np.argsort(distances)[:k]
        k_nearest_labels = y_train[k_indices]
        
        # Majority vote
        predicted_label = np.bincount(k_nearest_labels).argmax()
        y_pred.append(predicted_label)
    
    return np.array(y_pred)

# Step 6: Use 10-fold cross-validation
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
fold_scores = []
fold_times = []

# Start cross-validation process
start_time = time.time()

for fold_idx, (train_idx, test_idx) in enumerate(kf.split(X, y), 1):
    print(f"Processing fold {fold_idx}...")  # Indicate which fold is being processed
    
    # Split the data for the current fold
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    # Step 7: Predict using k-NN without BallTree
    print(f"Predicting using k-NN for fold {fold_idx}...")
    y_pred = knn_predict(X_train, y_train, X_test, k=3)

    # Calculate accuracy for this fold
    accuracy = accuracy_score(y_test, y_pred)
    fold_scores.append(accuracy)

    # Track the time taken for this fold
    fold_end_time = time.time()
    fold_time = fold_end_time - start_time
    fold_times.append(fold_time)
    
    print(f"Fold {fold_idx}: Accuracy = {accuracy * 100:.2f}%, Time = {fold_time:.2f} seconds")

# Step 8: Report overall results
mean_accuracy = np.mean(fold_scores)
std_accuracy = np.std(fold_scores)
total_time = np.sum(fold_times)
best_accuracy = np.max(fold_scores)

print(f"\nCross-validation completed in {total_time:.2f} seconds.")
print(f"Accuracy for each fold: {[f'{score * 100:.2f}%' for score in fold_scores]}")
print(f"Mean accuracy: {mean_accuracy * 100:.2f}%")
print(f"Standard deviation of accuracy: {std_accuracy * 100:.2f}%")
print(f"Best accuracy: {best_accuracy * 100:.2f}%")


Fetching MNIST dataset...
Using first 10,000 images: 10000 training samples.
Standardizing features...
Applying PCA...
Reduced data shape: (10000, 283)
Processing fold 1...
Predicting using k-NN for fold 1...
Fold 1: Accuracy = 90.80%, Time = 374.16 seconds
Processing fold 2...
Predicting using k-NN for fold 2...
Fold 2: Accuracy = 93.90%, Time = 630.32 seconds
Processing fold 3...
Predicting using k-NN for fold 3...
Fold 3: Accuracy = 91.80%, Time = 871.80 seconds
Processing fold 4...
Predicting using k-NN for fold 4...
Fold 4: Accuracy = 91.20%, Time = 1114.85 seconds
Processing fold 5...
Predicting using k-NN for fold 5...
Fold 5: Accuracy = 90.40%, Time = 1360.21 seconds
Processing fold 6...
Predicting using k-NN for fold 6...
Fold 6: Accuracy = 92.40%, Time = 1600.69 seconds
Processing fold 7...
Predicting using k-NN for fold 7...
Fold 7: Accuracy = 91.30%, Time = 1845.26 seconds
Processing fold 8...
Predicting using k-NN for fold 8...
Fold 8: Accuracy = 92.00%, Time = 2088.27 sec

In [8]:
import time
import numpy as np
from sklearn.datasets import fetch_openml
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# Step 1: Fetch and preprocess the MNIST dataset
print("Fetching MNIST dataset...")
X, y = fetch_openml('mnist_784', version=1, return_X_y=True, as_frame=False)

X, y = X[:10000], y[:10000]  # Select first 10,000 images for processing
print(f"Using first 10,000 images: {X.shape[0]} training samples.")

X = X / 255.0  # Normalize the pixel values to [0, 1]
y = y.astype(int)  # Convert labels to integers

# Step 2: Standardize the features
print("Standardizing features...")
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Step 3: Apply PCA for dimensionality reduction
print("Applying PCA...")
pca = PCA(n_components=0.95)  # Retain enough components to explain 95% variance
X = pca.fit_transform(X)
print(f"Reduced data shape: {X.shape}")

# Step 4: Define the custom cosine distance function
def cosine_similarity(point1, point2):
    """Calculate the cosine similarity between two points."""
    dot_product = np.dot(point1, point2)
    norm1 = np.linalg.norm(point1)
    norm2 = np.linalg.norm(point2)
    return dot_product / (norm1 * norm2)

def cosine_distance(x, y):
    """Calculate the cosine distance between two vectors."""
    return 1 - cosine_similarity(x, y)

# Step 5: Use 10-fold cross-validation
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
fold_scores = []
fold_times = []

start_time = time.time()

# Function to perform 1-NN without BallTree
def knn_predict(X_train, y_train, X_test):
    y_pred = []
    for test_point in X_test:
        # Calculate distances to all training points
        distances = [cosine_distance(test_point, train_point) for train_point in X_train]
        
        # Get index of the smallest distance (nearest neighbor)
        nearest_index = np.argmin(distances)
        predicted_label = y_train[nearest_index]
        
        y_pred.append(predicted_label)
    return np.array(y_pred)

for fold_idx, (train_idx, test_idx) in enumerate(kf.split(X, y), 1):
    print(f"Processing fold {fold_idx}...")  # Indicate which fold is being processed
    
    # Split the data for the current fold
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    # Test 1-NN without Ball Tree
    y_pred = knn_predict(X_train, y_train, X_test)

    # Calculate accuracy for this fold
    accuracy = accuracy_score(y_test, y_pred)
    fold_scores.append(accuracy)

    # Track the time taken for this fold
    fold_end_time = time.time()
    fold_time = fold_end_time - start_time
    fold_times.append(fold_time)
    
    print(f"Fold {fold_idx}: Accuracy = {accuracy * 100:.2f}%, Time = {fold_time:.2f} seconds")

# Step 6: Report overall results
mean_accuracy = np.mean(fold_scores)
std_accuracy = np.std(fold_scores)
total_time = np.sum(fold_times)
best_accuracy = np.max(fold_scores)

print(f"\nCross-validation completed in {total_time:.2f} seconds.")
print(f"Accuracy for each fold: {[f'{score * 100:.2f}%' for score in fold_scores]}")
print(f"Mean accuracy: {mean_accuracy * 100:.2f}%")
print(f"Standard deviation of accuracy: {std_accuracy * 100:.2f}%")
print(f"Best accuracy: {best_accuracy * 100:.2f}%")


Fetching MNIST dataset...
Using first 10,000 images: 10000 training samples.
Standardizing features...
Applying PCA...
Reduced data shape: (10000, 283)
Processing fold 1...
Fold 1: Accuracy = 90.90%, Time = 55.32 seconds
Processing fold 2...
Fold 2: Accuracy = 93.40%, Time = 117.19 seconds
Processing fold 3...
Fold 3: Accuracy = 92.40%, Time = 177.31 seconds
Processing fold 4...
Fold 4: Accuracy = 90.30%, Time = 238.18 seconds
Processing fold 5...
Fold 5: Accuracy = 91.70%, Time = 297.16 seconds
Processing fold 6...
Fold 6: Accuracy = 93.00%, Time = 360.45 seconds
Processing fold 7...
Fold 7: Accuracy = 92.70%, Time = 421.39 seconds
Processing fold 8...
Fold 8: Accuracy = 93.50%, Time = 486.86 seconds
Processing fold 9...
Fold 9: Accuracy = 92.50%, Time = 548.27 seconds
Processing fold 10...
Fold 10: Accuracy = 92.70%, Time = 609.83 seconds

Cross-validation completed in 3311.96 seconds.
Accuracy for each fold: ['90.90%', '93.40%', '92.40%', '90.30%', '91.70%', '93.00%', '92.70%', '93.

In [3]:
import time
import numpy as np
from sklearn.datasets import fetch_openml
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# Step 1: Fetch and preprocess the MNIST dataset
print("Fetching MNIST dataset...")
X, y = fetch_openml('mnist_784', version=1, return_X_y=True, as_frame=False)

X, y = X[:10000], y[:10000]  # Select first 10,000 images for processing
print(f"Using first 10,000 images: {X.shape[0]} training samples.")

X = X / 255.0  # Normalize pixel values to [0, 1]
y = y.astype(int)  # Convert labels to integers

# Step 2: Standardize the features
print("Standardizing features...")
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Step 3: Apply PCA for dimensionality reduction
print("Applying PCA...")
pca = PCA(n_components=0.95)  # Retain enough components to explain 95% variance
X = pca.fit_transform(X)
print(f"Reduced data shape: {X.shape}")

# Step 4: Manhattan Distance function
def manhattan_distance(x, y):
    """Calculate the Manhattan (L1) distance between two points."""
    return np.sum(np.abs(x - y))

# Step 5: Implement k-NN without BallTree
def knn_predict(X_train, y_train, X_test, k=3):
    """Predict labels using k-NN and Manhattan distance."""
    y_pred = []
    
    # For each test point, calculate distances to all training points
    for test_point in X_test:
        distances = [manhattan_distance(test_point, train_point) for train_point in X_train]
        
        # Get indices of the k smallest distances
        k_indices = np.argsort(distances)[:k]
        k_nearest_labels = y_train[k_indices]
        
        # Majority vote
        predicted_label = np.bincount(k_nearest_labels).argmax()
        y_pred.append(predicted_label)
    
    return np.array(y_pred)

# Step 6: Use 10-fold cross-validation
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
fold_scores = []
fold_times = []

# Start cross-validation process
start_time = time.time()

for fold_idx, (train_idx, test_idx) in enumerate(kf.split(X, y), 1):
    print(f"Processing fold {fold_idx}...")  # Indicate which fold is being processed
    
    # Split the data for the current fold
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    # Step 7: Predict using k-NN without BallTree
    print(f"Predicting using k-NN for fold {fold_idx}...")
    y_pred = knn_predict(X_train, y_train, X_test, k=3)

    # Calculate accuracy for this fold
    accuracy = accuracy_score(y_test, y_pred)
    fold_scores.append(accuracy)

    # Track the time taken for this fold
    fold_end_time = time.time()
    fold_time = fold_end_time - start_time
    fold_times.append(fold_time)
    
    print(f"Fold {fold_idx}: Accuracy = {accuracy * 100:.2f}%, Time = {fold_time:.2f} seconds")

# Step 8: Report overall results
mean_accuracy = np.mean(fold_scores)
std_accuracy = np.std(fold_scores)
total_time = np.sum(fold_times)
best_accuracy = np.max(fold_scores)

print(f"\nCross-validation completed in {total_time:.2f} seconds.")
print(f"Accuracy for each fold: {[f'{score * 100:.2f}%' for score in fold_scores]}")
print(f"Mean accuracy: {mean_accuracy * 100:.2f}%")
print(f"Standard deviation of accuracy: {std_accuracy * 100:.2f}%")
print(f"Best accuracy: {best_accuracy * 100:.2f}%")


Fetching MNIST dataset...
Using first 10,000 images: 10000 training samples.
Standardizing features...
Applying PCA...
Reduced data shape: (10000, 283)
Processing fold 1...
Predicting using k-NN for fold 1...
Fold 1: Accuracy = 87.20%, Time = 191.39 seconds
Processing fold 2...
Predicting using k-NN for fold 2...
Fold 2: Accuracy = 91.30%, Time = 384.87 seconds
Processing fold 3...
Predicting using k-NN for fold 3...
Fold 3: Accuracy = 89.70%, Time = 577.12 seconds
Processing fold 4...
Predicting using k-NN for fold 4...
Fold 4: Accuracy = 87.90%, Time = 766.06 seconds
Processing fold 5...
Predicting using k-NN for fold 5...
Fold 5: Accuracy = 89.00%, Time = 958.87 seconds
Processing fold 6...
Predicting using k-NN for fold 6...
Fold 6: Accuracy = 90.80%, Time = 1149.18 seconds
Processing fold 7...
Predicting using k-NN for fold 7...
Fold 7: Accuracy = 88.40%, Time = 1339.68 seconds
Processing fold 8...
Predicting using k-NN for fold 8...
Fold 8: Accuracy = 91.20%, Time = 1530.18 secon