In [1]:
import time
import numpy as np
from sklearn.datasets import fetch_openml
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

# Step 1: Fetch and preprocess the MNIST dataset
print("Fetching MNIST dataset...")
X, y = fetch_openml('mnist_784', version=1, return_X_y=True, as_frame=False)

X, y = X[:10000], y[:10000]  # Select first 10,000 images for processing
print(f"Using first 10,000 images: {X.shape[0]} training samples.")

X = X / 255.0  # Normalize the pixel values to [0, 1]
y = y.astype(int)  # Convert labels to integers

# Step 2: Standardize the features
print("Standardizing features...")
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Step 3: Euclidean Distance function
def euclidean_distance(x, y):
    """Calculate the Euclidean distance between two points."""
    return np.sqrt(np.sum((x - y) ** 2))

# Step 4: Implement k-NN without BallTree
def knn_predict(X_train, y_train, X_test, k=3):
    """Predict labels using k-NN and Euclidean distance."""
    y_pred = []
    
    # For each test point, calculate distances to all training points
    for test_point in X_test:
        distances = [euclidean_distance(test_point, train_point) for train_point in X_train]
        
        # Get indices of the k smallest distances
        k_indices = np.argsort(distances)[:k]
        k_nearest_labels = y_train[k_indices]
        
        # Majority vote
        predicted_label = np.bincount(k_nearest_labels).argmax()
        y_pred.append(predicted_label)
    
    return np.array(y_pred)

# Step 5: Use 10-fold cross-validation
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
fold_scores = []
fold_times = []

# Start cross-validation process
start_time = time.time()

for fold_idx, (train_idx, test_idx) in enumerate(kf.split(X, y), 1):
    print(f"Processing fold {fold_idx}...")  # Indicate which fold is being processed
    
    # Split the data for the current fold
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    # Step 6: Predict using k-NN without BallTree
    print(f"Predicting using k-NN for fold {fold_idx}...")
    y_pred = knn_predict(X_train, y_train, X_test, k=3)

    # Calculate accuracy for this fold
    accuracy = accuracy_score(y_test, y_pred)
    fold_scores.append(accuracy)

    # Track the time taken for this fold
    fold_end_time = time.time()
    fold_time = fold_end_time - start_time
    fold_times.append(fold_time)
    
    print(f"Fold {fold_idx}: Accuracy = {accuracy * 100:.2f}%, Time = {fold_time:.2f} seconds")

# Step 7: Report overall results
mean_accuracy = np.mean(fold_scores)
std_accuracy = np.std(fold_scores)
total_time = np.sum(fold_times)
best_accuracy = np.max(fold_scores)

print(f"\nCross-validation completed in {total_time:.2f} seconds.")
print(f"Accuracy for each fold: {[f'{score * 100:.2f}%' for score in fold_scores]}")
print(f"Mean accuracy: {mean_accuracy * 100:.2f}%")
print(f"Standard deviation of accuracy: {std_accuracy * 100:.2f}%")
print(f"Best accuracy: {best_accuracy * 100:.2f}%")


Fetching MNIST dataset...
Using first 10,000 images: 10000 training samples.
Standardizing features...
Processing fold 1...
Predicting using k-NN for fold 1...
Fold 1: Accuracy = 89.90%, Time = 418.52 seconds
Processing fold 2...
Predicting using k-NN for fold 2...
Fold 2: Accuracy = 93.10%, Time = 699.12 seconds
Processing fold 3...
Predicting using k-NN for fold 3...
Fold 3: Accuracy = 91.30%, Time = 972.67 seconds
Processing fold 4...
Predicting using k-NN for fold 4...
Fold 4: Accuracy = 90.70%, Time = 1253.42 seconds
Processing fold 5...
Predicting using k-NN for fold 5...
Fold 5: Accuracy = 90.60%, Time = 1526.49 seconds
Processing fold 6...
Predicting using k-NN for fold 6...
Fold 6: Accuracy = 91.40%, Time = 1805.21 seconds
Processing fold 7...
Predicting using k-NN for fold 7...
Fold 7: Accuracy = 91.00%, Time = 2081.29 seconds
Processing fold 8...
Predicting using k-NN for fold 8...
Fold 8: Accuracy = 92.40%, Time = 2357.44 seconds
Processing fold 9...
Predicting using k-NN f

In [8]:
import time
import numpy as np
from sklearn.datasets import fetch_openml
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

# Step 1: Fetch and preprocess the MNIST dataset
print("Fetching MNIST dataset...")
X, y = fetch_openml('mnist_784', version=1, return_X_y=True, as_frame=False)

X, y = X[:10000], y[:10000]  # Select first 10,000 images for processing
print(f"Using first 10,000 images: {X.shape[0]} training samples.")

X = X / 255.0  # Normalize the pixel values to [0, 1]
y = y.astype(int)  # Convert labels to integers

# Step 2: Standardize the features
print("Standardizing features...")
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Step 3: Define the custom cosine distance function
def cosine_similarity(point1, point2):
    """Calculate the cosine similarity between two points."""
    dot_product = np.dot(point1, point2)
    norm1 = np.linalg.norm(point1)
    norm2 = np.linalg.norm(point2)
    return dot_product / (norm1 * norm2)

def cosine_distance(x, y):
    """Calculate the cosine distance between two vectors."""
    return 1 - cosine_similarity(x, y)

# Step 4: Use 10-fold cross-validation
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
fold_scores = []
fold_times = []

start_time = time.time()

# Function to perform k-NN without BallTree
def knn_predict(X_train, y_train, X_test, k=3):
    y_pred = []
    for test_point in X_test:
        # Calculate distances to all training points
        distances = [cosine_distance(test_point, train_point) for train_point in X_train]
        
        # Get indices of the k smallest distances
        k_indices = np.argsort(distances)[:k]
        k_nearest_labels = y_train[k_indices]
        
        # Majority vote
        predicted_label = np.bincount(k_nearest_labels).argmax()
        y_pred.append(predicted_label)
    return np.array(y_pred)

for fold_idx, (train_idx, test_idx) in enumerate(kf.split(X, y), 1):
    print(f"Processing fold {fold_idx}...")  # Indicate which fold is being processed
    
    # Split the data for the current fold
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    # Test k-NN without Ball Tree
    y_pred = knn_predict(X_train, y_train, X_test, k=3)

    # Calculate accuracy for this fold
    accuracy = accuracy_score(y_test, y_pred)
    fold_scores.append(accuracy)

    # Track the time taken for this fold
    fold_end_time = time.time()
    fold_time = fold_end_time - start_time
    fold_times.append(fold_time)
    
    print(f"Fold {fold_idx}: Accuracy = {accuracy * 100:.2f}%, Time = {fold_time:.2f} seconds")

# Step 5: Report overall results
mean_accuracy = np.mean(fold_scores)
std_accuracy = np.std(fold_scores)
total_time = np.sum(fold_times)
best_accuracy = np.max(fold_scores)

print(f"\nCross-validation completed in {total_time:.2f} seconds.")
print(f"Accuracy for each fold: {[f'{score * 100:.2f}%' for score in fold_scores]}")
print(f"Mean accuracy: {mean_accuracy * 100:.2f}%")
print(f"Standard deviation of accuracy: {std_accuracy * 100:.2f}%")
print(f"Best accuracy: {best_accuracy * 100:.2f}%")


Fetching MNIST dataset...
Using first 10,000 images: 10000 training samples.
Standardizing features...
Processing fold 1...
Fold 1: Accuracy = 90.40%, Time = 62.33 seconds
Processing fold 2...
Fold 2: Accuracy = 93.00%, Time = 127.82 seconds
Processing fold 3...
Fold 3: Accuracy = 90.90%, Time = 190.03 seconds
Processing fold 4...
Fold 4: Accuracy = 90.40%, Time = 249.82 seconds
Processing fold 5...
Fold 5: Accuracy = 90.50%, Time = 312.83 seconds
Processing fold 6...
Fold 6: Accuracy = 91.80%, Time = 380.30 seconds
Processing fold 7...
Fold 7: Accuracy = 91.40%, Time = 444.88 seconds
Processing fold 8...
Fold 8: Accuracy = 92.10%, Time = 509.06 seconds
Processing fold 9...
Fold 9: Accuracy = 91.30%, Time = 570.48 seconds
Processing fold 10...
Fold 10: Accuracy = 91.60%, Time = 631.69 seconds

Cross-validation completed in 3479.25 seconds.
Accuracy for each fold: ['90.40%', '93.00%', '90.90%', '90.40%', '90.50%', '91.80%', '91.40%', '92.10%', '91.30%', '91.60%']
Mean accuracy: 91.34%
S

In [3]:
import time
import numpy as np
from sklearn.datasets import fetch_openml
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

# Step 1: Fetch and preprocess the MNIST dataset
print("Fetching MNIST dataset...")
X, y = fetch_openml('mnist_784', version=1, return_X_y=True, as_frame=False)

X, y = X[:10000], y[:10000]  # Select first 10,000 images for processing
print(f"Using first 10,000 images: {X.shape[0]} training samples.")

X = X / 255.0  # Normalize pixel values to [0, 1]
y = y.astype(int)  # Convert labels to integers

# Step 2: Standardize the features
print("Standardizing features...")
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Step 3: Manhattan Distance function
def manhattan_distance(x, y):
    """Calculate the Manhattan (L1) distance between two points."""
    return np.sum(np.abs(x - y))

# Step 4: Implement k-NN without BallTree
def knn_predict(X_train, y_train, X_test, k=3):
    """Predict labels using k-NN and Manhattan distance."""
    y_pred = []
    
    # For each test point, calculate distances to all training points
    for test_point in X_test:
        distances = [manhattan_distance(test_point, train_point) for train_point in X_train]
        
        # Get indices of the k smallest distances
        k_indices = np.argsort(distances)[:k]
        k_nearest_labels = y_train[k_indices]
        
        # Majority vote
        predicted_label = np.bincount(k_nearest_labels).argmax()
        y_pred.append(predicted_label)
    
    return np.array(y_pred)

# Step 5: Use 10-fold cross-validation
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
fold_scores = []
fold_times = []

# Start cross-validation process
start_time = time.time()

for fold_idx, (train_idx, test_idx) in enumerate(kf.split(X, y), 1):
    print(f"Processing fold {fold_idx}...")  # Indicate which fold is being processed
    
    # Split the data for the current fold
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    # Step 6: Predict using k-NN without BallTree
    print(f"Predicting using k-NN for fold {fold_idx}...")
    y_pred = knn_predict(X_train, y_train, X_test, k=3)

    # Calculate accuracy for this fold
    accuracy = accuracy_score(y_test, y_pred)
    fold_scores.append(accuracy)

    # Track the time taken for this fold
    fold_end_time = time.time()
    fold_time = fold_end_time - start_time
    fold_times.append(fold_time)
    
    print(f"Fold {fold_idx}: Accuracy = {accuracy * 100:.2f}%, Time = {fold_time:.2f} seconds")

# Step 7: Report overall results
mean_accuracy = np.mean(fold_scores)
std_accuracy = np.std(fold_scores)
total_time = np.sum(fold_times)
best_accuracy = np.max(fold_scores)

print(f"\nCross-validation completed in {total_time:.2f} seconds.")
print(f"Accuracy for each fold: {[f'{score * 100:.2f}%' for score in fold_scores]}")
print(f"Mean accuracy: {mean_accuracy * 100:.2f}%")
print(f"Standard deviation of accuracy: {std_accuracy * 100:.2f}%")
print(f"Best accuracy: {best_accuracy * 100:.2f}%")


Fetching MNIST dataset...
Using first 10,000 images: 10000 training samples.
Standardizing features...
Processing fold 1...
Predicting using k-NN for fold 1...
Fold 1: Accuracy = 91.50%, Time = 225.07 seconds
Processing fold 2...
Predicting using k-NN for fold 2...
Fold 2: Accuracy = 94.70%, Time = 449.87 seconds
Processing fold 3...
Predicting using k-NN for fold 3...
Fold 3: Accuracy = 93.40%, Time = 674.65 seconds
Processing fold 4...
Predicting using k-NN for fold 4...
Fold 4: Accuracy = 91.90%, Time = 898.45 seconds
Processing fold 5...
Predicting using k-NN for fold 5...
Fold 5: Accuracy = 92.70%, Time = 1110.96 seconds
Processing fold 6...
Predicting using k-NN for fold 6...
Fold 6: Accuracy = 93.30%, Time = 1289.30 seconds
Processing fold 7...
Predicting using k-NN for fold 7...
Fold 7: Accuracy = 92.00%, Time = 1467.10 seconds
Processing fold 8...
Predicting using k-NN for fold 8...
Fold 8: Accuracy = 93.80%, Time = 1532.64 seconds
Processing fold 9...
Predicting using k-NN fo