In [None]:
import time
import numpy as np
from sklearn.datasets import fetch_openml
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.neighbors import BallTree
from sklearn.preprocessing import StandardScaler

# Step 1: Fetch and preprocess the MNIST dataset
print("Fetching MNIST dataset...")
X, y = fetch_openml('mnist_784', version=1, return_X_y=True, as_frame=False)

X, y = X[:10000], y[:10000]  # Select first 10,000 images for processing
print(f"Using first 10,000 images: {X.shape[0]} training samples.")


X = X / 255.0  # Normalize the pixel values to [0, 1]
y = y.astype(int)  # Convert labels to integers

# Step 2: Standardize the features
print("Standardizing features...")
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Step 3: Euclidean Distance function
def euclidean_distance(x, y):
    """Calculate the Euclidean distance between two points."""
    return np.sqrt(np.sum((x - y) ** 2))

# Step 4: Define the custom Euclidean distance function for BallTree
def euclidean_distance_ball_tree(x, y):
    """Return the Euclidean distance between two vectors."""
    return euclidean_distance(x, y)

# Step 5: Use 10-fold cross-validation
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
fold_scores = []
fold_times = []

# Initialize the start time
start_time = time.time()

for fold_idx, (train_idx, test_idx) in enumerate(kf.split(X, y), 1):
    print(f"Processing fold {fold_idx}...")  # Indicate which fold is being processed
    
    # Split the data for the current fold
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    # Build the Ball Tree for the current fold
    print(f"Building Ball Tree using custom Euclidean distance for fold {fold_idx}...")
    ball_tree = BallTree(X_train, metric=euclidean_distance_ball_tree)

    # Query the Ball Tree to find the k nearest neighbors
    def knn_with_ball_tree(ball_tree, X_test, k=3):
        y_pred = []
        for test_point in X_test:
            dist, ind = ball_tree.query([test_point], k=k)  # Query for k nearest neighbors
            neighbors_labels = y_train[ind].flatten()  # Get the labels of the neighbors
            predicted_label = np.bincount(neighbors_labels).argmax()  # Majority vote
            y_pred.append(predicted_label)
        return np.array(y_pred)

    # Test k-NN with Ball Tree
    y_pred_ball_tree = knn_with_ball_tree(ball_tree, X_test, k=3)

    # Calculate accuracy for this fold
    accuracy = accuracy_score(y_test, y_pred_ball_tree)
    fold_scores.append(accuracy)

    # Track the time taken for this fold
    fold_end_time = time.time()
    fold_time = fold_end_time - start_time
    fold_times.append(fold_time)
    
    print(f"Fold {fold_idx}: Accuracy = {accuracy * 100:.2f}%, Time = {fold_time:.2f} seconds")

# Step 6: Report overall results
mean_accuracy = np.mean(fold_scores)
std_accuracy = np.std(fold_scores)
total_time = np.sum(fold_times)
best_accuracy = np.max(fold_scores)

print(f"\nCross-validation completed in {total_time:.2f} seconds.")
print(f"Accuracy for each fold: {[f'{score * 100:.2f}%' for score in fold_scores]}")
print(f"Mean accuracy: {mean_accuracy * 100:.2f}%")
print(f"Standard deviation of accuracy: {std_accuracy * 100:.2f}%")
print(f"Best accuracy: {best_accuracy * 100:.2f}%")


Fetching MNIST dataset...
Using first 10,000 images: 10000 training samples.
Standardizing features...
Processing fold 1...
Building Ball Tree using custom Euclidean distance for fold 1...
Fold 1: Accuracy = 89.90%, Time = 451.22 seconds
Processing fold 2...
Building Ball Tree using custom Euclidean distance for fold 2...


In [None]:
import time
import numpy as np
from sklearn.datasets import fetch_openml
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.neighbors import BallTree
from sklearn.preprocessing import StandardScaler

# Step 1: Fetch and preprocess the MNIST dataset
print("Fetching MNIST dataset...")
X, y = fetch_openml('mnist_784', version=1, return_X_y=True, as_frame=False)

X, y = X[:10000], y[:10000]  # Select first 10,000 images for processing
print(f"Using first 10,000 images: {X.shape[0]} training samples.")

X = X / 255.0  # Normalize the pixel values to [0, 1]
y = y.astype(int)  # Convert labels to integers

# Step 2: Standardize the features
print("Standardizing features...")
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Step 3: Define the custom cosine distance function
def cosine_similarity(point1, point2):
    """Calculate the cosine similarity between two points."""
    dot_product = np.dot(point1, point2)
    norm1 = np.linalg.norm(point1)
    norm2 = np.linalg.norm(point2)
    return dot_product / (norm1 * norm2)

def cosine_distance(x, y):
    """Calculate the cosine distance between two vectors."""
    return 1 - cosine_similarity(x, y)

# Step 4: Use 10-fold cross-validation
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
fold_scores = []
fold_times = []

# Initialize BallTree
start_time = time.time()

for fold_idx, (train_idx, test_idx) in enumerate(kf.split(X, y), 1):
    print(f"Processing fold {fold_idx}...")  # Indicate which fold is being processed
    
    # Split the data for the current fold
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    # Build the Ball Tree for the current fold
    print(f"Building Ball Tree using custom cosine distance for fold {fold_idx}...")
    ball_tree = BallTree(X_train, metric=cosine_distance)

    # Query the Ball Tree to find the k nearest neighbors
    def knn_with_ball_tree(ball_tree, X_test, k=3):
        y_pred = []
        for test_point in X_test:
            dist, ind = ball_tree.query([test_point], k=k)  # Query for k nearest neighbors
            neighbors_labels = y_train[ind].flatten()  # Get the labels of the neighbors
            predicted_label = np.bincount(neighbors_labels).argmax()  # Majority vote
            y_pred.append(predicted_label)
        return np.array(y_pred)

    # Test k-NN with Ball Tree
    y_pred_ball_tree = knn_with_ball_tree(ball_tree, X_test, k=3)

    # Calculate accuracy for this fold
    accuracy = accuracy_score(y_test, y_pred_ball_tree)
    fold_scores.append(accuracy)

    # Track the time taken for this fold
    fold_end_time = time.time()
    fold_time = fold_end_time - start_time
    fold_times.append(fold_time)
    
    print(f"Fold {fold_idx}: Accuracy = {accuracy * 100:.2f}%, Time = {fold_time:.2f} seconds")

# Step 5: Report overall results
mean_accuracy = np.mean(fold_scores)
std_accuracy = np.std(fold_scores)
total_time = np.sum(fold_times)
best_accuracy = np.max(fold_scores)

print(f"\nCross-validation completed in {total_time:.2f} seconds.")
print(f"Accuracy for each fold: {[f'{score * 100:.2f}%' for score in fold_scores]}")
print(f"Mean accuracy: {mean_accuracy * 100:.2f}%")
print(f"Standard deviation of accuracy: {std_accuracy * 100:.2f}%")
print(f"Best accuracy: {best_accuracy * 100:.2f}%")


In [None]:
import time
import numpy as np
from sklearn.datasets import fetch_openml
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.neighbors import BallTree
from sklearn.preprocessing import StandardScaler

# Step 1: Fetch and preprocess the MNIST dataset
print("Fetching MNIST dataset...")
X, y = fetch_openml('mnist_784', version=1, return_X_y=True, as_frame=False)

X, y = X[:10000], y[:10000]  # Select first 10,000 images for processing
print(f"Using first 10,000 images: {X.shape[0]} training samples.")


X = X / 255.0  # Normalize pixel values to [0, 1]
y = y.astype(int)  # Convert labels to integers

# Step 2: Standardize the features
print("Standardizing features...")
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Step 3: Manhattan Distance function
def manhattan_distance(x, y):
    """Calculate the Manhattan (L1) distance between two points."""
    return np.sum(np.abs(x - y))

# Step 4: Define the custom Manhattan distance function for BallTree
def manhattan_distance_ball_tree(x, y):
    """Return the Manhattan distance between two vectors for BallTree."""
    return manhattan_distance(x, y)

# Step 5: Use 10-fold cross-validation
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
fold_scores = []
fold_times = []

# Initialize the start time
start_time = time.time()

for fold_idx, (train_idx, test_idx) in enumerate(kf.split(X, y), 1):
    print(f"Processing fold {fold_idx}...")  # Indicate which fold is being processed
    
    # Split the data for the current fold
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    # Build the Ball Tree for the current fold
    print(f"Building Ball Tree using custom Manhattan distance for fold {fold_idx}...")
    ball_tree = BallTree(X_train, metric=manhattan_distance_ball_tree)

    # Query the Ball Tree to find the k nearest neighbors
    def knn_with_ball_tree(ball_tree, X_test, k=3):
        y_pred = []
        for test_point in X_test:
            dist, ind = ball_tree.query([test_point], k=k)  # Query for k nearest neighbors
            neighbors_labels = y_train[ind].flatten()  # Get the labels of the neighbors
            predicted_label = np.bincount(neighbors_labels).argmax()  # Majority vote
            y_pred.append(predicted_label)
        return np.array(y_pred)

    # Test k-NN with Ball Tree
    y_pred_ball_tree = knn_with_ball_tree(ball_tree, X_test, k=3)

    # Calculate accuracy for this fold
    accuracy = accuracy_score(y_test, y_pred_ball_tree)
    fold_scores.append(accuracy)

    # Track the time taken for this fold
    fold_end_time = time.time()
    fold_time = fold_end_time - start_time
    fold_times.append(fold_time)
    
    print(f"Fold {fold_idx}: Accuracy = {accuracy * 100:.2f}%, Time = {fold_time:.2f} seconds")

# Step 6: Report overall results
mean_accuracy = np.mean(fold_scores)
std_accuracy = np.std(fold_scores)
total_time = np.sum(fold_times)
best_accuracy = np.max(fold_scores)

print(f"\nCross-validation completed in {total_time:.2f} seconds.")
print(f"Accuracy for each fold: {[f'{score * 100:.2f}%' for score in fold_scores]}")
print(f"Mean accuracy: {mean_accuracy * 100:.2f}%")
print(f"Standard deviation of accuracy: {std_accuracy * 100:.2f}%")
print(f"Best accuracy: {best_accuracy * 100:.2f}%")
