In [1]:
import numpy as np
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.neighbors import BallTree

# Step 2: Fetch and preprocess the MNIST dataset
print("Fetching MNIST dataset...")
X, y = fetch_openml('mnist_784', version=1, return_X_y=True, as_frame=False)
X = X / 255.0  # Normalize pixel values to [0, 1]
y = y.astype(int)  # Convert labels to integers

# Cosine Similarity function
def cosine_similarity(point1, point2):
    """Calculate the cosine similarity between two points."""
    dot_product = np.dot(point1, point2)
    norm1 = np.linalg.norm(point1)
    norm2 = np.linalg.norm(point2)
    return dot_product / (norm1 * norm2)

# Define the custom cosine distance function for BallTree
def cosine_distance(x, y):
    """Calculate the cosine distance between two vectors."""
    return 1 - cosine_similarity(x, y)

# Example dataset
print("Subsampling dataset for testing...")
sample_size = 10000  # Use a smaller sample size for training/testing
X_train, X_test, y_train, y_test = train_test_split(X[:sample_size], y[:sample_size], test_size=0.3, random_state=42)
print(f"Training set size: {len(X_train)}, Test set size: {len(X_test)}")

# Step 3: Build the Ball Tree with Cosine distance
print("Building Ball Tree using sklearn with custom cosine distance...")
ball_tree = BallTree(X_train, metric=cosine_distance)

# Step 4: Query the Ball Tree to find the k nearest neighbors
def knn_with_ball_tree(ball_tree, X_test, k=3):
    y_pred = []
    
    # Query the tree for each test sample
    for test_point in X_test:
        dist, ind = ball_tree.query([test_point], k=k)  # Query for k nearest neighbors
        neighbors_labels = y_train[ind].flatten()  # Get the labels of the neighbors
        
        # Majority vote for the predicted label
        predicted_label = np.bincount(neighbors_labels).argmax()
        y_pred.append(predicted_label)
    
    return np.array(y_pred)

# Step 5: Test k-NN with Ball Tree
print("Testing k-NN with Ball Tree...")
y_pred_ball_tree = knn_with_ball_tree(ball_tree, X_test, k=3)

# Step 6: Calculate accuracy
accuracy_ball_tree = accuracy_score(y_test, y_pred_ball_tree)
print(f"Accuracy of k-NN with Ball Tree (k=3): {accuracy_ball_tree:.2%}")


Fetching MNIST dataset...
Subsampling dataset for testing...
Training set size: 7000, Test set size: 3000
Building Ball Tree using sklearn with custom cosine distance...
Testing k-NN with Ball Tree...
Accuracy of k-NN with Ball Tree (k=3): 95.47%


In [2]:
import numpy as np
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.neighbors import BallTree

# Step 2: Fetch and preprocess the MNIST dataset
print("Fetching MNIST dataset...")
X, y = fetch_openml('mnist_784', version=1, return_X_y=True, as_frame=False)
X = X / 255.0  # Normalize pixel values to [0, 1]
y = y.astype(int)  # Convert labels to integers

# Euclidean Distance function
def euclidean_distance(x, y):
    """Calculate the Euclidean distance between two points."""
    return np.sqrt(np.sum((x - y) ** 2))

# Define the custom Euclidean distance function for BallTree
def euclidean_distance_ball_tree(x, y):
    """Return the Euclidean distance between two vectors."""
    return euclidean_distance(x, y)

# Example dataset
print("Subsampling dataset for testing...")
sample_size = 10000  # Use a smaller sample size for training/testing
X_train, X_test, y_train, y_test = train_test_split(X[:sample_size], y[:sample_size], test_size=0.3, random_state=42)
print(f"Training set size: {len(X_train)}, Test set size: {len(X_test)}")

# Step 3: Build the Ball Tree with custom Euclidean distance
print("Building Ball Tree using sklearn with custom Euclidean distance...")
ball_tree = BallTree(X_train, metric=euclidean_distance_ball_tree)

# Step 4: Query the Ball Tree to find the k nearest neighbors
def knn_with_ball_tree(ball_tree, X_test, k=3):
    y_pred = []
    
    # Query the tree for each test sample
    for test_point in X_test:
        dist, ind = ball_tree.query([test_point], k=k)  # Query for k nearest neighbors
        neighbors_labels = y_train[ind].flatten()  # Get the labels of the neighbors
        
        # Majority vote for the predicted label
        predicted_label = np.bincount(neighbors_labels).argmax()
        y_pred.append(predicted_label)
    
    return np.array(y_pred)

# Step 5: Test k-NN with Ball Tree
print("Testing k-NN with Ball Tree...")
y_pred_ball_tree = knn_with_ball_tree(ball_tree, X_test, k=3)

# Step 6: Calculate accuracy
accuracy_ball_tree = accuracy_score(y_test, y_pred_ball_tree)
print(f"Accuracy of k-NN with Ball Tree (k=3): {accuracy_ball_tree:.2%}")


Fetching MNIST dataset...
Subsampling dataset for testing...
Training set size: 7000, Test set size: 3000
Building Ball Tree using sklearn with custom Euclidean distance...
Testing k-NN with Ball Tree...
Accuracy of k-NN with Ball Tree (k=3): 94.67%


In [3]:
import numpy as np
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.neighbors import BallTree

# Step 2: Fetch and preprocess the MNIST dataset
print("Fetching MNIST dataset...")
X, y = fetch_openml('mnist_784', version=1, return_X_y=True, as_frame=False)
X = X / 255.0  # Normalize pixel values to [0, 1]
y = y.astype(int)  # Convert labels to integers

# Manhattan Distance function
def manhattan_distance(x, y):
    """Calculate the Manhattan (L1) distance between two points."""
    return np.sum(np.abs(x - y))

# Define the custom Manhattan distance function for BallTree
def manhattan_distance_ball_tree(x, y):
    """Return the Manhattan distance between two vectors for BallTree."""
    return manhattan_distance(x, y)

# Example dataset
print("Subsampling dataset for testing...")
sample_size = 10000  # Use a smaller sample size for training/testing
X_train, X_test, y_train, y_test = train_test_split(X[:sample_size], y[:sample_size], test_size=0.3, random_state=42)
print(f"Training set size: {len(X_train)}, Test set size: {len(X_test)}")

# Step 3: Build the Ball Tree with custom Manhattan distance
print("Building Ball Tree using sklearn with custom Manhattan distance...")
ball_tree = BallTree(X_train, metric=manhattan_distance_ball_tree)

# Step 4: Query the Ball Tree to find the k nearest neighbors
def knn_with_ball_tree(ball_tree, X_test, k=3):
    y_pred = []
    
    # Query the tree for each test sample
    for test_point in X_test:
        dist, ind = ball_tree.query([test_point], k=k)  # Query for k nearest neighbors
        neighbors_labels = y_train[ind].flatten()  # Get the labels of the neighbors
        
        # Majority vote for the predicted label
        predicted_label = np.bincount(neighbors_labels).argmax()
        y_pred.append(predicted_label)
    
    return np.array(y_pred)

# Step 5: Test k-NN with Ball Tree
print("Testing k-NN with Ball Tree...")
y_pred_ball_tree = knn_with_ball_tree(ball_tree, X_test, k=3)

# Step 6: Calculate accuracy
accuracy_ball_tree = accuracy_score(y_test, y_pred_ball_tree)
print(f"Accuracy of k-NN with Ball Tree (k=3): {accuracy_ball_tree:.2%}")


Fetching MNIST dataset...
Subsampling dataset for testing...
Training set size: 7000, Test set size: 3000
Building Ball Tree using sklearn with custom Manhattan distance...
Testing k-NN with Ball Tree...
Accuracy of k-NN with Ball Tree (k=3): 93.60%
