In [1]:
import time
import numpy as np
from sklearn.datasets import fetch_openml
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

# Step 1: Fetch and preprocess the MNIST dataset
print("Fetching MNIST dataset...")
X, y = fetch_openml('mnist_784', version=1, return_X_y=True, as_frame=False)
samples=5000
X, y = X[:samples], y[:samples]  # Select first 10,000 images for processing
print(f"Using first {samples} images: {X.shape[0]} training samples.")

X = X / 255.0  # Normalize the pixel values to [0, 1]
y = y.astype(int)  # Convert labels to integers

# Step 2: Standardize the features
print("Standardizing features...")
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Step 3: Euclidean Distance function
def euclidean_distance(x, y):
    """Calculate the Euclidean distance between two points."""
    return np.sqrt(np.sum((x - y) ** 2))

# Step 4: Implement k-NN without BallTree
def knn_predict(X_train, y_train, X_test, k=3):
    """Predict labels using k-NN and Euclidean distance."""
    y_pred = []
    
    # For each test point, calculate distances to all training points
    for test_point in X_test:
        distances = [euclidean_distance(test_point, train_point) for train_point in X_train]
        
        # Get indices of the k smallest distances
        k_indices = np.argsort(distances)[:k]
        k_nearest_labels = y_train[k_indices]
        
        # Majority vote
        predicted_label = np.bincount(k_nearest_labels).argmax()
        y_pred.append(predicted_label)
    
    return np.array(y_pred)

# Step 5: Function to perform k-NN with k control
def perform_knn_with_k_control(X, y, k_values):
    results = {}
    kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    
    for k in k_values:
        print(f"\nRunning k-NN with k = {k}...")
        fold_scores = []
        fold_times = []
        start_time = time.time()
        
        for fold_idx, (train_idx, test_idx) in enumerate(kf.split(X, y), 1):
            print(f"Processing fold {fold_idx} for k = {k}...")
            
            # Split the data for the current fold
            X_train, X_test = X[train_idx], X[test_idx]
            y_train, y_test = y[train_idx], y[test_idx]

            # Predict using k-NN for the current value of k
            y_pred = knn_predict(X_train, y_train, X_test, k=k)

            # Calculate accuracy for this fold
            accuracy = accuracy_score(y_test, y_pred)
            fold_scores.append(accuracy)

            # Track the time taken for this fold
            fold_end_time = time.time()
            fold_time = fold_end_time - start_time
            fold_times.append(fold_time)
            
            print(f"Fold {fold_idx}: Accuracy = {accuracy * 100:.2f}%, Time = {fold_time:.2f} seconds")
        
        # Compute overall results for this value of k
        mean_accuracy = np.mean(fold_scores)
        std_accuracy = np.std(fold_scores)
        total_time = np.sum(fold_times)
        best_accuracy = np.max(fold_scores)

        print(f"\nResults for k = {k}:")
        print(f"Accuracy for each fold: {[f'{score * 100:.2f}%' for score in fold_scores]}")
        print(f"Mean accuracy: {mean_accuracy * 100:.2f}%")
        print(f"Standard deviation of accuracy: {std_accuracy * 100:.2f}%")
        print(f"Best accuracy: {best_accuracy * 100:.2f}%")
        
        # Store results
        results[k] = {
            "mean_accuracy": mean_accuracy,
            "std_accuracy": std_accuracy,
            "total_time": total_time,
            "best_accuracy": best_accuracy,
        }
    
    return results

# Step 6: Run k-NN with multiple values of k
k_values = [1,3,5,7]  # Example k values to test
results = perform_knn_with_k_control(X, y, k_values)

# Step 7: Summarize results
print("\nSummary of results for all k values:")
for k, result in results.items():
    print(f"k = {k}: Mean Accuracy = {result['mean_accuracy'] * 100:.2f}%, Best Accuracy = {result['best_accuracy'] * 100:.2f}%, Total Time = {result['total_time']:.2f} seconds")


Fetching MNIST dataset...
Using first 10,000 images: 5000 training samples.
Standardizing features...

Running k-NN with k = 1...
Processing fold 1 for k = 1...
Fold 1: Accuracy = 89.00%, Time = 13.39 seconds
Processing fold 2 for k = 1...
Fold 2: Accuracy = 89.60%, Time = 27.19 seconds
Processing fold 3 for k = 1...
Fold 3: Accuracy = 91.00%, Time = 41.48 seconds
Processing fold 4 for k = 1...
Fold 4: Accuracy = 90.20%, Time = 55.99 seconds
Processing fold 5 for k = 1...
Fold 5: Accuracy = 91.40%, Time = 70.63 seconds
Processing fold 6 for k = 1...
Fold 6: Accuracy = 88.40%, Time = 84.98 seconds
Processing fold 7 for k = 1...
Fold 7: Accuracy = 88.20%, Time = 99.34 seconds
Processing fold 8 for k = 1...
Fold 8: Accuracy = 91.20%, Time = 113.12 seconds
Processing fold 9 for k = 1...
Fold 9: Accuracy = 88.40%, Time = 127.50 seconds
Processing fold 10 for k = 1...
Fold 10: Accuracy = 91.80%, Time = 141.90 seconds

Results for k = 1:
Accuracy for each fold: ['89.00%', '89.60%', '91.00%', 

In [2]:
import time
import numpy as np
from sklearn.datasets import fetch_openml
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.neighbors import BallTree
from sklearn.preprocessing import StandardScaler

# Step 1: Fetch and preprocess the MNIST dataset
print("Fetching MNIST dataset...")
X, y = fetch_openml('mnist_784', version=1, return_X_y=True, as_frame=False)
samples=5000
X, y = X[:samples], y[:samples]  # Select first 10,000 images for processing
print(f"Using first {5000} images: {X.shape[0]} training samples.")

X = X / 255.0  # Normalize the pixel values to [0, 1]
y = y.astype(int)  # Convert labels to integers

# Step 2: Standardize the features
print("Standardizing features...")
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Step 3: Define the custom cosine distance function
def cosine_similarity(point1, point2):
    """Calculate the cosine similarity between two points."""
    dot_product = np.dot(point1, point2)
    norm1 = np.linalg.norm(point1)
    norm2 = np.linalg.norm(point2)
    return dot_product / (norm1 * norm2)

def cosine_distance(x, y):
    """Calculate the cosine distance between two vectors."""
    return 1 - cosine_similarity(x, y)

# Step 4: Function to perform k-NN with Ball Tree and k control
def perform_knn_with_ball_tree_k_control(X, y, k_values):
    results = {}
    kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    
    for k in k_values:
        print(f"\nRunning k-NN with k = {k} using Ball Tree and cosine distance...")
        fold_scores = []
        fold_times = []
        start_time = time.time()
        
        for fold_idx, (train_idx, test_idx) in enumerate(kf.split(X, y), 1):
            print(f"Processing fold {fold_idx} for k = {k}...")
            
            # Split the data for the current fold
            X_train, X_test = X[train_idx], X[test_idx]
            y_train, y_test = y[train_idx], y[test_idx]

            # Build the Ball Tree for the current fold
            ball_tree = BallTree(X_train, metric=cosine_distance)

            # Query the Ball Tree to find the k nearest neighbors
            def knn_with_ball_tree(ball_tree, X_test, k):
                y_pred = []
                for test_point in X_test:
                    dist, ind = ball_tree.query([test_point], k=k)  # Query for k nearest neighbors
                    neighbors_labels = y_train[ind].flatten()  # Get the labels of the neighbors
                    predicted_label = np.bincount(neighbors_labels).argmax()  # Majority vote
                    y_pred.append(predicted_label)
                return np.array(y_pred)

            # Test k-NN with Ball Tree
            y_pred_ball_tree = knn_with_ball_tree(ball_tree, X_test, k=k)

            # Calculate accuracy for this fold
            accuracy = accuracy_score(y_test, y_pred_ball_tree)
            fold_scores.append(accuracy)

            # Track the time taken for this fold
            fold_end_time = time.time()
            fold_time = fold_end_time - start_time
            fold_times.append(fold_time)
            
            print(f"Fold {fold_idx}: Accuracy = {accuracy * 100:.2f}%, Time = {fold_time:.2f} seconds")
        
        # Compute overall results for this value of k
        mean_accuracy = np.mean(fold_scores)
        std_accuracy = np.std(fold_scores)
        total_time = np.sum(fold_times)
        best_accuracy = np.max(fold_scores)

        print(f"\nResults for k = {k}:")
        print(f"Accuracy for each fold: {[f'{score * 100:.2f}%' for score in fold_scores]}")
        print(f"Mean accuracy: {mean_accuracy * 100:.2f}%")
        print(f"Standard deviation of accuracy: {std_accuracy * 100:.2f}%")
        print(f"Best accuracy: {best_accuracy * 100:.2f}%")
        
        # Store results
        results[k] = {
            "mean_accuracy": mean_accuracy,
            "std_accuracy": std_accuracy,
            "total_time": total_time,
            "best_accuracy": best_accuracy,
        }
    
    return results

# Step 5: Run k-NN with multiple values of k
k_values = [1,3,5,7]  # Example k values to test
results = perform_knn_with_ball_tree_k_control(X, y, k_values)

# Step 6: Summarize results
print("\nSummary of results for all k values:")
for k, result in results.items():
    print(f"k = {k}: Mean Accuracy = {result['mean_accuracy'] * 100:.2f}%, Best Accuracy = {result['best_accuracy'] * 100:.2f}%, Total Time = {result['total_time']:.2f} seconds")


Fetching MNIST dataset...
Using first 5000 images: 5000 training samples.
Standardizing features...

Running k-NN with k = 1 using Ball Tree and cosine distance...
Processing fold 1 for k = 1...
Fold 1: Accuracy = 88.80%, Time = 18.10 seconds
Processing fold 2 for k = 1...
Fold 2: Accuracy = 90.40%, Time = 36.89 seconds
Processing fold 3 for k = 1...
Fold 3: Accuracy = 90.80%, Time = 55.37 seconds
Processing fold 4 for k = 1...
Fold 4: Accuracy = 88.80%, Time = 73.68 seconds
Processing fold 5 for k = 1...
Fold 5: Accuracy = 91.80%, Time = 89.18 seconds
Processing fold 6 for k = 1...
Fold 6: Accuracy = 90.40%, Time = 104.54 seconds
Processing fold 7 for k = 1...
Fold 7: Accuracy = 88.40%, Time = 119.79 seconds
Processing fold 8 for k = 1...
Fold 8: Accuracy = 91.20%, Time = 135.05 seconds
Processing fold 9 for k = 1...
Fold 9: Accuracy = 89.20%, Time = 150.59 seconds
Processing fold 10 for k = 1...
Fold 10: Accuracy = 92.40%, Time = 165.83 seconds

Results for k = 1:
Accuracy for each f

In [3]:
import time
import numpy as np
from sklearn.datasets import fetch_openml
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

# Step 1: Fetch and preprocess the MNIST dataset
print("Fetching MNIST dataset...")
X, y = fetch_openml('mnist_784', version=1, return_X_y=True, as_frame=False)
samples=5000
X, y = X[:samples], y[:samples]  # Select first 10,000 images for processing
print(f"Using first {samples} images: {X.shape[0]} training samples.")

X = X / 255.0  # Normalize pixel values to [0, 1]
y = y.astype(int)  # Convert labels to integers

# Step 2: Standardize the features
print("Standardizing features...")
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Step 3: Manhattan Distance function
def manhattan_distance(x, y):
    """Calculate the Manhattan (L1) distance between two points."""
    return np.sum(np.abs(x - y))

# Step 4: Implement k-NN without BallTree
def knn_predict(X_train, y_train, X_test, k=3):
    """Predict labels using k-NN and Manhattan distance."""
    y_pred = []
    
    # For each test point, calculate distances to all training points
    for test_point in X_test:
        distances = [manhattan_distance(test_point, train_point) for train_point in X_train]
        
        # Get indices of the k smallest distances
        k_indices = np.argsort(distances)[:k]
        k_nearest_labels = y_train[k_indices]
        
        # Majority vote
        predicted_label = np.bincount(k_nearest_labels).argmax()
        y_pred.append(predicted_label)
    
    return np.array(y_pred)

# Step 5: Function to perform k-NN with k control
def perform_knn_with_k_control(X, y, k_values):
    results = {}
    kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    
    for k in k_values:
        print(f"\nRunning k-NN with k = {k} using Manhattan distance...")
        fold_scores = []
        fold_times = []
        start_time = time.time()
        
        for fold_idx, (train_idx, test_idx) in enumerate(kf.split(X, y), 1):
            print(f"Processing fold {fold_idx} for k = {k}...")
            
            # Split the data for the current fold
            X_train, X_test = X[train_idx], X[test_idx]
            y_train, y_test = y[train_idx], y[test_idx]

            # Predict using k-NN
            print(f"Predicting using k-NN for fold {fold_idx}...")
            y_pred = knn_predict(X_train, y_train, X_test, k=k)

            # Calculate accuracy for this fold
            accuracy = accuracy_score(y_test, y_pred)
            fold_scores.append(accuracy)

            # Track the time taken for this fold
            fold_end_time = time.time()
            fold_time = fold_end_time - start_time
            fold_times.append(fold_time)
            
            print(f"Fold {fold_idx}: Accuracy = {accuracy * 100:.2f}%, Time = {fold_time:.2f} seconds")
        
        # Compute overall results for this value of k
        mean_accuracy = np.mean(fold_scores)
        std_accuracy = np.std(fold_scores)
        total_time = np.sum(fold_times)
        best_accuracy = np.max(fold_scores)

        print(f"\nResults for k = {k}:")
        print(f"Accuracy for each fold: {[f'{score * 100:.2f}%' for score in fold_scores]}")
        print(f"Mean accuracy: {mean_accuracy * 100:.2f}%")
        print(f"Standard deviation of accuracy: {std_accuracy * 100:.2f}%")
        print(f"Best accuracy: {best_accuracy * 100:.2f}%")
        
        # Store results
        results[k] = {
            "mean_accuracy": mean_accuracy,
            "std_accuracy": std_accuracy,
            "total_time": total_time,
            "best_accuracy": best_accuracy,
        }
    
    return results

# Step 6: Run k-NN with multiple values of k
k_values = [1,3,5,7]  # Example k values to test
results = perform_knn_with_k_control(X, y, k_values)

# Step 7: Summarize results
print("\nSummary of results for all k values:")
for k, result in results.items():
    print(f"k = {k}: Mean Accuracy = {result['mean_accuracy'] * 100:.2f}%, Best Accuracy = {result['best_accuracy'] * 100:.2f}%, Total Time = {result['total_time']:.2f} seconds")


Fetching MNIST dataset...
Using first 5000 images: 5000 training samples.
Standardizing features...

Running k-NN with k = 1 using Manhattan distance...
Processing fold 1 for k = 1...
Predicting using k-NN for fold 1...
Fold 1: Accuracy = 91.80%, Time = 26.03 seconds
Processing fold 2 for k = 1...
Predicting using k-NN for fold 2...
Fold 2: Accuracy = 89.40%, Time = 54.92 seconds
Processing fold 3 for k = 1...
Predicting using k-NN for fold 3...
Fold 3: Accuracy = 93.20%, Time = 86.63 seconds
Processing fold 4 for k = 1...
Predicting using k-NN for fold 4...
Fold 4: Accuracy = 91.40%, Time = 121.50 seconds
Processing fold 5 for k = 1...
Predicting using k-NN for fold 5...
Fold 5: Accuracy = 92.00%, Time = 153.72 seconds
Processing fold 6 for k = 1...
Predicting using k-NN for fold 6...
Fold 6: Accuracy = 91.20%, Time = 188.90 seconds
Processing fold 7 for k = 1...
Predicting using k-NN for fold 7...
Fold 7: Accuracy = 93.40%, Time = 219.91 seconds
Processing fold 8 for k = 1...
Predict