<a href="https://colab.research.google.com/github/yutongye616/python-colab/blob/main/sklearn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [20]:
import numpy as np
import time
from sklearn.datasets import load_iris, load_breast_cancer, load_digits

def compute_distance_naive(X):
    """
    Compute distance matrix using nested loops
    """
    N, D = X.shape
    M = np.zeros((N, N))
    for i in range(N):
        for j in range(N):
            diff = X[i] - X[j]
            M[i, j] = np.sqrt(np.dot(diff, diff))
    return M

def compute_distance_smart(X):
    """
    Compute distance matrix using vectorization
    """
    squared_norms = np.sum(X**2, axis=1)
    dot_products = X @ X.T
    M_squared = squared_norms.reshape(-1, 1) - 2 * dot_products + squared_norms
    M_squared = np.maximum(M_squared, 0)
    return np.sqrt(M_squared)

def compute_correlation_naive(X):
    """
    Compute correlation matrix using nested loops
    """
    N, D = X.shape
    M = np.zeros((D, D))
    means = np.mean(X, axis=0)
    stds = np.std(X, axis=0, ddof=1)

    for i in range(D):
        for j in range(D):
            if stds[i] == 0 or stds[j] == 0:
                M[i, j] = 1.0 if i == j else 0.0
            else:
                covariance = np.sum((X[:, i] - means[i]) * (X[:, j] - means[j])) / (N - 1)
                M[i, j] = covariance / (stds[i] * stds[j])
    return M

def compute_correlation_smart(X):
    """
    Compute correlation matrix using vectorization
    """
    N, D = X.shape
    X_centered = X - np.mean(X, axis=0)
    covariance = (X_centered.T @ X_centered) / (N - 1)
    stds = np.sqrt(np.diag(covariance))
    denominator = np.outer(stds, stds)

    M = np.zeros((D, D))
    for i in range(D):
        for j in range(D):
            if denominator[i, j] > 1e-10:
                M[i, j] = covariance[i, j] / denominator[i, j]
            else:
                M[i, j] = 1.0 if i == j else 0.0
    return M

def sklearn_datasets():
    datasets = {
        'Iris': load_iris(),
        'Breast Cancer': load_breast_cancer(),
        'Digits': load_digits()
    }

    distance_results = []
    correlation_results = []

    for name, dataset in datasets.items():
        X = dataset.data
        N, D = X.shape

        # Compute distance matrix
        start_time = time.time()
        dist_loop = compute_distance_naive(X)
        end_time = time.time()
        time_dist_loop = end_time - start_time

        start_time = time.time()
        dist_smart = compute_distance_smart(X)
        end_time = time.time()
        time_dist_smart = end_time - start_time

        # Compute correlation matrix
        start_time = time.time()
        corr_loop = compute_correlation_naive(X)
        end_time = time.time()
        time_corr_loop = end_time - start_time

        start_time = time.time()
        corr_smart = compute_correlation_smart(X)
        end_time = time.time()
        time_corr_smart = end_time - start_time

        # Store results (remove Speedup since not required)
        distance_results.append({
            'Dataset': name,
            'N': N,
            'D': D,
            'With Loops (s)': time_dist_loop,
            'Without Loops (s)': time_dist_smart
        })

        correlation_results.append({
            'Dataset': name,
            'N': N,
            'D': D,
            'With Loops (s)': time_corr_loop,
            'Without Loops (s)': time_corr_smart
        })

    # Display tables

    print("DISTANCE MATRIX COMPUTATION TIMES")
    print("_" * 100)

    print("Dataset         | N      | D | With Loops | Without Loops")
    print("_" * 100)
    for result in distance_results:
        print(f"{result['Dataset']:15} | {result['N']:5} | {result['D']:3} | {result['With Loops (s)']:.6f} | {result['Without Loops (s)']:.6f}")

    print("\n")


    print("CORRELATION MATRIX COMPUTATION TIMES")
    print("_" * 100)

    print("Dataset         | N      | D | With Loops | Without Loops")
    print("_" * 100)
    for result in correlation_results:
        print(f"{result['Dataset']:15} | {result['N']:5} | {result['D']:3} | {result['With Loops (s)']:.6f} ")


    return distance_results, correlation_results

def main():
    distance_results, correlation_results = sklearn_datasets()

if __name__ == "__main__":
    main()

DISTANCE MATRIX COMPUTATION TIMES
____________________________________________________________________________________________________
Dataset         | N      | D | With Loops | Without Loops
____________________________________________________________________________________________________
Iris            |   150 |   4 | 0.073210 | 0.000334
Breast Cancer   |   569 |  30 | 1.515725 | 0.008162
Digits          |  1797 |  64 | 11.629039 | 0.067990


CORRELATION MATRIX COMPUTATION TIMES
____________________________________________________________________________________________________
Dataset         | N      | D | With Loops | Without Loops
____________________________________________________________________________________________________
Iris            |   150 |   4 | 0.000391 
Breast Cancer   |   569 |  30 | 0.035545 
Digits          |  1797 |  64 | 0.122956 
