<a href="https://colab.research.google.com/github/yutongye616/python-colab/blob/main/sklearn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# Problem 3: Benchmark sklearn datasets for distance and correlation matrices
import numpy as np
import time
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.datasets import load_iris, load_breast_cancer, load_digits

# Import the functions from your other files (or copy them here)
def compute_distance_naive(X):
    """Compute distance matrix using nested loops"""
    X = np.asarray(X, dtype=np.float64)
    N, D = X.shape
    M = np.zeros((N, N), dtype=np.float64)
    for i in range(N):
        xi = X[i]
        for j in range(i, N):
            diff = xi - X[j]
            M[i, j] = M[j, i] = np.sqrt(np.dot(diff, diff))
    return M

def compute_distance_smart(X):
    """Compute distance matrix using vectorization"""
    N = X.shape[0]
    X = np.asarray(X, dtype=np.float64, order='C')
    sq = np.sum(X * X, axis=1).reshape(N, 1)
    M = sq + sq.T - 2.0 * (X @ X.T)
    np.maximum(M, 0.0, out=M)
    np.sqrt(M, out=M)
    return M

def compute_correlation_naive(X):
    """Compute correlation matrix using nested loops"""
    X = np.asarray(X, dtype=np.float64, order='C')
    N, D = X.shape
    M = np.zeros([D, D])
    means = np.mean(X, axis=0)
    stds = np.std(X, axis=0, ddof=1)
    Xc = X - means

    for i in range(D):
        xi = Xc[:, i]
        si = stds[i]
        M[i, i] = 1.0
        for j in range(i + 1, D):
            sj = stds[j]
            if si == 0.0 or sj == 0.0:
                val = 0.0
            else:
                num = np.dot(xi, Xc[:, j])
                val = num / ((N - 1) * si * sj)
            M[i, j] = val
            M[j, i] = val
    return M

def compute_correlation_smart(X):
    """Compute correlation matrix using vectorization"""
    N, D = X.shape
    X = np.asarray(X, dtype=np.float64, order='C')
    Xc = X - np.mean(X, axis=0)
    covariance = (Xc.T @ Xc) / (N - 1)
    stds = np.sqrt(np.diag(covariance))
    denom = np.outer(stds, stds)
    # Use a small epsilon to avoid division by zero, but still handle zero stds
    epsilon = 1e-8
    M = np.divide(covariance, denom, where=denom > epsilon)
    # Set correlation to 0 where stds are zero
    M[denom <= epsilon] = 0.0
    np.fill_diagonal(M, 1.0)
    return M


def benchmark_sklearn_datasets():
    """Benchmark the three sklearn datasets"""

    # Load datasets
    datasets = {
        'Iris': load_iris(),
        'Breast Cancer': load_breast_cancer(),
        'Digits': load_digits()
    }

    # Initialize results
    distance_results = []
    correlation_results = []

    print("=" * 80)
    print("PROBLEM 3: SKLEARN DATASETS BENCHMARK")
    print("=" * 80)

    for name, dataset in datasets.items():
        X = dataset.data
        N, D = X.shape

        print(f"\nProcessing {name} dataset (N={N}, D={D})...")

        # Benchmark distance matrix
        st = time.time()
        dist_loop = compute_distance_naive(X)
        et = time.time()
        time_dist_loop = et - st

        st = time.time()
        dist_smart = compute_distance_smart(X)
        et = time.time()
        time_dist_smart = et - st

        # Benchmark correlation matrix
        st = time.time()
        corr_loop = compute_correlation_naive(X)
        et = time.time()
        time_corr_loop = et - st

        st = time.time()
        corr_smart = compute_correlation_smart(X)
        et = time.time()
        time_corr_smart = et - st

        # Verify results
        assert np.allclose(dist_loop, dist_smart, atol=1e-04), f"Distance matrices mismatch for {name}"
        # Add print statements for debugging correlation matrix
        try:
            assert np.allclose(corr_loop, corr_smart, atol=1e-04) # Increased tolerance here
        except AssertionError:
            print(f"AssertionError for {name} correlation matrix.")
            print(f"Max absolute difference: {np.max(np.abs(corr_loop - corr_smart))}")
            print("corr_loop:")
            print(corr_loop)
            print("corr_smart:")
            print(corr_smart)
            raise


        # Store results
        distance_results.append({
            'Dataset': name,
            'N': N,
            'D': D,
            'With Loops (s)': time_dist_loop,
            'Without Loops (s)': time_dist_smart,
            'Speedup': time_dist_loop / time_dist_smart
        })

        correlation_results.append({
            'Dataset': name,
            'N': N,
            'D': D,
            'With Loops (s)': time_corr_loop,
            'Without Loops (s)': time_corr_smart,
            'Speedup': time_corr_loop / time_corr_smart
        })

    # Display tables
    print("\n" + "=" * 80)
    print("TABLE 1: DISTANCE MATRIX COMPUTATION TIMES")
    print("=" * 80)

    dist_df = pd.DataFrame(distance_results)
    formatted_dist_df = dist_df.copy()
    formatted_dist_df['With Loops (s)'] = formatted_dist_df['With Loops (s)'].apply(lambda x: f'{x:.6f}')
    formatted_dist_df['Without Loops (s)'] = formatted_dist_df['Without Loops (s)'].apply(lambda x: f'{x:.6f}')
    formatted_dist_df['Speedup'] = formatted_dist_df['Speedup'].apply(lambda x: f'{x:.2f}x')
    print(formatted_dist_df.to_string(index=False))

    print("\n" + "=" * 80)
    print("TABLE 2: CORRELATION MATRIX COMPUTATION TIMES")
    print("=" * 80)

    corr_df = pd.DataFrame(correlation_results)
    formatted_corr_df = corr_df.copy()
    formatted_corr_df['With Loops (s)'] = formatted_corr_df['With Loops (s)'].apply(lambda x: f'{x:.6f}')
    formatted_corr_df['Without Loops (s)'] = formatted_corr_df['Without Loops (s)'].apply(lambda x: f'{x:.6f}')
    formatted_corr_df['Speedup'] = formatted_corr_df['Speedup'].apply(lambda x: f'{x:.2f}x')
    print(formatted_corr_df.to_string(index=False))

    # Save to file
    with open('problem3_results.txt', 'w') as f:
        f.write("PROBLEM 3 RESULTS\n")
        f.write("=" * 50 + "\n")
        f.write("Distance Matrix Times:\n")
        f.write(formatted_dist_df.to_string(index=False) + "\n\n")
        f.write("Correlation Matrix Times:\n")
        f.write(formatted_corr_df.to_string(index=False) + "\n")

    print("\nResults saved to 'problem3_results.txt'")

    return distance_results, correlation_results

def main():
    print("Problem 3: Sklearn Datasets Benchmark")
    print("This script computes distance and correlation matrices for")
    print("Iris, Breast Cancer, and Digits datasets from sklearn.")
    print()

    distance_results, correlation_results = benchmark_sklearn_datasets()

    print("\n" + "=" * 80)
    print("BENCHMARK COMPLETED SUCCESSFULLY!")
    print("=" * 80)
    print("\nSummary:")
    print(f"- Iris: N=150, D=4")
    print(f"- Breast Cancer: N=569, D=30")
    print(f"- Digits: N=1797, D=64")
    print(f"- Total computations: 3 datasets × 2 matrix types × 2 methods = 12")
    print(f"- All results verified for numerical accuracy")

if __name__ == "__main__":
    main()

Problem 3: Sklearn Datasets Benchmark
This script computes distance and correlation matrices for
Iris, Breast Cancer, and Digits datasets from sklearn.

PROBLEM 3: SKLEARN DATASETS BENCHMARK

Processing Iris dataset (N=150, D=4)...

Processing Breast Cancer dataset (N=569, D=30)...

Processing Digits dataset (N=1797, D=64)...

TABLE 1: DISTANCE MATRIX COMPUTATION TIMES
      Dataset    N  D With Loops (s) Without Loops (s) Speedup
         Iris  150  4       0.035914          0.000460  78.05x
Breast Cancer  569 30       0.548300          0.004896 111.99x
       Digits 1797 64       6.960559          0.057974 120.06x

TABLE 2: CORRELATION MATRIX COMPUTATION TIMES
      Dataset    N  D With Loops (s) Without Loops (s) Speedup
         Iris  150  4       0.000260          0.000131   1.99x
Breast Cancer  569 30       0.002425          0.000350   6.93x
       Digits 1797 64       0.020144          0.001578  12.76x

Results saved to 'problem3_results.txt'

BENCHMARK COMPLETED SUCCESSFULLY!

