In [1]:
import numpy as np
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots

import timeit
import math

from archive.randomized_projected_nmf import *
from random_matrix import *
from nmf import * 
from benchmark import *
from collections import defaultdict

**Overview of Algorithms**

1. NNLS: Active Set Method
    - i) Randomized Projected NMF
        * a) Gaussian Test Matrix
        * b) SRHT Test Matrix
        * c) SRFT Test Matrix
        * d) Count Sketch Matrix
        * e) Sparse JL
    - ii) Structured Random Projected NMF
        * a) Gaussian Test Matrix
        * b) SRHT Test Matrix
        * c) SRFT Test Matrix
        * d) Count Sketch Matrix
        * e) Sparse JL
2. Muliplicative Updates
    - i) Randomized Projected NMF
        * a) Gaussian Test Matrix
        * b) SRHT Test Matrix
        * c) SRFT Test Matrix
        * d) Count Sketch Matrix
        * e) Sparse JL
    - ii) Structured Random Projected NMF
        * a) Gaussian Test Matrix
        * b) SRHT Test Matrix
        * c) SRFT Test Matrix
        * d) Count Sketch Matrix
        * e) Sparse JL

In [None]:
methods = {
    "MU C": nmf_compress_mu,
    'MU SC': nmf_structured_compress_mu,
    'HALS C': nmf_compress_hals,
    'HALS SC': nmf_structured_compress_hals
}

projection_types = [
    'gaussian',
    'srht',
    'srft',
    'sparse-jl',
    'count-sketch',
]

stats = {
    'time': {method: defaultdict(list) for method in methods},
    'errors': {method: defaultdict(list) for method in methods},
}

# Parameters
sizes = np.arange(1_00, 1_001, 1_00)
runs = 10
r = 20
for n in sizes:
    # Generate A (n x 0.75n)
    np.random.seed(1)
    A, _, _= generate_synthetic_matrix(n,r,delta = 1.0)
    for method_name, method in methods.items():
        for projection in projection_types:
            total_times = [] 
            total_errors = []       
            for i in range(runs):
                # Set seed per run
                seed = i + 1
                
                # Time NMF Method
                start_time = timeit.default_timer()
                _, _, errors = method(A, r, random_state=seed,projection_type = projection)
                time = timeit.default_timer() - start_time

                # Store
                total_times.append(time)
                total_errors.append(errors[-1])
            
            # Store average times
            stats['time'][method_name][projection].append(np.mean(total_times))
            stats['errors'][method_name][projection].append(np.mean(total_errors))
    
    print(f'Completed benchmarking for matrix size: {n}')

In [None]:
mu_errors_df = pd.DataFrame(stats['errors']['HALS SC'],index = sizes)
mu_times_df = pd.DataFrame(stats['time']['HALS SC'],index = sizes)

In [None]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots


fig = make_subplots(rows=1, cols=2)

# Add time traces (left plot) with assigned colors
for method in mu_times_df.columns:
    fig.add_trace(
        go.Scatter(
            x=mu_times_df.index,
            y=mu_times_df[method],
            name=method,
            legendgroup=method,
            showlegend=True,  # Explicit color assignment
        ),
        row=1, col=1
    )

# Add error traces (right plot) with same colors
for method in mu_errors_df.columns:
    fig.add_trace(
        go.Scatter(
            x=mu_errors_df.index,
            y=mu_errors_df[method],
            name=method,
            legendgroup=method,
            showlegend=False,# Same color as left plot
        ),
        row=1, col=2
    )

fig.update_layout(
    title='Computation Time and Reconstruction Error by Method',
    xaxis_title='Size (n)',
    yaxis_title='Time (s)',
    xaxis2_title='Size (n)',
    yaxis2_title='Reconstruction Error',

)

"""    
legend=dict(
    orientation="h",
    yanchor="bottom",
    y=1.1,
    xanchor="center",
    x=0.5
)

"""
fig.show()

**Random Matrix Projection Algorithms**

First, we compare the compression time for each of the tested matrix transformation techinques. Specifically, we show the run times of only computing the initial matrix projection for both random projection and structured random projection using RPI.

*Standard Randomized Projection*

In [None]:
sizes = np.arange(100, 1_001, 100)
runs = 10
k = 100
methods = {
    "Gaussian": gaussian_random_matrix,
    "SRHT": srht_matrix,
    "SRFT": srft_matrix,
    "CountSketch": countsketch_matrix,
    "Sparse JL": lambda n, k, seed: sparse_jl_matrix(n, k, s=3, seed=seed)
}
times = {method: [] for method in methods}

for n in sizes:
    # Generate Random Matrix A to Project
    np.random.seed(1)
    A = np.random.randn(n, k)

    for method_name, method in methods.items():
        run_times = []
        for i in range(runs):
            seed = i + 1

            # Get Projection Matrix
            sigma = method(k, n, seed=seed)
            
            # Time Methods
            timer = timeit.Timer(lambda: sigma @ A)
            # Average over 10 runs
            runtime = timer.timeit(number=10) / 10
            
            run_times.append(runtime)
        times[method_name].append(np.mean(run_times))
    print(f'Calculating for Matrix of Size: {n}')

*Plot: Standard Random Projections*

In [None]:
times_df = pd.DataFrame(times,index = sizes)

In [None]:
fig = go.Figure()


for method in times_df.columns:
    fig.add_trace(
        go.Scatter(
            x = times_df.index,
            y = times_df[method],
            name = method
        )
    )

fig.update_layout(
    title = 'Time (s) vs Matrix Size by Compression Algorithm',
    xaxis_title = 'n',
    yaxis_title = 'Time (s)',
    template = 'plotly_white'
)
fig.show()

*Structured Random Projection*

In [None]:
# Assuming you have all these matrix generation functions defined
methods = {
    "Gaussian": gaussian_random_matrix,
    "SRHT": srht_matrix,
    "SRFT": srft_matrix,
    "CountSketch": countsketch_matrix,
    "Sparse JL": lambda n, k, seed: sparse_jl_matrix(n, k, s=3, seed=seed)
}

# Initialize timing dictionaries
times = {
    'projection': {method: [] for method in methods},
    'rpi': {method: [] for method in methods},
    'total': {method: [] for method in methods}
}

# Parameters
sizes = np.arange(100, 1_001, 100)
runs = 10
k = 100
q = 2  # Number of power iterations for RPI

for n in sizes:
    # Generate Random Matrix A to Project
    np.random.seed(1)
    A = np.abs(np.random.randn(n, k))  # Note: A is k×n (transposed from typical notation)
    
    for method_name, method in methods.items():
        projection_times = []
        rpi_times = []
        total_times = []
        
        for i in range(runs):
            seed = i + 1

            # Time projection matrix generation
            timer_proj = timeit.Timer(lambda: method(k, n, seed=seed))
            proj_time = timer_proj.timeit(number=1)  # Single run as creation is deterministic per seed
            
            # Generate projection matrix
            sigma = method(k, n, seed=seed)
            
            # Time projection operation
            timer_mult = timeit.Timer(lambda: sigma @ A)
            mult_time = timer_mult.timeit(number=10) / 10
            
            # Time RPI operation
            timer_rpi = timeit.Timer(lambda: randomized_power_iteration(A, sigma, q))
            rpi_time = timer_rpi.timeit(number=10) / 10
            
            projection_times.append(proj_time + mult_time)
            rpi_times.append(rpi_time)
            total_times.append(proj_time + mult_time + rpi_time)
        
        # Store average times
        times['projection'][method_name].append(np.mean(projection_times))
        times['rpi'][method_name].append(np.mean(rpi_times))
        times['total'][method_name].append(np.mean(total_times))
    
    print(f'Completed benchmarking for matrix size: {n}')

In [None]:
proj_times_df = pd.DataFrame(times['projection'],index = sizes)
total_times_df = pd.DataFrame(times['total'],index = sizes)
rpi_times_df = pd.DataFrame(times['rpi'],index = sizes)

*Plot: Structured Random Compression*

In [None]:
fig = go.Figure()

for method in rpi_times_df.columns:
    fig.add_trace(
        go.Scatter(
            x = rpi_times_df.index,
            y = rpi_times_df[method],
            name = method
        )
    )

fig.show()

*Plot: Time Breakdown by Component of Algorithm*

In [None]:
methods = rpi_times_df.columns  

plots_per_row = 3
num_methods = len(methods)
num_rows = math.ceil(num_methods / plots_per_row)

# Create subplots
fig = make_subplots(
    rows=num_rows, 
    cols=plots_per_row,
    subplot_titles=[f"{method}" for method in methods],
    vertical_spacing=0.15,
    horizontal_spacing=0.1
)

for idx, method in enumerate(methods):
    row = idx // plots_per_row + 1
    col = idx % plots_per_row + 1

    # RPI Proportion
    fig.add_trace(
        go.Scatter(
            x=rpi_times_df.index,
            y=rpi_times_df[method] / total_times_df[method],
            name='RPI',
            mode='lines',
            stackgroup='one',
            line=dict(width=0.5),
            fillcolor='rgba(0, 100, 80, 0.3)',
            legendgroup=method,
            showlegend=(idx == 0)
        ),
        row=row, col=col
    )
    
    # Projection Proportion
    fig.add_trace(
        go.Scatter(
            x=rpi_times_df.index,
            y=proj_times_df[method] / total_times_df[method],
            name='Generation + Multiplication',
            mode='lines',
            stackgroup='one',
            line=dict(width=0.5),
            fillcolor='rgba(100, 0, 80, 0.3)',
            legendgroup=method,
            showlegend= (idx == 0)
        ),
        row=row, col=col
    )
    
    # Y-axis in percent
    fig.update_yaxes(
        tickformat=',.0%',
        range=[0, 1],
        row=row, col=col
    )

# Update layout
fig.update_layout(
    title_text="Time Breakdown by Method",
    height=300 * num_rows,
    hovermode='x unified',
    legend=dict(orientation='h', yanchor='bottom', y=-0.2, xanchor='center', x=0.5),
)

fig.show()


### **NMF Algorithms**

*$\delta = 1$*

In [None]:
methods = {
    "MU": nmf_mu,
    "MU C": nmf_compress_mu,
    "MU SC": nmf_structured_compress_mu,
    "HALS": nmf_hals,
    "HALS C":nmf_compress_hals,
    "HALS SC":nmf_structured_compress_hals,
}

stats = {
    'time': {method: [] for method in methods},
    'errors': {method: [] for method in methods},
}

# Parameters
sizes = np.arange(1_000, 5_001, 1_000)
runs = 10
r = 20
for n in sizes:
    # Generate A (n x 0.75n)
    np.random.seed(1)
    A, _, _= generate_synthetic_matrix(n,r,delta = 1.0)
    for method_name, method in methods.items():

        total_times = [] 
        total_errors = []       
        for i in range(runs):
            # Set seed per run
            seed = i + 1
            
            # Time NMF Method
            start_time = timeit.default_timer()
            _, _, errors = method(A, r, random_state=seed)
            time = timeit.default_timer() - start_time

            # Store
            total_times.append(time)
            total_errors.append(errors[-1])
        
        # Store average times
        stats['time'][method_name].append(np.mean(total_times))
        stats['errors'][method_name].append(np.mean(total_errors))
    
    print(f'Completed benchmarking for matrix size: {n}')

In [None]:
nmf_times_df = pd.DataFrame(stats['time'],index = sizes)
nmf_errors_df = pd.DataFrame(stats['errors'],index = sizes)

*Plot: Standard NMF Implementations*

In [None]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Create a color map for consistent colors
color_map = {
    'MU': '#1f77b4',  # Blue
    'MU C': '#9467bd',
    'MU SC': '#e377c2',
    'HALS': '#ff7f0e',  # Orange
    'HALS C': '#17becf',
    'HALS SC': '#bcbd22'
    # Add more methods as needed
}

fig = make_subplots(rows=1, cols=2)

# Add time traces (left plot) with assigned colors
for method in nmf_times_df.columns:
    fig.add_trace(
        go.Scatter(
            x=nmf_times_df.index,
            y=nmf_times_df[method],
            name=method,
            legendgroup=method,
            showlegend=True,
            line=dict(color=color_map[method])  # Explicit color assignment
        ),
        row=1, col=1
    )

# Add error traces (right plot) with same colors
for method in nmf_errors_df.columns:
    fig.add_trace(
        go.Scatter(
            x=nmf_errors_df.index,
            y=nmf_errors_df[method],
            name=method,
            legendgroup=method,
            showlegend=False,
            line=dict(color=color_map[method])  # Same color as left plot
        ),
        row=1, col=2
    )

fig.update_layout(
    title='Computation Time and Reconstruction Error by Method',
    xaxis_title='Size (n)',
    yaxis_title='Time (s)',
    xaxis2_title='Size (n)',
    yaxis2_title='Reconstruction Error',

)

"""    
legend=dict(
    orientation="h",
    yanchor="bottom",
    y=1.1,
    xanchor="center",
    x=0.5
)

"""
fig.show()

*$\delta = 0.01$*

In [None]:
methods = {
    "MU": nmf_mu,
    "HALS": nmf_hals,
}

stats = {
    'time': {method: [] for method in methods},
    'errors': {method: [] for method in methods},
}

# Parameters
sizes = np.arange(1_00, 1_001, 1_00)
runs = 10
r = 20
for n in sizes:
    # Generate A (n x 0.75n)
    np.random.seed(1)
    A, _, _= generate_synthetic_matrix(n,r,delta = 0.01)
    for method_name, method in methods.items():

        total_times = [] 
        total_errors = []       
        for i in range(runs):
            # Set seed per run
            seed = i + 1
            
            # Time NMF Method
            start_time = timeit.default_timer()
            _, _, errors = method(A, r, random_state=seed)
            time = timeit.default_timer() - start_time

            # Store
            total_times.append(time)
            total_errors.append(errors[-1])
        
        # Store average times
        stats['time'][method_name].append(np.mean(total_times))
        stats['errors'][method_name].append(np.mean(total_errors))
    
    print(f'Completed benchmarking for matrix size: {n}')

In [None]:
nmf_times_df = pd.DataFrame(stats['time'],index = sizes)
nmf_errors_df = pd.DataFrame(stats['errors'],index = sizes)

In [None]:
# Create a color map for consistent colors
color_map = {
    'MU': '#1f77b4',  # Blue
    'HALS': '#ff7f0e',  # Orange
    # Add more methods as needed
}

fig = make_subplots(rows=1, cols=2)

# Add time traces (left plot) with assigned colors
for method in nmf_times_df.columns:
    fig.add_trace(
        go.Scatter(
            x=nmf_times_df.index,
            y=nmf_times_df[method],
            name=method,
            legendgroup=method,
            showlegend=True,
            line=dict(color=color_map[method])  # Explicit color assignment
        ),
        row=1, col=1
    )

# Add error traces (right plot) with same colors
for method in nmf_errors_df.columns:
    fig.add_trace(
        go.Scatter(
            x=nmf_errors_df.index,
            y=nmf_errors_df[method],
            name=method,
            legendgroup=method,
            showlegend=False,
            line=dict(color=color_map[method])  # Same color as left plot
        ),
        row=1, col=2
    )

fig.update_layout(
    title='Computation Time and Reconstruction Error by Method',
    xaxis_title='Size (n)',
    yaxis_title='Time (s)',
    xaxis2_title='Size (n)',
    yaxis2_title='Reconstruction Error',

)

"""    
legend=dict(
    orientation="h",
    yanchor="bottom",
    y=1.1,
    xanchor="center",
    x=0.5
)

"""
fig.show()

##### ***Application 1: CBCL***

In [2]:
from sklearn.datasets import fetch_olivetti_faces
X_faces = fetch_olivetti_faces(shuffle=True).data

*Method Benchmarks*

In [3]:
X_faces.shape[0]

400

In [None]:
methods = {
    "MU C": nmf_compress_mu,
    'MU SC': nmf_structured_compress_mu,
    'HALS C': nmf_compress_hals,
    'HALS SC': nmf_structured_compress_hals
}

projection_types = [
    'gaussian',
    # 'srht',
    # 'givens',
    'srft',
    'sparse-jl',
    'count-sketch',
]

stats = benchmark_faces(X_faces,methods,projection_types,r=49,runs=10)
stats

HALS SC performs the best, across all projection methods, but takes the longest to compute. 

HALS C performs the besst with traditional gaussian compression, MU C performs the best with count-sketch, and MU SC with gaussian.

**Mutiplicative Updates**

In [None]:
# Standard MU Algorithm
X_mu,Y_mu,errors_mu = nmf_mu(X_faces,49,max_iter=100)

# Standard Compressed MU
X_c_mu,Y_c_mu,errors_s_mu = nmf_compress_mu(X_faces,49,max_iter=100,projection_type='count-sketch')

# Structured Compressed MU
X_sc_mu,Y_sc_mu,errors_sc_mu = nmf_structured_compress_mu(X_faces,49,max_iter=100,projection_type='gaussian')

# Errors 
mu_errors_df = pd.DataFrame({'MU':errors_mu,
              'C MU':errors_s_mu,
              'SC MU':errors_sc_mu})

*Plot: Reconstruction Errors*

In [None]:
fig = go.Figure()

for method in mu_errors_df.columns:
    fig.add_trace(
        go.Scatter(
            y = mu_errors_df[method],
            name = method
        )
    )

fig.update_layout(
    title = 'CBCL Reconstruction Error by HALS Method',
    yaxis_title = 'Reconstruction Error',
    xaxis_title = 'Iteration'
)
fig.show()

*Plot: Visual Reconstruction Error by Method*

In [None]:
n_images = 4

fig = make_subplots(
    rows=4, cols=4,
    subplot_titles=(["Original Images"] + [' ']*3 + 
                   ["MU"] + [' ']*3 + 
                   ["Comp. MU (Count-Sketch)"] + [' ']*3 + 
                   ["Struct. Comp. MU (Gaussian)"] + [' ']*3),
    vertical_spacing=0.05,
    horizontal_spacing=0.02
)

matrix_pairs = [
    ("Original", X_faces),
    ("MU", X_mu @ Y_mu),
    ("Compressed MU", X_c_mu @ Y_c_mu),
    ("Structured Compressed MU", X_sc_mu @ Y_sc_mu)
]

for row, (name, Z) in enumerate(matrix_pairs, start=1):
    for col in range(1, n_images + 1):
        z = Z[col-1].reshape(64, 64)
        fig.add_trace(
            go.Heatmap(z=z, colorscale='gray', showscale=False),
            row=row, col=col
        )
  
fig.update_layout(
    title="NMF MU Reconstruction",
    width=800,
    height=600,
    font=dict(size=8), 
    margin=dict(l=10, r=10, b=10, t=40, pad=0), 
    plot_bgcolor='white', 
    paper_bgcolor='white'
)

fig.update_annotations(font_size=10)
fig.update_yaxes(autorange='reversed')
fig.show()

**Hierarchical Least Squares**

In [None]:
# Standard MU Algorithm
X_hals,Y_hals,errors_hals = nmf_hals(X_faces,49,max_iter=100)

# Standard Compressed MU
X_c_hals,Y_c_hals,errors_s_hals = nmf_compress_hals(X_faces,49,max_iter=100,projection_type='gaussian')

# Structured Compressed MU
X_sc_hals,Y_sc_hals,errors_sc_hals = nmf_structured_compress_hals(X_faces,49,max_iter=100,projection_type='count-sketch')

# Errors dataframe
hals_errors_df = pd.DataFrame({'HALS':errors_hals,
              'C HALS':errors_s_hals,
              'SC HALS':errors_sc_hals})

*Plot: Reconstruction Error*

In [None]:
fig = go.Figure()

for method in hals_errors_df.columns:
    fig.add_trace(
        go.Scatter(
            y = hals_errors_df[method],
            name = method
        )
    )

fig.update_layout(
    title = 'CBCL Reconstruction Error by HALS Method',
    yaxis_title = 'Reconstruction Error',
    xaxis_title = 'Iteration'
)
fig.show()

*Plot: Visual Reconstruction Error by Method*

In [None]:
n_images = 4

fig = make_subplots(
    rows=4, cols=4,
    subplot_titles=(["Original Images"] + [' ']*3 + 
                   ["HALS"] + [' ']*3 + 
                   ["Comp. HALS (Gaussian)"] + [' ']*3 + 
                   ["Struct. Comp. HALS (Count-Sketch)"] + [' ']*3),
    vertical_spacing=0.05,
    horizontal_spacing=0.02
)

matrix_pairs = [
    ("Original", X_faces),
    ("HALS", X_hals @ Y_hals),
    ("Compressed HALS", X_c_hals @ Y_c_hals),
    ("Structured Compressed HALS", X_sc_hals @ Y_sc_hals)
]

for row, (name, Z) in enumerate(matrix_pairs, start=1):
    for col in range(1, n_images + 1):
        z = Z[col-1].reshape(64, 64)
        fig.add_trace(
            go.Heatmap(z=z, colorscale='gray', showscale=False),
            row=row, col=col
        )

fig.update_layout(
    title="NMF HALS Reconstruction",
    width=800,
    height=600,
    font=dict(size=8),  
    margin=dict(l=10, r=10, b=10, t=40, pad=0), 
    plot_bgcolor='white',
    paper_bgcolor='white'
)

fig.update_annotations(font_size=10)
fig.update_yaxes(autorange='reversed')
fig.show()

### **Harvard Microarray Dataset**

In [None]:
df = pd.read_excel('dataseta_12600gene.xls')
X = df.iloc[:, 2:].values 

# 2. Preprocessing
X = np.where(X < 0, 0, X)
X = np.log2(X + 1)  

*Multiplicative Updates*

In [None]:
# Standard MU Algorithm
X_mu,Y_mu,errors_mu = nmf_mu(X,5,max_iter=100)

# Standard Compressed MU
X_c_mu,Y_c_mu,errors_s_mu = nmf_compress_mu(X,5,max_iter=100)

# Structured Compressed MU
X_sc_mu,Y_sc_mu,errors_sc_mu = nmf_structured_compress_mu(X,5,max_iter=100)

In [None]:
mu_errors_df = pd.DataFrame({'MU':errors_mu,
              'C MU':errors_s_mu,
              'SC MU':errors_sc_mu})

In [None]:
fig = go.Figure()

for method in mu_errors_df.columns:
    fig.add_trace(
        go.Scatter(
            y = mu_errors_df[method],
            name = method
        )
    )

fig.update_layout(
    title = 'Microarray Reconstruction Error by MU Method',
    yaxis_title = 'Reconstruction Error',
    xaxis_title = 'Iteration',
    width = 600,
    height = 500
)
fig.show()