**Section 5b: Empirical Performance - 20 NewsGroups**

*Imports*

In [1]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from collections import defaultdict
from random_matrix import *
from nmf import *
from benchmark import *

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score, confusion_matrix
from scipy.optimize import linear_sum_assignment

*Data*

In [2]:
newsgroups = fetch_20newsgroups(
    subset='test',
    remove=('headers', 'footers', 'quotes')
)

texts = newsgroups.data 
true_labels = newsgroups.target

# Create TF-IDF vectorizer 
tfidf = TfidfVectorizer(
    max_features=5000,        
    stop_words='english',    
    min_df=5,                
    max_df=0.7               
)

X_tfidf = tfidf.fit_transform(texts)

# Convert to NCW
e = np.ones(X_tfidf.shape[1]) 
term_weights = X_tfidf.T @ X_tfidf @ e  
term_weights = np.array(term_weights).flatten()  

epsilon = 1e-6
D_inv_sqrt = np.diag(1 / np.sqrt(term_weights + epsilon)) 

X_ncw = X_tfidf @ D_inv_sqrt

In [3]:
print(f'Shape: {X_ncw.shape}')
print(f'Sparsity: {1 - len(np.nonzero(X_ncw)[0])/(X_ncw.shape[0]*X_ncw.shape[1])}')

Shape: (7532, 5000)
Sparsity: 0.9914033988316516


*Benchmark*

In [4]:
def clustering_accuracy(y_true, y_pred):
    """
    Computes clustering accuracy (AC) using the Kuhn-Munkres algorithm
    to find the optimal mapping from predicted clusters to true labels.
    """
    cm = confusion_matrix(y_true, y_pred)
    row_ind, col_ind = linear_sum_assignment(-cm)  # maximize accuracy
    return cm[row_ind, col_ind].sum() / np.sum(cm)

In [5]:
import timeit
import numpy as np
from collections import defaultdict
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score

def run_experiment(X_ncw, true_labels,k_values,runs,methods,methods_baseline,projection_types):
    results = []
    
    # First run baseline methods (no projections)
    for method_name, method in methods_baseline.items():
        for k in k_values:
            metrics = {
                'errors': [],
                'times': [],
                'ARI': [],
                'AC': [],
                'NMI': []
            }
            
            for seed in range(1, runs + 1):
                # Time NMF Method
                start_time = timeit.default_timer()
                W, H, errors = method(X_ncw, k, random_state=seed)
                elapsed_time = timeit.default_timer() - start_time

                # Calculate metrics
                pred_labels = W.argmax(axis=1)
                
                metrics['errors'].append(errors[-1])
                metrics['times'].append(elapsed_time)
                metrics['ARI'].append(adjusted_rand_score(true_labels, pred_labels))
                metrics['AC'].append(clustering_accuracy(true_labels, pred_labels))
                metrics['NMI'].append(normalized_mutual_info_score(true_labels, pred_labels))
            
            # Store results for baseline
            results.append({
                'Method': method_name,
                'Projection': 'none',  # Mark as baseline
                'K': k,
                'Time': np.mean(metrics['times']),
                'Errors': np.mean(metrics['errors']),
                'ARI': np.mean(metrics['ARI']),
                'AC': np.mean(metrics['AC']),
                'NMI': np.mean(metrics['NMI'])
            })
            
            print(f"Completed baseline {method_name}, k={k}")
    
    # Then run projection methods
    for method_name, method in methods.items():
        for projection in projection_types:
            for k in k_values:
                metrics = {
                    'errors': [],
                    'times': [],
                    'ARI': [],
                    'AC': [],
                    'NMI': []
                }
                
                for seed in range(1, runs + 1):
                    # Time NMF Method
                    start_time = timeit.default_timer()
                    W, H, errors = method(X_ncw, k, random_state=seed, projection_type=projection)
                    elapsed_time = timeit.default_timer() - start_time

                    # Calculate metrics
                    pred_labels = W.argmax(axis=1)
                    
                    metrics['errors'].append(errors[-1])
                    metrics['times'].append(elapsed_time)
                    metrics['ARI'].append(adjusted_rand_score(true_labels, pred_labels))
                    metrics['AC'].append(clustering_accuracy(true_labels, pred_labels))
                    metrics['NMI'].append(normalized_mutual_info_score(true_labels, pred_labels))
                
                # Store results for this configuration
                results.append({
                    'Method': method_name,
                    'Projection': projection,
                    'K': k,
                    'Time': np.mean(metrics['times']),
                    'Errors': np.mean(metrics['errors']),
                    'ARI': np.mean(metrics['ARI']),
                    'AC': np.mean(metrics['AC']),
                    'NMI': np.mean(metrics['NMI'])
                })
                
                print(f"Completed {method_name}, {projection}, k={k}")
    
    return results

methods = {
    "MU C": nmf_compress_mu,
    'MU SC': nmf_structured_compress_mu,
    'HALS C': nmf_compress_hals,
    'HALS SC': nmf_structured_compress_hals
}

methods_baseline = {
    "MU": nmf_mu,
    'HALS': nmf_hals,
}

projection_types = [
    'gaussian',
    'srht',
    'srft',
    'sparse-jl',
    'count-sketch',
]

runs = 10
k_values = range(20, 21, 2)

# Run the experiment
rows_compressed = run_experiment(X_ncw, true_labels,k_values,runs,methods,{},projection_types)
rows_baseline = run_experiment(X_ncw, true_labels,k_values,runs,{},methods_baseline,projection_types)

Completed MU C, gaussian, k=20
Completed MU C, srht, k=20
Completed MU C, srft, k=20
Completed MU C, sparse-jl, k=20
Completed MU C, count-sketch, k=20
Completed MU SC, gaussian, k=20
Completed MU SC, srht, k=20
Completed MU SC, srft, k=20
Completed MU SC, sparse-jl, k=20
Completed MU SC, count-sketch, k=20
Completed HALS C, gaussian, k=20
Completed HALS C, srht, k=20
Completed HALS C, srft, k=20
Completed HALS C, sparse-jl, k=20
Completed HALS C, count-sketch, k=20
Completed HALS SC, gaussian, k=20
Completed HALS SC, srht, k=20
Completed HALS SC, srft, k=20
Completed HALS SC, sparse-jl, k=20
Completed HALS SC, count-sketch, k=20
Completed baseline MU, k=20
Completed baseline HALS, k=20


*Baseline Stats*

In [6]:
df_baseline = pd.DataFrame(rows_baseline)

In [7]:
df_baseline.pivot_table(values = ['AC','NMI','Time'],index = ['Method']).round(4)

Unnamed: 0_level_0,AC,NMI,Time
Method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
HALS,0.0841,0.0751,10.5167
MU,0.3639,0.4089,14.351


*Compressed Stats*

In [8]:
df_stats = pd.DataFrame(rows_compressed)[['Method','Projection','Time','AC','NMI']].pivot_table(values = ['AC','NMI'],columns = ['Projection'],index=['Method'])
df_time = pd.DataFrame(rows_compressed)[['Method','Projection','Time','AC','NMI']].pivot_table(values = ['Time'],columns = ['Projection'],index=['Method'])

In [11]:
df_stats.round(4)

Unnamed: 0_level_0,AC,AC,AC,AC,AC,NMI,NMI,NMI,NMI,NMI
Projection,count-sketch,gaussian,sparse-jl,srft,srht,count-sketch,gaussian,sparse-jl,srft,srht
Method,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
HALS C,0.0668,0.0664,0.0632,0.0615,0.0614,0.0048,0.0042,0.0036,0.003,0.0031
HALS SC,0.0816,0.0817,0.0806,0.0827,0.0812,0.0802,0.0755,0.0806,0.0717,0.0703
MU C,0.0766,0.0708,0.0737,0.0729,0.0743,0.0146,0.0114,0.0127,0.01,0.0116
MU SC,0.3534,0.3812,0.4001,0.3739,0.3771,0.3485,0.3688,0.3777,0.3738,0.3765


In [12]:
df_time.round(4)

Unnamed: 0_level_0,Time,Time,Time,Time,Time
Projection,count-sketch,gaussian,sparse-jl,srft,srht
Method,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
HALS C,8.7212,8.5627,8.2775,8.0707,7.5491
HALS SC,8.4334,8.5587,8.3121,8.3823,8.6492
MU C,8.7794,8.3193,8.5105,8.363,8.0753
MU SC,8.7321,8.8139,8.7562,8.7441,9.0416
