**Section 5b: Empirical Performance - 20 NewsGroups**

*Imports*

In [1]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from collections import defaultdict
from random_matrix import *
from nmf import *
from benchmark import *

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import adjusted_rand_score
from sklearn.metrics import confusion_matrix
from scipy.optimize import linear_sum_assignment
from sklearn.metrics import normalized_mutual_info_score

*Data*

In [2]:
def clustering_accuracy(y_true, y_pred):
    """
    Computes clustering accuracy (AC) using the Kuhn-Munkres algorithm
    to find the optimal mapping from predicted clusters to true labels.
    """
    cm = confusion_matrix(y_true, y_pred)
    row_ind, col_ind = linear_sum_assignment(-cm)  # maximize accuracy
    return cm[row_ind, col_ind].sum() / np.sum(cm)

In [3]:
newsgroups = fetch_20newsgroups(
    subset='test',
    remove=('headers', 'footers', 'quotes')
)

texts = newsgroups.data 
true_labels = newsgroups.target

# Create TF-IDF vectorizer 
tfidf = TfidfVectorizer(
    max_features=5000,        
    stop_words='english',    
    min_df=5,                
    max_df=0.7               
)

X_tfidf = tfidf.fit_transform(texts)

# Convert to NCW
e = np.ones(X_tfidf.shape[1]) 
term_weights = X_tfidf.T @ X_tfidf @ e  
term_weights = np.array(term_weights).flatten()  

epsilon = 1e-6
D_inv_sqrt = np.diag(1 / np.sqrt(term_weights + epsilon)) 

X_ncw = X_tfidf @ D_inv_sqrt

In [4]:
e = np.ones(X_tfidf.shape[1])  # Vector of ones (shape: n_terms,)
term_weights = X_tfidf.T @ X_tfidf @ e  # Shape: (n_terms,)
term_weights = np.array(term_weights).flatten()  # Ensure 1D array

# 2. Avoid division by zero for unused terms
epsilon = 1e-6
D_inv_sqrt = np.diag(1 / np.sqrt(term_weights + epsilon))  # Sparse diagonal matrix

# 3. Apply weighting: X_ncw = X_tfidf D^{-1/2}
X_ncw = X_tfidf @ D_inv_sqrt

*Benchmark*

In [5]:
import timeit
import numpy as np
from collections import defaultdict
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score

def run_experiment(X_ncw, true_labels,k_values,runs,methods,methods_baseline,projection_types):
    results = []
    
    # First run baseline methods (no projections)
    for method_name, method in methods_baseline.items():
        for k in k_values:
            metrics = {
                'errors': [],
                'times': [],
                'ARI': [],
                'AC': [],
                'NMI': []
            }
            
            for seed in range(1, runs + 1):
                # Time NMF Method
                start_time = timeit.default_timer()
                W, H, errors = method(X_ncw, k, random_state=seed)
                elapsed_time = timeit.default_timer() - start_time

                # Calculate metrics
                pred_labels = W.argmax(axis=1)
                
                metrics['errors'].append(errors[-1])
                metrics['times'].append(elapsed_time)
                metrics['ARI'].append(adjusted_rand_score(true_labels, pred_labels))
                metrics['AC'].append(clustering_accuracy(true_labels, pred_labels))
                metrics['NMI'].append(normalized_mutual_info_score(true_labels, pred_labels))
            
            # Store results for baseline
            results.append({
                'Method': method_name,
                'Projection': 'none',  # Mark as baseline
                'K': k,
                'Time': np.mean(metrics['times']),
                'Errors': np.mean(metrics['errors']),
                'ARI': np.mean(metrics['ARI']),
                'AC': np.mean(metrics['AC']),
                'NMI': np.mean(metrics['NMI'])
            })
            
            print(f"Completed baseline {method_name}, k={k}")
    
    # Then run projection methods
    for method_name, method in methods.items():
        for projection in projection_types:
            for k in k_values:
                metrics = {
                    'errors': [],
                    'times': [],
                    'ARI': [],
                    'AC': [],
                    'NMI': []
                }
                
                for seed in range(1, runs + 1):
                    # Time NMF Method
                    start_time = timeit.default_timer()
                    W, H, errors = method(X_ncw, k, random_state=seed, projection_type=projection)
                    elapsed_time = timeit.default_timer() - start_time

                    # Calculate metrics
                    pred_labels = W.argmax(axis=1)
                    
                    metrics['errors'].append(errors[-1])
                    metrics['times'].append(elapsed_time)
                    metrics['ARI'].append(adjusted_rand_score(true_labels, pred_labels))
                    metrics['AC'].append(clustering_accuracy(true_labels, pred_labels))
                    metrics['NMI'].append(normalized_mutual_info_score(true_labels, pred_labels))
                
                # Store results for this configuration
                results.append({
                    'Method': method_name,
                    'Projection': projection,
                    'K': k,
                    'Time': np.mean(metrics['times']),
                    'Errors': np.mean(metrics['errors']),
                    'ARI': np.mean(metrics['ARI']),
                    'AC': np.mean(metrics['AC']),
                    'NMI': np.mean(metrics['NMI'])
                })
                
                print(f"Completed {method_name}, {projection}, k={k}")
    
    return results

methods = {
    "MU C": nmf_compress_mu,
    'MU SC': nmf_structured_compress_mu,
    'HALS C': nmf_compress_hals,
    'HALS SC': nmf_structured_compress_hals
}

methods_baseline = {
    "MU": nmf_mu,
    'HALS': nmf_hals,
}

projection_types = [
    'gaussian',
    'srht',
    'srft',
    'sparse-jl',
    'count-sketch',
]

runs = 1
k_values = range(2, 5, 2)

# Run the experiment
rows = run_experiment(X_ncw, true_labels,k_values,runs,methods,methods_baseline,projection_types)

Completed baseline MU, k=2
Completed baseline MU, k=4
Completed baseline HALS, k=2
Completed baseline HALS, k=4
Completed MU C, gaussian, k=2
Completed MU C, gaussian, k=4
Completed MU C, srht, k=2
Completed MU C, srht, k=4
Completed MU C, srft, k=2
Completed MU C, srft, k=4
Completed MU C, sparse-jl, k=2
Completed MU C, sparse-jl, k=4
Completed MU C, count-sketch, k=2
Completed MU C, count-sketch, k=4
Completed MU SC, gaussian, k=2
Completed MU SC, gaussian, k=4
Completed MU SC, srht, k=2
Completed MU SC, srht, k=4
Completed MU SC, srft, k=2
Completed MU SC, srft, k=4
Completed MU SC, sparse-jl, k=2
Completed MU SC, sparse-jl, k=4
Completed MU SC, count-sketch, k=2
Completed MU SC, count-sketch, k=4
Completed HALS C, gaussian, k=2
Completed HALS C, gaussian, k=4
Completed HALS C, srht, k=2
Completed HALS C, srht, k=4
Completed HALS C, srft, k=2
Completed HALS C, srft, k=4
Completed HALS C, sparse-jl, k=2
Completed HALS C, sparse-jl, k=4
Completed HALS C, count-sketch, k=2
Completed HA

In [6]:
pd.DataFrame(rows)[['Method','Projection','Time','AC','NMI']]

Unnamed: 0,Method,Projection,Time,AC,NMI
0,MU,none,13.645584,0.100903,0.189129
1,MU,none,13.759416,0.145645,0.256505
2,HALS,none,9.939849,0.055098,0.001395
3,HALS,none,10.064094,0.086697,0.111899
4,MU C,gaussian,7.88398,0.058152,0.001449
5,MU C,gaussian,7.911336,0.065985,0.004223
6,MU C,srht,7.972725,0.057621,0.000689
7,MU C,srht,7.998276,0.062666,0.002336
8,MU C,srft,7.779707,0.061471,0.001533
9,MU C,srft,8.095565,0.061073,0.00236


*Plots*