**Section 5b: Empirical Performance - 20 NewsGroups**

*Imports*

In [1]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from sklearn.datasets import fetch_olivetti_faces
from collections import defaultdict
from random_matrix import *
from nmf import *
from benchmark import *

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import adjusted_rand_score
import numpy as np

*Data*

In [2]:
newsgroups = fetch_20newsgroups(
    subset='test',
    remove=('headers', 'footers', 'quotes')  # Keep only body text
)

texts = newsgroups.data 
true_labels = newsgroups.target

# Create TF-IDF vectorizer (limit to top 5000 terms)
tfidf = TfidfVectorizer(
    max_features=5000,        # Keep top 5k terms
    stop_words='english',    # Remove English stopwords
    min_df=5,                # Ignore terms in <5 docs
    max_df=0.7               # Ignore terms in >70% docs
)

# Apply vectorizer to texts -> Sparse matrix: (documents × terms)
X_tfidf = tfidf.fit_transform(texts)

In [13]:
e = np.ones(X_tfidf.shape[1])  # Vector of ones (shape: n_terms,)
term_weights = X_tfidf.T @ X_tfidf @ e  # Shape: (n_terms,)
term_weights = np.array(term_weights).flatten()  # Ensure 1D array

# 2. Avoid division by zero for unused terms
epsilon = 1e-6
D_inv_sqrt = np.diag(1 / np.sqrt(term_weights + epsilon))  # Sparse diagonal matrix

# 3. Apply weighting: X_ncw = X_tfidf D^{-1/2}
X_ncw = X_tfidf @ D_inv_sqrt

*Benchmark*

In [16]:
methods = {
    "MU C": nmf_compress_mu,
    'MU SC': nmf_structured_compress_mu,
    'HALS C': nmf_compress_hals,
    'HALS SC': nmf_structured_compress_hals
}

projection_types = [
    'gaussian',
    # 'srht',
    # 'givens',
     'srft',
     'sparse-jl',
     'count-sketch',
]

stats = {
'errors': {method: defaultdict(int) for method in methods},
'time': {method: defaultdict(int) for method in methods},
'ARI': {method: defaultdict(int) for method in methods}
}

# Set r
r = 20
runs = 1
rows = []
for method_name, method in methods.items():
    for projection in projection_types:
        total_errors = []    
        total_times = []  
        ari_scores = [] 
        for i in range(runs):
            # Set seed per run
            seed = i + 1
            
            # Time NMF Method
            start_time = timeit.default_timer()
            W, H, errors = method(X_ncw, r, random_state=seed,projection_type = projection)
            time = timeit.default_timer() - start_time

            # Get ARI Score
            pred_labels = W.argmax(axis=1)
            ari_score = adjusted_rand_score(true_labels, pred_labels)
            # Store
            total_times.append(time)
            total_errors.append(errors[-1])
            ari_scores.append(ari_score)
        
        # Store average times
        rows.append({
            'Method':method_name,
            'Projection':projection,
            'Time':np.mean(total_times),
            'Errors':np.mean(total_errors),
            'ARI':np.mean(ari_scores)
        })
        print(f"Completed {method_name},{projection}")

Completed MU C,gaussian
Completed MU C,srft
Completed MU C,sparse-jl
Completed MU C,count-sketch
Completed MU SC,gaussian
Completed MU SC,srft
Completed MU SC,sparse-jl
Completed MU SC,count-sketch
Completed HALS C,gaussian
Completed HALS C,srft
Completed HALS C,sparse-jl
Completed HALS C,count-sketch
Completed HALS SC,gaussian
Completed HALS SC,srft
Completed HALS SC,sparse-jl
Completed HALS SC,count-sketch


In [17]:
pd.DataFrame(rows)

Unnamed: 0,Method,Projection,Time,Errors,ARI
0,MU C,gaussian,11.931203,1.076159,0.00114
1,MU C,srft,11.861955,0.997384,0.00127
2,MU C,sparse-jl,12.12197,1.11136,0.000797
3,MU C,count-sketch,11.935314,1.077865,0.000916
4,MU SC,gaussian,12.076636,0.986291,0.197624
5,MU SC,srft,24.759421,0.986083,0.154895
6,MU SC,sparse-jl,12.163921,0.987753,0.195202
7,MU SC,count-sketch,12.126181,0.98915,0.18855
8,HALS C,gaussian,11.606488,1.009426,0.000605
9,HALS C,srft,11.618306,0.998528,0.000892


*Plots*