**NMF Benchmarks**

In [None]:
import numpy as np
import pandas as pd
from archive.algo import randomized_nmf
import plotly.graph_objects as go

**Example 1, Sparse Application: 20 News Groups Dataset**

In [82]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer

newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))
vectorizer = TfidfVectorizer(max_features=5000)  # Limit vocabulary for efficiency
X_sparse = vectorizer.fit_transform(newsgroups.data)
X_sparse = X_sparse.toarray() 

In [85]:
X,Y,errors = randomized_nmf(X_sparse,50,10,w=1,compression='standard')

In [86]:
fig = go.Figure()

fig.add_trace(
    go.Scatter(
        y = errors
    )
)

fig.show()

In [105]:
import numpy as np
import pandas as pd

# Get feature names (words)
feature_names = np.array(vectorizer.get_feature_names_out())

# Number of top words to display per topic
n_top_words = 10

# Extract top words for each topic
top_words_per_topic = {}

for topic_idx, topic in enumerate(Y):
    # Sort the words in descending order and get the indices of the top words
    top_word_indices = topic.argsort()[-n_top_words:][::-1]
    top_words = feature_names[top_word_indices]
    top_words_per_topic[topic_idx] = top_words

# Display the top words for each topic
for topic_idx, top_words in top_words_per_topic.items():
    print(f"Topic #{topic_idx + 1}: {' '.join(top_words)}")

Topic #1: different other doing officials passages denning economic phones others compromise
Topic #2: arizona arithmetic arm armed arguments armenia attention argument attempts attempting
Topic #3: ap alot baud article along automatic adequate cnn auto apart
Topic #4: but busy button carry case cases cars calm carrying chicago
Topic #5: preferably plot prog plenty phil prefer register raw push pushing
Topic #6: pray perspective played park player peter paris prayer produced play
Topic #7: roy rpm schedule scene row rs scheduled scanner rsa scan
Topic #8: ansi answer another answered answering helping anonymous arguments helps appropriate
Topic #9: 34u 68 abc 67 35 1985 34 1986 1984 04
Topic #10: gui guess glory global guide gm glass g9 glad guard


**Example 2, Dense: CBCL Face Dataset (Image Processing)**

In [2]:
from sklearn.datasets import fetch_olivetti_faces

X_dense = fetch_olivetti_faces(shuffle=True).data  # 400 images of size 64x64

In [3]:
X,Y,errors = randomized_nmf(X_dense,49,5,compression='standard')

In [113]:
import pandas as pd
from scipy.sparse import csr_matrix

# Download from Stanford (example: Books subset)
url = "http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/ratings_Books.csv"
df = pd.read_csv(url, names=['user_id', 'product_id', 'rating', 'timestamp'])

# Convert to sparse matrix
rows = df['user_id'].astype('category').cat.codes
cols = df['product_id'].astype('category').cat.codes
sparse_matrix = csr_matrix((df['rating'], (rows, cols)))

In [None]:
X,Y,errors = randomized_nmf(sparse_matrix,10,5,compression='standard')

*Errors Plot*

In [101]:
fig = go.Figure()

fig.add_trace(
    go.Scatter(
        y = errors
    )
)

fig.show()

In [4]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import numpy as np

# Assuming your NMF model is already fitted and stored in `nmf_model`
n_components = 4  # Adjust based on your NMF model's number of components

# Get the learned basis images (W matrix from NMF)
basis_images = np.dot(X,Y)

# Create a subplot grid for the basis images (4 images per row)
rows = (n_components + 3) // 4  # Calculate number of rows needed

fig = make_subplots(
    rows=rows, cols=4,
    subplot_titles=[f'Basis {i+1}' for i in range(n_components)]
)

# Plot the basis images
for i in range(n_components):
    fig.add_trace(
        go.Heatmap(z=basis_images[i].reshape(64, 64), colorscale='gray', showscale=False),
        row=(i // 4) + 1, col=(i % 4) + 1
    )

fig.update_layout(
    title="NMF Basis Images",
    showlegend=False
)

fig.show()

In [5]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import numpy as np

# Assuming your NMF model is already fitted and stored in `nmf_model`
n_components = 4 # Adjust based on your NMF model's number of components

# Get the learned basis images (W matrix from NMF)
basis_images = X_dense

# Create a subplot grid for the basis images (4 images per row)
rows = (n_components + 3) // 4  # Calculate number of rows needed

fig = make_subplots(
    rows=rows, cols=4,
    subplot_titles=[f'Basis {i+1}' for i in range(n_components)]
)

# Plot the basis images
for i in range(n_components):
    fig.add_trace(
        go.Heatmap(z=basis_images[i].reshape(64, 64), colorscale='gray', showscale=False),
        row=(i // 4) + 1, col=(i % 4) + 1
    )

fig.update_layout(
    title="NMF Basis Images",
    showlegend=False
)

fig.show()

In [104]:
from plotly.subplots import make_subplots

# Assuming X is the original data matrix (faces dataset)
# and H is the coefficient matrix (X = W * H), get the reconstructed data
reconstructed_images = np.dot(X, Y)

# Plot the original and reconstructed images for comparison using Plotly
n_images = 4  # Number of images to visualize
rows = (n_images + 3) // 4  # Calculate number of rows needed for 4 images per row

# Create subplots for original and reconstructed images (4 images per row)
fig = make_subplots(
    rows=2, cols=4,  # 2 rows for original and reconstructed images
    subplot_titles=[f'Original {i+1}' for i in range(n_images)] + [f'Reconstructed {i+1}' for i in range(n_images)]
)

# Original images (first row)
for i in range(n_images):
    fig.add_trace(
        go.Heatmap(z=X_dense[i].reshape(64, 64), colorscale='gray', showscale=False),
        row=1, col=(i % 4) + 1
    )

# Reconstructed images (second row)
for i in range(n_images):
    fig.add_trace(
        go.Heatmap(z=reconstructed_images[i].reshape(64, 64), colorscale='gray', showscale=False),
        row=2, col=(i % 4) + 1
    )

fig.update_layout(
    title="Original vs Reconstructed Faces",
    showlegend=False
)

fig.show()


In [7]:
df = pd.read_excel('dataseta_12600gene.xls')

In [None]:
X = df.iloc[:, 2:].values  # Drop first two columns

# 2. Preprocessing
X = np.where(X < 0, 0, X)
X = np.log2(X + 1)  

In [19]:
X

array([[0.        , 3.33913738, 0.        , ..., 4.80219322, 4.63226822,
        0.        ],
       [3.52857132, 3.33913738, 0.        , ..., 3.51601515, 4.91360751,
        0.        ],
       [0.01435529, 3.48284828, 0.        , ..., 4.26528686, 5.03562391,
        0.        ],
       ...,
       [3.88557436, 0.        , 3.51916445, ..., 0.        , 6.88581837,
        6.2320851 ],
       [0.        , 0.        , 0.        , ..., 0.05658353, 0.        ,
        0.        ],
       [6.28503225, 6.73782194, 6.22371214, ..., 6.18923155, 7.08831124,
        5.36352154]])