### Non-negative matrix factorization

- dimension reduction
- interpretable (unlike PCA)
- all sample features must be non-negative

NMF expresses documents as combinations of topics, images as combination of patterns


In [1]:
from sklearn.decomposition import NMF

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()

csr = tfidf.fit_transform(['INSTRUCTIONS 100XP',
'Import TfidfVectorizer from sklearn.feature_extraction.text.',
'Create a TfidfVectorizer instance called tfidf.',
'Apply .fit_transform() method of tfidf to documents and assign the result to csr_mat. This is a word-frequency array in csr_matrix format.',
'Inspect csr_mat by calling its .toarray() method and printing the result. This has been done for you.',
'The columns of the array correspond to words. Get the list of words by calling the .get_feature_names() method of tfidf, and assign the result to words.'])

words = tfidf.get_feature_names()
words[:4]

['100xp', 'and', 'apply', 'array']

In [4]:
model = NMF(n_components=3)

In [5]:
model.fit(csr)

NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=200,
  n_components=3, random_state=None, shuffle=False, solver='cd',
  tol=0.0001, verbose=0)

Features can be used to reconstruct the samples

In [8]:
nmf_features = model.transform(csr)
nmf_features

array([[0.        , 0.        , 1.        ],
       [0.        , 0.73715549, 0.        ],
       [0.02273933, 0.73104802, 0.        ],
       [0.68738356, 0.01115013, 0.        ],
       [0.53375894, 0.        , 0.        ],
       [0.69457824, 0.        , 0.        ]])

In [11]:
csr.toarray()[1]

array([0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.4198708 , 0.        , 0.        , 0.        ,
       0.        , 0.4198708 , 0.        , 0.        , 0.        ,
       0.4198708 , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.4198708 , 0.4198708 , 0.        ,
       0.34430007, 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        ])

Dimension of components = dimension of samples

In [7]:
model.components_

array([[0.        , 0.23443291, 0.13267166, 0.18201299, 0.18201299,
        0.11665981, 0.16888304, 0.00248152, 0.16888304, 0.08929153,
        0.08929153, 0.00248152, 0.20445538, 0.13267166, 0.13267166,
        0.11665981, 0.        , 0.13267166, 0.11665981, 0.13267166,
        0.13267166, 0.        , 0.08929153, 0.08929153, 0.11665981,
        0.        , 0.13267166, 0.11665981, 0.00248152, 0.        ,
        0.13267166, 0.11665981, 0.08929153, 0.23443291, 0.32845366,
        0.11665981, 0.23443291, 0.        , 0.        , 0.15542011,
        0.        , 0.48170345, 0.20445538, 0.36402599, 0.11665981,
        0.13267166, 0.26787459, 0.11665981],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.33277847, 0.        , 0.        ,
        0.        , 0.33277847, 0.        , 0.        , 0.        ,
        0.        , 0.28712975, 0.        , 0.        , 0.        ,
        0.        , 0.28712975, 0.        , 0.        , 0.        ,
   

#### Sample reconstruction

- multiply components by feature values and add up


In [None]:
# Import pandas
import pandas as pd

# Create a pandas DataFrame: df
df = pd.DataFrame(nmf_features, index=titles)

# Print the row for 'Anne Hathaway'
print(df.loc['Anne Hathaway'])

# Print the row for 'Denzel Washington'
print(df.loc['Denzel Washington']