
# Topic extraction with Non-negative Matrix Factorization and Latent Dirichlet Allocation


NMF relies on linear algebra, it fixes values for the probability vectors of the multinomials. 
NMF is a deterministic algorithm which arrives at a single representation of the corpus.
NMF performs better in cases where the topic probabilities should remain fixed per document (unlikely though)—or in small datasets in which the additional variability coming from the hyperpriors is too much.

LDA is based on probabilistic graphical modeling. Documents that have similar words or groups of words usually have the same topic. Thus, LDA is a probabilistic model capable of expressing uncertainty about the placement of topics across texts and the assignment of words to topics.

Extract topic “descriptions” based on top ranked terms in basis vectors:
the output is a list of topics, each represented as a list of terms (weights are not shown).

In [1]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation

In [2]:
df = pd.read_excel("C:\\Users\\thesk\\Desktop\\RAW_recipes.xlsx", feature_names = ['name', 'id', 'minutes', 'contributor_id', 'submitted', 'tags', 'nutrition', 'n_steps','steps', 'description', 'ingredients', 'n_ingredients'], na_values=['NA'])
df.shape

(231637, 12)

In [3]:
df.head(3)

Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients
0,arriba baked winter squash mexican style,137739,55,47892,2005-09-16,"['60-minutes-or-less', 'time-to-make', 'course...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",11,"['make a choice and proceed with recipe', 'dep...",autumn is my favorite time of year to cook! th...,"['winter squash', 'mexican seasoning', 'mixed ...",7
1,a bit different breakfast pizza,31490,30,26278,2002-06-17,"['30-minutes-or-less', 'time-to-make', 'course...","[173.4, 18.0, 0.0, 17.0, 22.0, 35.0, 1.0]",9,"['preheat oven to 425 degrees f', 'press dough...",this recipe calls for the crust to be prebaked...,"['prepared pizza crust', 'sausage patty', 'egg...",6
2,all in the kitchen chili,112140,130,196586,2005-02-25,"['time-to-make', 'course', 'preparation', 'mai...","[269.8, 22.0, 32.0, 48.0, 39.0, 27.0, 5.0]",6,"['brown ground beef in large pot', 'add choppe...",this modified version of 'mom's' chili was a h...,"['ground beef', 'yellow onions', 'diced tomato...",13


# <p style="color:teal;"> Use tf (raw term count or count vectorizer) features ..

In [4]:
# Creating vocabulary of all words in our data.
# Only words that appear in less than 95% of the document and appear in at least 2 documents
# I set max_features=1000 in tfv for speed reasons, otherwise there would be 10,000+ features.
tfv = CountVectorizer(max_df=0.95, min_df=2, max_features=1000, stop_words='english')

In [5]:
tf_word_matrix = tfv.fit_transform(df['name'].values.astype('U'))
tf_word_matrix

<231637x1000 sparse matrix of type '<class 'numpy.int64'>'
	with 724091 stored elements in Compressed Sparse Row format>

<p style="color:teal;">Each of 230000+ rows in our Excel file (or each document) is represented as 1000 dimensional vector, 
which means that our vocabulary has 1000 words.

## <p style="color:teal;">.. for LDA</p>

In [6]:
# Use LDA to create topics along with the probability distribution for each word in our vocabulary for each topic.
lda = LatentDirichletAllocation(n_components=5, max_iter=5, learning_method='online', learning_offset=50., random_state=0)

In [7]:
lda.fit(tf_word_matrix)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='online', learning_offset=50.0,
             max_doc_update_iter=100, max_iter=5, mean_change_tol=0.001,
             n_components=5, n_jobs=1, n_topics=None, perp_tol=0.1,
             random_state=0, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [8]:
first_topic = lda.components_[0]
print(first_topic)

[2.05490103e-01 2.06514134e-01 1.14421202e+02 2.04710032e-01
 2.01484157e-01 2.05240018e-01 2.02577830e-01 1.33139134e+02
 2.00384912e-01 2.02009858e-01 2.01307638e-01 2.01232461e-01
 2.03612920e-01 2.00786480e-01 2.00722084e-01 1.07947316e+02
 2.03281119e-01 2.01613638e-01 2.00519415e-01 2.02670524e-01
 2.01443797e-01 2.01563290e-01 5.91781542e+02 2.01321420e-01
 2.01817192e-01 2.01083719e-01 2.03168566e-01 2.04352746e+02
 2.01713190e-01 2.03673322e-01 2.02928319e-01 1.09902939e+03
 4.69583893e+02 2.01361804e-01 2.01389493e-01 2.02166770e-01
 2.01607308e-01 8.18024778e+02 2.01990305e-01 2.02080804e-01
 2.03246472e-01 2.02574795e-01 3.44925099e+03 2.00957651e-01
 2.01664301e-01 2.03448431e-01 2.02006286e-01 2.01344802e-01
 3.09214689e+02 2.01275170e-01 2.01390010e-01 1.08687095e+02
 2.01624633e-01 2.00510250e-01 2.02111024e-01 2.03916849e-01
 2.01965797e-01 2.01929825e-01 2.01424964e-01 3.01168721e+02
 3.52165242e+03 2.82105161e+03 2.03145833e-01 2.02533098e-01
 2.02484609e-01 2.034395

In [10]:
# The first topic contains the probabilities of 1000 words for this topic.
# The argsort() function sorts the indexes according to probabilities.
# Once sorted, the 10 words with the highest probabilities will be at the last 10 indexes of the array. 
# Get the indexes of the 10 words with the highest probabilities.

top_words_in_topic = first_topic.argsort()[-10:]
print(top_words_in_topic)

[291  60 937 999 870 154 751 844 698  97]


In [11]:
## Use these indexes to retrieve the value of the words from the tfv (or count vectorizer) object
for i in top_words_in_topic:
    print(tfv.get_feature_names()[i])

dip
bean
turkey
zucchini
stuffed
casserole
roasted
spicy
potato
bread


In [12]:
for i,topic in enumerate(lda.components_):
    print(f'Top words for topic #{i}:')
    print([tfv.get_feature_names()[i] for i in topic.argsort()[-10:]])

Top words for topic #0:
['dip', 'bean', 'turkey', 'zucchini', 'stuffed', 'casserole', 'roasted', 'spicy', 'potato', 'bread']
Top words for topic #1:
['peanut', 'muffins', 'garlic', 'white', 'butter', 'easy', 'cookies', 'sweet', 'chocolate', 'sauce']
Top words for topic #2:
['fried', 'chili', 'grilled', 'sausage', 'pumpkin', 'potatoes', 'spinach', 'pork', 'rice', 'chicken']
Top words for topic #3:
['hot', 'simple', 'apple', 'red', 'cream', 'lemon', 'shrimp', 'pasta', 'pie', 'salad']
Top words for topic #4:
['cream', 'vegetable', 'orange', 'pot', 'strawberry', 'beef', 'tomato', 'soup', 'cake', 'cheese']


In [13]:
# To add a column to the dataset showing the topic, first
# use lda.transform() method and pass it document-word matrix 
# to assign 5 new columns with probabilities of each topic to each Excel row (or document). 

topic_values = lda.transform(tf_word_matrix)
topic_values.shape

(231637, 5)

In [14]:
## Use the argmax() method and axis=1 for a column in order to find the topic index with maximum value.
# Pandas syntax: Index.argmax(axis=None)

df['Topic'] = topic_values.argmax(axis=1)
df.head(3)

Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients,Topic
0,arriba baked winter squash mexican style,137739,55,47892,2005-09-16,"['60-minutes-or-less', 'time-to-make', 'course...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",11,"['make a choice and proceed with recipe', 'dep...",autumn is my favorite time of year to cook! th...,"['winter squash', 'mexican seasoning', 'mixed ...",7,0
1,a bit different breakfast pizza,31490,30,26278,2002-06-17,"['30-minutes-or-less', 'time-to-make', 'course...","[173.4, 18.0, 0.0, 17.0, 22.0, 35.0, 1.0]",9,"['preheat oven to 425 degrees f', 'press dough...",this recipe calls for the crust to be prebaked...,"['prepared pizza crust', 'sausage patty', 'egg...",6,4
2,all in the kitchen chili,112140,130,196586,2005-02-25,"['time-to-make', 'course', 'preparation', 'mai...","[269.8, 22.0, 32.0, 48.0, 39.0, 27.0, 5.0]",6,"['brown ground beef in large pot', 'add choppe...",this modified version of 'mom's' chili was a h...,"['ground beef', 'yellow onions', 'diced tomato...",13,2


In [15]:
# 5 most common words for curiosity reasons
occ = np.asarray(tf_word_matrix.sum(axis=0)).ravel().tolist()
counts_df = pd.DataFrame({'term': tfv.get_feature_names(), 'occurrences': occ})
counts_df.sort_values(by='occurrences', ascending=False).head(5)

Unnamed: 0,term,occurrences
170,chicken,22966
766,salad,13299
774,sauce,10075
163,cheese,9745
182,chocolate,9029


# <p style="color:fuchsia;">Use tf-idf features ..</p>

In [17]:
tiv = TfidfVectorizer(max_df=0.95, min_df=2, max_features=1000, stop_words='english')

In [18]:
tfidf_word_matrix = tiv.fit_transform(df['name'].values.astype('U'))

## <p style="color:purple;">for NMF, the Frobenius norm

In [19]:
nmf = NMF(n_components=5, random_state=1, alpha=.1, l1_ratio=.5)

# Fit the NMF model with tf-idf features
nmf.fit(tfidf_word_matrix)

NMF(alpha=0.1, beta_loss='frobenius', init=None, l1_ratio=0.5, max_iter=200,
  n_components=5, random_state=1, shuffle=False, solver='cd', tol=0.0001,
  verbose=0)

In [20]:
print("Top words for Topic 1:\n")
first_topic = nmf.components_[0]
top_words_in_topic = first_topic.argsort()[-10:]

for i in top_words_in_topic:
    print(tiv.get_feature_names()[i])
    
print("\nTopics in NMF model (Frobenius norm):")
    
for i,topic in enumerate(nmf.components_):
    print(f'Top words for topic #{i}:')
    print([tiv.get_feature_names()[i] for i in topic.argsort()[-10:]])
    
topic_values = nmf.transform(tfidf_word_matrix)
df['Topic_F'] = topic_values.argmax(axis=1)
df.head(3)

Top words for Topic 1:

crock
easy
breasts
fried
grilled
lemon
casserole
pot
rice
chicken

Topics in NMF model (Frobenius norm):
Top words for topic #0:
['crock', 'easy', 'breasts', 'fried', 'grilled', 'lemon', 'casserole', 'pot', 'rice', 'chicken']
Top words for topic #1:
['cucumber', 'tuna', 'tomato', 'fruit', 'bean', 'spinach', 'pasta', 'dressing', 'potato', 'salad']
Top words for topic #2:
['apple', 'cream', 'banana', 'pie', 'peanut', 'butter', 'chip', 'cookies', 'cake', 'chocolate']
Top words for topic #3:
['easy', 'pot', 'beef', 'bean', 'bread', 'casserole', 'cream', 'potato', 'cheese', 'soup']
Top words for topic #4:
['chops', 'lemon', 'cream', 'spaghetti', 'shrimp', 'garlic', 'tomato', 'pasta', 'pork', 'sauce']


Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients,Topic,Topic_F
0,arriba baked winter squash mexican style,137739,55,47892,2005-09-16,"['60-minutes-or-less', 'time-to-make', 'course...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",11,"['make a choice and proceed with recipe', 'dep...",autumn is my favorite time of year to cook! th...,"['winter squash', 'mexican seasoning', 'mixed ...",7,0,3
1,a bit different breakfast pizza,31490,30,26278,2002-06-17,"['30-minutes-or-less', 'time-to-make', 'course...","[173.4, 18.0, 0.0, 17.0, 22.0, 35.0, 1.0]",9,"['preheat oven to 425 degrees f', 'press dough...",this recipe calls for the crust to be prebaked...,"['prepared pizza crust', 'sausage patty', 'egg...",6,4,3
2,all in the kitchen chili,112140,130,196586,2005-02-25,"['time-to-make', 'course', 'preparation', 'mai...","[269.8, 22.0, 32.0, 48.0, 39.0, 27.0, 5.0]",6,"['brown ground beef in large pot', 'add choppe...",this modified version of 'mom's' chili was a h...,"['ground beef', 'yellow onions', 'diced tomato...",13,2,3



## <p style="color:purple;">.. the generalized Kullback-Leibler divergence

In [22]:
nmf = NMF(n_components=5, random_state=1, beta_loss='kullback-leibler', solver='mu', max_iter=1000, alpha=.1, l1_ratio=.5)
nmf.fit(tfidf_word_matrix)

NMF(alpha=0.1, beta_loss='kullback-leibler', init=None, l1_ratio=0.5,
  max_iter=1000, n_components=5, random_state=1, shuffle=False,
  solver='mu', tol=0.0001, verbose=0)

In [24]:
print("Top words for Topic 1:\n")
first_topic = nmf.components_[0]
top_words_in_topic = first_topic.argsort()[-10:]

for i in top_words_in_topic:
    print(tiv.get_feature_names()[i])
    
print("\nTopics in NMF model (generalized Kullback-Leibler):")
    
for i,topic in enumerate(nmf.components_):
    print(f'Top words for topic #{i}:')
    print([tiv.get_feature_names()[i] for i in topic.argsort()[-10:]])
    
topic_values = nmf.transform(tfidf_word_matrix)
df['Topic_KL'] = topic_values.argmax(axis=1)
df.head(3)

Top words for Topic 1:

curry
mexican
salsa
quick
fried
spicy
style
easy
rice
chicken

Topics in NMF model (generalized Kullback-Leibler):
Top words for topic #0:
['curry', 'mexican', 'salsa', 'quick', 'fried', 'spicy', 'style', 'easy', 'rice', 'chicken']
Top words for topic #1:
['fresh', 'egg', 'tomato', 'sandwich', 'tuna', 'fruit', 'dressing', 'dip', 'spinach', 'salad']
Top words for topic #2:
['pumpkin', 'banana', 'muffins', 'apple', 'butter', 'cream', 'cookies', 'pie', 'cake', 'chocolate']
Top words for topic #3:
['beans', 'crock', 'pot', 'sweet', 'potato', 'potatoes', 'beef', 'casserole', 'cheese', 'soup']
Top words for topic #4:
['lemon', 'roasted', 'stuffed', 'grilled', 'garlic', 'pasta', 'shrimp', 'pork', 'bread', 'sauce']


Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients,Topic,Topic_F,Topic_KL
0,arriba baked winter squash mexican style,137739,55,47892,2005-09-16,"['60-minutes-or-less', 'time-to-make', 'course...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",11,"['make a choice and proceed with recipe', 'dep...",autumn is my favorite time of year to cook! th...,"['winter squash', 'mexican seasoning', 'mixed ...",7,0,3,0
1,a bit different breakfast pizza,31490,30,26278,2002-06-17,"['30-minutes-or-less', 'time-to-make', 'course...","[173.4, 18.0, 0.0, 17.0, 22.0, 35.0, 1.0]",9,"['preheat oven to 425 degrees f', 'press dough...",this recipe calls for the crust to be prebaked...,"['prepared pizza crust', 'sausage patty', 'egg...",6,4,3,4
2,all in the kitchen chili,112140,130,196586,2005-02-25,"['time-to-make', 'course', 'preparation', 'mai...","[269.8, 22.0, 32.0, 48.0, 39.0, 27.0, 5.0]",6,"['brown ground beef in large pot', 'add choppe...",this modified version of 'mom's' chili was a h...,"['ground beef', 'yellow onions', 'diced tomato...",13,2,3,3


In [25]:
# 5 most common words for curiosity reasons
occ = np.asarray(tfidf_word_matrix.sum(axis=0)).ravel().tolist()
counts_df = pd.DataFrame({'term': tiv.get_feature_names(), 'occurrences': occ})
counts_df.sort_values(by='occurrences', ascending=False).head(5)

Unnamed: 0,term,occurrences
170,chicken,8341.169923
766,salad,5386.820373
774,sauce,3926.177617
163,cheese,3829.282234
830,soup,3806.244214
