In [1]:
import os
import os.path

import nltk
# nltk.download('wordnet')
from nltk.corpus import wordnet as wn    
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import PorterStemmer

import pandas as pd
import numpy as np

In [2]:
#load tokenizer, stemmer and stop words
tokenizer = RegexpTokenizer(r'\w+')
stemmer = PorterStemmer()
wnl = WordNetLemmatizer()

stop_words = nltk.corpus.stopwords.words('english')

In [3]:
inFilePath = "C:\\Users\\thesk\\Desktop\\RAW_recipes_name.xlsx"

df = pd.read_excel(open(inFilePath,'rb'))
strdata = df.values.tolist() #each row becomes a separate list

In [4]:
strdata[0:10]

[['arriba   baked winter squash mexican style'],
 ['a bit different  breakfast pizza'],
 ['all in the kitchen  chili'],
 ['alouette  potatoes'],
 ['amish  tomato ketchup  for canning'],
 ['apple a day  milk shake'],
 ['aww  marinated olives'],
 ['backyard style  barbecued ribs'],
 ['bananas 4 ice cream  pie'],
 ['beat this  banana bread']]

In [5]:
tokens = [tokenizer.tokenize(str(i)) for i in strdata] #tokenize words in lists

cleaned_list = []

for token in tokens:
    stopped = [i for i in token if str(i).lower() not in stop_words] #remove stop words
    longer = [i for i in stopped if len(i) > 2]
    stemmed = [stemmer.stem(i) for i in longer] #stem words
    lemmed = [wnl.lemmatize(i) for i in stemmed]
    cleaned_list.append(lemmed) #append stemmed words to list

backtodf = pd.DataFrame(cleaned_list) #convert list back to pandas dataframe
remove_NaN = backtodf.replace(np.nan, '', regex=True) #remove None (which return as words (str))
mergeddf = remove_NaN.astype(str).apply(lambda x: ' '.join(x), axis=1) #convert cells to strings, merge columns

In [6]:
mergeddf.head()

0    arriba bake winter squash mexican style     
1               bit differ breakfast pizza       
2                          kitchen chili         
3                         alouett potato         
4             amish tomato ketchup canning       
dtype: object

## LDA with Gensim

In [None]:
import gensim
from gensim import corpora

# Create a Dictionary from the data 
dictionary = corpora.Dictionary(cleaned_list)

# Convert to Bag-of-words corpus and save both for future use
corpus = [dictionary.doc2bow(text) for text in cleaned_list]

In [8]:
import pickle
pickle.dump(corpus, open('corpus.pkl', 'wb'))
dictionary.save('dictionary.gensim')

In [9]:
# Now asking LDA to find N topics in the data:

NUM_TOPICS = 5

ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=5)
ldamodel.save('model5.gensim')

topics = ldamodel.print_topics(num_words=5)
for topic in topics:
    print(topic)

(0, '0.054*"chocol" + 0.054*"cake" + 0.040*"cooki" + 0.029*"strawberri" + 0.022*"appl"')
(1, '0.075*"salad" + 0.032*"sauc" + 0.030*"chicken" + 0.028*"yummi" + 0.022*"dip"')
(2, '0.044*"chees" + 0.039*"chicken" + 0.037*"zucchini" + 0.028*"tomato" + 0.027*"bake"')
(3, '0.053*"sweet" + 0.042*"bean" + 0.036*"pie" + 0.035*"potato" + 0.029*"white"')
(4, '0.056*"soup" + 0.029*"bread" + 0.029*"veget" + 0.024*"whole" + 0.023*"pork"')


Ideally, above we should see the some of the following recipe types: <br>
Breads, Dessert, Dips/salad, Sandwiches, Side dishes, Soups/stews, Breakfast, Main courses <br>
(like on this added picture)
<img src="food_classification.png" width="300" />