In [None]:
import pandas as pd
import numpy as np

In [None]:
#if running bertopic

!pip install bertopic
# !pip install sentence-transformers

### Import

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
df = pd.read_csv('/content/drive/MyDrive/IntroML/data/train_dataset_all.csv')
df.head(3)

Unnamed: 0,recipe_id,category,name,n_ingredients,minutes,calories,total fat (PDV),sugar (PDV),sodium (PDV),protein (PDV),saturated fat (PDV),carbohydrates (PDV),log calories,log fat,log sugar,log sodium,log protein,log sat fat,log carbs,user_id,date,review,rating,description
0,9014,vegetables,cheesy potato skillet stuff,5,40,357.8,33.0,6.0,22.0,30.0,37.0,8.0,5.879974,3.496508,1.791759,3.091042,3.401197,3.610918,2.079442,11297,2001-06-18,What a great quick & easy meal especially for ...,4,this recipe came about when a friend of mine r...
1,9492,meat,chicken honey glazed,7,100,484.9,11.0,189.0,20.0,110.0,6.0,16.0,6.183943,2.397895,5.241747,2.995732,4.70048,1.791759,2.772589,11297,2001-06-26,So easy and tasty. To make the clean-up even...,4,serve with a green salad and a your favorite r...
2,2731,meat,jaegerschnitzel,17,35,382.2,28.0,16.0,17.0,61.0,34.0,5.0,5.945944,3.332205,2.772589,2.833213,4.110874,3.526361,1.609438,11297,2001-07-02,The man of the house really loved the sauce wi...,4,(breaded veal cutlet with mushroom sauce)


### Getting the relevant columns

#### (not used) For reviews

In [None]:
# grab only the review from the original df
df_review = df[['recipe_id', 'review']]

#set the ratings to be the index, and drop the 'rating' column name
# df_review = df_review.set_index('rating')
# df_review.index.name = None
df_review.head(2)

Unnamed: 0,recipe_id,review
0,9014,What a great quick & easy meal especially for ...
1,9492,So easy and tasty. To make the clean-up even...


#### Get combined column (name+desc)

In [None]:
# grab the name + description 
df_namedesc = df[['recipe_id', 'category', 'name', 'description']]

#since it's the name & desc we want, every row should be unique based on the recipe_id. thus, we will drop duplicates (that have arised from reviews)
df_namedesc.drop_duplicates(subset=['recipe_id'], inplace=True)
df_namedesc.reset_index(drop=True, inplace=True)

#concatenate name & desc into 1 column with space inbtwn
df_namedesc['combined'] = df_namedesc[['name', 'description']].agg(' '.join, axis=1)

df_namedesc.head(2)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


Unnamed: 0,recipe_id,category,name,description,combined
0,9014,vegetables,cheesy potato skillet stuff,this recipe came about when a friend of mine r...,cheesy potato skillet stuff this recipe came a...
1,9492,meat,chicken honey glazed,serve with a green salad and a your favorite r...,chicken honey glazed serve with a green salad ...


##### Cleaning up the text

In [None]:
import re 
import string

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')

from nltk.tokenize import word_tokenize

from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

import collections

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [None]:
stop = stopwords.words('english')
stop.extend(['recipe', 'dish', 'meal', 'food', 'dinner']) #saw these words, won't be useful 

def clean_text(text):
    # Lowercase, Remove Non-alphanumeric characters, Punctuations, Numbers & Stopwords
    text = text.lower()
    text = re.sub('[^A-Za-z\s]+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text) 

    # stemmer = PorterStemmer()
    # text = stemmer.stem(text)

    lemmatizer = WordNetLemmatizer()
    text = lemmatizer.lemmatize(text)
    
    text = ' '.join([word for word in text.split() if word not in (stop)])

    return text

In [None]:
#clean the combined column
df_namedesc['combined'] = pd.DataFrame(df_namedesc['combined'].apply(clean_text))

#tokenize that cleaned combined column 
# df_namedesc['tokenized'] = [word_tokenize(row) for row in df_namedesc['combined']]

# convert category into numbers for using in topic modelling 
df_namedesc['category'] = pd.Categorical(df_namedesc['category'])
df_namedesc['cat_code'] = df_namedesc.category.cat.codes
df_namedesc.head(2)

df_namedesc.head(2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


Unnamed: 0,recipe_id,category,name,description,combined,cat_code
0,9014,vegetables,cheesy potato skillet stuff,this recipe came about when a friend of mine r...,cheesy potato skillet stuff came friend mine r...,9
1,9492,meat,chicken honey glazed,serve with a green salad and a your favorite r...,chicken honey glazed serve green salad favorit...,4


#### (not used) Generating additional column for food words found in wordnet

In [None]:
import re

from nltk.corpus import wordnet as wn

In [None]:
food_synset = wn.synsets('food')

# Check synset
for syn in food_synset:
    print(syn, '\n')
    print(syn.hyponyms())

Synset('food.n.01') 

[Synset('beverage.n.01'), Synset('chyme.n.01'), Synset('comestible.n.01'), Synset('comfort_food.n.01'), Synset('commissariat.n.01'), Synset('culture_medium.n.01'), Synset('fare.n.04'), Synset('feed.n.01'), Synset('foodstuff.n.02'), Synset('micronutrient.n.01'), Synset('miraculous_food.n.01'), Synset('nutriment.n.01'), Synset('soul_food.n.01'), Synset('water.n.06'), Synset('yolk.n.02')]
Synset('food.n.02') 

[Synset('baked_goods.n.01'), Synset('breakfast_food.n.01'), Synset('butter.n.01'), Synset('cheese.n.01'), Synset('chocolate.n.02'), Synset('coconut.n.01'), Synset('convenience_food.n.01'), Synset('dika_bread.n.01'), Synset('fish.n.02'), Synset('fresh_food.n.01'), Synset('health_food.n.01'), Synset('junk_food.n.01'), Synset('leftovers.n.01'), Synset('loaf.n.02'), Synset('meat.n.01'), Synset('pasta.n.02'), Synset('produce.n.01'), Synset('seafood.n.01'), Synset('slop.n.04'), Synset('yogurt.n.01')]
Synset('food.n.03') 

[Synset('pabulum.n.02')]


In [None]:
food = wn.synset('food.n.02')
food_list = list(set([w for s in food.closure(lambda s:s.hyponyms()) for w in s.lemma_names()]))

In [None]:
def recursive_hypernym(food_word_list, food_synset,n,m,index):
    if n == 1:
        return 
    elif m == n:
        food_name = [food_name.name().split('.')[0] for food_name in food_synset[index].hyponyms()]
        for word in food_name:

            food_word_list.append(word)
            recursive_hypernym(food_word_list,wn.synsets(word),n-1,m,index)
    else:
        
        for syn in food_synset: 
            food_name = [food_name.name().split('.')[0] for food_name in syn.hyponyms()]
            for word in food_name:

                food_word_list.append(word)
                recursive_hypernym(food_word_list,wn.synsets(word),n-1,m,index)
                
    return food_word_list

# parameters
# food_word_list = empty list
# synset of word = wn.synset("word")
# n = how many iterations you want
# m = conditional iteration count to exempt the first iteration for the below index
# index = what index of the first synset that you want

In [None]:
food_word_list = []
food_synset = wn.synsets("food")
food_word_list = recursive_hypernym(food_word_list,food_synset,4,4,1)
print(food_word_list[:10])
# print(food_word_list)

['baked_goods', 'bread', 'anadama_bread', 'bap', 'barmbrack', 'breadstick', 'brown_bread', 'bun', 'caraway_seed_bread', 'challah']


In [None]:
df_namedesc['food_words'] = df_namedesc['tokenized'].apply(lambda x: list([w for w in x if w in food_word_list]))

In [None]:
df_namedesc.reset_index(inplace=True)

In [None]:
def join_tokens(text):
  text = ' '.join([word for word in text])

  return text

In [None]:
df_namedesc['food_words_joined'] = df_namedesc['food_words'].apply(join_tokens)

In [None]:
df_namedesc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56137 entries, 0 to 56136
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype   
---  ------             --------------  -----   
 0   level_0            56137 non-null  int64   
 1   index              56137 non-null  int64   
 2   recipe_id          56137 non-null  int64   
 3   category           56137 non-null  category
 4   name               56137 non-null  object  
 5   description        56137 non-null  object  
 6   combined           56137 non-null  object  
 7   tokenized          56137 non-null  object  
 8   cat_code           56137 non-null  int8    
 9   food_words         56137 non-null  object  
 10  food_words_joined  56137 non-null  object  
dtypes: category(1), int64(3), int8(1), object(6)
memory usage: 4.0+ MB


## Topic Modelling with BERTopic

### Import packages, enable GPU

In [None]:
## enable colab to use gpu
import torch
device = "cuda:0" if torch.cuda.is_available() else "cpu"
device

'cuda:0'

In [None]:
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer

### Generate topics based on category and combined name+desc

1) Generate `n` topics from combined `name+desc` and `category`

2) Get a list of words associated with each topic generated

3) Label each recipe to a cluster(s) based on whether the word appeared in combined `name+desc` 
- meaning a recipe can be found in more than 1 cluster

In [None]:
df_namedesc.head(2)

Unnamed: 0,recipe_id,category,name,description,combined,cat_code
0,9014,vegetables,cheesy potato skillet stuff,this recipe came about when a friend of mine r...,cheesy potato skillet stuff came friend mine r...,9
1,9492,meat,chicken honey glazed,serve with a green salad and a your favorite r...,chicken honey glazed serve green salad favorit...,4


In [None]:
docs = df_namedesc['combined']
y = df_namedesc['cat_code'] #for supervised topic modelling 

In [None]:
topic_model = BERTopic(min_topic_size=70,
                       nr_topics='auto',
                       embedding_model='all-MiniLM-L6-v2')

topics, _ = topic_model.fit_transform(docs, y=y)

Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/10.2k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/349 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]



In [None]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,0,17423,0_chocolate_cookies_dessert_cream
1,-1,14747,-1_sauce_salad_pasta_casserole
2,1,5320,1_chicken_pork_steak_marinade
3,2,3189,2_soup_chowder_broth_vegetable
4,3,2588,3_potatoes_potato_fries_baked
5,4,2576,4_salad_dressing_vinaigrette_salads
6,5,2155,5_bread_loaf_pizza_dough
7,6,1167,6_muffins_muffin_breakfast_chocolate
8,7,1090,7_salmon_fish_fillets_catfish
9,8,1088,8_pasta_spaghetti_salad_sauce


In [None]:
topic_model.visualize_topics()

In [None]:
topic_model.visualize_hierarchy()

In [None]:
topic_model.visualize_heatmap()

In [None]:
topic_model.save('/content/drive/MyDrive/IntroML/models/bertopic_70_auto_55')

## Labelling recipes in train df with topics

In [None]:
all_topics = topic_model.get_topics()

In [None]:
all_topics[1]

[('chicken', 0.034943698958301524),
 ('pork', 0.03246110757630752),
 ('steak', 0.014024286908908355),
 ('marinade', 0.011801103460295866),
 ('grilled', 0.009958636641507167),
 ('marinating', 0.008816234138028695),
 ('cooking', 0.008274752716396333),
 ('grill', 0.007491613788364649),
 ('mustard', 0.007062052074591514),
 ('baked', 0.007035028234757445)]

In [None]:
#remove the probabilites, keep only the words for each topic
topic_words = {topic: [] for topic in all_topics}

for topic, word_list in all_topics.items():
  # print(topic, word_list)
  for word_tuple in word_list:
    # print(word_tuple)
    word = word_tuple[0]
    # print(word)
    topic_words[topic].append(word)


In [None]:
print(topic_words)
#save to .txt file incase need these words

{-1: ['sauce', 'salad', 'pasta', 'casserole', 'cheese', 'chicken', 'cooking', 'tomato', 'meat', 'tomatoes'], 0: ['chocolate', 'cookies', 'dessert', 'cream', 'coffee', 'cookie', 'pudding', 'smoothie', 'strawberry', 'brownies'], 1: ['chicken', 'pork', 'steak', 'marinade', 'grilled', 'marinating', 'cooking', 'grill', 'mustard', 'baked'], 2: ['soup', 'chowder', 'broth', 'vegetable', 'lentil', 'corn', 'soups', 'stew', 'chili', 'spicy'], 3: ['potatoes', 'potato', 'fries', 'baked', 'garlic', 'casserole', 'salad', 'bacon', 'cooking', 'onion'], 4: ['salad', 'dressing', 'vinaigrette', 'salads', 'spinach', 'greens', 'lettuce', 'tuna', 'cheese', 'ingredients'], 5: ['bread', 'loaf', 'pizza', 'dough', 'loaves', 'flour', 'yeast', 'sourdough', 'breads', 'baking'], 6: ['muffins', 'muffin', 'breakfast', 'chocolate', 'flour', 'wheat', 'oatmeal', 'cinnamon', 'oat', 'raisins'], 7: ['salmon', 'fish', 'fillets', 'catfish', 'tilapia', 'baked', 'trout', 'fillet', 'mustard', 'cooking'], 8: ['pasta', 'spaghetti'

In [None]:
# if grab from txt file
topic_words = {-1: ['sauce', 'salad', 'pasta', 'casserole', 'cheese', 'chicken', 'cooking', 'tomato', 'meat', 'tomatoes'], 0: ['chocolate', 'cookies', 'dessert', 'cream', 'coffee', 'cookie', 'pudding', 'smoothie', 'strawberry', 'brownies'], 1: ['chicken', 'pork', 'steak', 'marinade', 'grilled', 'marinating', 'cooking', 'grill', 'mustard', 'baked'], 2: ['soup', 'chowder', 'broth', 'vegetable', 'lentil', 'corn', 'soups', 'stew', 'chili', 'spicy'], 3: ['potatoes', 'potato', 'fries', 'baked', 'garlic', 'casserole', 'salad', 'bacon', 'cooking', 'onion'], 4: ['salad', 'dressing', 'vinaigrette', 'salads', 'spinach', 'greens', 'lettuce', 'tuna', 'cheese', 'ingredients'], 5: ['bread', 'loaf', 'pizza', 'dough', 'loaves', 'flour', 'yeast', 'sourdough', 'breads', 'baking'], 6: ['muffins', 'muffin', 'breakfast', 'chocolate', 'flour', 'wheat', 'oatmeal', 'cinnamon', 'oat', 'raisins'], 7: ['salmon', 'fish', 'fillets', 'catfish', 'tilapia', 'baked', 'trout', 'fillet', 'mustard', 'cooking'], 8: ['pasta', 'spaghetti', 'salad', 'sauce', 'italian', 'tomato', 'tomatoes', 'garlic', 'cheese', 'tuna'], 9: ['rice', 'fried', 'beans', 'peas', 'casserole', 'cooking', 'chicken', 'saffron', 'cooker', 'cooked'], 10: ['shrimp', 'crab', 'prawns', 'appetizer', 'seafood', 'crabmeat', 'crawfish', 'crabs', 'cajun', 'appetizers'], 11: ['salsa', 'chili', 'tacos', 'taco', 'burritos', 'tortilla', 'burrito', 'tortillas', 'guacamole', 'quesadillas'], 12: ['meatloaf', 'meatballs', 'meatball', 'meat', 'turkey', 'spaghetti', 'sandwiches', 'beef', 'gravy', 'recipes'], 13: ['corn', 'cornbread', 'creamed', 'casserole', 'butter', 'skillet', 'popcorn', 'chili', 'pudding', 'paula'], 14: ['zucchini', 'squash', 'butternut', 'acorn', 'casserole', 'stuffed', 'fritters', 'baked', 'tasty', 'delicious'], 15: ['mushrooms', 'mushroom', 'portabella', 'steak', 'appetizer', 'portabello', 'cheese', 'portabellas', 'tasty', 'spinach'], 16: ['beans', 'bean', 'bacon', 'garlic', 'canned', 'casserole', 'almonds', 'cooking', 'flavor', 'onions'], 17: ['broccoli', 'cabbage', 'casserole', 'sauerkraut', 'kielbasa', 'garlic', 'cheese', 'delicious', 'steamed', 'cooked'], 18: ['macaroni', 'cheese', 'casserole', 'baked', 'kraft', 'cheeseburger', 'ham', 'cheddar', 'cheeses', 'pasta'], 19: ['roast', 'stew', 'crock', 'beef', 'gravy', 'cooker', 'crockpot', 'cooking', 'cook', 'roasts'], 20: ['asparagus', 'prosciutto', 'sesame', 'garlic', 'sauce', 'delicious', 'mushrooms', 'ham', 'contessa', 'cooking'], 21: ['carrots', 'carrot', 'vegetables', 'roast', 'veggies', 'vegetable', 'eat', 'thanksgiving', 'cooked', 'cooking'], 22: ['rolls', 'dough', 'cinnamon', 'yeast', 'bread', 'oven', 'cinnabon', 'baking', 'flour', 'homemade'], 23: ['scones', 'scallops', 'scone', 'scallop', 'breakfast', 'cranberry', 'seared', 'cheese', 'seafood', 'cranberries'], 24: ['tuna', 'wasabi', 'sandwich', 'casserole', 'lunch', 'sandwiches', 'melts', 'steaks', 'mayo', 'sardines'], 25: ['biscuits', 'biscuit', 'lobster', 'flour', 'breakfast', 'butter', 'baking', 'shortcake', 'dough', 'cinnamon'], 26: ['burgers', 'burger', 'hamburgers', 'hamburger', 'patties', 'grill', 'beef', 'cheeseburgers', 'bbq', 'cheeseburger'], 27: ['noodles', 'noodle', 'ramen', 'sesame', 'spicy', 'cabbage', 'vegetables', 'tofu', 'pasta', 'chicken'], 28: ['tomatoes', 'tomato', 'basil', 'cheese', 'mozzarella', 'baked', 'garlic', 'appetizer', 'pizza', 'herbs'], 29: ['ham', 'sandwiches', 'sandwich', 'cheese', 'glazed', 'glaze', 'baked', 'hams', 'steak', 'eggs'], 30: ['sandwich', 'pancakes', 'bacon', 'breakfast', 'sandwiches', 'waffles', 'egg', 'pancake', 'eggs', 'lunch'], 31: ['lamb', 'chops', 'rosemary', 'meat', 'marinade', 'cooking', 'cooked', 'marinating', 'stew', 'marinated'], 32: ['spinach', 'creamed', 'casserole', 'cheese', 'appetizer', 'eat', 'berghoff', 'serve', 'garlic', 'bake'], 33: ['cucumber', 'cucumbers', 'salad', 'pickles', 'tomato', 'cukes', 'onion', 'pickled', 'tomatoes', 'yogurt'], 34: ['lasagna', 'lasagne', 'noodles', 'ricotta', 'cheese', 'vegetarian', 'sausage', 'meat', 'beef', 'lasagnas'], 35: ['ribs', 'bbq', 'rib', 'barbecue', 'barbecued', 'pork', 'grill', 'beef', 'napkins', 'cooker'], 36: ['oatmeal', 'granola', 'breakfast', 'oats', 'cereal', 'milk', 'snack', 'oat', 'raisins', 'cinnamon'], 37: ['eggplant', 'aubergine', 'eggplants', 'vegetarian', 'baba', 'casserole', 'slices', 'garlic', 'meat', 'flavors'], 38: ['turkey', 'stuffing', 'thanksgiving', 'bird', 'gravy', 'roast', 'meat', 'cooking', 'cooked', 'pheasant'], 39: ['enchiladas', 'enchilada', 'tortillas', 'chicken', 'beef', 'mexican', 'sauce', 'casserole', 'cream', 'cheese'], 40: ['couscous', 'cous', 'salad', 'chickpeas', 'vegetables', 'vegetarian', 'spicy', 'cooking', 'stew', 'chicken'], 41: ['onions', 'onion', 'vidalia', 'steak', 'meats', 'grilled', 'vidalias', 'burgers', 'mustard', 'bbq'], 42: ['cauliflower', 'cheese', 'garlic', 'cheddar', 'gratin', 'vegetable', 'olive', 'mustard', 'cheesy', 'broccoli'], 43: ['coleslaw', 'cabbage', 'mayo', 'dressing', 'kfc', 'mayonnaise', 'dressings', 'spicy', 'fridge', 'refrigeration'], 44: ['curry', 'paneer', 'thai', 'curried', 'curries', 'vegetables', 'vegetable', 'rice', 'spices', 'potatoes'], 45: ['sausage', 'sausages', 'breakfast', 'gravy', 'biscuits', 'pork', 'patties', 'meat', 'onion', 'appetizers'], 46: ['sprouts', 'brussels', 'brussel', 'sprout', 'bacon', 'cabbage', 'mustard', 'walnuts', 'vegetable', 'fennel'], 47: ['fry', 'tofu', 'vegetables', 'stirfry', 'broccoli', 'vegetable', 'rice', 'veggies', 'veggie', 'sesame'], 48: ['risotto', 'rice', 'barley', 'pumpkin', 'mushroom', 'mushrooms', 'squash', 'risottos', 'peas', 'cooking'], 49: ['sloppy', 'joes', 'joe', 'bbq', 'sauce', 'sandwiches', 'buns', 'favorite', 'chili', 'recipes'], 50: ['peas', 'pea', 'frozen', 'green', 'fresh', 'sugar', 'mushrooms', 'onions', 'sesame', 'eat'], 51: ['buns', 'dough', 'bun', 'cinnamon', 'flour', 'hamburger', 'burger', 'bread', 'egg', 'sourdough'], 52: ['dip', 'chips', 'tortilla', 'hummus', 'taco', 'dips', 'corn', 'appetizer', 'party', 'veggies'], 53: ['artichokes', 'artichoke', 'garlic', 'appetizer', 'marinated', 'mushrooms', 'casserole', 'cheese', 'vinaigrette', 'steamed'], 54: ['alfredo', 'fettuccine', 'sauce', 'fettuccini', 'pasta', 'chicken', 'shrimp', 'cheese', 'calories', 'garlic']}

In [None]:
#function to add a topic if the word is found in combined column
def add_topic_to_recipe(text):
  topic_list = []
  for topic, word_list in topic_words.items():
    if any(keyword in text for keyword in word_list):
      topic_list.append(topic)
  return topic_list

In [None]:
df_namedesc['topic_list'] = df_namedesc['combined'].apply(add_topic_to_recipe)

In [None]:
df_namedesc.head(3)

Unnamed: 0,recipe_id,category,name,description,combined,cat_code,topic_list
0,9014,vegetables,cheesy potato skillet stuff,this recipe came about when a friend of mine r...,cheesy potato skillet stuff came friend mine r...,9,"[3, 13, 42]"
1,9492,meat,chicken honey glazed,serve with a green salad and a your favorite r...,chicken honey glazed serve green salad favorit...,4,"[-1, 1, 3, 4, 8, 9, 27, 29, 32, 33, 39, 40, 44..."
2,2731,meat,jaegerschnitzel,(breaded veal cutlet with mushroom sauce),jaegerschnitzel breaded veal cutlet mushroom s...,4,"[-1, 5, 8, 15, 20, 22, 39, 48, 49, 51, 54]"


In [None]:
df_namedesc.to_csv('/content/drive/MyDrive/IntroML/data/train_dataset_recipewithtopic.csv', index=False)

## Generating topics for the test dataset

In [None]:
df_test = pd.read_csv('/content/drive/MyDrive/IntroML/data/test_dataset_all.csv')
df_test.head(3)

Unnamed: 0,recipe_id,category,name,n_ingredients,minutes,calories,total fat (PDV),sugar (PDV),sodium (PDV),protein (PDV),saturated fat (PDV),carbohydrates (PDV),log calories,log fat,log sugar,log sodium,log protein,log sat fat,log carbs,user_id,date,review,rating,description
0,45348,pasta,basil walnut pesto,7,12,213.8,32.0,1.0,5.0,11.0,18.0,0.0,5.365041,3.465736,0.0,1.609438,2.397895,2.890372,0.0,706608,2009-10-03,Just like most I tried this recipe because I n...,4,this is my favorite pesto recipe; rich but hea...
1,330059,pasta,bulgur and chickpeas with preserved lemon vina...,17,25,225.2,12.0,6.0,24.0,13.0,5.0,11.0,5.416989,2.484907,1.791759,3.178054,2.564949,1.609438,2.397895,621626,2009-10-03,DH really enjoyed this dish. I did leave out t...,5,this dish is loaded with texture and flavor. w...
2,279553,soup,turkey and brown rice chilli,14,485,202.4,2.0,37.0,22.0,34.0,1.0,10.0,5.310246,0.693147,3.610918,3.091042,3.526361,0.0,2.302585,621626,2009-10-03,I made this on the stove top since it was a la...,5,fantstic crock pot recipe fron betty crocker. ...


In [None]:
# grab the name + description 
df_test_namedesc = df_test[['recipe_id', 'category', 'name', 'description']]

#since it's the name & desc we want, every row should be unique based on the recipe_id. thus, we will drop duplicates (that have arised from reviews)
df_test_namedesc.drop_duplicates(subset=['recipe_id'], inplace=True)
df_test_namedesc.reset_index(drop=True, inplace=True)

#concatenate name & desc into 1 column with space inbtwn
df_test_namedesc['combined'] = df_test_namedesc[['name', 'description']].agg(' '.join, axis=1)

df_test_namedesc.head(2)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


Unnamed: 0,recipe_id,category,name,description,combined
0,45348,pasta,basil walnut pesto,this is my favorite pesto recipe; rich but hea...,basil walnut pesto this is my favorite pesto r...
1,330059,pasta,bulgur and chickpeas with preserved lemon vina...,this dish is loaded with texture and flavor. w...,bulgur and chickpeas with preserved lemon vina...


In [None]:
#clean the combined column
df_test_namedesc['combined'] = pd.DataFrame(df_test_namedesc['combined'].apply(clean_text))
df_test_namedesc.head(2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,recipe_id,category,name,description,combined
0,45348,pasta,basil walnut pesto,this is my favorite pesto recipe; rich but hea...,basil walnut pesto favorite pesto rich healthy...
1,330059,pasta,bulgur and chickpeas with preserved lemon vina...,this dish is loaded with texture and flavor. w...,bulgur chickpeas preserved lemon vinaigrette l...


In [None]:
topic_words = {-1: ['sauce', 'casserole', 'chicken', 'potatoes', 'pasta', 'cheese', 'salad', 'cooking', 'meat', 'garlic'], 0: ['cake', 'cookies', 'chocolate', 'dessert', 'pie', 'cookie', 'pudding', 'cheesecake', 'coffee', 'christmas'], 1: ['drink', 'smoothie', 'coffee', 'tea', 'cocktail', 'iced', 'lemonade', 'vodka', 'drinks', 'beverage'], 2: ['pork', 'chops', 'chicken', 'marinade', 'marinating', 'grilled', 'cooking', 'baked', 'grill', 'cook'], 3: ['soup', 'chowder', 'broth', 'vegetable', 'soups', 'corn', 'stew', 'pea', 'chili', 'spicy'], 4: ['salad', 'dressing', 'vinaigrette', 'salads', 'spinach', 'lettuce', 'greens', 'tuna', 'ingredients', 'ranch'], 5: ['bread', 'loaf', 'loaves', 'wheat', 'dough', 'flour', 'yeast', 'sourdough', 'baking', 'breads'], 6: ['potatoes', 'potato', 'baked', 'garlic', 'oven', 'casserole', 'bacon', 'microwave', 'bake', 'rosemary'], 7: ['pasta', 'spaghetti', 'salad', 'garlic', 'tomato', 'tomatoes', 'macaroni', 'cheese', 'basil', 'broccoli'], 8: ['rice', 'beans', 'casserole', 'peas', 'cooked', 'cooking', 'cooker', 'chicken', 'cook', 'dishes'], 9: ['muffins', 'muffin', 'breakfast', 'wheat', 'flour', 'oatmeal', 'chocolate', 'oat', 'blueberries', 'raisins'], 10: ['salmon', 'fish', 'fillets', 'tilapia', 'catfish', 'trout', 'flounder', 'fillet', 'swordfish', 'halibut'], 11: ['shrimp', 'crab', 'prawns', 'lobster', 'seafood', 'crabmeat', 'crabs', 'prawn', 'cajun', 'appetizers'], 12: ['salsa', 'chili', 'tacos', 'taco', 'burritos', 'tortilla', 'burrito', 'spicy', 'tortillas', 'enchiladas'], 13: ['mushrooms', 'mushroom', 'portabella', 'steak', 'appetizer', 'beef', 'portabello', 'portabellas', 'cheese', 'steaks'], 14: ['zucchini', 'squash', 'butternut', 'acorn', 'stuffed', 'casserole', 'baked', 'vegetable', 'tomato', 'cooking'], 15: ['stew', 'crock', 'roast', 'cooker', 'gravy', 'crockpot', 'cooking', 'cook', 'potatoes', 'cooked'], 16: ['steak', 'steaks', 'marinade', 'beef', 'marinated', 'grilled', 'meat', 'mignon', 'grill', 'marinating'], 17: ['corn', 'cornbread', 'creamed', 'skillet', 'butter', 'casserole', 'chili', 'jalapeno', 'fried', 'frozen'], 18: ['beans', 'bean', 'bacon', 'garlic', 'almonds', 'cooking', 'onions', 'tasty', 'flavor', 'cook'], 19: ['macaroni', 'cheese', 'casserole', 'cheddar', 'baked', 'kraft', 'cheeseburger', 'cheeses', 'pasta', 'chili'], 20: ['carrots', 'cabbage', 'carrot', 'vegetables', 'vegetable', 'veggies', 'casserole', 'veggie', 'cook', 'cooking'], 21: ['asparagus', 'sesame', 'prosciutto', 'ham', 'sauce', 'delicious', 'cooking', 'garlic', 'salad', 'tasty'], 22: ['meatballs', 'meatball', 'spaghetti', 'meat', 'balls', 'gravy', 'turkey', 'ball', 'beef', 'meatloaf'], 23: ['tuna', 'casserole', 'melt', 'sandwich', 'sandwiches', 'lunch', 'melts', 'sardines', 'mayo', 'toast'], 24: ['burgers', 'burger', 'hamburgers', 'hamburger', 'patties', 'beef', 'cheeseburgers', 'cheeseburger', 'ketchup', 'stuffed'], 25: ['sandwich', 'pancakes', 'breakfast', 'eggs', 'sandwiches', 'scrambled', 'egg', 'waffles', 'lunch', 'pancake'], 26: ['noodles', 'noodle', 'ramen', 'sesame', 'rice', 'spicy', 'salad', 'cooked', 'pasta', 'vegetables'], 27: ['meatloaf', 'meat', 'loaf', 'sandwiches', 'ketchup', 'beef', 'crumbs', 'breadcrumbs', 'recipes', 'mashed'], 28: ['scones', 'scallops', 'scone', 'scallop', 'breakfast', 'dough', 'cranberry', 'seafood', 'cinnamon', 'baking'], 29: ['biscuits', 'biscuit', 'bisquick', 'breakfast', 'flour', 'butter', 'dough', 'baking', 'shortcake', 'cinnamon'], 30: ['rolls', 'cinnamon', 'dough', 'yeast', 'bread', 'cinnabon', 'flour', 'oven', 'mixer', 'baking'], 31: ['tomatoes', 'tomato', 'bruschetta', 'basil', 'pasta', 'garlic', 'appetizer', 'baked', 'mozzarella', 'herbs'], 32: ['broccoli', 'casserole', 'garlic', 'steamed', 'sesame', 'broccolini', 'cauliflower', 'broccoflower', 'cooked', 'cook'], 33: ['lasagna', 'lasagne', 'noodles', 'vegetarian', 'cheese', 'sausage', 'meat', 'pasta', 'cheeses', 'lasagnas'], 34: ['lamb', 'chops', 'rack', 'meat', 'marinade', 'marinating', 'marinated', 'cooking', 'racks', 'grill'], 35: ['ribs', 'bbq', 'rib', 'barbecue', 'pork', 'barbecued', 'grill', 'beef', 'racks', 'cooking'], 36: ['spinach', 'casserole', 'appetizer', 'cheese', 'garlic', 'berghoff', 'bacon', 'pernod', 'serve', 'bake'], 37: ['ham', 'sandwiches', 'cheese', 'glazed', 'sandwich', 'hams', 'baked', 'sliced', 'slices', 'slice'], 38: ['eggplant', 'aubergine', 'eggplants', 'vegetarian', 'baba', 'garlic', 'casserole', 'aubergines', 'turkish', 'meat'], 39: ['cucumber', 'cucumbers', 'salad', 'cukes', 'onion', 'pickled', 'spicy', 'yogurt', 'tomatoes', 'onions'], 40: ['pizza', 'dough', 'crust', 'pizzas', 'toppings', 'bread', 'crusts', 'flour', 'mozzarella', 'yeast'], 41: ['curry', 'paneer', 'vegetables', 'vegetarian', 'vegetable', 'curries', 'curried', 'thai', 'spices', 'spinach'], 42: ['pie', 'pies', 'shepherds', 'pastry', 'biscuit', 'leftovers', 'veggies', 'stuffing', 'vegetables', 'veggie'], 43: ['oatmeal', 'granola', 'oats', 'breakfast', 'cereal', 'oat', 'porridge', 'snack', 'grains', 'nutritious'], 44: ['onions', 'onion', 'vidalia', 'grilled', 'steak', 'vidalias', 'bbq', 'meats', 'grill', 'dishes'], 45: ['coleslaw', 'cabbage', 'mayo', 'dressing', 'mayonnaise', 'kfc', 'slaws', 'spicy', 'dressings', 'fridge'], 46: ['turkey', 'stuffing', 'thanksgiving', 'gravy', 'roast', 'brining', 'cooking', 'brine', 'turkeys', 'cooked'], 47: ['couscous', 'moroccan', 'cous', 'chickpeas', 'vegetarian', 'vegetables', 'ingredients', 'lemony', 'flavors', 'herbs'], 48: ['stir', 'fry', 'stirfry', 'tofu', 'vegetables', 'szechuan', 'rice', 'broccoli', 'veggies', 'spicy'], 49: ['dip', 'chips', 'dips', 'crackers', 'tortilla', 'hummus', 'appetizer', 'dipper', 'snack', 'dipping'], 50: ['enchiladas', 'enchilada', 'tortillas', 'mexican', 'chicken', 'casserole', 'cream', 'sauce', 'cheese', 'spicy'], 51: ['wings', 'wing', 'chicken', 'spicy', 'sauce', 'heat', 'appetizer', 'glazed', 'grill', 'grilled'], 52: ['risotto', 'rice', 'stirring', 'mushroom', 'barley', 'asparagus', 'mushrooms', 'risottos', 'squash', 'peas'], 53: ['cauliflower', 'cheese', 'cheddar', 'garlic', 'vegetable', 'broccoli', 'gratin', 'spices', 'mustard', 'dijon'], 54: ['sprouts', 'brussels', 'sprout', 'hollandaise', 'mustard', 'walnuts', 'onions', 'chestnuts', 'flavor', 'cabbage'], 55: ['sausage', 'sausages', 'gravy', 'patties', 'biscuits', 'rolls', 'pork', 'turkey', 'meat', 'skillet'], 56: ['fries', 'crispy', 'fry', 'fried', 'frying', 'potato', 'chips', 'potatoes', 'mcdonalds', 'burgers'], 57: ['sloppy', 'joes', 'joe', 'kids', 'bbq', 'favorite', 'chili', 'recipes', 'sandwiches', 'sauce'], 58: ['peas', 'pea', 'sesame', 'onions', 'snaps', 'carrots', 'mushrooms', 'delicious', 'peppers', 'pods'], 59: ['points', 'watchers', 'weight', 'fat', 'calories', 'carb', 'pork', 'chicken', 'cookbook', 'watcher'], 60: ['buns', 'bun', 'dough', 'flour', 'rolls', 'cinnamon', 'wheat', 'bread', 'egg', 'yeast'], 61: ['alfredo', 'fettuccine', 'sauce', 'fettuccini', 'pasta', 'shrimp', 'creamy', 'fettucini', 'calories', 'restaurant'], 62: ['brisket', 'beef', 'meat', 'bbq', 'pot', 'barbecue', 'cooking', 'smoked', 'briskets', 'cooker'], 63: ['quiche', 'crustless', 'crust', 'cheese', 'broccoli', 'lunch', 'bacon', 'chayote', 'quiches', 'breakfast'], 64: ['beets', 'beet', 'beetroot', 'greens', 'baked', 'roast', 'pickled', 'salad', 'sour', 'fermenting'], 65: ['artichokes', 'artichoke', 'hearts', 'garlic', 'appetizer', 'steamed', 'cheese', 'bruschetta', 'marinated', 'served'], 66: ['mussels', 'mussel', 'steamed', 'crusty', 'wine', 'marinires', 'serve', 'clams', 'oysters', 'vermouth'], 67: ['venison', 'deer', 'steaks', 'beef', 'meat', 'steak', 'elk', 'gravy', 'moose', 'hamburger']}

In [None]:
df_test_namedesc['topic_list'] = df_test_namedesc['combined'].apply(add_topic_to_recipe)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [None]:
df_test_namedesc.head(2)

Unnamed: 0,recipe_id,category,name,description,combined,topic_list
0,45348,pasta,basil walnut pesto,this is my favorite pesto recipe; rich but hea...,basil walnut pesto favorite pesto rich healthy...,"[7, 31, 57]"
1,330059,pasta,bulgur and chickpeas with preserved lemon vina...,this dish is loaded with texture and flavor. w...,bulgur chickpeas preserved lemon vinaigrette l...,"[3, 4, 8, 18, 23, 25, 36, 47, 52, 54, 58, 63, ..."


In [None]:
df_test_namedesc.to_csv('/content/drive/MyDrive/IntroML/data/test_dataset_recipewithtopic.csv', index=False)