In [1]:
import pandas as pd
path_to_allrecipes = 'archive/allrecipes-recipes.json.xz'
path_to_kaggle = 'archive/Food Ingredients and Recipe Dataset with Image Name Mapping.csv'

In [2]:
def combine_datasets(path_to_kaggle, path_to_allrecipes):
    
    def read_file(path):
        recipes_lines = [json.loads(line) for line in open(path, 'r')]
        return pd.DataFrame(recipes_lines)

    df_all_recipies = pd.read_json(path_to_allrecipes,lines=True, compression='xz')
    df_all_recipies = df_all_recipies[df_all_recipies['photo_url'] != "http://images.media-allrecipes.com/global/recipes/nophoto/nopicture-910x511.png"]
    df_all_recipies = df_all_recipies[["ingredients", "instructions", "photo_url", "title"]]
    
    df_kaggle = pd.read_csv(path_to_kaggle)
    df_kaggle['Instructions'] = df_kaggle.apply(lambda x: [x.Instructions], axis=1)
    df_kaggle.rename(columns={"Title":"title", "Cleaned_Ingredients":"ingredients", "Instructions": "instructions", "Image_Name":"photo_url"}, inplace = True)
    df_kaggle = df_kaggle[["ingredients", "instructions", "photo_url", "title"]]
    final_df = pd.concat([df_all_recipies, df_kaggle])
    
    return final_df

In [193]:
data = combine_datasets(path_to_kaggle, path_to_allrecipes)
data.iloc[10]['photo_url']
data.iloc[10]['title']


'Sun Dried Tomato and Asiago Cheese Bread'

In [26]:
import re
def clean_dataset(dataset):
    def remove_values(x):
        prohibitedWords = [' and ', ' with ', ' the ', 'easy', 'best', ' i ', 'ii', 'iii', 'iv']
        big_regex = re.compile('|'.join(map(re.escape, prohibitedWords)))
        l = [big_regex.sub(" ", str(t).lower()) for t in x]
        return l
    data = dataset.assign(recipe_name = lambda x: remove_values(x['title']))
    return data
data = clean_dataset(data)
data

Unnamed: 0,ingredients,instructions,photo_url,title,recipe_name,category
0,"[1/2 cup unsalted butter, chilled and cubed, 1...",[Preheat oven to 400 degrees F (205 degrees C)...,http://images.media-allrecipes.com/userphotos/...,"Basil, Roasted Peppers and Monterey Jack Cornb...","basil, roasted peppers monterey jack cornbread","[roasted, chicken]"
1,"[1/2 cup Parmesan cheese, 3/4 teaspoon ground ...","[Combine parmesan cheese, pepper and garlic po...",http://images.media-allrecipes.com/userphotos/...,Crispy Cheese Twists,crispy cheese twists,"[cheese, baked, macaroni, cheese]"
2,"[2 cups hot water, 1/2 cup margarine, 1/3 cup ...",[Melt margarine in hot water. Add sugar and sa...,http://images.media-allrecipes.com/userphotos/...,Mom's Yeast Rolls,mom's yeast rolls,"[cinnamon, rolls]"
3,"[1 1/2 cups white sugar, 1/2 cup vegetable oil...",[Combine sugar and oil; beat well. Add eggs an...,http://images.media-allrecipes.com/userphotos/...,Sweet Potato Bread I,sweet potato bread i,"[sweet, potato, potato, salad]"
4,"[1/4 cup butter, 1 teaspoon white sugar, 1 cup...",[Stir butter and 1 teaspoon sugar into the hot...,http://images.media-allrecipes.com/userphotos/...,Orange Buns,orange buns,"[orange, chicken]"
...,...,...,...,...,...,...
13496,"['1 cup all-purpose flour', '2/3 cup unsweeten...",[Preheat the oven to 350°F. Into a bowl sift t...,brownie-pudding-cake-14408,Brownie Pudding Cake,brownie pudding cake,[pudding]
13497,"['1 preserved lemon', '1 1/2 pound butternut s...",[Preheat oven to 475°F.\nHalve lemons and scoo...,israeli-couscous-with-roasted-butternut-squash...,Israeli Couscous with Roasted Butternut Squash...,israeli couscous roasted butternut squash pres...,"[spaghetti, squash]"
13498,['Leftover katsuo bushi (dried bonito flakes) ...,"[If using katsuo bushi flakes from package, mo...",rice-with-soy-glazed-bonito-flakes-and-sesame-...,Rice with Soy-Glazed Bonito Flakes and Sesame ...,rice soy-glazed bonito flakes sesame seeds,[rice]
13499,['1 stick (1/2 cup) plus 1 tablespoon unsalted...,[Melt 1 tablespoon butter in a 12-inch heavy s...,spanakopita-107344,Spanakopita,spanakopita,"[spinach, salad]"


In [27]:
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
nltk.download('stopwords')
de_stop = stopwords.words('english')
tokenizer = RegexpTokenizer(r'\w+')
final_names=[]
recipe_name = data['recipe_name'].tolist()
for rec in recipe_name:
    raw = rec.lower()
    tokens = tokenizer.tokenize(raw)
    stop_t = [recipe_name for recipe_name in tokens if not recipe_name in de_stop]
    stem_t = [i for i in stop_t if len(i)>1]
    if len(stem_t)==0: final_names.append(['error'])
    else: final_names.append(stem_t)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Shru\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [28]:
final_names

[['basil', 'roasted', 'peppers', 'monterey', 'jack', 'cornbread'],
 ['crispy', 'cheese', 'twists'],
 ['mom', 'yeast', 'rolls'],
 ['sweet', 'potato', 'bread'],
 ['orange', 'buns'],
 ['cornish', 'splits'],
 ['jalapeno', 'cheese', 'bread'],
 ['dee', 'health', 'bread'],
 ['oatmeal', 'bread'],
 ['strawberry', 'bread'],
 ['sun', 'dried', 'tomato', 'asiago', 'cheese', 'bread'],
 ['hawa', 'sweet', 'bread'],
 ['corn', 'bread', 'ever', 'eat'],
 ['knead', 'refrigerator', 'rolls'],
 ['sourdough', 'starter'],
 ['sourdough', 'bread'],
 ['dilly', 'bread'],
 ['pumpkin', 'bread'],
 ['sy', 'challah'],
 ['onion', 'bread'],
 ['grandma', 'vandoren', 'white', 'bread'],
 ['italian', 'bread'],
 ['sunday', 'dinner', 'rolls'],
 ['banana', 'wheat', 'bread'],
 ['banana', 'bread'],
 ['candied', 'fruit', 'bread'],
 ['nut', 'fruit', 'bread'],
 ['mother', 'banana', 'bread'],
 ['pumpernickel', 'bread'],
 ['buttermilk', 'bread'],
 ['sun', 'dried', 'tomato', 'focaccia'],
 ['michell', 'jenny', 'challah'],
 ['potato', 'br

In [13]:
from gensim import corpora, models
dictionary = corpora.Dictionary(final_names)
corpus = [dictionary.doc2bow(text) for text in final_names]

In [8]:
import gensim
num_topics = list(range(50,150,10))
num_keywords = 15

LDA_models = {}
LDA_topics = {}
for i in num_topics:
    print(i)
    LDA_models[i] = gensim.models.ldamodel.LdaModel(corpus, num_topics=i,passes=1, chunksize=len(corpus), random_state=0, id2word=dictionary)
    print(1)
    shown_topics = LDA_models[i].show_topics(num_topics=i, 
                                             num_words=num_keywords,
                                             formatted=False)
    print(2)
    LDA_topics[i] = [[word[0] for word in topic[1]] for topic in shown_topics]
    
    print('done')

50


KeyboardInterrupt: 

In [None]:
def jaccard_similarity(topic_1, topic_2):
    """
    Derives the Jaccard similarity of two topics

    Jaccard similarity:
    - A statistic used for comparing the similarity and diversity of sample sets
    - J(A,B) = (A ∩ B)/(A ∪ B)
    - Goal is low Jaccard scores for coverage of the diverse elements
    """
    intersection = set(topic_1).intersection(set(topic_2))
    union = set(topic_1).union(set(topic_2))
                    
    return float(len(intersection))/float(len(union))

In [None]:
import numpy as np
LDA_stability = {}
for i in range(0, len(num_topics)-1):
    jaccard_sims = []
    for t1, topic1 in enumerate(LDA_topics[num_topics[i]]): # pylint: disable=unused-variable
        sims = []
        for t2, topic2 in enumerate(LDA_topics[num_topics[i+1]]): # pylint: disable=unused-variable
            sims.append(jaccard_similarity(topic1, topic2))    
        
        jaccard_sims.append(sims)    
    
    LDA_stability[num_topics[i]] = jaccard_sims
                
mean_stabilities = [np.array(LDA_stability[i]).mean() for i in num_topics[:-1]]

In [None]:
from gensim.models import CoherenceModel
coherences = [CoherenceModel(model=LDA_models[i], texts=final_names, dictionary=dictionary, coherence='c_v').get_coherence()\
              for i in num_topics[:-1]]

In [None]:
coherences
mean_stabilities

In [None]:
coh_sta_diffs = [coherences[i] - mean_stabilities[i] for i in range(len(num_topics[:-1]))] # limit topic numbers to the number of keywords
coh_sta_max = max(coh_sta_diffs)
coh_sta_max_idxs = [i for i, j in enumerate(coh_sta_diffs) if j == coh_sta_max]
ideal_topic_num_index = coh_sta_max_idxs[0] # choose less topics in case there's more than one max
ideal_topic_num = num_topics[ideal_topic_num_index]

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
plt.figure(figsize=(20,10))
ax = sns.lineplot(x=num_topics[:-1], y=mean_stabilities, label='Average Topic Overlap')
ax = sns.lineplot(x=num_topics[:-1], y=coherences, label='Topic Coherence')

ax.axvline(x=ideal_topic_num, label='Ideal Number of Topics', color='black')
ax.axvspan(xmin=ideal_topic_num - 1, xmax=ideal_topic_num + 1, alpha=0.5, facecolor='grey')

y_max = max(max(mean_stabilities), max(coherences)) + (0.10 * max(max(mean_stabilities), max(coherences)))
ax.set_ylim([0, y_max])
ax.set_xlim([1, num_topics[-1]-1])
                
ax.axes.set_title('Model Metrics per Number of Topics', fontsize=25)
ax.set_ylabel('Metric Level', fontsize=20)
ax.set_xlabel('Number of Topics', fontsize=20)
plt.legend(fontsize=20)
plt.show()   

In [None]:
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=100, passes=1, chunksize=len(corpus), random_state=0, id2word=dictionary)
ldamodel.print_topics(num_topics=100,num_words = 15)

In [29]:
final_names[:20]

[['basil', 'roasted', 'peppers', 'monterey', 'jack', 'cornbread'],
 ['crispy', 'cheese', 'twists'],
 ['mom', 'yeast', 'rolls'],
 ['sweet', 'potato', 'bread'],
 ['orange', 'buns'],
 ['cornish', 'splits'],
 ['jalapeno', 'cheese', 'bread'],
 ['dee', 'health', 'bread'],
 ['oatmeal', 'bread'],
 ['strawberry', 'bread'],
 ['sun', 'dried', 'tomato', 'asiago', 'cheese', 'bread'],
 ['hawa', 'sweet', 'bread'],
 ['corn', 'bread', 'ever', 'eat'],
 ['knead', 'refrigerator', 'rolls'],
 ['sourdough', 'starter'],
 ['sourdough', 'bread'],
 ['dilly', 'bread'],
 ['pumpkin', 'bread'],
 ['sy', 'challah'],
 ['onion', 'bread']]

In [30]:
data_samples = [' '.join(ti) for ti in final_names]
print(len(data_samples))

85986


In [31]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(max_df=0.99, max_features=None)
tfidf = tfidf_vectorizer.fit_transform(data_samples)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()



In [32]:
import operator
def rank_terms(A, terms):
    sums = A.sum(axis=0)
    weights = {}
    for col, term in enumerate(terms):
        weights[term] = sums[0,col]
    return sorted(weights.items(), key=operator.itemgetter(1), reverse=True)

In [33]:
ranking = rank_terms(tfidf, tfidf_feature_names)
topic_n = []
for i, pair in enumerate(ranking[0:50]):
    topic_n.append(pair[0])
    print( "%02d. %s (%.2f)" % (i+1, pair[0], pair[1]))

01. chicken (2272.10)
02. salad (1798.09)
03. chocolate (1270.53)
04. cake (1231.05)
05. pie (1133.84)
06. soup (1127.89)
07. cheese (1058.56)
08. sauce (1028.70)
09. cookies (1024.20)
10. bread (901.91)
11. cream (845.05)
12. potato (799.06)
13. rice (780.32)
14. apple (779.09)
15. pork (763.69)
16. butter (727.13)
17. grilled (715.79)
18. casserole (710.80)
19. sweet (710.15)
20. beef (676.21)
21. lemon (671.09)
22. pasta (653.07)
23. baked (617.94)
24. roasted (606.30)
25. pumpkin (596.65)
26. tomato (587.82)
27. sausage (584.39)
28. turkey (570.96)
29. dip (542.90)
30. shrimp (541.06)
31. bean (536.18)
32. orange (532.04)
33. creamy (529.37)
34. corn (527.61)
35. potatoes (520.92)
36. peanut (518.40)
37. garlic (517.41)
38. spicy (511.37)
39. pizza (510.44)
40. green (499.18)
41. spinach (493.83)
42. banana (492.77)
43. bacon (479.35)
44. muffins (468.81)
45. stuffed (467.14)
46. cranberry (465.23)
47. chili (454.18)
48. style (446.85)
49. coconut (446.55)
50. bars (444.66)


In [34]:
topic_n

['chicken',
 'salad',
 'chocolate',
 'cake',
 'pie',
 'soup',
 'cheese',
 'sauce',
 'cookies',
 'bread',
 'cream',
 'potato',
 'rice',
 'apple',
 'pork',
 'butter',
 'grilled',
 'casserole',
 'sweet',
 'beef',
 'lemon',
 'pasta',
 'baked',
 'roasted',
 'pumpkin',
 'tomato',
 'sausage',
 'turkey',
 'dip',
 'shrimp',
 'bean',
 'orange',
 'creamy',
 'corn',
 'potatoes',
 'peanut',
 'garlic',
 'spicy',
 'pizza',
 'green',
 'spinach',
 'banana',
 'bacon',
 'muffins',
 'stuffed',
 'cranberry',
 'chili',
 'style',
 'coconut',
 'bars']

In [176]:
from sklearn.decomposition import NMF
nmf = NMF(n_components=50, random_state=1, verbose=1, max_iter=200).fit(tfidf)



violation: 1.0
violation: 0.2155689762227214
violation: 0.10908997888529301
violation: 0.06910238414897138
violation: 0.044263619457772636
violation: 0.0292798200530504
violation: 0.022590922739200894
violation: 0.018530313455421582
violation: 0.0161409092968958
violation: 0.013599791828444374
violation: 0.01131850640482199
violation: 0.009432006069528652
violation: 0.008282131100097295
violation: 0.007611003315890785
violation: 0.006994055013590916
violation: 0.006297264222245237
violation: 0.005614696384997175
violation: 0.005102165375110644
violation: 0.004581471538779771
violation: 0.004116063422080472
violation: 0.003675494420306746
violation: 0.003237463009717813
violation: 0.0028843250952322653
violation: 0.0026637584020613235
violation: 0.0024798466809158753
violation: 0.0023415959836157532
violation: 0.002246965419063107
violation: 0.002185134362516504
violation: 0.002113775724112471
violation: 0.002056660984216301
violation: 0.0020135514450478195
violation: 0.0019854123278774

In [177]:
def print_top_words(model, feature_names, n_top_words):
    manual_topic_cat = []
    for topic_idx, topic in enumerate(model.components_[:50]): # just show first 10 topics
        print("Topic #%d:" % topic_idx)
        for i in topic.argsort()[:-n_top_words - 1:-1]:
            manual_topic_cat.append(feature_names[i])
        print(" ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))
    return manual_topic_cat

In [178]:
topic_cat = print_top_words(nmf, tfidf_feature_names, 15)

Topic #0:
chicken fried curry wings buffalo parmesan breasts enchiladas thai skillet pot spicy honey style bbq
Topic #1:
chocolate cookies chip oatmeal sugar white double mousse almond cookie mint dark hot pudding chewy
Topic #2:
salad dressing cucumber fruit avocado tuna summer vinaigrette caesar pea greek carrot egg macaroni quinoa
Topic #3:
soup vegetable lentil noodle tortilla squash carrot onion butternut bean barley mushroom hearty french leek
Topic #4:
pie pecan pot crust shepherd cherry peach custard rhubarb blueberry chocolate meringue lime berry key
Topic #5:
cake coffee pound chocolate carrot upside bundt layer spice cherry food pudding angel pineapple sheet
Topic #6:
cheese blue mac macaroni goat three ball spread johnsonville frosting cottage feta skillet beer cream
Topic #7:
ham pineapple honey glazed salsa mustard sandwiches glaze mango sugar swiss upside brown egg loaf
Topic #8:
bread pudding wheat nut raisin quick whole monkey machine herb cinnamon soda french irish ho

In [179]:
import numpy as np
nmf_emb = nmf.transform(tfidf)
top_idx = np.argsort(nmf_emb,axis=0)[-1:]
show_ten = 301
count_idxs = 0
final_topics = []
for idxs in top_idx.T:
    if count_idxs == show_ten: break
    for idx in idxs:
        final_topics.append(final_names[idx])
    count_idxs += 1

violation: 1.0
violation: 0.05401436983592342
violation: 0.0007153169321544086
violation: 1.1625615988039283e-05
Converged at iteration 5


In [180]:
final_topics

[['chicken'],
 ['chocolate', 'chocolate', 'chip', 'cookies'],
 ['salad'],
 ['chicken', 'soup'],
 ['pie', 'strawberry', 'pie'],
 ['cake'],
 ['cheese', 'baked', 'macaroni', 'cheese'],
 ['ham', 'pineapple'],
 ['bread'],
 ['sweet', 'spicy', 'sweet', 'potatoes'],
 ['peanut', 'butter'],
 ['pork', 'chops'],
 ['casserole'],
 ['sweet', 'cream', 'ice', 'cream'],
 ['ham', 'sauce', 'mustard', 'sauce'],
 ['rice'],
 ['apple', 'salad'],
 ['italian', 'style', 'sausage'],
 ['slow', 'cooker', 'chicken'],
 ['potatoes'],
 ['green', 'bean', 'salad'],
 ['zucchini', 'salad'],
 ['lemon', 'lemon', 'loaf'],
 ['pumpkin', 'cake'],
 ['chicken', 'pasta'],
 ['baked', 'chicken'],
 ['beef', 'stew'],
 ['dip'],
 ['muffins'],
 ['turkey'],
 ['corn', 'salad'],
 ['tomato', 'chicken'],
 ['cranberry', 'orange', 'cookies'],
 ['sweet', 'potato', 'potato', 'salad'],
 ['grilled', 'chicken'],
 ['creamy', 'chicken'],
 ['chicken', 'shrimp'],
 ['bars'],
 ['spinach', 'salad'],
 ['roasted', 'chicken'],
 ['stuffed', 'chicken'],
 ['garli

In [181]:
def get_top_recipes_for_category(recipes_all_list, w_vector, recipe_index, top):
    top_indic = np.argsort(w_vector[:,recipe_index])[::-1] # umdrehen
    top_result = [] # return result as list
    for rec_index in top_indic[0:top]:
        top_result.append(recipes_all_list[rec_index])
    return top_result

In [182]:
recipes_for_category = get_top_recipes_for_category(recipe_name, nmf_emb, 1, 10)
recipes_for_category

['chocolate chocolate chip cookies i',
 'chocolate chocolate chip cookies  ',
 'chocolate chocolate chip cookies  i',
 'chocolate cookies',
 'very chocolate cookies',
 'chocolate cookies',
 '  chocolate cookies',
 'chocolate chip cookies  i',
 'chocolate chip cookies i',
 'chocolate chip cookies v ']

In [183]:
def get_top_cat_for_recipe(recipes_all_list, w_vector):
    final_recipes_cat_name = []
    recipe_cat_link = []
    for r in range(len(recipes_all_list)):
        one_hot_matrix = w_vector[r,:]
        all_zeros = not one_hot_matrix.any() # no fitting category
        if all_zeros: 
            recipe_cat_link.append(-1) # no fitting category
        else:
            top_indic = np.argsort(one_hot_matrix)[::-1][0] # 1. umdrehen 2. return erste
            recipe_cat_link.append(top_indic)
    for num in recipe_cat_link:
        if num == -1: 
            final_recipes_cat_name.append('no_cat')
        else: 
            final_recipes_cat_name.append(final_topics[num])
    return final_recipes_cat_name

In [184]:
categories_for_recipes = get_top_cat_for_recipe(recipe_name, nmf_emb)

In [185]:
count_no_cat = 0
for recipe_s in categories_for_recipes:
    if recipe_s == 'no_cat': count_no_cat+=1
print('Für {} Rezepte wurde keine Kategorie festgestellt'.format(count_no_cat))

Für 514 Rezepte wurde keine Kategorie festgestellt


In [186]:
categories_for_recipes

[['roasted', 'chicken'],
 ['cheese', 'baked', 'macaroni', 'cheese'],
 ['bread'],
 ['bread'],
 ['cranberry', 'orange', 'cookies'],
 ['banana', 'banana', 'bread'],
 ['bread'],
 ['bread'],
 ['bread'],
 ['bread'],
 ['bread'],
 ['bread'],
 ['bread'],
 ['stuffed', 'chicken'],
 ['bread'],
 ['bread'],
 ['bread'],
 ['bread'],
 ['bread'],
 ['bread'],
 ['bread'],
 ['bread'],
 ['pumpkin', 'cake'],
 ['bread'],
 ['banana', 'banana', 'bread'],
 ['bread'],
 ['bread'],
 ['banana', 'banana', 'bread'],
 ['bread'],
 ['bread'],
 ['tomato', 'chicken'],
 ['bread'],
 ['bread'],
 ['bread'],
 ['bread'],
 ['bread'],
 ['bread'],
 ['bread'],
 ['bread'],
 ['bread'],
 ['bread'],
 ['muffins'],
 ['bread'],
 ['bread'],
 ['bread'],
 ['muffins'],
 ['bread'],
 ['bread'],
 ['chocolate', 'chocolate', 'chip', 'cookies'],
 ['banana', 'banana', 'bread'],
 ['banana', 'banana', 'bread'],
 ['banana', 'banana', 'bread'],
 ['bacon'],
 ['bread'],
 ['garlic', 'steak', 'garlic'],
 ['bread'],
 ['bread'],
 ['bread'],
 ['bread'],
 ['stra

In [201]:
data['category'] = categories_for_recipes

In [202]:
pd.set_option('display.max_rows', 200)
data[:200]

Unnamed: 0,ingredients,instructions,photo_url,title,category
0,"[1/2 cup unsalted butter, chilled and cubed, 1...",[Preheat oven to 400 degrees F (205 degrees C)...,http://images.media-allrecipes.com/userphotos/...,"Basil, Roasted Peppers and Monterey Jack Cornb...","[roasted, chicken]"
1,"[1/2 cup Parmesan cheese, 3/4 teaspoon ground ...","[Combine parmesan cheese, pepper and garlic po...",http://images.media-allrecipes.com/userphotos/...,Crispy Cheese Twists,"[cheese, baked, macaroni, cheese]"
2,"[2 cups hot water, 1/2 cup margarine, 1/3 cup ...",[Melt margarine in hot water. Add sugar and sa...,http://images.media-allrecipes.com/userphotos/...,Mom's Yeast Rolls,[bread]
3,"[1 1/2 cups white sugar, 1/2 cup vegetable oil...",[Combine sugar and oil; beat well. Add eggs an...,http://images.media-allrecipes.com/userphotos/...,Sweet Potato Bread I,[bread]
4,"[1/4 cup butter, 1 teaspoon white sugar, 1 cup...",[Stir butter and 1 teaspoon sugar into the hot...,http://images.media-allrecipes.com/userphotos/...,Orange Buns,"[cranberry, orange, cookies]"
5,"[1 teaspoon active dry yeast, 1 1/4 cups lukew...","[In a small bowl, dissolve the yeast in the mi...",http://images.media-allrecipes.com/userphotos/...,Cornish Splits,"[banana, banana, bread]"
6,"[8 cups all-purpose flour, 4 cups shredded Che...","[In a very large bowl, combine 7 cups of flour...",http://images.media-allrecipes.com/userphotos/...,Jalapeno Cheese Bread,[bread]
7,"[2 tablespoons active dry yeast, 1 teaspoon wh...","[In a small bowl, dissolve the yeast and sugar...",http://images.media-allrecipes.com/userphotos/...,Dee's Health Bread,[bread]
8,"[1 cup rolled oats, 1/2 cup molasses, 1/3 cup ...","[Combine oats, molasses, oil, salt and boiling...",http://images.media-allrecipes.com/userphotos/...,Oatmeal Bread I,[bread]
9,"[3 cups all-purpose flour, 2 cups white sugar,...",[Grease and flour a 9 x 5 inch pan well. Prehe...,http://images.media-allrecipes.com/userphotos/...,Strawberry Bread I,[bread]


In [203]:
import os
def createFolder(directory):
    try:
        if not os.path.exists(directory):
            os.makedirs(directory)
    except OSError:
        print ('Error: Creating directory. ' +  directory)
def movePic(src, dest):
    os.rename(src, dest)
    
categories_for_recipes
directories_names = []

for directt in categories_for_recipes:
    d_name = '_'.join(directt)
    directories_names.append(d_name)

for curr_dir_category in directories_names:
    createFolder('./input/images/images/' + curr_dir_category)

In [215]:
data['directories_names'] = directories_names
image_category = dict(zip(data.photo_url, data.directories_names))

In [229]:
from collections import defaultdict
image_category = defaultdict(list)
print(type(data))
for index, row in data.iterrows():
    #print(row['photo_url'], row['directories_names'])
    image_category[row.directories_names].append(row.photo_url)

<class 'pandas.core.frame.DataFrame'>


In [None]:
for directory in directories_names:
    print(directory+':')
    for image in image_category[directory]:
        if "http://images.media-allrecipes.com/" in image:
            image_dir = image.split(".com")[1]
            image_name = image_dir.split('/')[-1]
            try:
                movePic(r"C:/Users/Shru/Desktop/Georgia Tech Master/Summer 2022/project/archive"+image_dir, r"C:/Users/Shru/Desktop/Georgia Tech Master/Summer 2022/project/archive/input/images/images/"+directory+'/'+image_name)
            except:
                print(image_dir)
        else:
            try:
                movePic(r"C:/Users/Shru/Desktop/Georgia Tech Master/Summer 2022/project/archive/dataset/"+image+'.jpg', r"C:/Users/Shru/Desktop/Georgia Tech Master/Summer 2022/project/archive/input/images/images/"+directory+'/'+image+'.jpg')
            except:
                print(image)
            #print(image_dir.split('/')[-1])