In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import pickle

recipes = pd.read_csv('PP_recipes.csv')

with open('ingr_map.pkl', 'rb') as f:
    ingr_map = pickle.load(f)

In [2]:
# hacky way to convert to dataframe and also save ingredients list
ingr_map['replaced'].to_csv('replaced_ingredients.csv')
ingredients = pd.read_csv('replaced_ingredients.csv')

In [5]:
# class to reduce amount of types of ingredients in map

# NOTE: the reduce() method seems to be bugged out right now. I am only using this to remove duplicate ingredients

class Reducer():

    # constructor
    # recommended to use this on ingr_map['replaced']
    def __init__(self, map: pd.Series):
        self.unreduced_map = map # original ingredients map
        self.index_mapping = []     # mapping from old ingredient IDs to new ingredient IDs


    # methods for reducing size of ingredients list

    # modifies ingredients map so there are no duplicate ingredient names
    def ingr_reduce(self):
        # find all ingredients with identical names
        names = {}
        for idx, ingr in enumerate(self.unreduced_map):
            if ingr not in names:
                names[ingr] = [idx]
            else:
                names[ingr].append(idx)
        # update map with list of unique names
        self.map = pd.Series(data=names.keys(), index=range(len(names.keys())))
        # save mapping from old IDs to new IDs
        self.index_mapping = [0] * self.unreduced_map.size
        for new_idx, name in enumerate(names):
            for old_idx in names[name]:
                self.index_mapping[old_idx] = new_idx

    def ingr_reduce_aggressive(self):
        self.ingr_reduce(ingr_map)
        # TODO: Use more heuristics to reduce ingredient count

    # methods for modifying datasets

    # uses self.index_mapping to swap the original ingredient IDs with the new ones
    # assumes format consistent with original version of PP_recipes.csv
    def reduce(self, df: pd.DataFrame):
        # change ingredient IDs
        for idx, id_list in enumerate(df['ingredient_ids']):
            print(id_list)
            if type(id_list) == type(int()):
                df.loc[idx, 'ingredient_ids'] = self.index_mapping[id_list]
            else:
                for id in id_list:
                    df.loc[idx, 'ingredient_ids'] = self.index_mapping[id]
        return df

In [6]:
# remove duplicate ingredients
r = Reducer(ingr_map['replaced'])
r.ingr_reduce()
r.map.to_csv('replaced_ingredients.csv')

In [7]:
import csv

# Get most common words in ingredients list
ingredient_words = {}
ingredients = []
with open('replaced_ingredients.csv') as csvfile:
    data = csv.reader(csvfile, delimiter=',', quotechar='"')
    for row in data:
        # sanitize inputs
        s = row[1].replace('"', '')
        s = s.replace("'", "")
        s = s.replace('-', ' ')
        s = s.replace('flmy', 'flour')  # idk how they managed to mess this up, but they did
        ingredients.append(s)
        tokens = s.split(' ')
        for tok in tokens:
            if len(tok) > 1 and tok in ingredient_words:
                ingredient_words[tok] += 1
            elif len(tok) > 1:
                ingredient_words[tok] = 1

In [8]:
ingr_words_sorted = []
for word in ingredient_words:
    ingr_words_sorted.append((word, ingredient_words[word]))
ingr_words_sorted.sort(key=lambda x:x[1], reverse=True)

In [9]:
# list of words to remove
bad_words = ['fresh', 'frozen', 'and', 'whole', 'leaf', 'of', 'in', 'hot', 'cooked', 'canned', 'light', 'de']

In [175]:
# remove the "bad words" and anything else that was filered out by the cleaning above from the ingredients list
new_ingredients = []
for ingredient in ingredients:
    for b in bad_words:
        tokens = ingredient.split(' ')
        if b in tokens:
            tokens.remove(b)
            # rebuild string
            ingredient = ''
            for tok in tokens:
                ingredient += (tok + ' ')
            ingredient = ingredient[:-1]
    new_ingredients.append(ingredient)
ingredients = new_ingredients

In [176]:
len(ingredients)

8024

In [12]:
# create new set of ingredient words and their frequencies
ingredient_words = {}
for ingredient in ingredients:
    tokens = ingredient.split(' ')
    for tok in tokens:
        if len(tok) > 1 and tok in ingredient_words:
            ingredient_words[tok] += 1
        elif len(tok) > 1:
            ingredient_words[tok] = 1

In [13]:
# sort the new (reduced) list of ingredient words
ingr_words_sorted = []
for word in ingredient_words:
    ingr_words_sorted.append((word, ingredient_words[word]))
ingr_words_sorted.sort(key=lambda x:x[1], reverse=True)

In [14]:
# build list of which words are used by which ingredients
ingr_words_ingrs = {}
for word in ingredient_words:
    ingr_words_ingrs[word] = []
for idx, ingredient in enumerate(ingredients):
    words = ingredient.split(' ')
    for word in words:
        if word in ingredient_words:
            ingr_words_ingrs[word].append(idx)

In [15]:
# build the ingredient shared-word graph
weights = np.zeros(shape=(len(ingredients), len(ingredients)), dtype=np.int8)
count = 0
for word in ingr_words_ingrs:
    for idx, ingredient in enumerate(ingr_words_ingrs[word]):
        if idx + 1 < len(ingr_words_ingrs[word]):
            for other in ingr_words_ingrs[word][idx+1:]:
                weights[ingredient][other] += 1
                weights[other][ingredient] += 1
                count += 1

In [16]:
# for generating the list of clusters (takes about 2 minutes to run on my laptop)
def get_clusters(graph: np.ndarray, threshold: int):
    clusters = []
    nodes_remaining = set(range(len(graph)))
    while len(nodes_remaining) > 0:
        # remove arbitrary node
        node = nodes_remaining.pop()
        queue = [node]
        clusters.append(set())
        # BFS the cluster
        while len(queue) > 0:
            node = queue.pop(0)
            clusters[len(clusters) - 1].add(node)
            for idx, n in enumerate(graph[node]):
                if n > threshold and idx in nodes_remaining: # found a neighbor
                    queue.append(idx)
                    nodes_remaining.remove(idx)
    return clusters

In [17]:
weights_copy = weights.copy()
clusters = get_clusters(weights_copy, 1)

In [23]:
cluster_lengths = [len(cluster) for cluster in clusters]
cluster_lengths

[1,
 1,
 3657,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 9,
 1,
 1,
 1,
 1,
 9,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 19,
 1,
 1,
 2,
 3,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 9,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 8,
 1,
 2,
 1,
 1,
 1,
 2,
 3,
 5,
 1,
 1,
 2,
 1,
 4,
 19,
 1,
 1,
 2,
 1,
 3,
 1,
 1,
 2,
 1,
 2,
 1,
 2,
 1,
 1,
 1,
 4,
 3,
 2,
 4,
 1,
 1,
 9,
 1,
 5,
 1,
 4,
 1,
 1,
 1,
 2,
 1,
 2,
 11,
 7,
 3,
 1,
 3,
 2,
 1,
 2,
 1,
 1,
 1,
 1,
 3,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 3,
 1,
 1,
 1,
 1,
 1,
 1,
 3,
 3,
 1,
 1,
 1,
 3,
 1,
 6,
 3,
 5,
 1,
 2,
 1,
 2,
 1,
 2,
 1,
 1,
 2,
 3,
 1,
 1,
 3,
 1,
 1,
 1,
 2,
 2,
 1,
 1,
 1,
 1,
 1,
 6,
 1,
 1,
 1,
 1,
 3,
 2,
 7,
 1,
 1,
 4,
 1,
 1,
 5,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 3,
 3,
 1,
 2,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 4,
 1,
 1,
 1,
 3,
 1,
 1,
 1,
 1,
 2,
 1,
 2,
 1,
 1,
 2,
 1,
 1,
 1,
 3,
 1,
 1,
 3,
 1,
 1,
 3,
 7,
 1,
 1,
 3,
 1,
 1,
 1,
 2,
 8,
 2,
 1,
 1,
 1,
 2

In [24]:
# second largest cluster size (19 seems like it could be reasonable)
max(cluster_lengths[3:])

19

In [25]:
for idx, cluster in enumerate(clusters):
    if len(cluster) == 19:
        print(idx)
        for ingredient in cluster:
            print(ingredients[ingredient])

25
angel hair pastum
bow tie pastum
wheat pastum
dry penne pastum
dry pastum
spiral shaped pastum
angel hair pastum
tri colored pastum
tri colored fusilli
wheat spiral pastum
penne pastum
wheat bow tie pastum
dry bow tie pastum
star shaped pastum
spiral shaped pastum
tri color spiral pastum
bow tie pastum
wheat angel hair pastum
tri colored pastum
81
shell macaroni
dry pasta shell macaroni
pasta shell
elbow macaroni
elbow macaroni
wheat elbow macaroni
small wheat pasta shell
small macaroni noodle
small shell pastum
jumbo pasta shell
medium pasta shell
small shell pastum
shell pastum
small macaroni pastum
wheat pasta shell
macaroni noodle
small elbow macaroni
wheat macaroni
pasta shell


In [38]:
'''
Commented out because it is like 100 times slower than the optimized version, so I will not be updating it with bug fixes
# this is an improved version that breaks up very large clusters
# for generating the list of clusters (takes about 2 minutes to run on my laptop per recursion)
# params:
# graph: ndarray that represents shared words between ingredients. will be modifed by this function
# min_threshold: weights of this value and below will be discarded 
# max_cluster_size: clusters above this size will be broken into more clusters
# ignore: nodes to ignore
def better_get_clusters(graph: np.ndarray, min_threshold: int, max_cluster_size: int, ignore=set()):
    clusters = []
    nodes_remaining = set(range(len(graph)))
    all_nodes = nodes_remaining
    for node in ignore:
        if node in nodes_remaining:
            nodes_remaining.remove(node)
    while len(nodes_remaining) > 0:
        # remove arbitrary node
        node = nodes_remaining.pop()
        queue = [node]
        clusters.append(set())
        # BFS the cluster
        while len(queue) > 0:
            node = queue.pop(0)
            clusters[len(clusters) - 1].add(node)
            for idx, n in enumerate(graph[node]):
                if n > min_threshold and idx in nodes_remaining: # found a neighbor
                    queue.append(idx)
                    nodes_remaining.remove(idx)
        # split up large clusters
        if len(clusters[-1]) > max_cluster_size:
            # create graph of cluster
            cluster_graph = graph.copy()
            for idx in range(len(graph)):
                # remove ingredients that are not in cluster
                if idx not in clusters[-1]:
                    cluster_graph[idx] = np.zeros(shape=(len(graph)), dtype=np.int8)
                    cluster_graph[:][idx] = np.zeros(shape=(len(graph)), dtype=np.int8)
            # recursively call this with a higher threshold
            not_in_cluster = all_nodes - set(clusters[-1])
            split_cluster = better_get_clusters(cluster_graph, min_threshold+1, max_cluster_size, ignore=not_in_cluster)
            # replace the large cluster with many smaller clusters
            clusters.pop()
            clusters.extend(split_cluster)
    return clusters
'''

In [55]:
# NOTE: This version should be functionally the same, but optimized for performance (runs in >2 seconds on my laptop)
# this is an improved version that breaks up very large clusters
# params:
# graph: ndarray that represents shared words between ingredients. will be modifed by this function
# min_threshold: weights of this value and below will be discarded 
# max_cluster_size: clusters above this size will be broken into more clusters
# ignore: nodes to ignore
def fast_better_get_clusters(graph: np.ndarray, min_threshold: int, max_cluster_size: int, ignore=set()):
    clusters = []
    nodes_remaining = set(range(len(graph)))
    all_nodes = nodes_remaining.copy()
    print("ignoring " + str(len(ignore)) + " nodes")
    for node in ignore:
        if node in nodes_remaining:
            nodes_remaining.remove(node)
    while len(nodes_remaining) > 0:
        # remove arbitrary node
        node = nodes_remaining.pop()
        queue = [node]
        clusters.append(set())
        # BFS the cluster
        while len(queue) > 0:
            node = queue.pop(0)
            clusters[-1].add(node)
            indices = np.argwhere(graph[node] > min_threshold)
            for idx in indices:
                if int(idx) in nodes_remaining:# found a neighbor
                    queue.append(int(idx))
                    nodes_remaining.remove(int(idx))
        # split up large clusters
        if len(clusters[-1]) > max_cluster_size:
            # create graph of cluster
            cluster_graph = graph.copy()
            for idx in range(len(graph)):
                # remove ingredients that are not in cluster
                if idx not in clusters[-1]:
                    cluster_graph[idx] = np.zeros(shape=(len(graph)), dtype=np.int8)
                    cluster_graph[:][idx] = np.zeros(shape=(len(graph)), dtype=np.int8)
            # recursively call this with a higher threshold
            print("spliting " + str(len(clusters[-1])) + " nodes")
            not_in_cluster = all_nodes - set(clusters[-1])
            split_cluster = fast_better_get_clusters(cluster_graph, min_threshold+1, max_cluster_size, ignore=not_in_cluster)
            # replace the large cluster with many smaller clusters
            clusters.pop()
            clusters.extend(split_cluster)
    return clusters

In [69]:
weights_copy = weights.copy()
clusters = fast_better_get_clusters(weights_copy, 1, 40) # experimentally determined parameters

ignoring 0 nodes
spliting 3657 nodes
ignoring 4367 nodes
spliting 211 nodes
ignoring 7813 nodes
spliting 92 nodes
ignoring 7932 nodes


In [70]:
len(clusters)

6903

In [71]:
cluster_lengths = [len(cluster) for cluster in clusters]
max(cluster_lengths)

31

In [72]:
# the largest cluster
for idx, cluster in enumerate(clusters):
    if len(cluster) == 31:
        print(idx)
        for ingredient in cluster:
            print(ingredients[ingredient])

2
instant chocolate pudding mix
french vanilla pudding pie filling mix
chocolate fudge instant pudding mix
white chocolate pudding mix
banana cream pudding mix
instant banana cream pudding mix
chocolate instant pudding pie mix
jell o chocolate pudding pie filling
cheesecake instant pudding pie filling mix
chocolate instant pudding pie filling mix
vanilla instant pudding pie filling mix
instant white chocolate pudding pie filling mix
french vanilla pudding mix
instant chocolate fudge pudding mix
chocolate instant pudding mix
instant white chocolate pudding mix
vanilla instant pudding mix
instant coconut cream pudding mix
instant cheesecake pudding mix
lemon instant pudding pie filling
instant vanilla pudding pie filling
banana cream pudding pie filling mix
cheesecake instant pudding pie filling
jello instant vanilla pudding mix
instant coconut pudding mix
coconut cream pudding mix
instant banana pudding mix
chocolate fudge pudding mix
banana cream instant pudding
instant vanilla pudding

In [73]:
# another large cluster
for idx, cluster in enumerate(clusters):
    if len(cluster) == 29:
        print(idx)
        for ingredient in cluster:
            print(ingredients[ingredient])

351
dried mild red chili pepper
green chili pepper
dried red pepper flake
dried red pepper
ground red chili pepper
dried chili pepper flake
dried chili pepper
mild chili pepper
dried red chili pepper
crushed red pepper flake
ground red pepper
green chili pepper
dry red pepper
red chili pepper flake
red chili pepper flake
dried red chili pepper
green bell pepper
red chili pepper
ground ancho chili pepper
thai red chili pepper
dried red chili
red chili pepper
green bell pepper flake
green pepper flake
green chili pepper flake
red pepper flake
red pepper flake
chili pepper flake
dry crushed red pepper


In [100]:
# Now, let's remove ingredient clusters that do not appear frequently in the training data (borrowed heavily from Sammy)
recipes = pd.read_csv('PP_recipes.csv')
recipes.drop(columns=["calorie_level", "name_tokens", "i", "id"], inplace=True)
recipes.head()


Unnamed: 0,ingredient_tokens,steps_tokens,techniques,ingredient_ids
0,"[[2911, 1019, 249, 6878], [1353], [6953], [153...","[40480, 40482, 21662, 481, 6878, 500, 246, 161...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...","[389, 7655, 6270, 1527, 3406]"
1,"[[17918], [25916], [2507, 6444], [8467, 1179],...","[40480, 40482, 729, 2525, 10906, 485, 43, 8393...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...","[2683, 4969, 800, 5298, 840, 2499, 6632, 7022,..."
2,"[[5867, 24176], [1353], [6953], [1301, 11332],...","[40480, 40482, 8240, 481, 24176, 296, 1353, 66...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...","[1257, 7655, 6270, 590, 5024, 1119, 4883, 6696..."
3,"[[1270, 1645, 28447], [21601], [27952, 29471, ...","[40480, 40482, 5539, 21601, 1073, 903, 2324, 4...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[7940, 3609, 7060, 6265, 1170, 6654, 5003, 3561]"
4,"[[1430, 11434], [1430, 17027], [1615, 23, 695,...","[40480, 40482, 14046, 1430, 11434, 488, 17027,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...","[3484, 6324, 7594, 243]"


In [128]:
# convert to numpy and to values instead of strings
ingredient_np = recipes["ingredient_ids"].to_numpy()
for x in range(len(ingredient_np)):
    ingredient_np[x] = np.array(eval(ingredient_np[x]))

In [110]:
# create mapping from ingredient ids to cluster ids
ingredient_to_cluster = np.ones(shape=len(ingredients), dtype=np.int16)
ingredient_to_cluster = np.multiply(ingredient_to_cluster, -1)
for idx, cluster in enumerate(clusters):
    for ingredient in cluster:
        assert(ingredient_to_cluster[ingredient] == -1)
        ingredient_to_cluster[ingredient] = idx

In [150]:
# sanity check that mapping is complete
np.argwhere(ingredient_to_cluster == -1)

array([], shape=(0, 1), dtype=int64)

In [125]:
# replace all ingredient ids in dataset with cluster ids
ingredient_np_copy = ingredient_np.copy()
for row in ingredient_np_copy:
    np.copyto(row, np.array([ingredient_to_cluster[n] for n in row])) 

In [134]:
# oops, I need to restore this if I want to use it again
recipes = pd.read_csv('PP_recipes.csv')
recipes.drop(columns=["calorie_level", "name_tokens", "i", "id"], inplace=True)
recipes.head()
ingredient_np = recipes["ingredient_ids"].to_numpy()
for x in range(len(ingredient_np)):
    ingredient_np[x] = np.array(eval(ingredient_np[x]))
# rename this array
cluster_np = ingredient_np_copy

In [165]:
# Remove clusters that are not used frequently

min_frequency = 100
frequencies = np.zeros(shape=len(clusters), dtype=np.int16)
for row in cluster_np:
    np.add.at(frequencies, row, 1) # increment all clusters used in this ingredient
frequencies

array([   1,   13, 2231, ..., 3661,    2,    0], dtype=int16)

In [167]:
cluster_np

array([array([ 411, 6535, 2908, 3383, 3821]),
       array([ 395, 4745, 3253, 4951,  668, 3642, 5723, 3055,  524, 1979, 3858,
       2908]),
       array([ 860, 6535, 2908,  519, 3347,  795, 3670, 5772, 6825, 5151, 6152,
       6585, 6476,  219,  395]),
       ...,
       array([3611, 6535, 1963, 3447, 2553, 4963, 4169,  395, 3642, 5540,  668,
       5784,  219, 4883]),
       array([5138,  273, 5022, 3820, 6858,  782,  860, 6683, 5772, 3253,  914,
       2119]),
       array([5615, 3253, 4642,  395,  383, 1029,  524, 1979, 3642])],
      dtype=object)

In [146]:
# there are 645 clusters that appear in at least 300 recipes
np.shape( np.argwhere(frequencies >= min_frequency) )

(1282, 1)

In [163]:
# create list of common clusters
common_clusters = np.argwhere(frequencies >= min_frequency)

# create deep copy of cluster_np without recipes that contain uncommon clusters
common_recipes = []
for recipe in cluster_np:
    # only copy recipes that only contain common clusters
    #test = np.argwhere(recipe not in common_clusters)
    test = [cluster in common_clusters for cluster in recipe]
    if False not in test:
        common_recipes.append(recipe)
        break
common_recipes = np.array(common_recipes, dtype=object)
common_recipes.size

8

In [160]:
np.shape(common_recipes)

(178265,)

In [153]:
(np.argwhere(ingredient_to_cluster == -1)).size

0

In [154]:
common_clusters

array([[   2],
       [   3],
       [   5],
       ...,
       [6891],
       [6898],
       [6900]], dtype=int64)

In [141]:
np.shape(frequencies)

(6903,)

In [135]:
ingredient_np

array([array([ 389, 7655, 6270, 1527, 3406]),
       array([2683, 4969,  800, 5298,  840, 2499, 6632, 7022, 1511, 3248, 4964,
       6270]),
       array([1257, 7655, 6270,  590, 5024, 1119, 4883, 6696, 7946, 5648, 7239,
       7705, 7594, 1168, 2683]),
       ...,
       array([2378, 7655, 3219, 2320, 5168, 5319, 4189, 2683, 2499, 6363,  840,
       6711, 1168, 5180]),
       array([5627, 2807, 5412, 3399, 7979, 1093, 1257, 7803, 6696,  800, 1833,
       3512]),
       array([6473,  800, 4807, 2683,  335, 1563, 1511, 3248, 2499])],
      dtype=object)

In [132]:
cluster_np

array([array([ 411, 6535, 2908, 3383, 3821]),
       array([ 395, 4745, 3253, 4951,  668, 3642, 5723, 3055,  524, 1979, 3858,
       2908]),
       array([ 860, 6535, 2908,  519, 3347,  795, 3670, 5772, 6825, 5151, 6152,
       6585, 6476,  219,  395]),
       ...,
       array([3611, 6535, 1963, 3447, 2553, 4963, 4169,  395, 3642, 5540,  668,
       5784,  219, 4883]),
       array([5138,  273, 5022, 3820, 6858,  782,  860, 6683, 5772, 3253,  914,
       2119]),
       array([5615, 3253, 4642,  395,  383, 1029,  524, 1979, 3642])],
      dtype=object)

In [124]:
ingredient_to_cluster[389]

411

In [109]:
clusters

[{0},
 {1},
 {2,
  22,
  23,
  28,
  30,
  33,
  35,
  38,
  57,
  69,
  70,
  73,
  75,
  83,
  111,
  128,
  147,
  149,
  156,
  211,
  219,
  323,
  390,
  423,
  467,
  859,
  931,
  980,
  1015,
  1116,
  1344},
 {14, 48, 82, 224, 345, 380, 452, 479, 965},
 {17, 158, 499},
 {37, 78, 122, 197, 295, 313, 1014, 1227, 3078},
 {45, 107, 110, 143, 196, 202, 215, 340, 669, 712, 848, 941, 1199},
 {63},
 {68},
 {86},
 {118, 1076},
 {159},
 {169, 429},
 {199, 326},
 {212},
 {213},
 {240, 900},
 {245},
 {247},
 {249},
 {261},
 {315},
 {342},
 {349},
 {353, 2230, 2547},
 {359},
 {373},
 {375},
 {401},
 {434},
 {437},
 {456, 1111},
 {470},
 {481},
 {482},
 {486},
 {493},
 {502},
 {513, 863},
 {516},
 {526},
 {528},
 {533},
 {538},
 {556},
 {563},
 {569},
 {579},
 {580},
 {601},
 {623},
 {625},
 {630},
 {632},
 {646},
 {648},
 {689},
 {692},
 {710},
 {731},
 {769},
 {779},
 {792},
 {879},
 {905},
 {924},
 {942},
 {957},
 {968},
 {977},
 {981},
 {989},
 {995},
 {1006},
 {1057},
 {1063},
 {1064}

In [172]:
len(ingredients)
ingredients[8023]

'khoya'

In [101]:
type(recipes["ingredient_ids"][0])

str

In [102]:
# Note that ingredient 8023, "khoya" never appears in the dataset
for row in recipes["ingredient_ids"]:
    if '8023' in row:
        print('success')

In [92]:
all_ingredients = ingredient_np[0]
for row in ingredient_np:
    all_ingredients = np.append(all_ingredients, row)
all_ingredients.max()

8022

In [174]:
ingredients[2]

'french vanilla pudding pie filling mix'