In [1]:
import numpy as np
import pandas as pd
import itertools
import pickle


In [3]:
with np.load('data/simplified-recipes-1M.npz', allow_pickle=True) as data:
    recipes = data['recipes']
    ingredients = data['ingredients']


In [4]:
word_recipes_10k = [ingredients[recipes[i]] for i in range(10000)]
recipes_10k = [recipes[i] for i in range(10000)]
recipes_1k = [recipes[i] for i in range(1000)]
recipes_100 = [recipes[i] for i in range(100)]

In [5]:
def get_combs(in_lst, comb_len=2):
    """Get possible combinations of length n from input list.
    
    Args:
    in_lst (list): list to get combinations from
    comb_len (int): length of combinations (default 2)

    Returns:
    dict where keys are combinations and values are counts
    
    """
    all_combs = {}

    for recipe in in_lst:
        # get all combinations in recipe
        combs = itertools.combinations(recipe, comb_len)
        for comb in combs:
            if comb not in all_combs:
                all_combs[comb] = 1
            else:
                all_combs[comb] += 1

            

    return all_combs

def recipe_words(recipe):
    """Get the ingredient names for a recipe.
    
    Args:
    recipe (np arr): input recipe as np array of ingredient indices

    Returns:
    input recipe as list of ingredient strings
    
    """
    return [ingredients[i] for i in recipe]


def dict_to_pickle(dict, path):
    """Write a dictionary to a pickle at specified path.
    
    Args:
    dict: dictionary to write to pickle
    path (str): location to save pickle

    Returns:
    nothing
    """
    with open(path, 'wb') as handle:
        pickle.dump(dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
    print('saved ' + str(path))


def NPMI(ing1, ing2, recipes):
    """Calculate the normalized point-wise mutual information"""

    num_recipes = len(recipes)
    # prob of ingredient 1 occuring
    recipes_with_1 = [recipe for recipe in recipes  if ing1 in recipe]
    p_of_ing1 = len(recipes_with_1) / num_recipes

    # prob of ingredient 2 occuring
    recipes_with_2 = [recipe for recipe in recipes if ing2 in recipe]
    p_of_ing2 = len(recipes_with_2) / num_recipes

    # calculate prob of both ingredients occuring together
    recipes_with_both = [recipe for recipe in recipes_with_1 if ing2 in recipe]
    p_of_ing_1_and_2 = len(recipes_with_both) / len(recipes)

    # calculate PMI
    pmi = np.log2(p_of_ing_1_and_2 / (p_of_ing1 * p_of_ing2))

    # normalize PMI
    nmpi = pmi / - (np.log2(p_of_ing_1_and_2))

    return nmpi


In [10]:
existing_combs_counts = get_combs(recipes)

existing_combs = [comb for comb, value in existing_combs_counts.items()]

In [14]:
existing_combs[0]


(233, 2754)

In [None]:
for i in range(30):
    print(existing_combs[i])
    print(NPMI(existing_combs[i][0], existing_combs[i][1], recipes))