In [1]:
import re
import pandas as pd
import nltk
import unicodedata
import csv
import itertools
from tqdm import trange, tqdm
from nltk import word_tokenize, pos_tag, corpus
from nltk.stem import WordNetLemmatizer
from collections import defaultdict, Counter

In [2]:
# Read in the datasets
df_directions = pd.read_excel('/Users/nessyliu/Desktop/RA/AllRecipes_Directions_2019-11-26.xlsx')
df = pd.read_csv('/Users/nessyliu/Desktop/RA/AllReviews_26thNov2019.csv')
df_ingredients_raw = pd.read_csv('/Users/nessyliu/Desktop/RA/part_2/Ingredients.csv')
df_cluster = pd.read_excel('/Users/nessyliu/Desktop/RA/part_2/Cluster_names.xlsx')

In [3]:
full_directions_list = []
recipe_id_list = list(set(df_directions.recipe_id.tolist()))
for recipe_id in recipe_id_list:
    full_dir_this_recipe = ' '.join(df_directions.loc[df_directions['recipe_id']==recipe_id, 'directions_step_text'])
    full_directions_list.append(full_dir_this_recipe)
dict_recipe_direction = dict(zip(recipe_id_list, full_directions_list))

In [4]:
# list of cluster names with spaces (e.g. apple juice) sorted by number of words in each name, 
# e.g. "apple juice" should appear before "juice"
cluster_name_orig_list = df_cluster.cluster_name.tolist()
cluster_name_orig_list.sort(key=lambda x: len(x.split()), reverse=True)

# create dict to map recipe_id to ingredient_ids
df_ingredients = df_ingredients_raw.groupby('recipe_id')['ingredient_id'].apply(list).reset_index(name ='ingredients')
dict_recipe_ingredients = dict(zip(df_ingredients.recipe_id, df_ingredients.ingredients))

# create dict to map ingredient_id to cluster_name
df_cluster = df_cluster.replace(' ', '_', regex=True)
dict_ingredient_clustername = dict(zip(df_cluster.ingredient_id, df_cluster.cluster_name))

In [5]:
# lemmatization dict
lemma_list = pd.read_csv('/Users/nessyliu/Desktop/RA/lemma_list.csv')
lemma_dict = lemma_list.set_index('word_list').to_dict()['lemma_list']

def lemmatization(text):
    # use the custom lemma dict first
    text = " ".join(str(lemma_dict.get(word, word)) for word in text.split())
    
    # then use the WordNetLemmatizer from nltk
    wnl = WordNetLemmatizer()
    # lemmatize each word based on its pos tagging
    text_after = " ".join([wnl.lemmatize(i,j[0].lower()) if j[0].lower() in ['a','n','v'] else wnl.lemmatize(i) for i,j in pos_tag(word_tokenize(text))])    
    return text_after

In [6]:
unigram_exclude_list = ['purpose','extra', 'whole', 'frying', 'cut',
                        'sun', 'baby', 'five', 'star',
                        'white', 'green', 'black', 'red', 'pink', 'yellow', 'brown', 'golden', 'blue', 
                        'color', 'colored',
                        'ground', 'sea', 'part', 'baked', 'raw', 'new', 'active',
                        'italian', 'dark', 'light', 'fresh', 'sweet', 'candied',
                        'dried', 'dry', 'heavy', 'condensed', 'firm', 'free', 'mixed', 'flavored', 'evaporated',
                        'hot', 'self', 'rising', 'split', 'cooking', 'stewed',
                        'de', 'dr']
unigram_exclude_list += corpus.stopwords.words('english')

ngram_exclude_list = ['all_purpose', 'purpose_flour', 'free_all']

ngram_not_start_end = ['for', 'of', 'and', 'with', 'in']

In [7]:
perfect_match_list = []
partial_match_list = []
no_match_list = []
recipe_list = []
direction_list = []
lemmatized_direction_list = []

num_ingredient_list = []
num_perfect_match_list = []
num_partial_match_list = []
num_no_match_list = []

for recipe_id in tqdm(recipe_id_list):
    # list of ingre that appeared exactly the same in the direction
    perfect_match_this_recipe = []
    # list of ingre that matched shorter form in the direction
    partial_match_this_recipe = []
    # list fo ingre that are not matched
    no_match_this_recipe = []
    
    # count number of ingredients
    num_perfect_match_this_recipe = 0
    num_partial_match_this_recipe = 0
    num_no_match_this_recipe = 0

    # get the ingredients of this recipe id
    try:
        recipe = [dict_ingredient_clustername[ingre_id] for ingre_id in dict_recipe_ingredients[recipe_id]]
        recipe.sort(key=lambda x: len(x.split('_')), reverse=True)
    except:
        recipe = [] # if the recipe id is not in the recipe_ingredient map list
        
    recipe.sort(key=lambda x: len(x.split('_')), reverse=True)
    # get the direction of this recipe id
    direction = dict_recipe_direction[recipe_id]
    lemmatized_direction = lemmatization(dict_recipe_direction[recipe_id])
    # map each ingredient to the words in the direction
    for ingre in recipe:
        ingre_tokens = ingre.split('_')
        ingre_original = ' '.join(ingre_tokens)
        if ingre_original in direction or ingre_original in lemmatized_direction:
            perfect_match_this_recipe.append(ingre)
            num_perfect_match_this_recipe += 1
        else:
            partial_match_this_ingre = []
            if len(ingre_tokens) > 1:
                n = len(ingre_tokens)-1
                while n > 0: # length of the short form
                    for i in range(0,len(ingre_tokens)-n+1): # start index in the full cluster name
                        short_form = ' '.join(ingre_tokens[i:i+n])
                        if short_form in direction or short_form in lemmatized_direction:
                            # the short form is a map to the ingre
                            # don't break the loop if matched, cuz 'oil for frying' may end up matched 'for frying' and break
                            # but ignore subsequence part, e.g. if 'firm tofu' is matched, ignore 'firm' & 'tofu'
                            if not any(short_form in existing_short_form for existing_short_form in partial_match_this_ingre):
                                if n > 1: # if the short form >=2 words
                                    if short_form.replace(' ','_') not in ngram_exclude_list\
                                        and ingre_tokens[i] not in ngram_not_start_end \
                                        and ingre_tokens[i+n-1] not in ngram_not_start_end:
                                            # if the short form not in ngram_exclude_list
                                            # and the short form does not start or end with words in ngram_not_start_end list
                                            partial_match_this_ingre.append(short_form.replace(' ','_'))
                                else: # if the short form is unigram
                                    if short_form not in unigram_exclude_list:
                                        partial_match_this_ingre.append(short_form)
                            
                    n -= 1
            if len(partial_match_this_ingre) == 0:
                no_match_this_recipe.append(ingre)
                num_no_match_this_recipe += 1
            else:
                partial_match_this_recipe.append([ingre, partial_match_this_ingre])
                num_partial_match_this_recipe += 1
    
    recipe_list.append(recipe)
    direction_list.append(direction)
    lemmatized_direction_list.append(lemmatized_direction)
    perfect_match_list.append(perfect_match_this_recipe)
    partial_match_list.append(partial_match_this_recipe)
    no_match_list.append(no_match_this_recipe)
    num_perfect_match_list.append(num_perfect_match_this_recipe)
    num_partial_match_list.append(num_partial_match_this_recipe)
    num_no_match_list.append(num_no_match_this_recipe)
    num_ingredient_list.append(len(recipe))

matched_df = pd.DataFrame({
    'recipe_id': recipe_id_list, 
    'recipe': recipe_list,
    'direction': direction_list,
    'lemma_direction': lemmatized_direction_list,
    'perfect_match': perfect_match_list,
    'partial_match': partial_match_list,
    'no_match': no_match_list,
    'num_ingredient': num_ingredient_list,
    'num_perfect_match': num_perfect_match_list,
    'num_partial_match': num_partial_match_list,
    'num_no_match': num_no_match_list
})

100%|██████████| 69121/69121 [06:12<00:00, 185.44it/s]


In [8]:
# test the 1-gram to (n-1)-gram shorter forms
ingre_tokens = ['fresh','white','chocolate','chips']

n = len(ingre_tokens)-1
while n > 0:
    for i in range(0,len(ingre_tokens)-n+1):
        short_form = ' '.join(ingre_tokens[i:i+n])
        print(short_form)
    n = n-1

fresh white chocolate
white chocolate chips
fresh white
white chocolate
chocolate chips
fresh
white
chocolate
chips


In [9]:
# n = 10 # check a random recipe
# print('recipe:',matched_df.loc[n,'recipe'])
# print('perfect:',matched_df.loc[n,'perfect_match'])
# print('partial:',matched_df.loc[n,'partial_match'])
# print('no:',matched_df.loc[n,'no_match'])
# print('\ndirection:',matched_df.loc[n,'direction'])

In [10]:
matched_df.to_csv('/Users/nessyliu/Desktop/RA/part_2/result/matched_directions.csv')

In [11]:
# matched_df.head()

In [12]:
# Create a dict using ingredient (cluster name) as key, 
# all appeared short forms (across all directions) as values, excluding meaningless terms (e.g. 'all','purpose','for')
all_short_forms = []
dict_ingre_shortforms = defaultdict(list)
for partial_match_this_recipe in partial_match_list:
    for partial_match_this_ingre in partial_match_this_recipe:
        ingre = partial_match_this_ingre[0]
        short_forms = partial_match_this_ingre[1]
        for short_form in short_forms:
            if short_form not in dict_ingre_shortforms[ingre]:
                all_short_forms.append(short_form)
                dict_ingre_shortforms[ingre].append(short_form)

In [13]:
count_short_forms = Counter(all_short_forms)
# count_short_forms.most_common()

In [14]:
# manually add some more mappings
dict_ingre_shortforms['beef_sirloin'].append('steak')
dict_ingre_shortforms['crabmeat'].append('crab')
dict_ingre_shortforms['espresso'].append('coffee_bean')

In [15]:
for key in dict_ingre_shortforms:
    print('\n',key)
    print(dict_ingre_shortforms[key])


 all_purpose_flour
['flour']

 other_milk
['milk']

 olive_oil
['oil', 'olive']

 ground_coriander
['coriander']

 ground_turmeric
['turmeric']

 cumin_seed
['cumin', 'seed']

 extra_virgin_olive_oil
['olive_oil', 'oil', 'virgin_olive_oil', 'virgin_olive', 'olive']

 ground_black_pepper
['pepper', 'black_pepper']

 split_pea
['pea']

 whole_milk
['milk']

 parmesan_cheese
['cheese', 'parmesan']

 red_bell_pepper
['bell_pepper', 'pepper', 'bell', 'red_bell']

 ground_cumin
['cumin']

 flavored_gelatin
['gelatin']

 granny_smith_apple
['apple', 'smith_apple']

 beef_stew_meat
['beef', 'meat', 'stew', 'stew_meat', 'beef_stew']

 vegetable_oil
['oil', 'vegetable']

 white_sugar
['sugar']

 whole_clove
['clove']

 oil_for_frying
['oil']

 ricotta_cheese
['cheese', 'ricotta']

 whole_kernel_corn
['corn', 'kernel', 'kernel_corn']

 white_rice
['rice']

 cheddar_cheese
['cheese', 'cheddar']

 ground_nutmeg
['nutmeg']

 white_vinegar
['vinegar']

 red_onion
['onion']

 green_chile_pepper
['chi

In [16]:
import csv

with open('/Users/nessyliu/Desktop/RA/part_2/result/dict_ingre_shortforms.csv', 'w') as f:
    for key in dict_ingre_shortforms.keys():
        f.write("%s,%s\n"%(key,dict_ingre_shortforms[key]))

In [17]:
# test on a subset of data
df = df.head(1000)    

In [18]:
# Combine each review with its direction
recipe_id_list = df.recipe_id.tolist()
review_id_list = df.review_id.tolist()
review_list = df.review_text.tolist()

comb_dir_review_list = []
for recipe_id, review  in zip(recipe_id_list, review_list):
    direction = dict_recipe_direction[recipe_id]
    combined_text = direction + ' ' + review
    comb_dir_review_list.append(combined_text)

In [19]:
import spacy
import neuralcoref

text = comb_dir_review_list[300]
print(text)

nlp = spacy.load('en')
neuralcoref.add_to_pipe(nlp, greedyness=0.4, max_dist = 100, 
                        conv_dict={'paprika': ['ingredient']})
doc = nlp(text)
for cluster in doc._.coref_clusters:
    print('')
    print(cluster)
    
    
test_sentence = 'My sister has a dog. She loves him.'
test_doc = nlp(test_sentence)
print(test_doc._.coref_clusters)

In a shallow bowl, mix 1 1/2 cup of flour, paprika, and garlic powder. In a second shallow bowl, whisk together the eggs and milk until thoroughly blended. In a third bowl, combine the crushed cracker crumbs, potato flakes, and 1/2 cup of flour. With a fork, prick the pork chops thoroughly to tenderize the meat. Season the chops with salt and pepper. Gently press the pork chops into the flour to coat, and shake off the excess flour. Dip into the beaten egg mixture, then press each chop into the cracker crumb mixture. Gently toss between your hands so any crumbs that haven't stuck can fall away Heat the oil in a large, deep skillet over medium heat, and gently place the pork chops into the hot oil. Fry the chops until the meat is no longer pink in the center and the crust is crisp and golden brown, about 5 minutes per side. Remove the chops from the pan, and drain the excess oil on paper towels. Easy and delicious. Great with country gravy and mashed potatoes.

the pork: [the pork, the 