In [1]:
import re
import numpy as np
import pandas as pd
import nltk
import unicodedata
import csv
import itertools
from tqdm.notebook import trange, tqdm
from nltk import word_tokenize, sent_tokenize, pos_tag, corpus
from nltk.stem import WordNetLemmatizer
from collections import defaultdict, Counter
import seaborn as sn
import matplotlib.pyplot as plt
import copy
from ast import literal_eval
import dill as pickle

In [2]:
def save_obj(obj, name):
    with open('/Users/nessyliu/Desktop/RA/part_2/result/'+ name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name):
    with open('/Users/nessyliu/Desktop/RA/part_2/result/' + name + '.pkl', 'rb') as f:
        return pickle.load(f)

In [3]:
# Read in the datasets
df = pd.read_csv('/Users/nessyliu/Desktop/RA/AllReviews_26thNov2019.csv')
df_ingredients_raw = pd.read_csv('/Users/nessyliu/Desktop/RA/part_2/Ingredients.csv')
df_cluster = pd.read_excel('/Users/nessyliu/Desktop/RA/part_2/Cluster_names.xlsx')
df_matched = pd.read_csv('/Users/nessyliu/Desktop/RA/part_2/result/matched_directions_w7.csv')
# load the mapping dict
dict_ingre_mapping = load_obj('dict_ingre_mapping')
# load the result from week 2
w2_result = pd.read_csv('/Users/nessyliu/Desktop/RA/part_2/result/review_with_ingre_df_w2.csv')

In [4]:
df = df.head(10000)

In [5]:
# list of cluster names with spaces (e.g. apple juice) sorted by number of words in each name, 
# e.g. "apple juice" should appear before "juice"
cluster_name_orig_list = df_cluster.cluster_name.tolist()
cluster_name_orig_list.sort(key=lambda x: len(x.split()), reverse=True)

# create dict to map recipe_id to ingredient_ids
df_ingredients = df_ingredients_raw.groupby('recipe_id')['ingredient_id'].apply(list).reset_index(name ='ingredients')
dict_recipe_ingredients = dict(zip(df_ingredients.recipe_id, df_ingredients.ingredients))

# create dict to map ingredient_id to cluster_name
df_cluster = df_cluster.replace(' ', '_', regex=True)
dict_ingredient_clustername = dict(zip(df_cluster.ingredient_id, df_cluster.cluster_name))

# list of cluster names with underscore, e.g. apple_juice, so that when check 'apple', won't confuse with 'apple juice'
cluster_name_list = df_cluster.cluster_name.tolist()

recipe_id_list = df.recipe_id.tolist()
review_id_list = df.review_id.tolist()
review_list = df.review_text.tolist()


In [6]:
# Map for expanding contractions in case of negations
CONTRACTION_MAP = {
    "ain't": "is not",
    "aren't": "are not",
    "can't": "can not",
    "cannot": "can not",
    "couldn't": "could not",
    "couldn't've": "could not have",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "hadn't": "had not",
    "hasn't": "has not",
    "haven't": "have not",
    "isn't": "is not",
    "mayn't": "may not",
    "mightn't": "might not",
    "mustn't": "must not",
    "needn't": "need not",
    "shan't": "shall not",
    "sha'n't": "shall not",
    "shouldn't": "should not",
    "shouldn't've": "should not have",
    "wasn't": "was not",
    "weren't": "were not",
    "won't": "will not",
    "wouldn't": "would not",
}

contain_ingre_flag_list = []
addition_flag_list = []
deletion_flag_list = []
replacement_flag_list = []

ingre_in_recipe_list = []
ingre_not_recipe_list = []
ngram_in_list = []
ngram_not_list = []
recipe_list = []

real_addition_list = [] # terms caught by addition grammar but the ingredient is already in the recipe
false_addition_list = [] # terms caught by addition grammar and the ingredient is not in the recipe
real_deletion_list = [] # terms caught by deletion grammar and the ingredient is in the recipe
false_deletion_list = [] # terms caught by deletion grammar but the ingredient is not in the recipe
replacement_list = [] # terms indicating replacement

new_ingre_not_matched_list = [] # the ingredients not in recipe, but also isn't detected as an addition or replacement

clean_text_list = [] # review text after cleaning

In [7]:
def deal_with_ingredient(text, recipe):
    # tokenize the review text
    sentences = nltk.sent_tokenize(text)
    tokens_sentences = [nltk.word_tokenize(sent) for sent in sentences]
    tokens = list(itertools.chain.from_iterable(tokens_sentences))
    flag = False
    
    # detect ingredients and get ngrams
    ingre_in_recipe = []
    ingre_not_recipe = []
    ngram_in = []
    ngram_not = []
    ngrams = list(nltk.everygrams(tokens, 4, 4))
    ngrams = [list(ngram) for ngram in ngrams] 
    detected_ingredients = [token for token in tokens if token in cluster_name_list] 
    if len(detected_ingredients) > 0:
        flag = True
    for ingre in detected_ingredients:
        if ingre in recipe:
            ingre_in_recipe.append(ingre)
            ngram_in.append([ngram for ngram in ngrams if ingre in ngram])
        else:
            ingre_not_recipe.append(ingre)
            ngram_not.append([ngram for ngram in ngrams if ingre in ngram])
    
    
    return flag, list(set(ingre_in_recipe)), list(set(ingre_not_recipe)), ngram_in, ngram_not

In [22]:
def text_processing(df):
    """
    text processing
    """
    for i in trange(0,len(df)):
        
        # get the recipe for the review and sort by number of words, e.g. [white_chocolate_chip, apple_juice, water]
        recipe_id = df.iloc[i]['recipe_id']
        try:
            recipe = [dict_ingredient_clustername[ingre_id] for ingre_id in dict_recipe_ingredients[recipe_id]]
            recipe.sort(key=lambda x: len(x.split('_')), reverse=True)
        except:
            recipe = [] # if the recipe id is not in the recipe_ingredient map list
        
        text = df.iloc[i]['review_text']
        
        # Convert to lower case
        text = text.lower()

        # Normalize the accented characters
        text = normalize_accented_characters(text)

        # Expand contractions, e.g. didn't --> did not
        text = expand_contractions(text, CONTRACTION_MAP)
        
        # Lemmatize the text, e.g. removed --> remove
        text = lemmatization(text)
        
        # concatenate some special phrases such as "instead_of"
        text = concat_phrase(text)

        # Standardize ingredient cluster names in the reviews
        text = standardize_ingredient_in_review(text, recipe)
        
        clean_text_list.append(text)

        
        # check whether review contain ingredient words in cluster name list, 
        # get the ingredients in the recipe and the ingredients not in the recipe
        # also get the ngrams containing the detected ingredient words
        flag, ingre_in_recipe, ingre_not_recipe, ngram_in, ngram_not = deal_with_ingredient(text, recipe)
        contain_ingre_flag_list.append(flag)
        ingre_in_recipe_list.append(ingre_in_recipe)
        ingre_not_recipe_list.append(ingre_not_recipe)
        ngram_in_list.append(ngram_in)
        ngram_not_list.append(ngram_not)
        
        recipe_list.append(recipe)
        
        
        # Extract terms indicating altering
        false_addition_terms, real_addition_terms = addition_extraction(text, recipe)
        false_deletion_terms, real_deletion_terms = deletion_extraction(text, recipe)
        replacement_terms = replacement_extraction(text, recipe)
        
        if len(real_addition_terms) > 0:
            addition_flag_list.append(True)
        else:
            addition_flag_list.append(False)

        if len(real_deletion_terms) > 0:
            deletion_flag_list.append(True)
        else:
            deletion_flag_list.append(False)
            
        if len(replacement_terms) > 0:
            replacement_flag_list.append(True)
        else:
            replacement_flag_list.append(False)
            
        # Filter the ingredients that are not in the recipe but also not detected as add/delete/replace
        new_ingre_not_matched = []
        all_detected_terms = false_addition_terms + real_addition_terms + false_deletion_terms + real_deletion_terms + replacement_terms
        all_detected_terms_tokens = ' '.join(all_detected_terms)
        new_ingre_not_matched = [ingre for ingre in ingre_not_recipe if ingre not in all_detected_terms_tokens.split()]
        
        # append all the above results
        replacement_list.append(replacement_terms)
        real_addition_list.append(real_addition_terms)
        false_addition_list.append(false_addition_terms)
        real_deletion_list.append(real_deletion_terms)
        false_deletion_list.append(false_deletion_terms)
        new_ingre_not_matched_list.append(new_ingre_not_matched)


def concat_phrase(text):
    # concatenate some important phrases for the ease of detection
    text = re.sub(r'\binstead of\b', 'instead_of', text)
    text = re.sub(r'\bleave out\b', 'leave_out', text)
    text = re.sub(r'\bleave off\b', 'leave_off', text)
    text = re.sub(r'\bbother with\b', 'bother_with', text)
    return text


def normalize_accented_characters(text):
    """
    In case of Unicode characters
    """
    # text = unicodedata.normalize('NFKD', text)
    text = unicodedata.normalize('NFC', text)
    text = text.replace("“", '"').replace("”", '"').replace("’", "'").replace("‘", "'").replace('–', '-')
    text = re.sub('ı', 'I', text)
    return text


def expand_contractions(text, contraction_mapping):
    """
    Expand contractions in case of negations, e.g. isn’t -> is not
    """
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())),
                                      flags=re.IGNORECASE | re.DOTALL)

    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match) \
            if contraction_mapping.get(match) \
            else contraction_mapping.get(match.lower())
        expanded_contraction = first_char + expanded_contraction[1:]
        return expanded_contraction

    expanded_text = contractions_pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)
    return expanded_text


# lemmatization dict
lemma_list = pd.read_csv('/Users/nessyliu/Desktop/RA/lemma_list.csv')
lemma_dict = lemma_list.set_index('word_list').to_dict()['lemma_list']
def lemmatization(text):
    # list of words that are forced not to lemmatize, those are the words appearing in cluster names
    force_keep_list = ['corned', 'sparkling', 'canning', 'roasted', 'baked', 'processed', 'flavored', 
                       'colored', 'candied', 'stuffing', 'dressing', 'shortening', "pig's", 'based',
                       'stewed', 'curing', 'decorating', 'coated', 'evaporated', 'pickled', 'fried',
                       'dripping', 'rising', "confectioners'", 'frying', 'coating', 'smoked', 'seasoned',
                       'rolled', 'filling', "devil's", 'sweetened', 'dried', 'pickling', 'topping', 'frosting',
                       'coloring', 'rose', 'pulled', 'crystallized', 'seasoning', 'whipped', 'condensed','baking',
                      'frenchfries', 'fries', 'flavoring']
    
    text = text.lower()
    
    text = text.replace('-n-', ' and ')
    text = text.replace(' & ', ' and ')
    text = text.replace('&', ' and ')
    text = text.replace('-', ' ')
    
    text = text.replace('sugar substitute', 'sweetener').replace('french fries','frenchfries')
    text = text.replace('dry milk', 'milk powder').replace('powder milk', 'milk powder')
    text = text.replace('lowfat',
                        'low fat').replace('nonfat',
                                                    'non fat').replace('glutenfree',
                                                                       'gluten free').replace('corn flakes',
                                                                                              'cornflakes')
    text = text.replace('flaxseed',
                        'flax seed').replace('lemongrass', 'lemon grass')
    text = text.replace('coconutmilk',
                        'coconut milk').replace('almondmilk',
                                                'almond milk').replace('crab meat',
                                                                       'crabmeat').replace('starfruit', 
                                                                                           'star fruit').replace('breadcrumb', 
                                                                                                                 'bread crumb')
    text = text.replace('red and yellow bell pepper', 
                        'red bell pepper and yellow bell pepper').replace('red and green bell pepper', 
                                                                          'red bell pepper and green bell pepper')

    # use the custom lemma dict first
    text = " ".join(str(lemma_dict.get(word, word)) for word in text.split())
    
    # then use the WordNetLemmatizer from nltk
    wnl = WordNetLemmatizer()
    
    word_list = word_tokenize(text)
    word_list_after = []
    
    for word in word_list:
        # word by word (otherwise tag may differ), lemmatize each word based on its pos tagging, exclude words in force keep list
        w,t = pos_tag([word])[0]
        if t[0].lower() in ['a','n','v'] and word not in force_keep_list:
            word = wnl.lemmatize(word,t[0].lower())
        word_list_after.append(word)
    return ' '.join(word_list_after)


def replacement_extraction(text, recipe):
    """
    extract Replacement terms
    """
    sentences = nltk.sent_tokenize(text)
    tokens_sentences = [nltk.word_tokenize(sent) for sent in sentences]
    tagged_sentences = [tagger.tag(sent) for sent in tokens_sentences]
    DESC_list = []
    for sent_tagged in tagged_sentences:  # each sentence
        sent_tree = key_phrases_cp.parse(sent_tagged)  # chunk each sentence based on grammar structure
        # print(sent_tree)
        DESC_in_this_sent = []
        for subtree in sent_tree.subtrees():
            if subtree.label() == "REPLACEMENT_TERM":
                string = str(subtree)
                string = re.sub('\n', '', string)
                string = re.sub('\s+', ' ', string)
                string = re.sub('REPLACEMENT_TERM ', '', string)
                string = re.sub('\/[A-Z]+\$*', '', string)
                string = re.sub('\(', '', string)
                string = re.sub('\)', '', string)

                DESC_in_this_sent.append(string)
        if DESC_in_this_sent != []:
            DESC_list.extend(DESC_in_this_sent)
    DESC_list = list(set(DESC_list))
    return DESC_list


def addition_extraction(text, recipe):
    """
    extract Addition terms
    """
    sentences = nltk.sent_tokenize(text)
    tokens_sentences = [nltk.word_tokenize(sent) for sent in sentences]
    tagged_sentences = [tagger.tag(sent) for sent in tokens_sentences]
    false_additions = []
    real_additions = []
    for sent_tagged in tagged_sentences:  # each sentence
        sent_tree = key_phrases_cp.parse(sent_tagged)  # chunk each sentence based on grammar structure
        false_additions_in_this_sent = []
        real_additions_in_this_sent = []
        for subtree in sent_tree.subtrees():
            if subtree.label() == "ADDITION_TERM":
                word_list = []
                # check whether the ingredient word in this term is in the original recipe
                flag = True # flag is true if the ingredient word is in the recipe
                for (word, tag) in subtree.leaves():
                    word_list.append(word)
                    if tag == 'INGREDIENT':
                        if word not in recipe:
                            flag = False
                string = ' '.join(word_list)
                if flag == True: # if the ingredient is in the recipe, it is a false addition (e.g. modify quantities)
                    false_additions_in_this_sent.append(string)
                else: # if the ingredient is not in the recipe, it is a real addition
                    real_additions_in_this_sent.append(string)

        if false_additions_in_this_sent != []:
            false_additions.extend(false_additions_in_this_sent)
        if real_additions_in_this_sent != []:
            real_additions.extend(real_additions_in_this_sent)
    false_additions = list(set(false_additions))
    real_additions = list(set(real_additions))
    return false_additions, real_additions


def deletion_extraction(text, recipe):
    """
    extract Deletion terms
    """
    sentences = nltk.sent_tokenize(text)
    tokens_sentences = [nltk.word_tokenize(sent) for sent in sentences]
    tagged_sentences = [tagger.tag(sent) for sent in tokens_sentences]
    false_deletions = []
    real_deletions = []
    for sent_tagged in tagged_sentences:  # each sentence
        sent_tree = key_phrases_cp.parse(sent_tagged)  # chunk each sentence based on grammar structure
        false_deletions_in_this_sent = []
        real_deletions_in_this_sent = []
        for subtree in sent_tree.subtrees():
            if subtree.label() == "DELETION_TERM":
                word_list = []
                # check whether the ingredient word in this term is in the original recipe
                flag = True # flag is true if the ingredient word is in the recipe
                for (word, tag) in subtree.leaves():
                    word_list.append(word)
                    if tag == 'INGREDIENT':
                        if word not in recipe:
                            flag = False
                string = ' '.join(word_list)
                if flag == False: # if the ingredient is not in the recipe, it is a false deletion
                                  # e.g. "It's good that the recipe contains no milk cuz I don't have any milk"
                    false_deletions_in_this_sent.append(string)
                else: # if the ingredient is in the recipe, it is a real deletion
                    real_deletions_in_this_sent.append(string)

        if false_deletions_in_this_sent != []:
            false_deletions.extend(false_deletions_in_this_sent)
        if real_deletions_in_this_sent != []:
            real_deletions.extend(real_deletions_in_this_sent)
    false_deletions = list(set(false_deletions))
    real_deletions = list(set(real_deletions))
    return false_deletions, real_deletions



# Custom our own tagger for POS tagging, and use python's default tagger as backup
custom_tagger = {
    # replacement
    'replace': 'REPLACEMENT', 'substitute': 'REPLACEMENT', 'instead_of': 'REPLACEMENT',
    # deletion
    'delete': 'DELETION', 'remove': 'DELETION', 'omit': 'DELETION', 'subtract': 'DELETION', 'skip': 'DELETION',
    'eliminate': 'DELETION', 
    'leave_out': 'DELETION', 'leave_off': 'DELETION',
    'bother_with': 'BOTHERWITH',
    # addition
    'add': 'ADDITION', 'use': 'ADDITION', 'put': 'ADDITION', 'incorporate': 'ADDITION',
    # other tags
    'no': 'NO',
    'not': 'NOT',
    'that': 'DT',
    'with': 'IN',
    'even': 'EVEN',
    'have': 'HAVE',
    'like': 'LIKE'}

ingredient_tagger = {}
for ingre in cluster_name_list:
    ingredient_tagger[ingre] = 'INGREDIENT'
    
custom_tagger_combined = {**custom_tagger, **ingredient_tagger}

default_tagger = nltk.data.load("/Users/nessyliu/nltk_data/taggers/maxent_treebank_pos_tagger/english.pickle")
tagger = nltk.tag.UnigramTagger(model=custom_tagger_combined, backoff=default_tagger)

# Define grammar structures for extracting negation phrases and then creativity words
key_phrases_grammar = r"""
    REPLACEMENT_TERM:
        # e.g. don't have/like apple so ADDITION(add/use/put/...) banana
        {<NOT>(<HAVE>|<LIKE>)<[A-Z]*>{0,4}<INGREDIENT><[A-Z]*>{0,4}<ADDITION><[A-Z]*>{0,4}<INGREDIENT>}
        
        # e.g. have no apple so ADDITION(add/use/put/...) banana
        {<HAVE><NO><INGREDIENT><[A-Z]*>{0,4}<ADDITION><[A-Z]*>{0,4}<INGREDIENT>}
        
        # e.g. ADDITION(add/use/put/...) banana cuz don't have/like apple
        {<ADDITION><[A-Z]*>{0,4}<INGREDIENT><[A-Z]*>{0,4}<NOT>(<HAVE>|<LIKE>)<[A-Z]*>{0,4}<INGREDIENT>}
        
        # e.g. ADDITION(add/use/put/...) banana cuz have no apple
        {<ADDITION><[A-Z]*>{0,4}<INGREDIENT><[A-Z]*>{0,4}<HAVE><NO><INGREDIENT>}
        
        # e.g. ADDITION(add/use/put/...) banana instead of apple
        {<ADDITION><[A-Z]*>{0,4}<INGREDIENT><REPLACEMENT><INGREDIENT>}
        
        # e.g. instead of apple I ADDITION(add/use/put/...) banana 
        {<REPLACEMENT><INGREDIENT><[A-Z]*>{0,4}<ADDITION><[A-Z]*>{0,4}<INGREDIENT>}
        
        # e.g. replace the Apple with some Banana
        {<REPLACEMENT><[A-Z]*>{0,4}<INGREDIENT><IN><DT>?<[A-Z]*>{0,4}<INGREDIENT>}
        
        # e.g. didn't use Apple, replace with Banana
        {<INGREDIENT><[A-Z]*>{0,4}<REPLACEMENT><IN><[A-Z]*>{0,4}<INGREDIENT>}
        
        # <DELETION> ingre and <REPLACEMENT> ingre
        # e.g. I omitted apple and substitute with banana
        {<DELETION><[A-Z]*>{0,4}<INGREDIENT><[A-Z]*>{0,4}<REPLACEMENT><[A-Z]*>*<INGREDIENT>}
        
    DELETION_TERM:
        ### Extract negation with "have" 
        # not have + ingredient"
        # e.g. "I didn't have any Beef"
        {<NOT><HAVE><[A-Z]*>{0,4}<INGREDIENT>}
        
        # "have no + ingredient"
        # e.g. "I had no Beef"
        {<HAVE><NO><INGREDIENT>}
        
        ### Extract negation with addition words
        # e.g. "I didn't add any water", "I use no milk"
        {<NOT><ADDITION><[A-Z]*>{0,4}<INGREDIENT>}
        {<ADDITION><NO><INGREDIENT>}
        
        ### Extract "deletion + ingredient"
        # e.g. "I removed Apple", "I deleted the Beef", "I left off the Beef"
        {<DELETION><[A-Z]*>{0,3}<INGREDIENT>}
        
        ### Extract negation terms with "bother with"
        # e.g. "I didn't bother with the Beef"
        {<NOT><BOTHERWITH><[A-Z]*>*<INGREDIENT>}      
    
    ADDITION_TERM:
        # e.g. I added {0-4 words such as "some", "a bit"} Milk
        {<ADDITION><[A-Z]*>{0,4}<INGREDIENT>}
"""
key_phrases_cp = nltk.RegexpParser(key_phrases_grammar)

In [28]:
text = "i do not have any beer , so i use a can of club_soda water "
replacement_extraction(text, ['ground_black_pepper', 'worcestershire_sauce', 'liquid_smoke', 'garlic_powder', 'barbeque_sauce', 'onion', 'beer', 'beef', 'water', 'salt'])

(S
  i/PRP
  do/VBP
  (DELETION_TERM not/NOT have/HAVE any/DT beer/INGREDIENT)
  ,/,
  so/RB
  i/PRP
  (ADDITION_TERM
    use/ADDITION
    a/DT
    can/NN
    of/IN
    club_soda/INGREDIENT
    water/INGREDIENT))


[]

In [10]:
def standardize_ingredient_in_review(text, recipe):
    
    # Standardize all mentionings of ingre in the review to its cluster name, using mapping dict
    
    # Example:
    # Input text: I omit chocolate chip.
    # Recipe: [white_chocolate_chip, others...], while both "white chocolate chip" & "chocolate chip" are cluster names
    # Output text: I omit white_chocolate_chip.
    
    # Step 1:
    # concatenate the full-match recipe-ingre phrases in review
    # since the recipe is sorted by # of words in each ingredient, longest ingredient will be matched first
    # e.g. "white chocolate chip" will be detected and replaced by "white_chocolate_chip", not "chocolate_chip"
    for ingre in recipe:
        ingre_with_space = ' '.join(ingre.split('_'))
        text = re.sub(r'\b' + ingre_with_space + r'\b', ingre, text)  
    

    non_recipe_ingre_not_matched = list(set(cluster_name_list) - set(recipe))
    non_recipe_ingre_not_matched.sort(key=lambda x: len(x.split('_')), reverse=True)
    
    # Step 2:
    # concatenate the full-match nonrecipe-ingre in the review
    for ingre in non_recipe_ingre_not_matched:
        if ingre.replace('_',' ') in text: 
            # if the ingre cluster name is in the review, directly replace it with concatenated version
            text = re.sub(r'\b' + ingre.replace('_',' ') + r'\b', ingre, text)
            non_recipe_ingre_not_matched.remove(ingre)
            
            
    # Step 3:
    # for recipe-ingredients only, check whether other names (from dict) exists in review, 
    # if yes then replace it with full-length cluster name
    for ingre in recipe:
        if ingre in dict_ingre_mapping.keys(): # if ingre has other names in dict
            short = dict_ingre_mapping[ingre]['short']
            synonym = dict_ingre_mapping[ingre]['synonym']
            parent = dict_ingre_mapping[ingre]['parent']
            child = dict_ingre_mapping[ingre]['child']
            all_possible_names = short + synonym + child + parent
            # remove the names that are overlaped with recipe-ingre
            # e.g. "chocolate" is short for "chocolate chip", but "chocolate" could also be a recipe-ingre,
            #       in this case, won't replace "chocolate" in review with "chocolate chip"
            all_possible_names = list(set(all_possible_names) - set(recipe)) 
            all_possible_names.sort(key=lambda x: len(x.split('_')), reverse=True) # sort all names with length
            for name in all_possible_names:
                # replace its names in dict with full-length cluster name
                text = re.sub(r'\b' + name.replace('_',' ') + r'\b', ingre, text) 
    
    
    # Step 4:
    # for remained unmatched nonrecipe-ingre, check whether other names (from dict) are in the review and replace with full name
    for ingre in non_recipe_ingre_not_matched:
        if ingre in dict_ingre_mapping.keys(): # if ingre has other names in dict
            # short = dict_ingre_mapping[ingre]['short']
            synonym = dict_ingre_mapping[ingre]['synonym']
            # parent = dict_ingre_mapping[ingre]['parent']
            child = dict_ingre_mapping[ingre]['child']
            all_possible_names = synonym + child # + short + parent
            all_possible_names.sort(key=lambda x: len(x.split('_')), reverse=True) # sort all names with length
            for name in all_possible_names:
                if name not in recipe: # in case the name is a recipe-ingre
                    # replace its names in dict with full-length cluster name
                    text = re.sub(r'\b' + name.replace('_',' ') + r'\b', ingre, text)

    return text

In [11]:
print(standardize_ingredient_in_review("i do not like chocolate, so I use strawberry ice cream.", 
                                       ['chocolate_ice_cream', 'salt'])) # correct
print(standardize_ingredient_in_review('i use apple', ['white_chocolate_chip','dried_apple'])) # incorrect

i do not like chocolate_ice_cream, so I use strawberry_ice_cream.
i use dried_apple


In [12]:
dict_ingre_mapping['bean']

defaultdict(list, {'child': ['soybean'], 'parent': [], 'synonym': []})

In [13]:
text_processing(df)

HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))




In [14]:
result_df = pd.DataFrame({
    'recipe_id': recipe_id_list,
    'review_id': review_id_list,
    'review_text': review_list,
    'clean_text': clean_text_list,
    'recipe': recipe_list,
    'ingre_flag': contain_ingre_flag_list,
    'ingre_in_recipe': ingre_in_recipe_list,
    'ingre_not_recipe': ingre_not_recipe_list,
    'ngram_in': ngram_in_list,
    'ngram_not': ngram_not_list,
    'addition_flag': addition_flag_list,
    'deletion_flag': deletion_flag_list,
    'replacement_flag': replacement_flag_list,
    'real_addition_terms': real_addition_list,
    'false_addition_terms': false_addition_list,
    'real_deletion_terms': real_deletion_list,
    'false_deletion_terms': false_deletion_list,
    'replacement_terms': replacement_list,
    'new_ingre_not_matched': new_ingre_not_matched_list
})


# result_df.to_csv('/Users/nessyliu/Desktop/RA/part_2/result/review_altering_df_w7.csv')
review_with_ingre_df = result_df.loc[result_df['ingre_flag'] == True]

merged_with_w2_result = pd.merge(review_with_ingre_df, w2_result, how='left', on = 'review_id', 
                                 suffixes = ['', '_w2']).drop(['flag','ngram_in_w2','ngram_not_w2',
                                                               'Unnamed: 0', 'recipe_id_w2','review_text_w2',
                                                              'clean_text_w2','recipe_w2'], 
                                                              axis=1)

merged_with_w2_result.to_csv('/Users/nessyliu/Desktop/RA/part_2/result/review_altering_with_ingre_df_w7.csv')

In [15]:
print(len(review_with_ingre_df))
print(len(result_df))

7446
10000


In [16]:
list(itertools.chain.from_iterable(replacement_list))

['instead_of onion i use onion',
 'not like onion so i use onion_powder',
 'use some fresh garlic_powder and beef_stock instead_of water',
 'use beef_broth instead_of water',
 'use 1 beer and beef_broth instead_of water',
 'omit the liquid_smoke flavoring and substitute an 18oz bottle of sweet baby ray original barbeque_sauce',
 'not have can tomato on hand i use spaghetti_sauce',
 'use a mexican_cheese_blend instead_of mozzarella_cheese',
 'use american_cheese instead_of mozzarella_cheese',
 'not like cooked tomato so i have to add some water',
 'use cheddar_cheese instead_of mozzarella_cheese',
 'substitute condensed_cream_of_mushroom_soup of luncheon_meat',
 'use macaroni instead_of white_rice',
 'use ground stew ground_beef instead_of ground_beef',
 'substitute lime for lemon_juice',
 'replace brandy for gin and bitter for dry_vermouth',
 'substitute vodka for the gin',
 'not have any potato flake so i use bread_crumb',
 'instead_of seasoning with salt and pepper',
 'not have any s

In [17]:
list(itertools.chain.from_iterable(real_deletion_list))

['omit the water',
 'leave_out the barbeque_sauce',
 'not have any beer',
 'not add barbeque_sauce',
 'not have beer',
 'not put any barbeque_sauce',
 'use no water',
 'not use much liquid_smoke',
 'not have any liquid_smoke',
 'not use dice tomato',
 'not put the mozzarella_cheese',
 'not use tomato',
 'not put any mozzarella_cheese',
 'leave_out the white_rice',
 'not add tomato',
 'eliminate the salt',
 'not have ground_nutmeg',
 'not have saltine_cracker or potato',
 'not have enough oil_for_frying',
 'not use potato',
 'not have any paprika',
 'have no potato',
 'not use the butter',
 'not have any fresh garlic',
 'omit the mushroom',
 'omit the celery',
 'not have the beef_broth',
 'leave_out the salt',
 'not add any ground_black_pepper',
 'leave_out the chili_powder',
 'leave_out the hot_sauce',
 'leave_out the tomato_juice',
 'not have that overpower tomato_juice',
 'omit the mushroom',
 'omit the ketchup',
 'not have any cherry_tomato',
 'not have any self_rising_flour',
 'not

In [18]:
list(itertools.chain.from_iterable(real_addition_list))

['use a pork_roast',
 'use luncheon_meat',
 'use a 4 lb chuck_roast',
 'use kraft honey hickory liquid_smoke',
 'use a bottle of local honey',
 'add 3 cup of beef_broth',
 'use a chuck_roast',
 'use a can of club_soda water',
 'use this recipe for other_meat',
 'use an english chuck_roast',
 'use a chuck_roast',
 'use a venison',
 'use this recipe on a pork_roast',
 'use 1 c beef_broth',
 'use rotel dice tomato with chile_de_arbol_pepper',
 'add an onion',
 'use minute white_rice and alittle luncheon_meat',
 'use dice tomato with basil',
 'add something green like bell pepper',
 'add onion',
 'add one chopped onion',
 'put some cheddar_cheese',
 'add mince onion',
 'add chopped onion and green_bell_pepper',
 'add some milk',
 'put it in the tortilla pie_crust',
 'add chili_powder and onion',
 'add garlic',
 'add some jalapeno to topping',
 'add to it by saute onion',
 'add some pizza blend mozzarella_cheese',
 'add the onion',
 'put in a can of celery',
 'add some pepper and onion',
 '