In [1]:
import re
import numpy as np
import pandas as pd
import nltk
import unicodedata
import csv
import itertools
from tqdm.notebook import trange, tqdm
from nltk import word_tokenize, sent_tokenize, pos_tag, corpus
from nltk.stem import WordNetLemmatizer
from collections import defaultdict, Counter
import seaborn as sn
import matplotlib.pyplot as plt
import copy
from ast import literal_eval
import dill as pickle
import more_itertools as mit

In [2]:
# Functions for reading and saving Python objects

def save_obj(obj, name):
    with open('OUTPUT_OF_FINAL_CODE/'+ name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name):
    with open('OUTPUT_OF_FINAL_CODE/' + name + '.pkl', 'rb') as f:
        return pickle.load(f)

In [3]:
# set the prefix to be added to any detected ingredients
prefix = 'INGRE_'

# set the path to your tagger: nltk_data/taggers/maxent_treebank_pos_tagger/english.pickle
# need to download the tagger (from nltk) if do not have
path_to_nltk_tagger = "/Users/nessyliu/nltk_data/taggers/maxent_treebank_pos_tagger/english.pickle"

## Read in the data files and some data pre-processing

In [4]:
# Read in the datasets
df = pd.read_csv('input_data/AllReviews_26thNov2019.csv')
df_ingredients_raw = pd.read_csv('input_data/Ingredients.csv')
df_cluster = pd.read_excel('input_data/Cluster_names.xlsx')

# load the mapping dict
dict_ingre_mapping = load_obj('dict_ingre_mapping')

In [5]:
# list of cluster names with spaces (e.g. apple juice) sorted by number of words in each name, 
# e.g. "apple juice" should appear before "juice"
cluster_name_orig_list = df_cluster.cluster_name.tolist()
cluster_name_orig_list.sort(key=lambda x: len(x.split()), reverse=True)

# create dict to map recipe_id to ingredient_ids
df_ingredients = df_ingredients_raw.groupby('recipe_id')['ingredient_id'].apply(list).reset_index(name ='ingredients')
dict_recipe_ingredients = dict(zip(df_ingredients.recipe_id, df_ingredients.ingredients))

# create dict to map ingredient_id to cluster_name
df_cluster = df_cluster.replace(' ', '_', regex=True)
dict_ingredient_clustername = dict(zip(df_cluster.ingredient_id, df_cluster.cluster_name))

# list of cluster names with underscore, e.g. apple_juice, so that when check 'apple', won't confuse with 'apple juice'
cluster_name_list = df_cluster.cluster_name.tolist()
cluster_name_list.sort(key=lambda x: len(x.split('_')), reverse=True)

# create dict to map concatenated_cluster_names to original cluster names
# e.g. 'apple_juice: apple juice'
dict_ingre_concat_to_origin = dict(zip(cluster_name_list, [ingre.replace('_',' ') for ingre in cluster_name_list]))

In [6]:
# create dict to map each recipe id to its sorted ingredients' cluster names
dict_recipe_to_name = {}
for recipe_id in dict_recipe_ingredients.keys():
    try:
        recipe = [dict_ingredient_clustername[ingre_id] for ingre_id in dict_recipe_ingredients[recipe_id]]
        recipe.sort(key=lambda x: len(x.split('_')), reverse=True)
        dict_recipe_to_name[recipe_id] = recipe
    except:
        continue # if this recipe ID has no ingredient info

## Auxiliary functions - to be called in the main text processing function

In [7]:
# Map for expanding contractions in case of negations
CONTRACTION_MAP = {
    "ain't": "is not",
    "aren't": "are not",
    "can't": "can not",
    "cannot": "can not",
    "couldn't": "could not",
    "couldn't've": "could not have",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "hadn't": "had not",
    "hasn't": "has not",
    "haven't": "have not",
    "isn't": "is not",
    "mayn't": "may not",
    "mightn't": "might not",
    "mustn't": "must not",
    "needn't": "need not",
    "shan't": "shall not",
    "sha'n't": "shall not",
    "shouldn't": "should not",
    "shouldn't've": "should not have",
    "wasn't": "was not",
    "weren't": "were not",
    "won't": "will not",
    "wouldn't": "would not",
}

In [8]:
amount_nouns=['cup','teaspoon','fluid ounce','ounce',
             'tablespoon','can or bottle','bottle',
             'can','package','pound','inch','jar',
             'packet','carton','bottle','milliliter',
             'box','bunch','tube','container',
             'jigger','quart','slice','liter','pouch',
             'container','bag','pint','bar', 'gallon']
amount_units = ['lb', 'oz']

# lemmatization dict
lemma_list = pd.read_csv('input_data/lemma_list.csv')
lemma_dict = lemma_list.set_index('word_list').to_dict()['lemma_list']
def lemmatization(text):
    # list of words that are forced not to lemmatize, those are the words appearing in cluster names
    force_keep_list = ['corned', 'sparkling', 'canning', 'roasted', 'baked', 'processed', 'flavored', 
                       'colored', 'candied', 'stuffing', 'dressing', 'shortening', "pig's", 'based',
                       'stewed', 'curing', 'decorating', 'coated', 'evaporated', 'pickled', 'fried',
                       'dripping', 'rising', "confectioners'", 'frying', 'coating', 'smoked', 'seasoned',
                       'rolled', 'filling', "devil's", 'sweetened', 'dried', 'pickling', 'topping', 'frosting',
                       'coloring', 'rose', 'pulled', 'crystallized', 'seasoning', 'whipped', 'condensed','baking',
                      'frenchfries', 'fries', 'flavoring']
    text = text.lower()
    text = text.replace('-n-', ' and ')
    text = text.replace(' & ', ' and ')
    text = text.replace('&', ' and ')
    text = text.replace('-', ' ')
    text = text.replace('/', '')
    text = text.replace('sugar substitute', 'sweetener').replace('french fries','frenchfries')
    text = text.replace('dry milk', 'milk powder').replace('powder milk', 'milk powder')
    text = text.replace('lowfat',
                        'low fat').replace('nonfat',
                                                    'non fat').replace('glutenfree',
                                                                       'gluten free').replace('corn flakes',
                                                                                              'cornflakes')
    text = text.replace('flaxseed',
                        'flax seed').replace('lemongrass', 'lemon grass')
    text = text.replace('coconutmilk',
                        'coconut milk').replace('almondmilk',
                                                'almond milk').replace('crab meat',
                                                                       'crabmeat').replace('starfruit', 
                                                                                           'star fruit').replace('breadcrumb', 
                                                                                                                 'bread crumb')
    text = text.replace('red and yellow bell pepper', 
                        'red bell pepper and yellow bell pepper').replace('red and green bell pepper', 
                                                                          'red bell pepper and green bell pepper')

    text = text.replace('used to', '')
    word_list = word_tokenize(text)
    word_list_after = []
    
    # use the custom lemma dict first
    word_list = [str(lemma_dict.get(word, word)) for word in word_list]
    
    # then use the WordNetLemmatizer from nltk
    wnl = WordNetLemmatizer()
    
    for word in word_list:
        # word by word (otherwise tag may differ), lemmatize each word based on its pos tagging, exclude words in force keep list
        w,t = pos_tag([word])[0]
        if t[0].lower() in ['a','n','v'] and word not in force_keep_list:
            word = wnl.lemmatize(word,t[0].lower())
        word_list_after.append(word)
    
    text = ' '.join(word_list_after)
    
    # remove the terms indicating amount, e.g. "2 to 3 cup of beer"
    for amount_noun in amount_nouns:
        text = re.sub(r'([0-9]* to )*[0-9]* ' + amount_noun + r' of', '', text)
    # remove more terms indicating amount, e.g. "use a 4 lb roast"
    for amount_unit in amount_units:
        text = re.sub(r'([0-9]* to )*[0-9]*( )*' + amount_unit, '', text)
    
    # remove all numbers
    text = re.sub(r'[0-9]', '', text)
    
    text = ' '.join(text.split()) # replace multiple spaces as single space
    
    return text

In [9]:
# sanity checks

# print(lemmatization('I used 2 to 3 cups of beer together with the milk'))
# print(lemmatization('I used 3 cups of beer together with the milk'))
# print(lemmatization('I used cups of beer together with the milk'))
# print(lemmatization('I used a 4 to 5 lb chuck roast and 2oz beer'))

In [10]:
def deal_with_ingredient(text, recipe):
    # tokenize the review text
    sentences = nltk.sent_tokenize(text)
    tokens_sentences = [nltk.word_tokenize(sent) for sent in sentences]
    tokens = list(itertools.chain.from_iterable(tokens_sentences))
    flag = False
    
    # detect ingredients and get ngrams
    ingre_in_recipe = []
    ingre_not_recipe = []
    detected_ingredients = [token.replace('INGRE_','') for token in tokens if 'INGRE_' in token]
    if len(detected_ingredients) > 0:
        flag = True
    for ingre in detected_ingredients:
        if ingre in recipe:
            ingre_in_recipe.append(ingre)
        else:
            ingre_not_recipe.append(ingre)
    
    
    return flag, list(set(ingre_in_recipe)), list(set(ingre_not_recipe))

In [11]:
def concat_phrase(text):
    # concatenate some important phrases for the ease of detection
    text = re.sub(r'\binstead of\b', 'instead_of', text)
    text = re.sub(r'\brather than\b', 'rather_than', text)
    text = re.sub(r'\bleave out\b', 'leave_out', text)
    text = re.sub(r'\bleave off\b', 'leave_off', text)
    text = re.sub(r'\bbother with\b', 'bother_with', text)
    return text


def normalize_accented_characters(text):
    """
    In case of Unicode characters
    """
    # text = unicodedata.normalize('NFKD', text)
    text = unicodedata.normalize('NFC', text)
    text = text.replace("“", '"').replace("”", '"').replace("’", "'").replace("‘", "'").replace('–', '-')
    text = re.sub('ı', 'I', text)
    return text


def expand_contractions(text, contraction_mapping):
    """
    Expand contractions in case of negations, e.g. isn’t -> is not
    """
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())),
                                      flags=re.IGNORECASE | re.DOTALL)

    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match) \
            if contraction_mapping.get(match) \
            else contraction_mapping.get(match.lower())
        expanded_contraction = first_char + expanded_contraction[1:]
        return expanded_contraction

    expanded_text = contractions_pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)
    return expanded_text


def replacement_extraction(text, recipe):
    """
    extract Replacement terms
    """
    sentences = nltk.sent_tokenize(text)
    tokens_sentences = [nltk.word_tokenize(sent) for sent in sentences]
    tagged_sentences = [tagger.tag(sent) for sent in tokens_sentences]
    DESC_list = []
    for sent_tagged in tagged_sentences:  # each sentence
        sent_tree = key_phrases_cp.parse(sent_tagged)  # chunk each sentence based on grammar structure
        DESC_in_this_sent = []
        for subtree in sent_tree.subtrees():
            if subtree.label() == "REPLACEMENT_TERM":
                string = str(subtree)
                string = re.sub('\n', '', string)
                string = re.sub('\s+', ' ', string)
                string = re.sub('REPLACEMENT_TERM ', '', string)
                string = re.sub('\/[A-Z]+\$*', '', string)
                string = re.sub('\(', '', string)
                string = re.sub('\)', '', string)

                DESC_in_this_sent.append(string)
        if DESC_in_this_sent != []:
            DESC_list.extend(DESC_in_this_sent)
    DESC_list = list(set(DESC_list))
    return DESC_list


def addition_extraction(text, recipe):
    """
    extract Addition terms
    """
    prefix_recipe = [prefix+ingre for ingre in recipe]
    sentences = nltk.sent_tokenize(text)
    tokens_sentences = [nltk.word_tokenize(sent) for sent in sentences]
    tagged_sentences = [tagger.tag(sent) for sent in tokens_sentences]
    false_additions = []
    real_additions = []
    for sent_tagged in tagged_sentences:  # each sentence
        sent_tree = key_phrases_cp.parse(sent_tagged)  # chunk each sentence based on grammar structure
        false_additions_in_this_sent = []
        real_additions_in_this_sent = []
        for subtree in sent_tree.subtrees():
            if subtree.label() == "ADDITION_TERM":
                word_list = []
                # check whether the ingredient word in this term is in the original recipe
                flag = True # flag is true if the ingredient word is in the recipe
                for (word, tag) in subtree.leaves():
                    word_list.append(word)
                    if tag == 'INGREDIENT':
                        if word not in prefix_recipe:
                            flag = False
                string = ' '.join(word_list)
                if flag == True: # if the ingredient is in the recipe, it is a false addition (e.g. modify quantities)
                    false_additions_in_this_sent.append(string)
                else: # if the ingredient is not in the recipe, it is a real addition
                    real_additions_in_this_sent.append(string)

        if false_additions_in_this_sent != []:
            false_additions.extend(false_additions_in_this_sent)
        if real_additions_in_this_sent != []:
            real_additions.extend(real_additions_in_this_sent)
    false_additions = list(set(false_additions))
    real_additions = list(set(real_additions))
    return false_additions, real_additions


def deletion_extraction(text, recipe):
    """
    extract Deletion terms
    """
    prefix_recipe = [prefix+ingre for ingre in recipe]
    sentences = nltk.sent_tokenize(text)
    tokens_sentences = [nltk.word_tokenize(sent) for sent in sentences]
    tagged_sentences = [tagger.tag(sent) for sent in tokens_sentences]
    false_deletions = []
    real_deletions = []
    for sent_tagged in tagged_sentences:  # each sentence
        sent_tree = key_phrases_cp.parse(sent_tagged)  # chunk each sentence based on grammar structure
        false_deletions_in_this_sent = []
        real_deletions_in_this_sent = []
        for subtree in sent_tree.subtrees():
            if subtree.label() == "DELETION_TERM":
                word_list = []
                # check whether the ingredient word in this term is in the original recipe
                flag = True # flag is true if the ingredient word is in the recipe
                for (word, tag) in subtree.leaves():
                    word_list.append(word)
                    if tag == 'INGREDIENT':
                        if word not in prefix_recipe:
                            flag = False
                string = ' '.join(word_list)
                if flag == False: # if the ingredient is not in the recipe, it is a false deletion
                                  # e.g. "It's good that the recipe contains no milk cuz I don't have any milk"
                    false_deletions_in_this_sent.append(string)
                else: # if the ingredient is in the recipe, it is a real deletion
                    real_deletions_in_this_sent.append(string)

        if false_deletions_in_this_sent != []:
            false_deletions.extend(false_deletions_in_this_sent)
        if real_deletions_in_this_sent != []:
            real_deletions.extend(real_deletions_in_this_sent)
    false_deletions = list(set(false_deletions))
    real_deletions = list(set(real_deletions))
    return false_deletions, real_deletions



# Custom our own tagger for POS tagging, and use python's default tagger as backup
custom_tagger = {
    # replacement
    'replace': 'REPLACEMENT', 'substitute': 'REPLACEMENT', 'instead_of': 'REPLACEMENT', 'rather_than': 'REPLACEMENT',
    # deletion
    'delete': 'DELETION', 'remove': 'DELETION', 'omit': 'DELETION', 'subtract': 'DELETION', 'skip': 'DELETION',
    'eliminate': 'DELETION', 
    # 'discard': 'DELETION', 
    'leave_out': 'DELETION', 'leave_off': 'DELETION',
    'bother_with': 'BOTHERWITH',
    # addition
    'add': 'ADDITION', 'use': 'ADDITION', 'put': 'ADDITION', 'incorporate': 'ADDITION',
    # other tags
    'no': 'NO',
    'not': 'NOT',
    'that': 'DT',
    'with': 'IN',
    'even': 'EVEN',
    'have': 'HAVE',
    'like': 'LIKE',
    # punctuations
    ',': 'PUNC'
}

ingredient_tagger = {}
for ingre in cluster_name_list:
    ingredient_tagger[prefix+ingre] = 'INGREDIENT'
    
custom_tagger_combined = {**custom_tagger, **ingredient_tagger}

default_tagger = nltk.data.load(path_to_nltk_tagger)
tagger = nltk.tag.UnigramTagger(model=custom_tagger_combined, backoff=default_tagger)

# Define grammar structures for extracting negation phrases and then creativity words
key_phrases_grammar = r"""
    REPLACEMENT_TERM:
        # e.g. don't have/like apple, so ADDITION(add/use/put/...) banana
        {<NOT>(<HAVE>|<LIKE>)<[A-Z]+>{0,4}<INGREDIENT><[A-Z]+>{0,4}<ADDITION><[A-Z]+>{0,4}<INGREDIENT>}
        
        # e.g. have no apple so ADDITION(add/use/put/...) banana
        {<HAVE><NO><INGREDIENT><[A-Z]+>{0,4}<ADDITION><[A-Z]+>{0,4}<INGREDIENT>}
        
        # e.g. ADDITION(add/use/put/...) banana cuz don't have/like apple
        {<ADDITION><[A-Z]+>{0,4}<INGREDIENT><[A-Z]+>{0,4}<NOT>(<HAVE>|<LIKE>)<[A-Z]+>{0,4}<INGREDIENT>}
        
        # e.g. ADDITION(add/use/put/...) banana cuz have no apple
        {<ADDITION><[A-Z]*>{0,4}<INGREDIENT><[A-Z]*>{0,4}<HAVE><NO><INGREDIENT>}
        
        # e.g. ADDITION(add/use/put/...) banana instead of apple
        {<ADDITION><[A-Z]*>{0,4}<INGREDIENT><REPLACEMENT><INGREDIENT>}
        
        # e.g. instead of apple I ADDITION(add/use/put/...) banana 
        {<REPLACEMENT><INGREDIENT><[A-Z]*>{0,4}<ADDITION><[A-Z]*>{0,4}<INGREDIENT>}
        
        # e.g. replace the Apple with some Banana
        {<REPLACEMENT><[A-Z]*>{0,4}<INGREDIENT><IN><DT>?<[A-Z]*>{0,4}<INGREDIENT>}
        
        # e.g. didn't use Apple, replace with Banana
        {<INGREDIENT><[A-Z]*>{0,4}<REPLACEMENT><IN><[A-Z]*>{0,4}<INGREDIENT>}
        
        # <DELETION> ingre and <REPLACEMENT> ingre
        # e.g. I omitted apple and substitute with banana
        {<DELETION><[A-Z]*>{0,4}<INGREDIENT><[A-Z]*>{0,4}<REPLACEMENT><[A-Z]*>*<INGREDIENT>}
        
    DELETION_TERM:
        ### Extract negation with "have" 
        # not have + ingredient"
        # e.g. "I didn't have any Beef"
        {<NOT><HAVE><[A-Z]*>{0,4}<INGREDIENT>}
        
        # "have no + ingredient"
        # e.g. "I had no Beef"
        {<HAVE><NO><INGREDIENT>}
        
        ### Extract negation with addition words
        # e.g. "I didn't add any water", "I use no milk"
        {<NOT><ADDITION><[A-Z]*>{0,4}<INGREDIENT>}
        {<ADDITION><NO><INGREDIENT>}
        
        ### Extract "deletion + ingredient"
        # e.g. "I removed Apple", "I deleted the Beef", "I left off the Beef"
        {<DELETION><[A-Z]*>{0,3}<INGREDIENT>}
        
        ### Extract negation terms with "bother with"
        # e.g. "I didn't bother with the Beef"
        {<NOT><BOTHERWITH><[A-Z]*>*<INGREDIENT>}      
    
    ADDITION_TERM:
        # e.g. I added {0-4 words such as "some", "a bit"} Milk
        {<ADDITION><[A-Z]*>{0,4}<INGREDIENT>}
"""
key_phrases_cp = nltk.RegexpParser(key_phrases_grammar)

In [12]:
def standardize_ingredient_in_review(text, recipe, seq=3):
    '''
    Standardize all mentionings of ingre in the review to its cluster name
    (including short/parent/child/synonym name matching in mapping dict)

    Example:
    Input text: I omit chocolate chip.
    Recipe: [white_chocolate_chip, others...], while both "white chocolate chip" & "chocolate chip" are cluster names
    Output text: I omit white_chocolate_chip.
    
    Also output the mappings that happened in the text.
    
    Sub-functions that can be freely combined in different sequences (some default seq (i.e. seq 1,2,3) has been set):
        1. full_recipe_ingre_match
        2. full_nonrecipe_ingre_match
        merged 1,2: full_ingre_match
        3. partial_recipe_ingre_match
        4. synonym_recipe_ingre_match
        5. parent_recipe_ingre_match
        6. child_recipe_ingre_match
        7. synonym_nonrecipe_ingre_match
        8. child_nonrecipe_ingre_match
        merged 4,5,6: synonym_child_parent_recipe_ingre_match
        merged 3,4,5,6: partial_synonym_child_parent_recipe_ingre_match
        merged 7,8: synonym_child_nonrecipe_ingre_match
    '''
    
    # list of full-match ingre in this text
    full_match_in_this_text = []
    # list of pairs of other matches (partial, synonym, parent, child names) happen in this text
    partial_match_in_this_text = []
    synonym_match_in_this_text = []
    parent_match_in_this_text = []
    child_match_in_this_text = []
    # list of detected new spice
    new_spice_ls = []
    
    # keep (and sort) a list of nonrecipe-ingre that are not matched in the review text yet
    non_recipe_ingre_not_matched = list(set(cluster_name_list) - set(recipe))
    non_recipe_ingre_not_matched.sort(key=lambda x: len(x.split('_')), reverse=True)
    
    def detect_new_spice(text):
        name = 'spice'
        tokens = text.split()
        word_before_spice = tokens[tokens.index(name) - 1]
        if prefix in word_before_spice: # if the word before "spice" is an INGRE
            new_spice = word_before_spice + '_' + name
            text = re.sub(r'\b' + word_before_spice +' ' + name + r'\b', 'INGRE_mixed_spice', text)
            new_spice_ls.append(new_spice)
        return text
    
    def full_recipe_ingre_match(text = text):
        # concatenate the full-match recipe-ingre with '_' in review
        # since the recipe is sorted, longest ingredient will be matched first
        # e.g. "white chocolate chip" will be detected and replaced by "white_chocolate_chip", not "chocolate_chip"
        for ingre in recipe:
            ingre_origin = dict_ingre_concat_to_origin[ingre]
            if re.search(r'\b' + ingre_origin + r'\b', text) is not None: # a (whole-word) full-match exist
                text = re.sub(r'\b' + ingre_origin + r'\b', prefix+ingre, text) # replace with concatenated version
                full_match_in_this_text.append(ingre)
        return text
    
    def full_nonrecipe_ingre_match(text = text, non_recipe_ingre_not_matched = non_recipe_ingre_not_matched):
        # concatenate the full-match nonrecipe-ingre in the review
        for ingre in non_recipe_ingre_not_matched:
            ingre_origin = dict_ingre_concat_to_origin[ingre]
            if re.search(r'\b' + ingre_origin + r'\b', text) is not None: # a (whole-word) full-match exist
                text = re.sub(r'\b' + ingre_origin + r'\b', prefix+ingre, text) # replace with concatenated version
                non_recipe_ingre_not_matched.remove(ingre)
                full_match_in_this_text.append(ingre)    
        return text
    
    def full_ingre_match(text, non_recipe_ingre_not_matched = non_recipe_ingre_not_matched):
        # concatenate the full-match ingre (both recipe and non-recipe ingre) in the review from the longest
        for ingre in cluster_name_list:
            ingre_origin = dict_ingre_concat_to_origin[ingre]
            if re.search(r'\b' + ingre_origin + r'\b', text) is not None: # a (whole-word) full-match exist
                text = re.sub(r'\b' + ingre_origin + r'\b', prefix+ingre, text) # replace with concatenated version
                if ingre in non_recipe_ingre_not_matched:
                    non_recipe_ingre_not_matched.remove(ingre)
                full_match_in_this_text.append(ingre)    
        return text
            
    
    def partial_recipe_ingre_match(text = text):
        # for recipe-ingredients only, check whether short names exists in review,
        # if yes then replace it with cluster name
        for ingre in recipe:
            if ingre in dict_ingre_mapping.keys(): # if ingre has other names in dict
                short = dict_ingre_mapping[ingre]['short']
                all_possible_names = short
                # remove the names that are overlaped with recipe-ingre
                # e.g. "chocolate" is short for "chocolate chip", but "chocolate" could also be a recipe-ingre,
                #       in this case, won't replace "chocolate" in review with "chocolate chip"
                all_possible_names = list(set(all_possible_names) - set(recipe)) 
                all_possible_names.sort(key=lambda x: len(x.split('_')), reverse=True) # sort all names with length
                for name in all_possible_names:
                    name_origin = name.replace('_',' ')
                    if re.search(r'\b' + name_origin + r'\b', text) is not None: # if this name (whole-word) found in text
                        if name == 'spice':
                            text = detect_new_spice(text)
                        else:
                            # replace this name with concatenated cluster name
                            text = re.sub(r'\b' + name_origin + r'\b', prefix+ingre, text) 
                            partial_match_in_this_text.append([name, ingre])
        return text
                        
    def parent_recipe_ingre_match(text = text):
        # for recipe-ingredients only, check whether parent names exists in review,
        # if yes then replace this parent name with its cluster name
        for ingre in recipe:
            if ingre in dict_ingre_mapping.keys(): # if ingre has other names in dict
                parent = dict_ingre_mapping[ingre]['parent']
                all_possible_names = parent
                # remove the names that are overlaped with recipe-ingre
                all_possible_names = list(set(all_possible_names) - set(recipe)) 
                all_possible_names.sort(key=lambda x: len(x.split('_')), reverse=True) # sort all names with length
                for name in all_possible_names:
                    name_origin = name.replace('_',' ')
                    if re.search(r'\b' + name_origin + r'\b', text) is not None: # if this name (whole-word) found in text
                        if name == 'spice':
                            text = detect_new_spice(text)
                        else:
                            # replace this name with concatenated cluster name
                            text = re.sub(r'\b' + name_origin + r'\b', prefix+ingre, text) 
                            parent_match_in_this_text.append([name, ingre])
        return text
    
    def synonym_recipe_ingre_match(text = text):
        # for recipe-ingredients only, check whether synonym names exists in review,
        # if yes then replace it with cluster name
        for ingre in recipe:
            if ingre in dict_ingre_mapping.keys(): # if ingre has other names in dict
                synonym = dict_ingre_mapping[ingre]['synonym']
                all_possible_names = synonym
                # remove the names that are overlaped with recipe-ingre
                all_possible_names = list(set(all_possible_names) - set(recipe)) 
                all_possible_names.sort(key=lambda x: len(x.split('_')), reverse=True) # sort all names with length
                for name in all_possible_names:
                    name_origin = name.replace('_',' ')
                    if re.search(r'\b' + name_origin + r'\b', text) is not None: # if this name (whole-word) found in text
                        if name == 'spice':
                            text = detect_new_spice(text)
                        else:
                            # replace this name with concatenated cluster name
                            text = re.sub(r'\b' + name_origin + r'\b', prefix+ingre, text) 
                            synonym_match_in_this_text.append([name, ingre])
        return text
    
    
    def child_recipe_ingre_match(text = text):
        # for recipe-ingredients only, check whether child names exists in review,
        # if yes then replace this child name with its cluster name
        for ingre in recipe:
            if ingre in dict_ingre_mapping.keys(): # if ingre has other names in dict
                child = dict_ingre_mapping[ingre]['child']
                all_possible_names = child
                # remove the names that are overlaped with recipe-ingre
                all_possible_names = list(set(all_possible_names) - set(recipe)) 
                all_possible_names.sort(key=lambda x: len(x.split('_')), reverse=True) # sort all names with length
                for name in all_possible_names:
                    name_origin = name.replace('_',' ')
                    if re.search(r'\b' + name_origin + r'\b', text) is not None: # if this name (whole-word) found in text
                        if name == 'spice':
                            text = detect_new_spice(text)
                        else:
                            # replace this name with concatenated cluster name
                            text = re.sub(r'\b' + name_origin + r'\b', prefix+ingre, text) 
                            child_match_in_this_text.append([name, ingre])
        return text
    
    
    def child_nonrecipe_ingre_match(text = text, non_recipe_ingre_not_matched = non_recipe_ingre_not_matched):
    # for remained unmatched nonrecipe-ingre
    # check whether its child names are in the review and replace with its cluster name
        for ingre in non_recipe_ingre_not_matched:
            if ingre in dict_ingre_mapping.keys(): # if ingre has other names in dict
                child = dict_ingre_mapping[ingre]['child']
                all_possible_names = child
                all_possible_names.sort(key=lambda x: len(x.split('_')), reverse=True) # sort all names with length
                for name in all_possible_names:
                    if name not in recipe: # in case the name is a recipe-ingre
                        name_origin = name.replace('_',' ')
                        if re.search(r'\b' + name_origin + r'\b', text) is not None: # if this name (whole-word) found in text
                            if name == 'spice':
                                text = detect_new_spice(text)
                            else:
                                # replace its names in dict with full-length cluster name
                                text = re.sub(r'\b' + name_origin + r'\b', prefix+ingre, text)
                                child_match_in_this_text.append([name, ingre])
        return text
    
    
    def synonym_nonrecipe_ingre_match(text = text, non_recipe_ingre_not_matched = non_recipe_ingre_not_matched):
    # for remained unmatched nonrecipe-ingre
    # check whether its child names are in the review and replace with its cluster name
        for ingre in non_recipe_ingre_not_matched:
            if ingre in dict_ingre_mapping.keys(): # if ingre has other names in dict
                synonym = dict_ingre_mapping[ingre]['synonym']
                all_possible_names = synonym
                all_possible_names.sort(key=lambda x: len(x.split('_')), reverse=True) # sort all names with length
                for name in all_possible_names:
                    if name not in recipe: # in case the name is a recipe-ingre
                        name_origin = name.replace('_',' ')
                        if re.search(r'\b' + name_origin + r'\b', text) is not None: # if this name (whole-word) found in text
                            if name == 'spice':
                                text = detect_new_spice(text)
                            else:
                                # replace its names in dict with full-length cluster name
                                text = re.sub(r'\b' + name_origin + r'\b', prefix+ingre, text)
                                synonym_match_in_this_text.append([name, ingre])
        return text
    
    
    def synonym_child_nonrecipe_ingre_match(text = text, non_recipe_ingre_not_matched = non_recipe_ingre_not_matched):
    # for remained unmatched nonrecipe-ingre
    # check whether its merged synonym & child names are in the review and replace with its cluster name
        for ingre in non_recipe_ingre_not_matched:
            if ingre in dict_ingre_mapping.keys(): # if ingre has other names in dict
                synonym = dict_ingre_mapping[ingre]['synonym']
                child = dict_ingre_mapping[ingre]['child']
                all_possible_names = synonym + child
                all_possible_names.sort(key=lambda x: len(x.split('_')), reverse=True) # sort all names with length
                for name in all_possible_names:
                    if name not in recipe: # in case the name is a recipe-ingre
                        name_origin = name.replace('_',' ')
                        if re.search(r'\b' + name_origin + r'\b', text) is not None: # if this name (whole-word) found in text
                            if name == 'spice':
                                text = detect_new_spice(text)
                            else:
                                # replace its names in dict with full-length cluster name
                                text = re.sub(r'\b' + name_origin + r'\b', prefix+ingre, text)
                                if name in synonym:
                                    synonym_match_in_this_text.append([name, ingre])
                                else:
                                    child_match_in_this_text.append([name, ingre])
        return text
    
    
    def synonym_child_parent_recipe_ingre_match(text = text):
    # for recipe-ingre
    # check whether its merged synonym & child & parent names are in the review and replace with its cluster name
        for ingre in recipe:
            if ingre in dict_ingre_mapping.keys(): # if ingre has other names in dict
                child = dict_ingre_mapping[ingre]['child']
                synonym = dict_ingre_mapping[ingre]['synonym']
                parent = dict_ingre_mapping[ingre]['parent']
                all_possible_names = child + synonym + parent
                # remove the names that are overlaped with recipe-ingre
                all_possible_names = list(set(all_possible_names) - set(recipe)) 
                all_possible_names.sort(key=lambda x: len(x.split('_')), reverse=True) # sort all names with length
                for name in all_possible_names:
                    name_origin = name.replace('_',' ')
                    if re.search(r'\b' + name_origin + r'\b', text) is not None: # if this name (whole-word) found in text
                        if name == 'spice':
                            text = detect_new_spice(text)
                        else:
                            # replace this name with concatenated cluster name
                            text = re.sub(r'\b' + name_origin + r'\b', prefix+ingre, text) 
                            child_match_in_this_text.append([name, ingre])
                            if name in synonym:
                                synonym_match_in_this_text.append([name, ingre])
                            elif name in child:
                                child_match_in_this_text.append([name, ingre])
                            else:
                                parent_match_in_this_text.append([name, ingre])
        return text
    
    def partial_synonym_child_parent_recipe_ingre_match(text = text):
    # for recipe-ingre
    # check whether its merged synonym & child & parent names are in the review and replace with its cluster name
        for ingre in recipe:
            if ingre in dict_ingre_mapping.keys(): # if ingre has other names in dict
                child = dict_ingre_mapping[ingre]['child']
                synonym = dict_ingre_mapping[ingre]['synonym']
                parent = dict_ingre_mapping[ingre]['parent']
                all_possible_names = child + synonym + parent
                # remove the names that are overlaped with recipe-ingre
                all_possible_names = list(set(all_possible_names) - set(recipe)) 
                all_possible_names.sort(key=lambda x: len(x.split('_')), reverse=True) # sort all names with length
                for name in all_possible_names:
                    name_origin = name.replace('_',' ')
                    if re.search(r'\b' + name_origin + r'\b', text) is not None: # if this name (whole-word) found in text
                        if name == 'spice':
                            text = detect_new_spice(text)
                        else:
                            # replace this name with concatenated cluster name
                            text = re.sub(r'\b' + name_origin + r'\b', prefix+ingre, text) 
                            child_match_in_this_text.append([name, ingre])
                            if name in synonym:
                                synonym_match_in_this_text.append([name, ingre])
                            elif name in child:
                                child_match_in_this_text.append([name, ingre])
                            elif name in parent:
                                parent_match_in_this_text.append([name, ingre])
                            else:
                                partial_match_in_this_text.append([name, ingre])
        return text
    
    
    if seq == 1:
        ## sequence 1
        text = full_ingre_match(text, non_recipe_ingre_not_matched) # merge 1&2, match from longest
        text = partial_recipe_ingre_match(text) # 3
        text = synonym_recipe_ingre_match(text) # 4
        text = parent_recipe_ingre_match(text) # 5
        text = child_recipe_ingre_match(text) # 6
        text = synonym_nonrecipe_ingre_match(text, non_recipe_ingre_not_matched) # 7
        text = child_nonrecipe_ingre_match(text, non_recipe_ingre_not_matched) # 8
    
    elif seq == 2:
        ## sequence 2
        text = full_ingre_match(text, non_recipe_ingre_not_matched) # merge 1&2, match from longest
        text = partial_recipe_ingre_match(text) # 3
        text = synonym_child_parent_recipe_ingre_match(text) # merge 4,5,6, match from longest
        text = synonym_child_nonrecipe_ingre_match(text, non_recipe_ingre_not_matched) # merge 7,8, match from longest
    
    else:
        ## sequence 3
        text = full_ingre_match(text, non_recipe_ingre_not_matched) # merge 1&2, match from longest
        text = partial_synonym_child_parent_recipe_ingre_match(text) # merge 3,4,5,6, match from longest
        text = synonym_child_nonrecipe_ingre_match(text, non_recipe_ingre_not_matched) # merge 7,8, match from longest
    
    ### Note: The sub-functions 1&2 are not in-use in all 3 seq, below are how to use them separately
    # text = full_recipe_ingre_match(text) # i.e. sub-function 1
    # text = full_nonrecipe_ingre_match(text, non_recipe_ingre_not_matched) # i.e. sub-function 2
        
    return text, full_match_in_this_text, partial_match_in_this_text, \
           synonym_match_in_this_text, parent_match_in_this_text, child_match_in_this_text, new_spice_ls


## Main text processing function

### The output is the result df

In [13]:
def text_processing(df, seq):
    """
    text processing
    """
    
    recipe_id_list = df.recipe_id.tolist()
    review_id_list = df.review_id.tolist()
    review_list = df.review_text.tolist()
    
    contain_ingre_flag_list = []
    addition_flag_list = []
    deletion_flag_list = []
    replacement_flag_list = []

    ingre_in_recipe_list = []
    ingre_not_recipe_list = []
    ngram_in_list = []
    ngram_not_list = []
    recipe_list = []

    real_addition_list = [] # terms caught by addition grammar but the ingredient is already in the recipe
    false_addition_list = [] # terms caught by addition grammar and the ingredient is not in the recipe
    real_deletion_list = [] # terms caught by deletion grammar and the ingredient is in the recipe
    false_deletion_list = [] # terms caught by deletion grammar but the ingredient is not in the recipe
    replacement_list = [] # terms indicating replacement

    new_ingre_not_matched_list = [] # the ingredients not in recipe, but also isn't detected as an addition or replacement

    clean_text_list = [] # review text after cleaning

    full_match_list = [] # the full-match ingre in the review
    partial_match_list = [] # the partial-match pairs ingre happened in the review, e.g. ["juice", "apple_juice"]
    parent_match_list = [] # the parent-match pairs ingre happened in the review
    child_match_list = [] # the child-match pairs ingre happened in the review
    synonym_match_list = [] # the synonym-match pairs ingre happened in the review
    new_spice_list = [] # the new spice detected in the review, will be replaced by "mixed_spice" in the text
    
    for i in trange(0,len(df)):
        
        # get the sorted recipe-ingredients for the review
        recipe_id = df.iloc[i]['recipe_id']
        try:
            recipe = dict_recipe_to_name[recipe_id]
        except:
            recipe = [] # if the recipe id is not in the dict_recipe_to_name map list
        
        text = df.iloc[i]['review_text']
        
        # Convert to lower case
        text = text.lower()

        # Normalize the accented characters
        text = normalize_accented_characters(text)

        # Expand contractions, e.g. didn't --> did not
        text = expand_contractions(text, CONTRACTION_MAP)
        
        # Lemmatize the text, e.g. removed --> remove
        text = lemmatization(text)
        
        # concatenate some special phrases such as "instead_of"
        text = concat_phrase(text)

        # Standardize ingredient cluster names in the reviews
        text, full_match, partial_match, synonym_match, parent_match, child_match, new_spice = standardize_ingredient_in_review(text, recipe, seq)
        
        clean_text_list.append(text)
        full_match_list.append(full_match) # append the full-match that happened in this review
        partial_match_list.append(partial_match) # append the other-match pairs that happened in this review
        synonym_match_list.append(synonym_match) # append the other-match pairs that happened in this review
        parent_match_list.append(parent_match) # append the other-match pairs that happened in this review
        child_match_list.append(child_match) # append the other-match pairs that happened in this review
        new_spice_list.append(new_spice) # append the new spices detected in this review
        
        # check whether review contain ingredient words in cluster name list, 
        # get the ingredients in the recipe and the ingredients not in the recipe
        # also get the ngrams containing the detected ingredient words
        flag, ingre_in_recipe, ingre_not_recipe = deal_with_ingredient(text, recipe)
        contain_ingre_flag_list.append(flag)
        ingre_in_recipe_list.append(ingre_in_recipe)
        ingre_not_recipe_list.append(ingre_not_recipe)
        
        recipe_list.append(recipe)
        
        # Extract terms indicating altering
        false_addition_terms, real_addition_terms = addition_extraction(text, recipe)
        false_deletion_terms, real_deletion_terms = deletion_extraction(text, recipe)
        replacement_terms = replacement_extraction(text, recipe)
        
        if len(real_addition_terms) > 0:
            addition_flag_list.append(True)
        else:
            addition_flag_list.append(False)

        if len(real_deletion_terms) > 0:
            deletion_flag_list.append(True)
        else:
            deletion_flag_list.append(False)
            
        if len(replacement_terms) > 0:
            replacement_flag_list.append(True)
        else:
            replacement_flag_list.append(False)
            
        # Filter the ingredients that are not in the recipe & not detected as any altering
        new_ingre_not_matched = []
        all_detected_terms = false_addition_terms + real_addition_terms + false_deletion_terms + real_deletion_terms + replacement_terms
        all_detected_terms_tokens = ' '.join(all_detected_terms)
        new_ingre_not_matched = [ingre for ingre in ingre_not_recipe if (prefix+ingre) not in all_detected_terms_tokens.split()]
        
        # append all the above results
        replacement_list.append(replacement_terms)
        real_addition_list.append(real_addition_terms)
        false_addition_list.append(false_addition_terms)
        real_deletion_list.append(real_deletion_terms)
        false_deletion_list.append(false_deletion_terms)
        new_ingre_not_matched_list.append(new_ingre_not_matched)

    result_df = pd.DataFrame({
        'recipe_id': recipe_id_list,
        'review_id': review_id_list,
        'review_text': review_list,
        'clean_text': clean_text_list,
        'recipe': recipe_list,
        'ingre_flag': contain_ingre_flag_list,
        'ingre_in_recipe': ingre_in_recipe_list,
        'ingre_not_recipe': ingre_not_recipe_list,
        'addition_flag': addition_flag_list,
        'deletion_flag': deletion_flag_list,
        'replacement_flag': replacement_flag_list,
        'real_addition_terms': real_addition_list,
        'false_addition_terms': false_addition_list,
        'real_deletion_terms': real_deletion_list,
        'false_deletion_terms': false_deletion_list,
        'replacement_terms': replacement_list,
        'new_ingre_not_matched': new_ingre_not_matched_list,
        'full_match': full_match_list,
        'partial_match': partial_match_list,
        'synonym_match': synonym_match_list,
        'parent_match': parent_match_list,
        'child_match': child_match_list,
        'new_spice': new_spice_list
    })
    
    return result_df

## Get the results for different sequence settings

In [14]:
# Note: specify the sequence setting in text_processing function, as below

result_df_1 = text_processing(df = df.head(10000), seq = 1)
result_df_1.to_csv('OUTPUT_OF_FINAL_CODE/result_altering_seq1.csv')

result_df_2 = text_processing(df = df.head(10000), seq = 2)
result_df_2.to_csv('OUTPUT_OF_FINAL_CODE/result_altering_seq2.csv')

result_df_3 = text_processing(df = df.head(10000), seq = 3)
result_df_3.to_csv('OUTPUT_OF_FINAL_CODE/result_altering_seq3.csv')

HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))




# Create stats df for seq 1,2,3 result, and also output the combined stats df

In [17]:
# read in the result files
result_seq1 = pd.read_csv('OUTPUT_OF_FINAL_CODE/review_altering_seq1.csv')
result_seq2 = pd.read_csv('OUTPUT_OF_FINAL_CODE/review_altering_seq2.csv')
result_seq3 = pd.read_csv('OUTPUT_OF_FINAL_CODE/review_altering_seq3.csv')

# result for seq1:
df = result_seq1

num_ingre_in_recipe = [len(literal_eval(ls)) for ls in df.ingre_in_recipe]
num_ingre_not_recipe = [len(literal_eval(ls)) for ls in df.ingre_not_recipe]
num_real_addition_terms = [len(literal_eval(ls)) for ls in df.real_addition_terms]
num_false_addition_terms = [len(literal_eval(ls)) for ls in df.false_addition_terms]
num_real_deletion_terms = [len(literal_eval(ls)) for ls in df.real_deletion_terms]
num_false_deletion_terms = [len(literal_eval(ls)) for ls in df.false_deletion_terms]
num_replacement_terms = [len(literal_eval(ls)) for ls in df.replacement_terms]
num_new_ingre_not_matched = [len(literal_eval(ls)) for ls in df.new_ingre_not_matched]
num_full_match = [len(literal_eval(ls)) for ls in df.full_match]
num_partial_match = [len(literal_eval(ls)) for ls in df.partial_match]
num_synonym_match = [len(literal_eval(ls)) for ls in df.synonym_match]
num_parent_match = [len(literal_eval(ls)) for ls in df.parent_match]
num_child_match = [len(literal_eval(ls)) for ls in df.child_match]
num_new_spice = [len(literal_eval(ls)) for ls in df.new_spice]
stats_df_seq1 = pd.DataFrame({
    'review_id': df.review_id,
    'num_ingre_in_recipe':num_ingre_in_recipe,
    'num_ingre_not_recipe':num_ingre_not_recipe,
    'num_real_addition_terms':num_real_addition_terms,
    'num_false_addition_terms':num_false_addition_terms,
    'num_real_deletion_terms':num_real_deletion_terms,
    'num_false_deletion_terms':num_false_deletion_terms,
    'num_replacement_terms':num_replacement_terms,
    'num_new_ingre_not_matched':num_new_ingre_not_matched,
    'num_full_match':num_full_match,
    'num_partial_match':num_partial_match,
    'num_synonym_match':num_synonym_match,
    'num_parent_match':num_parent_match,
    'num_child_match':num_child_match,
    'num_new_spice':num_new_spice
})

stats_df_seq1.to_csv('OUTPUT_OF_FINAL_CODE/stats_altering_seq1.csv')

# result for seq2:
df = result_seq2

num_ingre_in_recipe = [len(literal_eval(ls)) for ls in df.ingre_in_recipe]
num_ingre_not_recipe = [len(literal_eval(ls)) for ls in df.ingre_not_recipe]
num_real_addition_terms = [len(literal_eval(ls)) for ls in df.real_addition_terms]
num_false_addition_terms = [len(literal_eval(ls)) for ls in df.false_addition_terms]
num_real_deletion_terms = [len(literal_eval(ls)) for ls in df.real_deletion_terms]
num_false_deletion_terms = [len(literal_eval(ls)) for ls in df.false_deletion_terms]
num_replacement_terms = [len(literal_eval(ls)) for ls in df.replacement_terms]
num_new_ingre_not_matched = [len(literal_eval(ls)) for ls in df.new_ingre_not_matched]
num_full_match = [len(literal_eval(ls)) for ls in df.full_match]
num_partial_match = [len(literal_eval(ls)) for ls in df.partial_match]
num_synonym_match = [len(literal_eval(ls)) for ls in df.synonym_match]
num_parent_match = [len(literal_eval(ls)) for ls in df.parent_match]
num_child_match = [len(literal_eval(ls)) for ls in df.child_match]
num_new_spice = [len(literal_eval(ls)) for ls in df.new_spice]
stats_df_seq2 = pd.DataFrame({
    'review_id': df.review_id,
    'num_ingre_in_recipe':num_ingre_in_recipe,
    'num_ingre_not_recipe':num_ingre_not_recipe,
    'num_real_addition_terms':num_real_addition_terms,
    'num_false_addition_terms':num_false_addition_terms,
    'num_real_deletion_terms':num_real_deletion_terms,
    'num_false_deletion_terms':num_false_deletion_terms,
    'num_replacement_terms':num_replacement_terms,
    'num_new_ingre_not_matched':num_new_ingre_not_matched,
    'num_full_match':num_full_match,
    'num_partial_match':num_partial_match,
    'num_synonym_match':num_synonym_match,
    'num_parent_match':num_parent_match,
    'num_child_match':num_child_match,
    'num_new_spice':num_new_spice
})

stats_df_seq2.to_csv('OUTPUT_OF_FINAL_CODE/stats_altering_seq2.csv')


# result for seq3:
df = result_seq3

num_ingre_in_recipe = [len(literal_eval(ls)) for ls in df.ingre_in_recipe]
num_ingre_not_recipe = [len(literal_eval(ls)) for ls in df.ingre_not_recipe]
num_real_addition_terms = [len(literal_eval(ls)) for ls in df.real_addition_terms]
num_false_addition_terms = [len(literal_eval(ls)) for ls in df.false_addition_terms]
num_real_deletion_terms = [len(literal_eval(ls)) for ls in df.real_deletion_terms]
num_false_deletion_terms = [len(literal_eval(ls)) for ls in df.false_deletion_terms]
num_replacement_terms = [len(literal_eval(ls)) for ls in df.replacement_terms]
num_new_ingre_not_matched = [len(literal_eval(ls)) for ls in df.new_ingre_not_matched]
num_full_match = [len(literal_eval(ls)) for ls in df.full_match]
num_partial_match = [len(literal_eval(ls)) for ls in df.partial_match]
num_synonym_match = [len(literal_eval(ls)) for ls in df.synonym_match]
num_parent_match = [len(literal_eval(ls)) for ls in df.parent_match]
num_child_match = [len(literal_eval(ls)) for ls in df.child_match]
num_new_spice = [len(literal_eval(ls)) for ls in df.new_spice]
stats_df_seq3 = pd.DataFrame({
    'review_id': df.review_id,
    'num_ingre_in_recipe':num_ingre_in_recipe,
    'num_ingre_not_recipe':num_ingre_not_recipe,
    'num_real_addition_terms':num_real_addition_terms,
    'num_false_addition_terms':num_false_addition_terms,
    'num_real_deletion_terms':num_real_deletion_terms,
    'num_false_deletion_terms':num_false_deletion_terms,
    'num_replacement_terms':num_replacement_terms,
    'num_new_ingre_not_matched':num_new_ingre_not_matched,
    'num_full_match':num_full_match,
    'num_partial_match':num_partial_match,
    'num_synonym_match':num_synonym_match,
    'num_parent_match':num_parent_match,
    'num_child_match':num_child_match,
    'num_new_spice':num_new_spice
})

stats_df_seq3.to_csv('OUTPUT_OF_FINAL_CODE/stats_altering_seq3.csv')


merge_stats_1_2 = pd.merge(stats_df_seq1, stats_df_seq2, how='left', on = 'review_id', 
                                 suffixes = ['_1', '_2'])
merge_stats_1_2_3 = pd.merge(merge_stats_1_2, stats_df_seq3.add_suffix('_3'), how='left', left_on = 'review_id',
                             right_on = 'review_id_3',suffixes = ['', '']).drop(['review_id_3'], axis=1)
merge_stats_1_2_3.to_csv('OUTPUT_OF_FINAL_CODE/stats_altering_combined.csv')
