In [1]:
import re
import pandas as pd
import nltk
import unicodedata
import csv
import itertools
from tqdm import trange

In [2]:
# lemmatization dict
lemma_list = pd.read_csv('/Users/nessyliu/Desktop/RA/lemma_list.csv')
lemma_dict = lemma_list.set_index('word_list').to_dict()['lemma_list']

# Read in the datasets
df = pd.read_csv('/Users/nessyliu/Desktop/RA/AllReviews_26thNov2019.csv')
df_ingredients_raw = pd.read_csv('/Users/nessyliu/Desktop/RA/part_2/Ingredients.csv')
df_cluster = pd.read_excel('/Users/nessyliu/Desktop/RA/part_2/Cluster_names.xlsx')

In [3]:
df = df.head(30000)

In [4]:
cluster_name_orig_list = df_cluster.cluster_name.tolist() # list of cluster names with spaces, e.g. apple juice

# create dict to map recipe_id to ingredient_ids
df_ingredients = df_ingredients_raw.groupby('recipe_id')['ingredient_id'].apply(list).reset_index(name ='ingredients')
dict_recipe_ingredients = dict(zip(df_ingredients.recipe_id, df_ingredients.ingredients))

# create dict to map ingredient_id to cluster_name
df_cluster = df_cluster.replace(' ', '_', regex=True)
dict_ingredient_clustername = dict(zip(df_cluster.ingredient_id, df_cluster.cluster_name))

# list of cluster names with underscore, e.g. apple_juice, so that when check 'apple', won't confuse with 'apple juice'
cluster_name_list = df_cluster.cluster_name.tolist() 

recipe_id_list = df.recipe_id.tolist()
review_list = df.review_text.tolist()

In [5]:
# Map for expanding contractions in case of negations
CONTRACTION_MAP = {
    "ain't": "is not",
    "aren't": "are not",
    "can't": "can not",
    "cannot": "can not",
    "couldn't": "could not",
    "couldn't've": "could not have",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "hadn't": "had not",
    "hasn't": "has not",
    "haven't": "have not",
    "isn't": "is not",
    "mayn't": "may not",
    "mightn't": "might not",
    "mustn't": "must not",
    "needn't": "need not",
    "shan't": "shall not",
    "sha'n't": "shall not",
    "shouldn't": "should not",
    "shouldn't've": "should not have",
    "wasn't": "was not",
    "weren't": "were not",
    "won't": "will not",
    "wouldn't": "would not",
}

contain_ingre_flag_list = []
addition_flag_list = []
deletion_flag_list = []
replacement_flag_list = []

ingre_in_recipe_list = []
ingre_not_recipe_list = []
ngram_in_list = []
ngram_not_list = []
recipe_list = []

addition_list = []
deletion_list = []
replacement_list = []

clean_text_list = []

In [6]:
def deal_with_ingredient(text, recipe):
    # tokenize the review text
    sentences = nltk.sent_tokenize(text)
    tokens_sentences = [nltk.word_tokenize(sent) for sent in sentences]
    tokens = list(itertools.chain.from_iterable(tokens_sentences))
    flag = False
    
    # detect ingredients and get ngrams
    ingre_in_recipe = []
    ingre_not_recipe = []
    ngram_in = []
    ngram_not = []
    ngrams = list(nltk.everygrams(tokens, 4, 4))
    ngrams = [list(ngram) for ngram in ngrams] 
    detected_ingredients = [token for token in tokens if token in cluster_name_list] 
    if len(detected_ingredients) > 0:
        flag = True
    for ingre in detected_ingredients:
        if ingre in recipe:
            ingre_in_recipe.append(ingre)
            ngram_in.append([ngram for ngram in ngrams if ingre in ngram])
        else:
            ingre_not_recipe.append(ingre)
            ngram_not.append([ngram for ngram in ngrams if ingre in ngram])
    
    return flag, ingre_in_recipe, ingre_not_recipe, ngram_in, ngram_not

In [7]:
def text_processing(df):
    """
    text processing
    """
    for i in trange(0,len(df)):
    #for i in range(0,len(df)):
        
        recipe_id = df.iloc[i]['recipe_id']
        try:
            recipe = [dict_ingredient_clustername[ingre_id] for ingre_id in dict_recipe_ingredients[recipe_id]]
        except:
            recipe = [] # if the recipe id is not in the recipe_ingredient map list
        
        text = df.iloc[i]['review_text']
        
        # Convert to lower case
        text = text.lower()

        # Normalize the accented characters
        text = normalize_accented_characters(text)

        # Expand contractions, e.g. didn't --> did not
        text = expand_contractions(text, CONTRACTION_MAP)
        
        # Lemmatize the text, e.g. removed --> remove
        text = lemmatization(text)

        # Concatenate ingredient phrases in the reviews (which appear in the cluster name list)
        text = concat_phrase(text)
        
        clean_text_list.append(text)

        
        # check whether review contain ingredient words in cluster name list, 
        # get the ingredients in the recipe and the ingredients not in the recipe
        # also get the ngrams containing the detected ingredient words
        flag, ingre_in_recipe, ingre_not_recipe, ngram_in, ngram_not = deal_with_ingredient(text, recipe)
        contain_ingre_flag_list.append(flag)
        ingre_in_recipe_list.append(ingre_in_recipe)
        ingre_not_recipe_list.append(ingre_not_recipe)
        ngram_in_list.append(ngram_in)
        ngram_not_list.append(ngram_not)
        
        recipe_list.append(recipe)
        
        
        # Extract terms indicating altering
        addition_terms = addition_extraction(text)
        deletion_terms = deletion_extraction(text)
        replacement_terms = replacement_extraction(text)
        
        if len(addition_terms) > 0:
            addition_flag_list.append(True)
        else:
            addition_flag_list.append(False)

        if len(deletion_terms) > 0:
            deletion_flag_list.append(True)
        else:
            deletion_flag_list.append(False)
            
        if len(replacement_terms) > 0:
            replacement_flag_list.append(True)
        else:
            replacement_flag_list.append(False)
        
        replacement_list.append(replacement_terms)
        addition_list.append(addition_terms)
        deletion_list.append(deletion_terms)



def concat_phrase(text):
    # concatenate the ingredient phrases in the cluster name list
    for cluster in cluster_name_orig_list:
        text = re.sub(r'\b' + cluster + r'\b', cluster.replace(' ', '_'), text)
    
    # concatenate some important phrases for the ease of detection
    text = re.sub(r'\binstead of\b', 'instead_of', text)
    text = re.sub(r'\bleave out\b', 'leave_out', text)
    text = re.sub(r'\bleave off\b', 'leave_off', text)
    text = re.sub(r'\bbother with\b', 'bother_with', text)
    return text


def normalize_accented_characters(text):
    """
    In case of Unicode characters
    """
    # text = unicodedata.normalize('NFKD', text)
    text = unicodedata.normalize('NFC', text)
    text = text.replace("“", '"').replace("”", '"').replace("’", "'").replace("‘", "'").replace('–', '-')
    text = re.sub('ı', 'I', text)
    return text


def expand_contractions(text, contraction_mapping):
    """
    Expand contractions in case of negations, e.g. isn’t -> is not
    """
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())),
                                      flags=re.IGNORECASE | re.DOTALL)

    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match) \
            if contraction_mapping.get(match) \
            else contraction_mapping.get(match.lower())
        expanded_contraction = first_char + expanded_contraction[1:]
        return expanded_contraction

    expanded_text = contractions_pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)
    return expanded_text


def lemmatization(text_before):
    """
    Lemmatize the text
    """
    text_before = text_before.lower()
    text_after = " ".join(str(lemma_dict.get(word, word)) for word in text_before.split())
    return text_after


def replacement_extraction(text):
    """
    extract Replacement terms
    """
    sentences = nltk.sent_tokenize(text)
    tokens_sentences = [nltk.word_tokenize(sent) for sent in sentences]
    tagged_sentences = [tagger.tag(sent) for sent in tokens_sentences]
    DESC_list = []
    for sent_tagged in tagged_sentences:  # each sentence
        sent_tree = key_phrases_cp.parse(sent_tagged)  # chunk each sentence based on grammar structure
        DESC_in_this_sent = []
        for subtree in sent_tree.subtrees():
            if subtree.label() == "REPLACEMENT_TERM":
                string = str(subtree)
                string = re.sub('\n', '', string)
                string = re.sub('\s+', ' ', string)
                string = re.sub('REPLACEMENT_TERM ', '', string)
                string = re.sub('\/[A-Z]+\$*', '', string)
                string = re.sub('\(', '', string)
                string = re.sub('\)', '', string)

                DESC_in_this_sent.append(string)
        if DESC_in_this_sent != []:
            DESC_list.extend(DESC_in_this_sent)
    DESC_list = list(set(DESC_list))
    return DESC_list


def addition_extraction(text):
    """
    extract Addition terms
    """
    sentences = nltk.sent_tokenize(text)
    tokens_sentences = [nltk.word_tokenize(sent) for sent in sentences]
    tagged_sentences = [tagger.tag(sent) for sent in tokens_sentences]
    DESC_list = []
    for sent_tagged in tagged_sentences:  # each sentence
        sent_tree = key_phrases_cp.parse(sent_tagged)  # chunk each sentence based on grammar structure
        DESC_in_this_sent = []
        for subtree in sent_tree.subtrees():
            if subtree.label() == "ADDITION_TERM":
                string = str(subtree)
                string = re.sub('\n', '', string)
                string = re.sub('\s+', ' ', string)
                string = re.sub('ADDITION_TERM ', '', string)
                string = re.sub('\/[A-Z]+\$*', '', string)
                string = re.sub('\(', '', string)
                string = re.sub('\)', '', string)

                DESC_in_this_sent.append(string)
        if DESC_in_this_sent != []:
            DESC_list.extend(DESC_in_this_sent)
    DESC_list = list(set(DESC_list))
    return DESC_list


def deletion_extraction(text):
    """
    extract Deletion terms
    """
    sentences = nltk.sent_tokenize(text)
    tokens_sentences = [nltk.word_tokenize(sent) for sent in sentences]
    tagged_sentences = [tagger.tag(sent) for sent in tokens_sentences]
    DESC_list = []
    for sent_tagged in tagged_sentences:  # each sentence
        sent_tree = key_phrases_cp.parse(sent_tagged)  # chunk each sentence based on grammar structure
        DESC_in_this_sent = []
        for subtree in sent_tree.subtrees():
            if subtree.label() == "DELETION_TERM":
                string = str(subtree)
                string = re.sub('\n', '', string)
                string = re.sub('\s+', ' ', string)
                string = re.sub('DELETION_TERM ', '', string)
                string = re.sub('\/[A-Z]+\$*', '', string)
                string = re.sub('\(', '', string)
                string = re.sub('\)', '', string)

                DESC_in_this_sent.append(string)
        if DESC_in_this_sent != []:
            DESC_list.extend(DESC_in_this_sent)
    DESC_list = list(set(DESC_list))
    return DESC_list



# Custom our own tagger for POS tagging, and use python's default tagger as backup
custom_tagger = {
    'replace': 'REPLACEMENT', 'substitute': 'REPLACEMENT', 'instead_of': 'REPLACEMENT',
    'delete': 'DELETION', 'remove': 'DELETION', 'omit': 'DELETION', 'subtract': 'DELETION',
    'leave_out': 'DELETION', 'leave_off': 'DELETION',
    'add': 'ADDITION',
    'bother_with': 'BOTHERWITH',
    'no': 'NO',
    'not': 'NOT',
    'that': 'DT',
    'with': 'IN',
    'even': 'EVEN'}

ingredient_tagger = {}
for ingre in cluster_name_list:
    ingredient_tagger[ingre] = 'INGREDIENT'
    
custom_tagger_combined = {**custom_tagger, **ingredient_tagger}

default_tagger = nltk.data.load("/Users/nessyliu/nltk_data/taggers/maxent_treebank_pos_tagger/english.pickle")
tagger = nltk.tag.UnigramTagger(model=custom_tagger_combined, backoff=default_tagger)

# Define grammar structures for extracting negation phrases and then creativity words
key_phrases_grammar = r"""
    DELETION_TERM:
        # Extract "not have + ingredient"
        # e.g. "I didn't have any Beef"
        {<NOT><HAVE><DT>?<RB>?<INGREDIENT>}
        
        # Extract "have no + ingredient"
        # e.g. "I had no Beef"
        {<HAVE><NO><INGREDIENT>}
        
        # Extract "deletion + ingredient"
        # e.g. "I removed Apple", "I deleted the Beef", "I left off the Beef"
        {<DELETION><[A-Z]*>*<INGREDIENT>}
        
        # Extract negation terms with "bother with"
        # e.g. "I didn't bother with the Beef"
        {<NOT><BOTHERWITH><[A-Z]*>*<INGREDIENT>}
        
    REPLACEMENT_TERM:
        # e.g. replace the Apple with some Banana
        {<REPLACEMENT><[A-Z]*>*<INGREDIENT><IN><DT>?<[A-Z]*>*<INGREDIENT>}
        
        # e.g. didn't use Apple, replace with Banana
        {<INGREDIENT><[A-Z]*>*<REPLACEMENT><IN><[A-Z]*>*<INGREDIENT>}
    
    ADDITION_TERM:
        # e.g. I added some Milk
        {<ADDITION><[A-Z]*>*<INGREDIENT>}
"""
key_phrases_cp = nltk.RegexpParser(key_phrases_grammar)

In [8]:
text = "I didn't have the celery so replace with a shredded carrot which broke down."
replacement_extraction(text)

['celery so replace with a shredded carrot']

In [9]:
text_processing(df)

100%|██████████| 30000/30000 [59:33<00:00,  8.40it/s]  


In [10]:
result_df = pd.DataFrame({
    'recipe_id': recipe_id_list,
    'review_text': review_list,
    'clean_text': clean_text_list,
    'recipe': recipe_list,
    'flag': contain_ingre_flag_list,
    'ingre_in_recipe': ingre_in_recipe_list,
    'ingre_not_recipe': ingre_not_recipe_list,
    'ngram_in': ngram_in_list,
    'ngram_not': ngram_not_list,
    'addition_flag': addition_flag_list,
    'deletion_flag': deletion_flag_list,
    'replacement_flag': replacement_flag_list,
    'addition_terms': addition_list,
    'deletion_terms': deletion_list,
    'replacement_terms': replacement_list
})


result_df.to_csv('/Users/nessyliu/Desktop/RA/part_2/result_df.csv')

review_with_ingre_df = result_df.loc[result_df['flag'] == True]
review_with_ingre_df.to_csv('/Users/nessyliu/Desktop/RA/part_2/review_with_ingre_df.csv')

In [11]:
print(len(review_with_ingre_df))
print(len(result_df))

18262
30000


In [15]:
list(itertools.chain.from_iterable(replacement_list))

['substitute lime for lemon_juice',
 'replace brandy for gin',
 'substitute vodka for the gin',
 'instead_of seasoning with salt and pepper i seasoned with italian_bread',
 'substitute the water with a nice dark belgian beer',
 'substitute chicken_liver for beef_liver',
 'substitute balsamic_vinegar for the red_wine because i is out i also used regular cream_of_mushroom_soup',
 'instead_of spinach as well as parmigiana cheese',
 'instead_of the brown_sugar in the egg',
 'replace the fish seasoning with kosher_salt',
 'replace the sour_cream with nonfat plain greek_yogurt',
 'substitute chipotle_pepper for the ancho_chile_pepper',
 'substitute beef_broth for the water',
 'substitute red_wine vinegar for the white_vinegar but that is simply a matter of curiosity and not a complaint against the white_vinegar',
 'substitute chicken for the beef and add a little more seasoning',
 'substitute some pumpkin_puree for some of the mashed banana',
 'substitute the rosemary with thyme',
 'replace 

In [16]:
list(itertools.chain.from_iterable(deletion_list))

['leave_out the bbq sauce',
 'leave_out the rice',
 'remove course shell of salt',
 'leave_out the salt',
 'leave_out the chili powders and increased the black pepper',
 'leave_out the hot_sauce',
 'leave_out the tomato_juice',
 'leave_out the cilantro',
 'leave_out the cilantro',
 'remove the bitter',
 'omit the process of soaking the liver and milk',
 'leave_out the milk',
 'leave_out garlic_powder',
 'leave_out the mustard',
 'leave_out the mustard',
 'leave_out the ketchup and bell pepper',
 'leave_out the corn_syrup',
 'leave_out the maple_syrup',
 'leave_out the parsley',
 'omit the dill',
 'leave_out the ancho chili pepper',
 'leave_out the soy_sauce',
 'omit the final step with the flour',
 'omit the all-purpose flour and water',
 'remove the bay_leaf',
 'remove roast and deglazed pot with 1 c red_wine',
 'leave_out the green_onion',
 'leave_out the garlic_salt and salt',
 'omit the honey',
 'remove the honey and vanilla_extract',
 'remove from the grill and hit all sides with 

In [17]:
list(itertools.chain.from_iterable(addition_list))

['add the bbq sauce',
 'add 1/2 jar of bbq sauce',
 'add much water b/c the 2 cans of beer',
 'add bbq sauce',
 'add an onion',
 'add something green like bell pepper',
 'add one chopped onion',
 'add to it by saute onion',
 'add some water',
 'add some crushed red pepper',
 'add it as it seem to take forever to cook the rice',
 'add 1/2 cup milk',
 'add minced garlic',
 'add about a cup to a cup and a half of water to get the rice',
 'add sugar',
 'add some peppers and onions to browned ground_beef',
 'add some mustard',
 'add some garlic',
 'add 1/2 cup of fresh lemon_juice',
 'add some salt',
 'add fat but i will not go there and just rate this as make with beef',
 'add curing salt',
 'add more seasoning to the flour',
 'add a knob of butter to olive oil_for_frying',
 'add tablespoon of the keg steak',
 'add a fair amount of salt and pepper to the potato flake/crushed cracker',
 'add pepper',
 'add the cheese to the mixture after you add the ground_beef',
 'add lot of garlic_powder 