# Data Preprocessing 2

In [29]:
from dependency import parent_dir
from common.save import make_dir, save_pickle, load_pickle, print_time
from common.basics import *
import glob
import warnings
warnings.filterwarnings('ignore')

import copy
import pandas as pd

## 1. Combine and clean the source data (additional east-asian recipes)

In [10]:
#Asian Food Network Data
temp = pd.DataFrame()
path_to_json = '/opt/eda/data/asiarecipes/' #The directory store asian recipes

In [27]:
def read_data(path_to_json, web, file_format = '*.json'):
    json_pattern = os.path.join(path_to_json + web, file_format)
    file_list = glob.glob(json_pattern)

    dfs = [] 
    for file in file_list:
        data = pd.read_json(file, lines=True) 
        dfs.append(data) 

    temp = pd.concat(dfs, ignore_index=True) 
    return temp

data1 = read_data(path_to_json, web = 'asianfoodnetwork_data/')
data1 = data1[['name','ingredients','instructions']]
data1.head(2)

Unnamed: 0,name,ingredients,instructions
0,Instant Noodle Pizza,"[4 packets instant noodles (maggie, chicken)\r...",[Cook Noodles.\r\nCook noodles in boiling wate...
1,Singkong Sawut Gunung (Cassave Mountain),"[1 kg cassava&nbsp;\r\n, 10 g salt garam&nbsp;...","[Grate and Steam .\r\nGrate the cassava, let i..."


In [23]:
#Rasamalaysia Data

data2 = read_data(path_to_json, web = 'rasamalaysia_data/', file_format = '*.josn')
data2 = data2[['name','ingredient','instruction']]
data2.head(2)

Unnamed: 0,name,ingredient,instruction
0,Spicy Honey Baked Salmon,"[1 lb salmon, salt, ground black pepper, 1 ...","[Preheat oven to 400F., Rinse the salmon with ..."
1,Pandan Chiffon Cake Recipe,"[8 medium egg yolks, 2 oz fine sugar, 2 oz Pa...",


In [21]:
#Nyonyacooking Dataset
data3 = read_data(path_to_json, web = 'nyonyacooking_data/')
data3 = data3[['name','ingredient','instructions']]
data3.head(2)

Unnamed: 0,name,ingredient,instructions
0,Fruit Cake - Super Moist,"[280 g dried fruits and nuts, 130 g water, wat...",[Combine all the assorted fruits and chopped n...
1,Silken Tofu with Sweet Minced Meat,"[1 tofu (silken), 0.5 tbsp oil, 0.5 yellow o...",[Place tofu on the plate and add weight on it ...


In [25]:
#The Meat Men Data
data4 = read_data(path_to_json, web = 'themeatmen_data/')
data4 = data4[['title','ingredients','instructions']]
data4['title'] = data4['title'].apply(lambda x: x.split(' –')[0])
data4.head(2)

Unnamed: 0,title,ingredients,instructions
0,Prawn Paste Chicken Cutlet,"[1: 4 boneless chicken legs, 2: OKI premium su...",[1: Tenderize and flatten boneless chicken leg...
1,Ngoh Hiang,"[1: 500g pork belly (minced), 2: 250g prawns (...","[1: Mix 500g pork belly (minced), 250g prawns ..."


In [56]:
data1 = data1.rename(columns={"name": "title"})
data2 = data2.rename(columns={"name": "title", "ingredient": "ingredients","instruction": "instructions"})
data3 = data3.rename(columns={"name":"title","ingredient":"ingredients"})

dfs = [] 
dfs.append(data1)
dfs.append(data2)
dfs.append(data3)
dfs.append(data4)

east_asian = pd.concat(dfs,ignore_index=True) 
east_asian.head(2)

Unnamed: 0,title,ingredients,instructions
0,Instant Noodle Pizza,"[4 packets instant noodles (maggie, chicken)\r...",[Cook Noodles.\r\nCook noodles in boiling wate...
1,Singkong Sawut Gunung (Cassave Mountain),"[1 kg cassava&nbsp;\r\n, 10 g salt garam&nbsp;...","[Grate and Steam .\r\nGrate the cassava, let i..."


In [88]:
def clean_line(l):
    result = []
    if type(l)==list and bool(l)==True:
        for line in l:
            # all lowercase
            line = line.lower()

            # replace things in brace
            line = re.sub(r" ?\([^)]+\)", "", line)
            
            # only reserve number and alphabets
            line = re.sub(r"[^a-z0-9+()-/?&'!.,]", ' ', line)

            # remove extra spaces
            line = re.sub(' +',' ',line).strip()
            
            line = line.replace(' .', '.')
            line = line.replace(' !', '!')
            line = line.replace(')', '')
            line = line.replace('*', '')
            line = line.replace('..', '.')
            line = line.replace(' - ', '')
            line = line.replace(' - ', '')
            
            result.append(line)
            
    return result

east_asian.title = east_asian.title.apply(lambda x: re.sub(r"[^a-z0-9+()-/?&'!.,]", ' ', str(x).lower()))
east_asian['ingredients'] = east_asian['ingredients'].apply(lambda x: clean_line(x))

units = [r'tbsp',r'pkt',r'g',r'tsp',r'x',r'cups',r'oz',r'mrs',r'can',r'kg',r'packets',r'packet',r'cup',r'&nbsp',\
         r'lb', r'pkg',r'lbs',r'qt',r'lrg',r'grams',r'sm',\
         r'cans',r'bottle',r'and',r'cubes',r'o',r',',r'handful',r'pcs',r'tspr',r'cups',r'can',r'teaspoons', r'teaspoon',\
        r'container',r't',r'bag',r'gram',r'jar',r'c',r'lg',r'ml',r'ounces',r'ounce',r'box',r'- inch', r'tablespoon',r'tablespoons']

def clean_prefix(ingr):
    cleaned = []
    if ingr is not None:
        for ans in ingr:

            # strip
            ans = re.sub(' +',' ',ans).strip()

            # remove period
            ans = ans.replace('.', '')

            # remove units
            regexp = re.compile(r'(\d+)\s*(%s)\b' % '|'.join(units))
            ans = re.sub(regexp, r'\1', ans)
            
            # remove decimals
            ans = re.sub(r'(\d+)/(\d+)', '', ans) 
            
            # remove number
            ans = re.sub(r'\d+', '', ans)
            
             #remove special
            #for unit in units:
               # ans = re.sub(unit, '', ans) 
        
            # strip again
            ans = re.sub(' +',' ',ans).strip()

            if ans:
                cleaned.append(ans)
            
    return cleaned

east_asian['ingredients'] = east_asian['ingredients'].apply(lambda x: clean_prefix(x))

east_asian['instructions'] = east_asian['instructions'].apply(lambda x: clean_line(x))
east_asian['instructions'] = east_asian['instructions'].apply(lambda x: clean_prefix(x))

east_asian['tags'] = 'south-east-asian'
east_asian['tags'] = east_asian['tags'].apply(lambda x: x.split())

#east_asian.to_json('/opt/eda/data/pplm_data/east_asian.json')

In [85]:
east_asian = pd.read_json('/opt/eda/data/pplm_data/east_asian.json').reset_index().drop(columns = ['index'])
east_asian.head(2)

Unnamed: 0,title,ingredients,instructions,tags
0,instant noodle pizza,"[instant noodles, seasoning powder, taiwan sau...",[cook noodles cook noodles in boiling water un...,[south-east-asian]
1,singkong sawut gunung (cassave mountain),"[cassava&nbsp, salt garam&nbsp, palm sugar, su...","[grate and steam grate the cassava, let it sit...",[south-east-asian]


### Full dataset
Combine the original data1M with tags + the additional east asian recipes

In [53]:
data_full = pd.concat([data_df[['title','ingredients','instructions', 'tags']],east_asian[['title','ingredients','instructions', 'tags']]])
data_full = data_full.reset_index(drop=True)

#data_full.to_json('/opt/eda/data/data_full.json')

# 2. Multi-label Experiment

In [6]:
data, newdata = load_pickle(filename='/opt/eda/data/pplm_data/data_0210.pickle')
regional_tag = {'chinese': ['chinese', 'chinese-new-year', 'beijing'],
                           
               'korean': ['korean'],

               'japanese':['japanese'],

               'south-east-asian':['malaysian','indonesian','vietnamese','thai','cambodian','laotian','filipino'],

               'north-american':['north-american','northeastern-united-states','canadian','ontario',\
                                 'u . s .','midwestern', 'southern-united-states', 'southern'],

               'latin-american':['south-american','south-west-pacific','hawaiian', 'colombian', 'cuban', 'costa-rican',\
                                 'guatemalan','honduran', 'brazilian', 'ecuadorean',\
                                'peruvian', 'argentine', 'chilean', 'venezuelan','caribbean', 'mexican', 'central-american'],

               'greek':['greek'],

               'indian': ['bangladeshi','indian','nepalese'],

               'italian':['italian'],

               'french':['french','cajun'],

               'australian': ['australian', 'polynesian', 'new-zealand', 'australian and new zealander'],

               'british':['british-columbian','hungarian','irish', 'welsh', 'scottish','uk and ireland'],

               'european':['belgian','portuguese','iceland','turkish','moroccan','rosh-hashanah','jewish-ashkenazi',\
                           'jewish-sephardi','polish','finnish','czech','german','dutch','russian','georgian'],

               'afican':['african', 'somalian', 'south-african', 'moroccan', 'ethiopian',\
                          'egyptian','nigerian', 'sudanese', 'congolese', 'libyan', 'angolan'],

                'middle-eastern':['middle-eastern', 'iraqi','pakistani','iranian-persian','persian','egyptian']
              }

regionZ = [t for category, tag in regional_tag.items() for t in tag]
equal = {l: i for i, lst in regional_tag.items() for l in lst}

labeled, unlabeled = [], []
for i , recipe in tqdm.tqdm(enumerate(data)):
    newtags = []
    if 'tags' in recipe:
        # find the related tags
        newtags = [t for t in recipe['tags'] if t.replace(' recipe','') in regionZ]
        # renames the synonms
        newtags = [equal[t] if t in equal else t for t in newtags]
    recipe_ = copy.deepcopy(recipe)
    if newtags:
        recipe_['tags'] = list(set(newtags))
        labeled.append(recipe_)
    else:
        unlabeled.append(recipe_)

904401it [00:38, 23285.84it/s]


In [8]:
data_df = pd.DataFrame(labeled)
data_df.head(2)

Unnamed: 0,ingredients,instructions,recipe1m_idx,tags,title,url
0,"[cool whip, water, cracker crust, cubed seedle...",dissolve jello in boiling water. allow to cool...,4,[north-american],cool n easy creamy watermelon pie,http://www.food.com/recipe/cool-n-easy-creamy-...
1,"[lemon juice, beef, cornstarch, garlic, shredd...","in a large skillet, toast the coconut over med...",5,"[latin-american, north-american]",easy tropical beef skillet,http://www.food.com/recipe/easy-tropical-beef-...


## 3. South East Asian Recipes

In [8]:
sea_data = data_df[data_df['tags'].apply(lambda x: 'south-east-asian' in x)]
sea_data

Unnamed: 0,ingredients,instructions,recipe1m_idx,tags,title,url
30,"[cooking oil, shrimp, water, garlic, pepper, s...",wash shrimp and cut off whiskers. drain. place...,507,[south-east-asian],adobo style shrimp,http://www.food.com/recipe/adobo-style-shrimp-...
99,"[peppercorns black pepper, water, laurel leave...",cut the pork belly into serving pieces then co...,1731,[south-east-asian],lechon kawali,http://www.food.com/recipe/lechon-kawali-crisp...
130,"[ground pepper, fish sauce, rice wine vinegar,...","combine 1 tbls onions, 1 tbls sesame oil, 1 ts...",2224,[south-east-asian],cold soba noodles w/vietnamese pork,http://www.food.com/recipe/cold-soba-noodles-w...
134,"[ground pepper, white fish fillets, madras cur...",divide fish into 4 equal portions. combine sal...,2281,[south-east-asian],crackling rice paper wrapped fish,http://www.food.com/recipe/crackling-rice-pape...
151,"[vegetable oil, cherry tomatoes, red capsicum,...",place noodles in large heatproof bowl cover wi...,2531,[south-east-asian],thai beef patties with noodle salad,http://www.food.com/recipe/thai-beef-patties-w...
170,"[garlic, black peppercorns, bay leaves, chicke...","combine the chicken, soy sauce, vinegar, garli...",2773,[south-east-asian],adobo chicken with ginger,http://www.food.com/recipe/adobo-chicken-with-...
184,"[plain flour, water, pepper, limes, baking pow...",mix freshly ground pepper and salt together se...,3041,[south-east-asian],vietnamese salt and pepper eggplant,http://www.food.com/recipe/vietnamese-salt-and...
306,"[butter, brown sugar, vegetable oil, coriander...","cut chicken into pieces, place in a bowl. blen...",5073,[south-east-asian],satay chicken curry,http://www.food.com/recipe/satay-chicken-curry...
314,"[white mushrooms, szechuan hot bean sauce, sze...",cut chicken into small dice and dredge in corn...,5151,"[chinese, south-east-asian]",ma la chicken,http://www.food.com/recipe/ma-la-chicken-303652
399,"[fish sauce, onion, chicken breasts, red chile...",heat a large skillet over medium heat. place t...,6223,[south-east-asian],penang curry for two,http://www.food.com/recipe/penang-curry-for-tw...


In [76]:
#sea_data.to_json('/opt/eda/data/pplm_data/ingredients_eval/sea_prevalence_source_data.json')
#data_df.to_json('/opt/eda/data/recipe55k_regional.json')

In [68]:
def construct(title, ingredients, instructions):
    #ingredients: list
    try:
        title_text = "<start-title>" + title + "<end-title><start-ingredients>"

        ingredients_text = ''
        for ingre in ingredients:
            ingredients_text = ingredients_text + ingre + "$"
        ingredients_text += "<end-ingredients><start-directions>"

        instructions_text = instructions + "<end-instructions>|<endoftext>|"

        return title_text + ingredients_text + instructions_text
    
    except:
        return None

construct(data_full.title.iloc[1], data_full.ingredients.iloc[1], data_full.instructions.iloc[1])

'<start-title>dilly macaroni salad recipe<end-title><start-ingredients>cubed american cheese$salad dressing$celery$vinegar$dry dill weed$salt$pimento$elbow macaroni$green pepper$<end-ingredients><start-directions>cook macaroni according to package directions drain well. cold. combine macaroni, cheese cubes, celery, green pepper and pimento. blend together mayonnaise or possibly salad dressing, vinegar, salt and dill weed add in to macaroni mix. toss lightly. cover and refrigeratewell. serve salad in lettuce lined bowl if you like. makes 6 servings.<end-instructions>|<endoftext>|'