In [18]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle as pkl
import re
%matplotlib inline

In [19]:
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

In [20]:
with open('test_df.pkl', 'rb') as file:
    df = pkl.load(file)

In [21]:
def convert_time(bake_time):
    import datetime as dt
    
    if re.compile('[0-9]* h [0-9]* m').match(bake_time):
        time = pd.to_datetime(bake_time, format='%H h %M m')
    elif re.compile('[0-9]* m').match(bake_time):
        time = pd.to_datetime(bake_time, format='%M m')
    elif re.compile('[0-9]* h').match(bake_time):
        time = pd.to_datetime(bake_time, format='%H h')
        
    minutes = (time - dt.datetime(1900,1,1)).total_seconds() / 60
    
    return minutes

## Convert all columns to numerical values

In [22]:
df['rating'] = df['rating'].astype(float)
df['servings'] = df['servings'].astype(int)
df['calories'] = df['calories'].str.replace('calories', '').str.strip()
df['calories'] = df['calories'].fillna(df.dropna()['calories'].astype(int).mean()).astype(int)
df['made_it'] = df['made_it'].str.replace('made it', '').str.strip().str.replace('k', '000').astype(int)
df['num_photos'] = df['num_photos'].str.replace(' photos', '').str.replace('k', '000').astype(int)
df['num_reviews'] = df['num_reviews'].str.replace(' reviews', '').str.replace('k', '000').astype(int)
df['num_ratings'] = df['num_ratings'].str.replace(' Ratings', '')
df['num_ratings'] = df['num_ratings'].fillna(df['made_it'])
df['num_ratings'] = df['num_ratings'].astype(int)
df['prep_time'] = df['prep_time'].fillna('0 m').str.strip()
df['prep_time'] = df['prep_time'].str.strip()
df['prep_time_minutes'] = df['prep_time'].apply(lambda x: convert_time(x))

#### Preprocess oven temperatures

In [23]:
oven_temps = df['oven_temp'].str.replace('\D', ' ', regex=True).str.split().fillna(0).to_list()

for i, temp in enumerate(oven_temps):
    if temp == 0:
        oven_temps[i] = [0]

temps = []
for i, line in enumerate(oven_temps):
    for num in line:
        temperature = int(num)
        if temperature > 220:
            temps.append(temperature)
        elif temperature == 0:
            temps.append(0)

df['oven_temp'] = temps

In [24]:
df = df[['name', 'rating', 'num_ratings', 'num_reviews', 'made_it',
       'servings', 'calories', 'num_photos', 'oven_temp', 'prep_time_minutes', 'ingredients']]

## Process ingredient strings

In [25]:
def get_ingredient(ingredient, recipes):
    
    """
    Gets the line of the ingredient list containing the requested ingredient. 
    
    Parameters:
        ingredient (str): name of the ingredient 
        recipes (list): list of ingredients for each recipe
    Returns:
        ingredient_list (list): a single list matching the length of the input recipes list
                                containing the line of the recipe which has the requested ingredient
    """
    
    ingredient_list = []
    counter = 0
    for i, row in enumerate(recipes):
        for item in row:
            if item.find(ingredient) != -1:
                ingredient_list.append(item)
                counter += 1
                break
        if counter == 0:
            ingredient_list.append('0')
        counter = 0
    
    return ingredient_list

In [26]:
def get_measurement(ingredient):
    
    """
    Gets the line of the ingredient list containing the requested ingredient. 
    
    Parameters:
        ingredient (str): name of the ingredient 
    Returns:
        amount (list): a single list matching the length of the input ingredient list
                       containing the amount (float) of the ingredient for each recipe 
    """
    
    amounts = []
    for row in ingredient:
        #if re.match('\d* ' + unit, row):
            #row = re.sub(' ' + unit + '\D*.*', '', row)
        if re.match('\d*', row):
            row = re.sub(' \D*.*', '', row)
        amounts.append(float(row))
    return amounts

In [27]:
recipes_ingredients = df['ingredients'].to_list()

In [28]:
num_ingredients = []
for row in recipes_ingredients:
    num_ingredients.append(len(row))

In [29]:
df['num_ingredients'] = num_ingredients

In [30]:
ingredient_names = [('butter', 'g'), ('shortening', 'g'), ('oil', 'ml'), ('margarine', 'g'), ('white sugar', 'g'), ('brown sugar', 'g'), 
                     ('semisweet chocolate chips', 'g'), ('white chocolate chips', 'g'), ('milk chocolate chips', 'g'),
                     ('dark chocolate chips', 'g'), ('mint chocolate chips', 'g'), ('flour', 'g'), ('almond flour', 'g'), 
                     ('wheat flour', 'g'), ('vanilla extract', 'ml'), ('salt', 'g'), ('cinnamon', 'g'), ('baking soda', 'g'), 
                     ('walnuts', 'g'), ('pecans', 'g'), ('almonds', 'g'), ('macadamia', 'g'), ('confectioner sugar', 'g'), 
                     ('baking powder', 'g'), ('water', 'ml'), ('cocoa powder', 'g'), ('egg', 'whole')]

In [31]:
for ingredient in ingredient_names:
    col_name = f"{'_'.join(ingredient[0].split())}_{ingredient[1]}"
    df[col_name] = get_measurement(get_ingredient(ingredient[0], recipes_ingredients))

In [32]:
df = df[['name', 'rating', 'num_ratings', 'num_reviews', 'made_it', 'servings',
       'calories', 'num_photos', 'oven_temp', 'prep_time_minutes',
       'num_ingredients', 'butter_g', 'shortening_g', 'oil_ml', 'margarine_g',
       'white_sugar_g', 'brown_sugar_g', 'semisweet_chocolate_chips_g',
       'white_chocolate_chips_g', 'milk_chocolate_chips_g',
       'dark_chocolate_chips_g', 'mint_chocolate_chips_g', 'flour_g',
       'almond_flour_g', 'wheat_flour_g', 'vanilla_extract_ml', 'salt_g',
       'cinnamon_g', 'baking_soda_g', 'walnuts_g', 'pecans_g', 'almonds_g',
       'macadamia_g', 'confectioner_sugar_g', 'baking_powder_g', 'water_ml',
       'cocoa_powder_g', 'egg_whole']]

In [33]:
df.head()

Unnamed: 0,name,rating,num_ratings,num_reviews,made_it,servings,calories,num_photos,oven_temp,prep_time_minutes,num_ingredients,butter_g,shortening_g,oil_ml,margarine_g,white_sugar_g,brown_sugar_g,semisweet_chocolate_chips_g,white_chocolate_chips_g,milk_chocolate_chips_g,dark_chocolate_chips_g,mint_chocolate_chips_g,flour_g,almond_flour_g,wheat_flour_g,vanilla_extract_ml,salt_g,cinnamon_g,baking_soda_g,walnuts_g,pecans_g,almonds_g,macadamia_g,confectioner_sugar_g,baking_powder_g,water_ml,cocoa_powder_g,egg_whole
0,Giant Chocolate Chip Cookie,4.687126,668,548,998,16,399,255,375,0.0,10,225.0,0.0,0.0,0.0,150.0,165.0,335.0,0.0,0.0,0.0,0.0,280.0,0.0,0.0,5.0,6.0,0.0,5.0,115.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
1,Best Chocolate Chip Cookies,4.612473,14608,10000,32000,24,298,3000,350,60.0,11,225.0,0.0,0.0,0.0,200.0,220.0,335.0,0.0,0.0,0.0,0.0,375.0,0.0,0.0,10.0,3.0,0.0,5.0,115.0,0.0,0.0,0.0,0.0,0.0,10.0,0.0,2.0
2,Outrageous Chocolate Chip Cookies,4.702152,2649,2000,4000,18,207,494,350,25.0,11,115.0,0.0,0.0,0.0,100.0,75.0,170.0,0.0,0.0,0.0,0.0,125.0,0.0,0.0,3.0,2.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,Mom's Chocolate Chip Cookies,4.596047,1265,1000,1000,24,233,175,375,0.0,9,225.0,0.0,0.0,0.0,50.0,165.0,335.0,0.0,0.0,0.0,0.0,280.0,0.0,0.0,5.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
4,Stephen's Chocolate Chip Cookies,4.622283,368,299,472,48,146,39,350,50.0,15,225.0,0.0,0.0,0.0,200.0,220.0,110.0,120.0,160.0,0.0,0.0,310.0,0.0,0.0,5.0,3.0,0.0,5.0,40.0,35.0,30.0,0.0,0.0,5.0,0.0,0.0,2.0


In [34]:
ingredient_cols = ['butter_g','shortening_g', 'oil_ml', 'margarine_g', 
                   'white_sugar_g', 'brown_sugar_g', 'egg_whole',
                   'semisweet_chocolate_chips_g', 'white_chocolate_chips_g',
                   'milk_chocolate_chips_g', 'dark_chocolate_chips_g',
                   'mint_chocolate_chips_g', 'flour_g', 'almond_flour_g', 'wheat_flour_g',
                   'vanilla_extract_ml', 'salt_g', 'cinnamon_g', 'baking_soda_g',
                   'walnuts_g', 'pecans_g', 'almonds_g', 'macadamia_g',
                   'confectioner_sugar_g', 'baking_powder_g', 'water_ml',
                   'cocoa_powder_g']

In [35]:
# standardize ingredients per serving
for col in ingredient_cols:
    df[col] = df[col] / df['servings']

In [36]:
df.columns

Index(['name', 'rating', 'num_ratings', 'num_reviews', 'made_it', 'servings',
       'calories', 'num_photos', 'oven_temp', 'prep_time_minutes',
       'num_ingredients', 'butter_g', 'shortening_g', 'oil_ml', 'margarine_g',
       'white_sugar_g', 'brown_sugar_g', 'semisweet_chocolate_chips_g',
       'white_chocolate_chips_g', 'milk_chocolate_chips_g',
       'dark_chocolate_chips_g', 'mint_chocolate_chips_g', 'flour_g',
       'almond_flour_g', 'wheat_flour_g', 'vanilla_extract_ml', 'salt_g',
       'cinnamon_g', 'baking_soda_g', 'walnuts_g', 'pecans_g', 'almonds_g',
       'macadamia_g', 'confectioner_sugar_g', 'baking_powder_g', 'water_ml',
       'cocoa_powder_g', 'egg_whole'],
      dtype='object')

# Feature Engineering

In [37]:
df['fats'] = df['butter_g'] + df['shortening_g'] + df['oil_ml'] + df['margarine_g']

In [38]:
df['sugars'] = df['white_sugar_g'] + df['brown_sugar_g'] + df['confectioner_sugar_g']

In [39]:
df['flours'] = df['flour_g'] + df['almond_flour_g'] + df['wheat_flour_g']

In [40]:
df['nuts'] = df['walnuts_g'] + df['pecans_g'] + df['almonds_g'] + df['macadamia_g']

In [41]:
df['chocolate_chips'] = df['semisweet_chocolate_chips_g'] + df['white_chocolate_chips_g'] + df['milk_chocolate_chips_g'] + df['dark_chocolate_chips_g'] + df['mint_chocolate_chips_g']

In [42]:
df['fats_sugars_ratio'] = df['fats'] / df['sugars']

In [43]:
df.loc[(df['fats_sugars_ratio'] == np.inf), ['fats_sugars_ratio']] = 0

In [52]:
df['fats_flours_ratio'] = df['fats'] / df['flours']

In [53]:
df.loc[(df['fats_flours_ratio'] == np.inf), ['fats_flours_ratio']] = 0

In [44]:
df['sugars_flours_ratio'] = df['sugars'] / df['flours']

In [45]:
df['chips_nuts_ratio'] = df['chocolate_chips'] / df['nuts']

In [46]:
df.loc[(df['chips_nuts_ratio'] == np.inf), ['chips_nuts_ratio']] = 0

In [47]:
df.loc[(df['chips_nuts_ratio'].isnull()), ['chips_nuts_ratio']] = 0

In [48]:
df['white_brown_ratio'] = df['white_sugar_g'] / df['brown_sugar_g']

In [49]:
df.loc[(df['white_brown_ratio'] == np.inf), ['white_brown_ratio']] = 0

In [54]:
# SAVE FILE!
pd.to_pickle(df, 'df_all_cols.pkl')