In [1]:
from model.Corpus import Corpus
from model.Model import Model
from model.Embeddings import Embeddings
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

from pprint import pprint
import csv
from scrapers.ScraperResult import ScraperResult
from page_processors import lasso_inward

global_hyperparameters = {
    "embedding_dim": 100, # Dimensions in the GloVe embedding
    "max_length": 60, # Enforced max and min character length of an example
    "trunc_type": "post",
    "padding_type": "post",
    "oov_tok": "<OOV>",
    "test_portion": 0.1
}


In [2]:
# should tokenize the labels; there aren't 1191 unique ones

ingredients_hyperparameters = {
    "num_epochs": 6,
    "training_size": 89500
}

ingredient_corpus = Corpus(
    "training_data/augmented_classified_ingredients.csv",
    global_hyperparameters,
    ingredients_hyperparameters
).initialize()

ingredient_embeddings = Embeddings(
    "glove.6B.100d.txt",
    ingredient_corpus,
    global_hyperparameters
).activate()

ingredient_classifier = Model(
    ingredient_corpus,
    ingredient_embeddings,
    global_hyperparameters,
    ingredients_hyperparameters,
    1191
).run()


Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


In [3]:
recipes_hyperparameters = {
    "num_epochs": 4,
    "training_size": 76000
}

recipe_corpus = Corpus(
    "training_data/augmented_training_labels.csv",
    global_hyperparameters,
    recipes_hyperparameters
).initialize()

recipe_embeddings = Embeddings(
    "glove.6B.100d.txt",
    recipe_corpus,
    global_hyperparameters
).activate()

recipe_classifier = Model(
    recipe_corpus,
    recipe_embeddings,
    global_hyperparameters,
    recipes_hyperparameters,
    5
).run()


Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [19]:
from scrapers.PageProcessor import PageProcessor
from model.Cache import Cache
instance_cache = Cache()

In [7]:
total_payload = []

new_links = [
    "https://www.bbcgoodfood.com/recipes/espresso-martini",
    "https://www.seriouseats.com/the-martini-recipe"
]

with open('newlabels.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile, delimiter=',')
    
    for link in bbc:
        print(link)
        
        #try:
        data = PageProcessor(
            link,
            recipe_classifier,
            recipe_corpus,
            global_hyperparameters,
            writer
        )
        data.soupify()
        this_recipe = ScraperResult(
            link.split("/")[-1],
            list(map(lambda x: x[0], lasso_inward(data.classifications,4,1,1,2)[0])),
            list(map(lambda x: x[0], lasso_inward(data.classifications,4,1,2,2)[0])),
            list(map(lambda x: x[0], lasso_inward(data.classifications,4,1,3,2)[0])),
            list(map(lambda x: x[0], lasso_inward(data.classifications,4,1,4,2)[0]))
        )
        this_recipe.map_payload_ingredients(
            instance_cache.get_ingredients(),
            instance_cache.get_measures(),
            ingredient_corpus,
            ingredient_classifier,
            global_hyperparameters
        )
        total_payload.append(this_recipe.get_payload())

        #except:
        #    print("link " + link + " could not be opened")


pprint(total_payload)


https://www.bbcgoodfood.com/recipes/hurricane-cocktail
local
https://www.bbcgoodfood.com/recipes/sex-beach-cocktail
local
https://www.bbcgoodfood.com/recipes/jessica-rabbit-cocktail
local
https://www.bbcgoodfood.com/recipes/cajun-style-prawn-cocktail
local
https://www.bbcgoodfood.com/recipes/white-rabbit-cocktail
local
https://www.bbcgoodfood.com/recipes/gimlet-cocktail
local
https://www.bbcgoodfood.com/recipes/zombie-cocktail
local
https://www.bbcgoodfood.com/recipes/grasshopper-cocktail
local
https://www.bbcgoodfood.com/recipes/cosmopolitan-cocktail
local
https://www.bbcgoodfood.com/recipes/pink-lady
local
https://www.bbcgoodfood.com/recipes/lime-prawn-cocktail-pitta-salad
local
https://www.bbcgoodfood.com/recipes/rainbow-prawn-cocktails
local
https://www.bbcgoodfood.com/recipes/mezcalita-verde-cocktail
local
https://www.bbcgoodfood.com/recipes/strawberry-daiquiri-cocktail-fancies
local
https://www.bbcgoodfood.com/recipes/stuffed-cocktail-eggs
local
https://www.bbcgoodfood.com/recipe

KeyboardInterrupt: 

In [6]:
from scrapers.CocktailLinkScraper import CocktailLinkScraper
cs = CocktailLinkScraper()

ab = cs.alton_brown("scraper_working_data/ab-cocktails.html")
bbc = cs.bbc()


https://www.bbcgoodfood.com/search/recipes/?q=Cocktail+recipes&sort=-relevance
https://www.bbcgoodfood.com/search/recipes/page/2/?q=Cocktail+recipes&sort=-relevance
https://www.bbcgoodfood.com/search/recipes/page/3/?q=Cocktail+recipes&sort=-relevance
https://www.bbcgoodfood.com/search/recipes/page/4/?q=Cocktail+recipes&sort=-relevance
https://www.bbcgoodfood.com/search/recipes/page/5/?q=Cocktail+recipes&sort=-relevance
https://www.bbcgoodfood.com/search/recipes/page/6/?q=Cocktail+recipes&sort=-relevance
https://www.bbcgoodfood.com/search/recipes/page/7/?q=Cocktail+recipes&sort=-relevance
https://www.bbcgoodfood.com/search/recipes/page/8/?q=Cocktail+recipes&sort=-relevance
https://www.bbcgoodfood.com/search/recipes/page/9/?q=Cocktail+recipes&sort=-relevance
https://www.bbcgoodfood.com/search/recipes/page/10/?q=Cocktail+recipes&sort=-relevance


In [8]:
pprint(total_payload)

[{'ingredients': ['50ml dark rum',
                  '50ml white rum',
                  '1 passion fruit',
                  '1 orange , juiced',
                  '1 lemon , juiced',
                  '50ml sugar syrup',
                  '2 tsp grenadine',
                  '4 cocktail cherries',
                  '2 orange slices'],
  'mapped_ingredients': [['50ml dark rum', 50, 'dark%20rum', 237],
                         ['50ml white rum', 50, 'white%20rum', 738],
                         ['1 passion fruit', 1, 'strawberry', 521],
                         ['1 orange , juiced', 1, 'orange%20juice', 325],
                         ['1 lemon , juiced', 1, 'lemon%20juice', 39],
                         ['50ml sugar syrup', 50, 'simple%20syrup', 102],
                         ['2 tsp grenadine', 2, 'grenadine', 107],
                         ['4 cocktail cherries', 4, 'cherry', 15],
                         ['2 orange slices', 2, 'orange', 51]],
  'steps': ['Fill a cocktail shaker with

                  'slices from 1/2 lemon'],
  'mapped_ingredients': [['100ml Campari', 100, 'campari', 10],
                         ['ice cubes', 1, 'ice%20cube', 115],
                         ['200ml dry white wine', 200, 'white%20wine', 74],
                         ['slices from 1/2 lemon', 1, 'lemon', 76]],
  'steps': ['Divide the Campari between 2 wine glasses, add 2-3 ice cubes to '
            'each glass, then top with the dry white wine.',
            'Add the lemon wedges, stir to gently combine the flavours and '
            'serve.'],
  'timing': ['Prep: 5 mins'],
  'title': 'bicyclette',
  'yield': ['Serves 2']},
 {'ingredients': ['salt',
                  '0.4',
                  '40ml Blanco tequila',
                  '10ml lime juice',
                  '5ml lemon juice',
                  '10ml Worcestershire sauce',
                  '3 dashes pepper sauce (Tabasco is fine)',
                  'smoked sea salt , finely ground'],
  'mapped_ingredients': [['salt', 1,

In [23]:
class Cache:
    
    def __init__(self):
        pass

    def get_ingredients(self):
        from database.Query import Query
        
        if not hasattr(self,'ingredients'):
            self.ingredients = Query("select singular_name, id from ingredient;").get_data()

        return self.ingredients
        
    def get_measures(self):
        from database.Query import Query
        
        if not hasattr(self,'measures'):
            self.measures = Query("select short_name, long_name, plural_name, long_plural from measure where is_common = 1;").get_data()

        return self.measures
    
    
instance_cache = Cache()
print(instance_cache.get_measures())

[('clove', 'Clove', 'cloves', 'cloves'), ('cup', 'Cup', 'cups', 'cups'), ('dash', 'Dash', 'dashes', 'dashes'), ('fillet', 'Fillet', 'fillets', 'fillets'), ('floz', 'Fluid Ounce', 'floz', 'fluid ounces'), ('gal', 'Gallon', 'gal', 'gallons'), ('g', 'Gram', 'g', 'grams'), ('heart', 'Heart', 'Hearts', 'Hearts'), ('kg', 'Kilogram', 'kg', 'kilograms'), ('leaf', 'Leaf', 'leaves', 'leaves'), ('loaf', 'Loaf', 'loaves', 'loaves'), ('oz', 'Ounce', 'oz', 'ounces'), ('pinch', 'Pinch', 'pinches', 'pinches'), ('lb', 'Pound', 'lbs', 'pounds'), ('qt', 'Quart', 'qts', 'quarts'), ('rib', 'Rib', 'Ribs', 'Ribs'), ('sprig', 'Sprig', 'sprigs', 'sprigs'), ('stick', 'Stick', 'Sticks', 'Sticks'), ('tbsp', 'Tablespoon', 'tbsp', 'tablespoons'), ('tsp', 'Teaspoon', 'tsp', 'teaspoons'), ('to taste', 'To Taste', 'to taste', 'to taste'), ('wedge', 'Wedge', 'wedges', 'wedges'), ('slice', 'Slice', 'Slices', 'Slices'), ('pt', 'Pint', 'pints', 'pints'), ('packet', 'Packet', 'packets', 'packets'), ('head', 'Head', 'heads'

In [36]:
import re







('loaf', 'Loaf', 'loaves', 'loaves')


In [18]:


if re.search("(?<=[0-9]| )ml|milliliter|milliliters(?= )",s):
    print("ml")
elif re.search("(?<=[0-9]| )(tsp|teaspoon|teaspoons)(?= )",s):
    print("tsp")
elif re.search("(?<=[0-9]| )(tbsp)(?= )",s):
    print("tbsp")
elif re.search("(?<=[0-9]| )(g)(?= )",s):
    print("g")