In [None]:
from model.Corpus import Corpus
from model.Model import Model
from model.Embeddings import Embeddings
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

from pprint import pprint
import csv
from scrapers.ScraperResult import ScraperResult
from page_processors import *

global_hyperparameters = {
    "embedding_dim": 100, # Dimensions in the GloVe embedding
    "max_length": 60, # Enforced max and min character length of an example
    "trunc_type": "post",
    "padding_type": "post",
    "oov_tok": "<OOV>",
    "test_portion": 0.1
}


In [None]:
# should tokenize the labels; there aren't 1191 unique ones

ingredients_hyperparameters = {
    "num_epochs": 6,
    "training_size": 89500
}

ingredient_corpus = Corpus(
    "training_data/augmented_classified_ingredients.csv",
    global_hyperparameters,
    ingredients_hyperparameters
).initialize()

ingredient_embeddings = Embeddings(
    "glove.6B.100d.txt",
    ingredient_corpus,
    global_hyperparameters
).activate()

ingredient_classifier = Model(
    ingredient_corpus,
    ingredient_embeddings,
    global_hyperparameters,
    ingredients_hyperparameters,
    1191
).run()


In [None]:
recipes_hyperparameters = {
    "num_epochs": 4,
    "training_size": 76000
}

recipe_corpus = Corpus(
    "training_data/augmented_training_labels.csv",
    global_hyperparameters,
    recipes_hyperparameters
).initialize()

recipe_embeddings = Embeddings(
    "glove.6B.100d.txt",
    recipe_corpus,
    global_hyperparameters
).activate()

recipe_classifier = Model(
    recipe_corpus,
    recipe_embeddings,
    global_hyperparameters,
    recipes_hyperparameters,
    5
).run()


In [None]:
from scrapers.PageProcessor import PageProcessor
from model.Cache import Cache
instance_cache = Cache()

In [None]:
total_payload = []

new_links = [
    "https://www.bbcgoodfood.com/recipes/espresso-martini",
    "https://www.seriouseats.com/the-martini-recipe"
]

with open('newlabels.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile, delimiter=',')
    
    for link in new_links:
        print(link)
        
        #try:
        data = PageProcessor(
            link,
            recipe_classifier,
            recipe_corpus,
            global_hyperparameters,
            writer
        )
        data.soupify()
        this_recipe = ScraperResult(
            link.split("/")[-1],
            list(map(lambda x: x[0], lasso_inward(data.classifications,4,1,1,2)[0])),
            list(map(lambda x: x[0], lasso_inward(data.classifications,4,1,2,2)[0])),
            list(map(lambda x: x[0], lasso_inward(data.classifications,4,1,3,2)[0])),
            list(map(lambda x: x[0], lasso_inward(data.classifications,4,1,4,2)[0]))
        )
        this_recipe.map_payload_ingredients(
            instance_cache.get_ingredients(),
            ingredient_corpus,
            ingredient_classifier,
            global_hyperparameters
        )
        total_payload.append(this_recipe.get_payload())

        #except:
        #    print("link " + link + " could not be opened")


pprint(total_payload)


In [None]:
print(data.classifications)

In [None]:
data = process_page(
    "https://www.seriouseats.com/the-martini-recipe",
    #writer,
    recipe_classifier,
    recipe_corpus,
    global_hyperparameters["max_length"],
    global_hyperparameters["padding_type"],
    global_hyperparameters["trunc_type"]
)
this_recipe = ScraperResult(
    link.split("/")[-1],
    list(map(lambda x: x[0], lasso_inward(data,4,6,1,7)[0])),
    list(map(lambda x: x[0], lasso_inward(data,4,6,2,7)[0])),
    list(map(lambda x: x[0], lasso_inward(data,4,6,3,7)[0])),
    list(map(lambda x: x[0], lasso_inward(data,4,6,4,7)[0]))
)
this_payload = this_recipe.get_payload()
this_payload_mapped = this_recipe.map_payload_ingredients(
    instance_cache.get_ingredients(),
    ingredient_corpus,
    ingredient_classifier,
    global_hyperparameters
)
total_payload.append(this_recipe.get_payload())

In [None]:
pprint(this_payload_mapped)

In [None]:
tst_snt = total_payload[0]["ingredients"]
tst_snt = ingredient_corpus.tokenizer.texts_to_sequences(tst_snt)
tst_snt = pad_sequences(
    tst_snt, 
    maxlen=global_hyperparameters["max_length"], 
    padding=global_hyperparameters["padding_type"], 
    truncating=global_hyperparameters["trunc_type"]
)
pred = ingredient_classifier.model.predict(tst_snt)

for j in range(len(total_payload)):
    for i in range(len(total_payload[j]["ingredients"])):
        pred_id = np.argmax(pred[i])
        matched_ing = list(filter(lambda x: x[1] == pred_id,instance_cache.get_ingredients()))[0]
        total_payload[j]["ingredients"][i] = [total_payload[j]["ingredients"][i],matched_ing[0],pred_id]

    pprint(total_payload[j]["ingredients"])



In [None]:
from scrapers.CocktailLinkScraper import CocktailLinkScraper
cs = CocktailLinkScraper()

ab = cs.alton_brown("scraper_working_data/ab-cocktails.html")
print(ab)
