In [376]:
import requests
import json
from bs4 import BeautifulSoup
import data_config
from glob import glob
from datetime import datetime

# Loading database

In [377]:
import mysql.connector
from mysql.connector import Error

In [378]:
mydb = mysql.connector.connect(host='localhost',
                                     user='root',
                                     port="8889",
                                     password='Verrygood_123!?',
                                     database='recipedb')

mycursor = mydb.cursor(buffered=True)

### ingredient

In [379]:
nutritionix_path = data_config.data_raw_path/data_config.nutritionix_dir
prefix = '240315'
file_paths = list(nutritionix_path.glob(f'{prefix}*.json'))

now = datetime.now() # current date and time
etl_date = now.strftime("%y%m%d")

sql = """INSERT INTO ingredient (name, img_link, serving_unit, serving_size, amount_g,\
          calories, total_fat_g, cholesterol_mg, sodium_mg, carbonhydrate_g, protein_g,\
          calcium_mg, iron_mg, potassium_mg, vitamin_d_mcg, caffeine_mg, etl_date) \
        VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s )"""

In [380]:
for file_path in file_paths:
    vals = []
    with open(file_path) as json_file:
        json_data = json.load(json_file)
        for key_name, ingredient in json_data.items():
            name = ingredient['name'].lower()
            img_src = ingredient['img_src']
            serving_unit = ingredient['serving_unit'].split(' ')[0].lower()
            serving_size = float(ingredient['serving_size'])
            amount = float(ingredient['amount'][:-1])
            calories = float(ingredient['calories'])
            total_fat = float(ingredient['total_fat'][:-1])
            cholesterol = float(ingredient['cholesterol'][:-2])
            sodium = float(ingredient['sodium'][:-2])
            carbonhydrate = float(ingredient['carbonhydrate'][:-1])
            protein = float(ingredient['protein'][:-1])
            calcium = float(ingredient['calcium'][:-2])
            iron = float(ingredient['iron'][:-2])
            potassium = float(ingredient['potassium'][:-2])
            vitamin_d = float(ingredient['vitamin_d'][:-3])
            caffeine = float(ingredient['caffeine'][:-2])
            vals += [(name,img_src,serving_unit,serving_size,amount,
                      calories,total_fat,cholesterol,sodium,carbonhydrate,protein,
                      calcium,iron,potassium,vitamin_d, caffeine, etl_date)]
            
        mycursor.executemany(sql, vals)
        print(f"Success proceeding file {file_path.name}!")


Success proceeding file 240315_152226-common-food-p1.json!
Success proceeding file 240315_152226-common-food-p10.json!
Success proceeding file 240315_152226-common-food-p11.json!
Success proceeding file 240315_152226-common-food-p12.json!
Success proceeding file 240315_152226-common-food-p13.json!
Success proceeding file 240315_152226-common-food-p14.json!
Success proceeding file 240315_152226-common-food-p15.json!
Success proceeding file 240315_152226-common-food-p16.json!
Success proceeding file 240315_152226-common-food-p17.json!
Success proceeding file 240315_152226-common-food-p18.json!
Success proceeding file 240315_152226-common-food-p19.json!
Success proceeding file 240315_152226-common-food-p2.json!
Success proceeding file 240315_152226-common-food-p3.json!
Success proceeding file 240315_152226-common-food-p4.json!
Success proceeding file 240315_152226-common-food-p5.json!
Success proceeding file 240315_152226-common-food-p6.json!
Success proceeding file 240315_152226-common-f

### area

In [381]:
themeal_area_path = data_config.data_raw_path/data_config.the_meal_db_dir/'area'
prefix = '240306'
file_path = list(themeal_area_path.glob('240306*.json'))[0]

now = datetime.now() # current date and time
etl_date = now.strftime("%y%m%d")

sql = """INSERT INTO area (id, name, etl_date) \
        VALUES (%s, %s, %s)"""
with open(file_path) as json_file:
    json_data = json.load(json_file)['meals']
area_sort = sorted([i['strArea'].lower() for i in json_data])
vals = list(zip(list(range(1, len(area_sort)+1)), area_sort, [etl_date]*len(area_sort)))

mycursor.executemany(sql, vals)

### category

In [382]:
themeal_area_path = data_config.data_raw_path/data_config.the_meal_db_dir/'category'
prefix = '240306'
file_path = list(themeal_area_path.glob('240306*.json'))[0]

now = datetime.now() # current date and time
etl_date = now.strftime("%y%m%d")

sql = """INSERT INTO category (id, name, img_link, description, etl_date) VALUES (%s, %s, %s, %s, %s)"""

vals = []
with open(file_path) as json_file:
    json_data = json.load(json_file)['categories']
    for category in json_data:
        cat_id = int(category["idCategory"])
        cat_name = category["strCategory"].lower()
        link_src = category["strCategoryThumb"]
        desc = category["strCategoryDescription"]
        vals += [(cat_id,cat_name,link_src,desc,etl_date)]

mycursor.executemany(sql, vals)

### recipe

In [383]:
from fractions import Fraction
import re

In [384]:
themeal_recipe_path = data_config.data_raw_path/data_config.the_meal_db_dir/'recipe'
prefix = '240306'
file_paths = list(themeal_recipe_path.glob(f'{prefix}*.json'))

now = datetime.now() # current date and time
etl_date = now.strftime("%y%m%d")

recipe_sql = """INSERT INTO recipe (id, name, area, instruction, category_id, img_link, tags, youtube_link, source_link, etl_date) \
        VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"""

recipe_category_sql = """INSERT INTO recipe_category (recipe_id, category_id, category_pred,  etl_date) \
        VALUES (%s, %s, %s, %s)"""

measure_sql = """INSERT INTO measure (recipe_id, ingredient_name_in_meal, map_perc, map_ingredient_id, \
        map_ingredient_name, measure_quantity, measure_unit, etl_date ) \
        VALUES (%s, %s, %s, %s, %s, %s, %s, %s)"""

In [385]:
from fuzzywuzzy import fuzz

def map_string(db, s, threshold=90):
    best_score =  0
    for id_s, map_s in db:
        score = fuzz.token_sort_ratio(s, map_s)
        if score >= best_score:
            best_score = score
            best_s = map_s
            best_id = id_s
    if best_score > threshold:
        return best_s, best_id, best_score
    else:
        return None, None, score     
    
mycursor.execute("SELECT id, name FROM ingredient")
ingredient_name_db = mycursor.fetchall()
ingredient_name_db_sort = sorted(ingredient_name_db, key= lambda x: len(x[1]), reverse=True)

In [386]:
measure_unit_pattern = "|".join(data_config.measure_unit_db + [i+'s' for i in data_config.measure_unit_db ])
miss_ingredients = set()
for path_file in file_paths:
    with open(path_file) as json_file:
        json_data = json.load(json_file)['meals']
        if not(json_data):
            continue
        recipe_vals = []
        recipe_category_vals = []
        measure_vals = []
        for recipe in json_data:
            recipe_id = 'TM_'+recipe['idMeal']
            recipe_name = recipe['strMeal'].lower()
            area = recipe['strArea'].lower()
            category_pred = recipe['strCategory'].lower()
            instruction = recipe['strInstructions']
            img_link = recipe['strMealThumb']
            tags = recipe['strTags']
            youtube_link = recipe['strYoutube']
            source_link = recipe['strSource']
            measure_ing = []
            flag_continue = False
            for i in range(1,21):
                ingredient = recipe['strIngredient'+str(i)]
                if not(ingredient):
                    break
                ingredient = re.sub('[,.:?/-]','',ingredient).lower()
                map_ingredient, map_id_ingredient, score = map_string(ingredient_name_db_sort, ingredient, threshold=65)
                if not(map_ingredient):
                    miss_ingredients.add(ingredient)
                    flag_continue = True
                    break

                measures = re.split('(.*[a-zA-Z])/',recipe['strMeasure'+str(i)])
                if len(measures) == 1:
                    measure_desc = measures[0]
                else:
                    measure_desc = measures[1]

                extract_number = re.sub("[\u00BC\u00BE\u2150\u2044\u215E\u2189]+", lambda x: ' ' + str(Fraction(unicodedata.numeric(x.group()))), measure_desc)
                extract_number = extract_number.split('-')[0]
                match = re.search(r'\(.*\)',extract_number)
                if match:
                    extract_number = match.group()
                extract_numbers = re.findall(r'([0-9]+[./]*[0-9]*) *([0-9]+[./]*[0-9]*)*', extract_number)
                measure_quantity = 0
                for numbers in extract_numbers:
                        for number in numbers:
                            if number == '':
                                continue
                            else:
                                measure_quantity += float(Fraction(number))
                if measure_quantity == 0:
                    measure_quantity = None

                measure_units = re.findall("("+measure_unit_pattern+")",ingredient)
                if len(measure_units):
                    measure_unit = measure_units[0]
                else:
                    measure_unit = None
                    
                measure_ing += [(recipe_id, ingredient, score, map_id_ingredient,
                            map_ingredient, measure_quantity, measure_unit, etl_date)]
                
            if flag_continue:
                continue
                    
            recipe_vals += [(recipe_id, recipe_name, area, instruction, category_id, img_link, tags, youtube_link, source_link, etl_date)]
            
            map_category_sql = f"select id from category where LOWER(name) = '{category_pred.lower()}' LIMIT 1"
            mycursor.execute(map_category_sql)
            category_id = mycursor.fetchone()[0]
            recipe_category_vals += [(recipe_id, category_id, category_pred, etl_date)]
            
            measure_vals += measure_ing
            
        mycursor.executemany(recipe_sql, recipe_vals)
        mycursor.executemany(recipe_category_sql, recipe_category_vals)
        mycursor.executemany(measure_sql, measure_vals)
        print(f"Success proceeding file {path_file.name}!")      

Success proceeding file 240306_111816-a-recipe-themeal.json!
Success proceeding file 240306_111816-b-recipe-themeal.json!
Success proceeding file 240306_111816-c-recipe-themeal.json!
Success proceeding file 240306_111816-d-recipe-themeal.json!
Success proceeding file 240306_111816-e-recipe-themeal.json!
Success proceeding file 240306_111816-f-recipe-themeal.json!
Success proceeding file 240306_111816-g-recipe-themeal.json!
Success proceeding file 240306_111816-h-recipe-themeal.json!
Success proceeding file 240306_111816-i-recipe-themeal.json!
Success proceeding file 240306_111816-j-recipe-themeal.json!
Success proceeding file 240306_111816-k-recipe-themeal.json!
Success proceeding file 240306_111816-l-recipe-themeal.json!
Success proceeding file 240306_111816-m-recipe-themeal.json!
Success proceeding file 240306_111816-n-recipe-themeal.json!
Success proceeding file 240306_111816-o-recipe-themeal.json!
Success proceeding file 240306_111816-p-recipe-themeal.json!
Success proceeding file 

### the food

In [387]:
thefood_recipe_path = data_config.data_raw_path/data_config.the_food_dir/'recipe'
prefix = '240316'
file_paths = list(thefood_recipe_path.glob(f'{prefix}*.json'))

now = datetime.now() # current date and time
etl_date = now.strftime("%y%m%d")

recipe_sql = """INSERT INTO recipe (id, name, instruction, img_link, source_link, etl_date, rating, total_user_rated, time_cooking) \
        VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)"""

measure_sql = """INSERT INTO measure (recipe_id, ingredient_name_in_meal, map_perc, map_ingredient_id, \
        map_ingredient_name, measure_quantity, measure_unit, etl_date ) \
        VALUES (%s, %s, %s, %s, %s, %s, %s, %s)"""

In [388]:
def map_string_the_food(db, s, threshold=90):
    best_score =  0
    for id_s, map_s in db:
        score = fuzz.token_set_ratio(map_s, s.lower())
        if score > best_score:
            best_score = score
            best_s = map_s
            best_id = id_s
    if best_score > threshold:
        return best_s, best_id, best_score
    else:
        return None, None, score     
    
mycursor.execute("SELECT id, name FROM ingredient")
ingredient_name_db = mycursor.fetchall()
ingredient_name_db_sort = sorted(ingredient_name_db, key= lambda x: len(x[1]), reverse=True)

In [389]:
measure_unit_pattern = "|".join(data_config.measure_unit_db + [i+'s' for i in data_config.measure_unit_db ])
for path_file in file_paths:
    with open(path_file) as json_file:
        json_data = json.load(json_file)
        if not(json_data):
            continue
        recipe_vals = []
        recipe_category_vals = []
        measure_vals = []
        for recipe in json_data.values():
            recipe_id = 'TF_'+recipe['recipe_link'].split('-')[-1]
            recipe_name = ' '.join(recipe['recipe_link'].split('/')[-1].split('-')[:-1]).lower()
            instruction = recipe['instruction']
            img_link = recipe['recipe_img']
            source_link = recipe['recipe_link']
            rating_ref = recipe['rating']
            rating = re.search(r'([.0-9]*) stars',rating_ref).group(1)
            total_user_rated = re.search(r'([0-9]*) ratings',rating_ref).group(1)
            time_cooking = recipe['time']
            measure_ing = []
            flag_continue = False
            ingredients = recipe['ingredient']
            for ingredient in ingredients:
                ingredient = ingredient.lower()
                map_ingredient, map_id_ingredient, score = map_string_the_food(ingredient_name_db, ingredient, threshold=80)
                if not(map_ingredient):
                    flag_continue = True
                    break
                if len(ingredient.split('\n')) == 1:
                    measure_quantity = None
                    measure_unit = None
                else:

                    measure_extract = re.search(r'(.*)\n ?(\(.*\))?',ingredient).group(2)
                    if not(measure_extract):
                        measure_extract = ingredient.split('\n')[0]
                    measure_extract = measure_extract.split('-')[0]
                    measure_extract = measure_extract.replace('⁄','/')
                    extract_numbers = re.findall(r'([0-9]+[./]*[0-9]*) *([0-9]+[./]*[0-9]*)*', measure_extract)
                    measure_quantity = 0
                    for numbers in extract_numbers:
                        for number in numbers:
                            if number == '':
                                continue
                            else:
                                measure_quantity += float(Fraction(number))
                    if measure_quantity == 0:
                        measure_quantity = None

                    measure_unit = re.findall("("+measure_unit_pattern+")",ingredient)
                    if len(measure_unit):
                        measure_unit = measure_unit[0]
                    else:
                        measure_unit = None   

                measure_ing += [(recipe_id, ingredient, score, map_id_ingredient,
                                map_ingredient, measure_quantity, measure_unit, etl_date)]

            if flag_continue:
                continue
                    
            recipe_vals += [(recipe_id, recipe_name, instruction, img_link,  source_link, etl_date, rating, total_user_rated, time_cooking)]
            
            measure_vals += measure_ing
            
        mycursor.executemany(recipe_sql, recipe_vals)
        mycursor.executemany(recipe_category_sql, recipe_category_vals)
        mycursor.executemany(measure_sql, measure_vals)
        print(f"Success proceeding file {path_file.name}!")
        


Success proceeding file 240316_160511-recipes-p1.json!
Success proceeding file 240316_160511-recipes-p10.json!
Success proceeding file 240316_160511-recipes-p11.json!
Success proceeding file 240316_160511-recipes-p12.json!
Success proceeding file 240316_160511-recipes-p13.json!
Success proceeding file 240316_160511-recipes-p2.json!
Success proceeding file 240316_160511-recipes-p3.json!
Success proceeding file 240316_160511-recipes-p4.json!
Success proceeding file 240316_160511-recipes-p5.json!
Success proceeding file 240316_160511-recipes-p6.json!
Success proceeding file 240316_160511-recipes-p7.json!
Success proceeding file 240316_160511-recipes-p8.json!
Success proceeding file 240316_160511-recipes-p9.json!


In [390]:
mydb.close()

# theMealDb
themealdb.com

In [13]:
# recipe
recipe_response_API = requests.get('https://www.themealdb.com/api/json/v1/1/search.php?f=a')
recipe_data = recipe_response_API.text
recipe_json = json.loads(recipe_data)


# receipt

In [33]:
from datetime import datetime
date_time = datetime.now().strftime("%y%m%d_%H%M%S")



'240306_094610'

In [34]:
# category
recipe_response_API = requests.get('https://www.themealdb.com/api/json/v1/1/search.php?f=a')
recipe_data = recipe_response_API.text
recipe_json = json.loads(recipe_data)

In [55]:
# category
category_response_API = requests.get('https://www.themealdb.com/api/json/v1/1/categories.php')
category_data = category_response_API.text
category_json = json.loads(category_data)

In [57]:
# ingredient
ingredient_response_API = requests.get('https://www.themealdb.com/api/json/v1/1/list.php?i=list')
ingredient_data = ingredient_response_API.text
ingredient_json = json.loads(ingredient_data)

In [66]:

def create_image_ingredient_link(ing_name):
    ing_name = ing_name.replace(' ',r'%20')
    img_link = f"https://www.themealdb.com/images/ingredients/{ing_name}.png" 
    return img_link

In [23]:
# area
area_response_API = requests.get('https://www.themealdb.com/api/json/v1/1/list.php?a=list')
area_data = area_response_API.text
area_json = json.loads(area_data)

In [None]:
# image
image_response_API = requests.get('https://www.themealdb.com/api/json/v1/1/list.php?a=list')
image_data = image_response_API.text
image_json = json.loads(image_data)

# food.com

In [5]:
f_recipe_response_API = requests.get('https://www.food.com/recipe')

In [72]:
soup = BeautifulSoup(f_recipe_response_API.content, 'html.parser')

In [73]:
soup


<!DOCTYPE html>

<html xmlns="http://www.w3.org/1999/xhtml" xmlns:fb="http://www.facebook.com/2008/fbml" xmlns:og="http://ogp.me/ns#">
<head>
<script>
      const oArr = (window.location.host||'').split('.');
      const oArrLength = oArr.length;
      // Set up the cookie Domain with the format .domain.com
      const userConsentCD = '.' + oArr[oArrLength - 2] + '.' + oArr[oArrLength - 1];
      let queryParams = new URLSearchParams(location.search);
      let countryLoc = queryParams.get("countryLoc");
      let stateLoc = queryParams.get("stateLoc");
      if(countryLoc && stateLoc) {
          const date = new Date();
          date.setTime(date.getTime()+(10000));
          const expires = '; expires=' + date.toGMTString();
          document.cookie = "__ds_loc_country=" + countryLoc + ";domain=" + userConsentCD + expires;
          document.cookie = "__ds_loc_state=" + stateLoc + ";domain=" + userConsentCD + expires;
      }
      // Set up user config with the One Trust ID and 

In [75]:
soup.select("body > div.fd-site > div.fd-site-wrapper > div.container-sm-md.gk-tile-content > div.tile-stream.clearfix.fdStream")

[]

In [83]:
soup.findAll('div', attrs={'class':'cta'})[0]

<div class="cta">
<i class="icon-fdc-loading fa-spin"></i>
</div>

In [40]:
soup.find('div', class_='inner-wrapper').get_text()

AttributeError: 'NoneType' object has no attribute 'get_text'

In [6]:
soup.find('div', {" data.id": "recipe-45809"})

NameError: name 'soup' is not defined

In [27]:
from lxml import html
tree = html.fromstring(f_recipe_response_API.text)  

In [9]:
f_recipe_response_API.text

'\n\n<!DOCTYPE html>\n<html xmlns:fb="http://www.facebook.com/2008/fbml" xmlns:og="http://ogp.me/ns#" xmlns="http://www.w3.org/1999/xhtml">\n\n\n\n\n\n<head>\n\n\n\n\n\n\n\n\n\n\n\n\n\n    <script>\n      const oArr = (window.location.host||\'\').split(\'.\');\n      const oArrLength = oArr.length;\n      // Set up the cookie Domain with the format .domain.com\n      const userConsentCD = \'.\' + oArr[oArrLength - 2] + \'.\' + oArr[oArrLength - 1];\n      let queryParams = new URLSearchParams(location.search);\n      let countryLoc = queryParams.get("countryLoc");\n      let stateLoc = queryParams.get("stateLoc");\n      if(countryLoc && stateLoc) {\n          const date = new Date();\n          date.setTime(date.getTime()+(10000));\n          const expires = \'; expires=\' + date.toGMTString();\n          document.cookie = "__ds_loc_country=" + countryLoc + ";domain=" + userConsentCD + expires;\n          document.cookie = "__ds_loc_state=" + stateLoc + ";domain=" + userConsentCD + ex

In [17]:
buyers = tree.xpath('/html/body/div[1]/div[1]/div[3]/div[2]/div[1]')


In [101]:
htmlstring(tree.xpath('//h2')[0])

b'<h2 class="title">Loading...</h2>\n                    '

In [90]:
a = tree.cssselect('/html/body/div[1]/div[1]/div[3]')

SelectorSyntaxError: Expected selector, got <DELIM '/' at 0> (<string>)

In [55]:
from lxml.etree import tostring as htmlstring
htmlstring(a[0])

b'<div class="cta">\n                        <i class="icon-fdc-loading fa-spin"/>\n                    </div>\n                '

In [43]:
a[0].xpath(".//*")[2].tag

'style'

# yazio

https://www.yazio.com/en/foods/beer-light.html

In [102]:
f_recipe_response_API = requests.get('https://www.yazio.com/en/foods/beer-light.html')

In [103]:
tree = html.fromstring(f_recipe_response_API.text)  

In [115]:
# calories
tree.xpath('//*[@id="serving-calculator-numbers-calories"]/@data-value')[0]

text()='My Button

'95.7'

In [135]:
# calories
tree.xpath('/html/body/main/section[1]/div[2]/div[2]/div[2]/span//text()')

# carbohydrats
tree.xpath('/html/body/main/section[1]/div[2]/div[3]/div[2]/span//text()')

# protein
tree.xpath('/html/body/main/section[1]/div[2]/div[4]/div[2]/span//text()')

# fat
tree.xpath('/html/body/main/section[1]/div[2]/div[5]/div[2]/span//text()')

['0.0']

In [120]:
# ingredient
tree.xpath('//h1[@class="text-left mt-16 mb-32"]//text()')

['Beer, light']

In [143]:
# value of amount
tree.xpath('//input[@class="amount-input input js-max-length"]/@value')

# serving 
tree.xpath('//*[@id="serving-select-list"]/span[1]//text()')

['bottle (330.0 ml)']

In [166]:
# try to list all link source of web
# layer 1
a = requests.get('https://www.yazio.com/en/foods')
t = html.fromstring(a.text)  


TypeError: Type 'list' cannot be serialized.

In [176]:
# get link of all group
t.xpath('//div[@class="img-title-wrapper d-flex flex-column justify-content-center mt-16 align-self-start"]//a/@href')

['https://www.yazio.com/en/foods/alcoholic-drinks-beverages',
 'https://www.yazio.com/en/foods/baking-ingredients',
 'https://www.yazio.com/en/foods/cakes-pies',
 'https://www.yazio.com/en/foods/candy-sweets',
 'https://www.yazio.com/en/foods/cereals-grain-products',
 'https://www.yazio.com/en/foods/cheese',
 'https://www.yazio.com/en/foods/dips-spreads',
 'https://www.yazio.com/en/foods/dishes-meals',
 'https://www.yazio.com/en/foods/fast-food',
 'https://www.yazio.com/en/foods/fish-fish-products',
 'https://www.yazio.com/en/foods/fruits-fruit-products',
 'https://www.yazio.com/en/foods/herbs-spices',
 'https://www.yazio.com/en/foods/legumes',
 'https://www.yazio.com/en/foods/meat-meat-products',
 'https://www.yazio.com/en/foods/milk-dairy-products',
 'https://www.yazio.com/en/foods/miscellaneous',
 'https://www.yazio.com/en/foods/non-alcoholic-drinks-beverages',
 'https://www.yazio.com/en/foods/nuts-seeds',
 'https://www.yazio.com/en/foods/oatmeal-muesli-cereal',
 'https://www.yazio.

In [251]:
# layer 2 
# if not .html => not the last layer
# repeat until reach the path is .html
a2 = requests.get('https://www.yazio.com/en/foods/cheese')
t2 = html.fromstring(a2.text) 


In [252]:
t2.xpath('(//div[@class="row list-row "])[1]//a/@href')

['https://www.yazio.com/en/foods/cream-cheese.html',
 'https://www.yazio.com/en/foods/cream-cheese-fat-free.html',
 'https://www.yazio.com/en/foods/cream-cheese-low-fat.html',
 'https://www.yazio.com/en/foods/neufchatel-cheese.html',
 'https://www.yazio.com/en/foods/ricotta-cheese.html',
 'https://www.yazio.com/en/foods/requeijao-cremoso-light-catupiry.html']

['https://www.yazio.com/en/foods/cream-cheese',
 'https://www.yazio.com/en/foods/hard-cheese',
 'https://www.yazio.com/en/foods/sliced-cheese',
 'https://www.yazio.com/en/foods/soft-cheese']

In [209]:
t2.xpath("/html/body/main/div[2]/div[3]/div/ul[1]/li[1]/div[2]/div/div/@class")

['list-item-block table-cell vertical-middle']

In [213]:
htmlstring(t2.xpath('//div[@class="list-container "]')[0])

b'<div class="list-container ">\n<div class="row list-row ">\n<ul class="col-sm-6 list-primary list-circle-big">\n<li>\n<div class="list-item-image-wrapper">\n<picture data-alt="Cream Cheese" data-default-src="https://images.yazio.com/creamcheese.jpg?w=80&amp;h=80&amp;cf&amp;q=90">\n<source srcset="https://images.yazio.com/creamcheese.jpg?w=80&amp;h=80&amp;cf&amp;q=90,&#10;                                    https://images.yazio.com/creamcheese.jpg?w=160&amp;h=160&amp;cf&amp;q=90 2x">\n<img src="" alt="Cream Cheese" title="Cream Cheese" class="list-item-image img-circle" width="80" height="80"/>\n</source></picture>\n</div>\n<div class="list-item-group">\n<div class="list-item-table table">\n<div class="list-item-block table-cell vertical-middle">\n<a href="https://www.yazio.com/en/foods/cream-cheese" class="link ">\nCream Cheese </a>\n<p class="list-item-description">\ncontains 6 foods\n</p>\n</div>\n<span class="list-item-text table-cell vertical-middle icon icon-yz_20-001-right-open

# Selenium
scrap data 
"https://fddb.info/db/en/search/?cat=site-en&search=Cheddar+Cheese"

In [2]:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
import time

In [51]:
options = webdriver.ChromeOptions()
options.add_argument('--disable-cookies')

driver = webdriver.Chrome(options=options)

string = 'Cheddar Cheese'
string = string.replace(' ', '+')
driver.get('https://fddb.info/db/en/search/?cat=site-en&search='+string)
WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH,'//*[@id="CybotCookiebotDialogBodyButtonDecline"]'))).click()
# driver.find_element_by_xpath('//*[@id="CybotCookiebotDialogBodyButtonDecline"]').click()
driver.implicitly_wait(5)
driver.find_element("xpath",'//*[@class="rla"][1]').click()
print(a)
# driver.quit()

<selenium.webdriver.remote.webelement.WebElement (session="827a51fe6aea2db3f4af32eec905df16", element="f.68B2A4171D640C55FC2C0B12145E10D6.d.4EAFB832E9CC2EAAD2955B448EB95CA1.e.47")>


In [None]:
//*[@id="content"]/div[3]/div[1]/div/div/table[2]/tbody/tr[2]/td[2]/div

[//*[@id="content"]//tr]

In [65]:
f_recipe_response_API = requests.get('https://www.nutritionix.com/database/common-foods?page=1')

In [74]:
tree = html.fromstring(f_recipe_response_API.text)  

In [73]:
f_recipe_response_API.text

'<!doctype html>\n<html lang="en-us" state-class="cssClass">\n<head>\n  <base href="/">\n\n  <meta name="viewport" content="width=device-width, initial-scale=1, user-scalable=no">\n  <meta http-equiv="Content-Language" content="en">\n\n  <title ng-bind="MetaTags.title || \'Nutritionix\'">Nutritionix</title>\n  <meta property="og:title" content="{{MetaTags.title || \'Nutritionix\'}}"/>\n  <meta name="twitter:title" content="{{MetaTags.title || \'Nutritionix\'}}"/>\n  <meta name="p:domain_verify" content="137b225f91bb8e3a501ae281806e9b14"/>\n\n  <meta ng-if="MetaTags.description" name="description" content="{{MetaTags.description}}">\n  <meta ng-if="MetaTags.description" property="og:description" content="{{MetaTags.description}}"/>\n  <meta ng-if="MetaTags.description" name="twitter:description" content="{{MetaTags.description}}"/>\n\n  <meta ng-if="MetaTags.keywords" name="keywords" content="{{MetaTags.keywords}}">\n\n  <meta ng-repeat="(key, value) in MetaTags.properties"\n        nam

In [36]:
//*[@id="content"]/div[3]/div[1]/div/div/table[2]/tbody/tr[2]/td[2]/div

SyntaxError: invalid syntax (2309997718.py, line 1)

# nutritionix.com

In [23]:
options = webdriver.ChromeOptions()
options.add_argument('--disable-cookies')

driver = webdriver.Chrome(options=options)

string = '1'
string = string.replace(' ', '+')
driver.get('https://www.nutritionix.com/database/common-foods?page='+string)
elements = WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.XPATH, '//*[@class="item-row item-hover ng-scope"]')))

# elements[0].click()


In [27]:
elements[3].get_attribute("href")

'/food/oreo'

In [129]:
elements

[<selenium.webdriver.remote.webelement.WebElement (session="393be40407dc45c1c89ce1fbc0dfbf2c", element="f.5E9F99736173D22D954A39EB2E9DC7F3.d.C9B836BB7C29EA377431E7017812CC24.e.37")>,
 <selenium.webdriver.remote.webelement.WebElement (session="393be40407dc45c1c89ce1fbc0dfbf2c", element="f.5E9F99736173D22D954A39EB2E9DC7F3.d.C9B836BB7C29EA377431E7017812CC24.e.38")>,
 <selenium.webdriver.remote.webelement.WebElement (session="393be40407dc45c1c89ce1fbc0dfbf2c", element="f.5E9F99736173D22D954A39EB2E9DC7F3.d.C9B836BB7C29EA377431E7017812CC24.e.39")>,
 <selenium.webdriver.remote.webelement.WebElement (session="393be40407dc45c1c89ce1fbc0dfbf2c", element="f.5E9F99736173D22D954A39EB2E9DC7F3.d.C9B836BB7C29EA377431E7017812CC24.e.40")>,
 <selenium.webdriver.remote.webelement.WebElement (session="393be40407dc45c1c89ce1fbc0dfbf2c", element="f.5E9F99736173D22D954A39EB2E9DC7F3.d.C9B836BB7C29EA377431E7017812CC24.e.41")>,
 <selenium.webdriver.remote.webelement.WebElement (session="393be40407dc45c1c89ce1fbc

In [8]:
#elements[0]
elements[0].find_element(By.XPATH, '//*[@class="item-photo"]').get_attribute('src')


'https://nix-tag-images.s3.amazonaws.com/1814_thumb.jpg'

In [16]:
options = webdriver.ChromeOptions()
options.add_argument('--disable-cookies')
options.add_argument("--start-maximized")
driver = webdriver.Chrome(options=options)

driver.get('https://www.nutritionix.com/food/greek-yogurt')
WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH,'//*[@class="fc-button fc-cta-consent fc-primary-button"]'))).click()

tab_index = driver.find_element(By.XPATH, '//*[@class="nf-unitQuantityBox nf-modifier-field"]')
# action = ActionChains(driver)
# action.move_to_element(tab_index).click().send_keys(Keys.DELETE).send_keys('1').send_keys(Keys.ENTER)
# action.perform()

driver.implicitly_wait(2)

food_label = driver.find_element(By.XPATH, '//*[@class="nf"]')
serving_size = food_label.find_element(By.XPATH, '//input[@type="hidden"]').get_attribute('value')
serving_unit = food_label.find_element(By.XPATH, '//*[@class="nf-serving-unit-name "]').text.split('\n')[0]
second_amount = food_label.find_element(By.XPATH, '//*[@itemprop="servingSize"]').text.split('\n')[0]
second_unit = food_label.find_element(By.XPATH, '//*[@class="sr-only"]').text.split('\n')[0]
calories = food_label.find_element(By.XPATH, '//*[@itemprop="calories"]').text.split('\n')[0] 
food_name = food_label.find_element(By.XPATH, '//*[@class="nf-item-name block"]').text.split('\n')[0] 
total_fat = food_label.find_element(By.XPATH, '//*[@itemprop="fatContent"]').text.split('\n')[0] 
cholesterol = food_label.find_element(By.XPATH, '//*[@itemprop="cholesterolContent"]').text.split('\n')[0] 
sodium = food_label.find_element(By.XPATH, '//*[@itemprop="sodiumContent"]').text.split('\n')[0] 
carbonhydrate = food_label.find_element(By.XPATH, '//*[@itemprop="carbohydrateContent"]').text.split('\n')[0] 
protein = food_label.find_element(By.XPATH, '//*[@itemprop="proteinContent"]').text.split('\n')[0] 
calcium = food_label.find_element(By.XPATH, '//*[@itemprop="calciumContent"]').text.split('\n')[0] 
iron = food_label.find_element(By.XPATH, '//*[@itemprop="ironContent"]').text.split('\n')[0] 
potassium = food_label.find_element(By.XPATH, '//*[@itemprop="potassiumContent"]').text.split('\n')[0] 
vitamin_d = food_label.find_element(By.XPATH, '//*[@itemprop="vitaminDContent"]').text.split('\n')[0] 
caffeine = food_label.find_element(By.XPATH, '//*[@itemprop="caffeineContent"]').text.split('\n')[0] 


d={'name': food_name,
  'serving_unit':serving_unit,
  'serving_size': serving_size,
  'unit':second_unit,
  'amount': second_amount,
  'calories': calories,
  'total_fat': total_fat,
  'cholesterol': cholesterol,
  'sodium': sodium,
  'carbonhydrate': carbonhydrate,
  'protein': protein,
  'calcium': calcium,
  'iron': iron,
  'potassium': potassium,
  'vitamin_d': vitamin_d,
  'caffeine': caffeine
}

total_height = food_label.size["height"] + 1000
total_width = food_label.size['width']
driver.set_window_size(total_width, total_height)  # the trick
# Javascript expression to scroll to a particular element
# arguments[0] refers to the first argument that is later passed
# in to execute_script method
js_code = "arguments[0].scrollIntoView();"
driver.execute_script(js_code, food_label)


WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH,'//*[@class="btn btn-default btn-xs"]'))).click()
food_label.screenshot('ff_test2.png')



True

In [214]:
food_label.find_elements(By.XPATH, '//*[@class="nf-line"]')[1].text

'Total Fat 0.7g\ngrams\n1%\nDaily Value'

# FOOD.COM
https://www.food.com/search/

In [31]:
# list of recipe
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from selenium.common.exceptions import TimeoutException
import time


options = webdriver.ChromeOptions()
options.add_argument('--disable-cookies')

driver = webdriver.Chrome(options=options)

driver.get('https://www.food.com/search/')

users = set()
recipes = set()
recipe_db = {}
for i in range(2): # 2000
    driver.get('https://www.food.com/search/')
    elements = WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.XPATH, '//*[@class="fd-tile fd-recipe  "]')))
    while len(elements) <= (i+1):
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(0.5)
        elements = WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.XPATH, '//*[@class="fd-tile fd-recipe  "]')))
    cook_time = elements[i].find_elements(By.XPATH, '//*[@class="cook-time"]')[i].text 
    recipe_link = elements[i].find_elements(By.XPATH, '//*[@class="inner-wrapper"]/a')[i].get_attribute('href') # recipe
    recipe_img = elements[i].find_elements(By.XPATH, '//img')[i].get_attribute('src') # img
    user_link = elements[i].find_elements(By.XPATH, '//*[@class="author"]//a')[i].get_attribute('href') # user link
    users.add(user_link)
    if recipe_link not in recipes:
        elements[i].click()

        serving_tab = driver.find_element(By.XPATH, '//*[@class="value svelte-1o10zxc"]')
        action = ActionChains(driver)
        action.move_to_element(serving_tab).click().send_keys(Keys.DELETE).send_keys('1').send_keys(Keys.ENTER)
        action.perform()

        igr_elements = WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.XPATH, '//*[@style="display: contents"]')))
        igrs = []
        for i in range(1, len(igr_elements)):
            igd = igr_elements[i].text
            igrs += [igd]
            
        instruction = driver.find_element(By.XPATH, '//*[@class="direction-list svelte-1dqq0pw"]').text

        reviews = WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.XPATH, '//*[@id="reviews"]//*[@class="post svelte-omstw2"]')))
        user_elements = WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.XPATH, '//*[@id="reviews"]//*[@class="post svelte-omstw2"]//*[@class="post__avatar svelte-omstw2"]/a')))
        for u in user_elements:
            user = u.get_attribute('href')
            users.add(user)
            
        rating = driver.find_element(By.XPATH, '//*[@class="layout__item rating-badge svelte-1dqq0pw"]//a').get_attribute('aria-label')
        
        recipe_id = recipe_link.split('-')[-1]
        
        recipe_db[recipe_id] = {
            'recipe_link' : recipe_link,
            'author': user_link,
            'ingredient': igrs,
            'rating': rating,
            'recipe_img': recipe_link,
            'time': cook_time,
            'serving_pp': 1
        }
# user
user_db = {}
for link in users:
    driver.get(link)
    reviews = {}
    while True:
        items = WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.XPATH, '//*[@class="gk-aa-item"]')))
        for item in items:
            if 'reviewed' in item.text.split('\n')[0]:
                recipe_link = item.find_element(By.CLASS_NAME, "gk-aa-item-heading-info").find_elements(By.TAG_NAME, "a")[1].get_attribute('href')
                if recipe_link in recipes:
                    rating = item.find_element(By.XPATH,'//*[@class="fd-rating-percent"]').get_attribute('style')
                    recipe_id = recipe_link.split('-')[-1]
                    reviews[recipe_id] = rating
        try:
            WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.XPATH,'//*[@class="gk-aa-load-more"]'))).click()
        except TimeoutException:
            break
    user_id = link.split(r'/')[-1]
    user_db[user_id] = reviews



In [32]:
recipe_link
# elements[2].find_elements(By.CLASS_NAME, '//*[@class="cook-time"]')


'https://www.food.com/recipe/best-banana-bread-2886'