# Fetch all meals with nutrition facts and ingredients

In [93]:
from __future__ import unicode_literals
from bs4 import BeautifulSoup
from contextlib import closing
from requests import get
from requests.exceptions import RequestException
import matplotlib.pyplot as plt
import csv
import html5lib
import logging
import os
import pandas as pd
import numpy as np
import re
import sys
import time

In [198]:
import selenium
from selenium import webdriver
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from datetime import datetime as dt
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [275]:
def get_driver():    
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    driver = webdriver.Chrome(options=chrome_options)
    return driver

In [276]:
driver = get_driver()

In [2]:
logging.basicConfig(filename = '/home/hardik/BMI-based-food-recommendation/scraping_logs.log', filemode='a', level=logging.INFO)

In [3]:
logger = logging.getLogger('test')
logger.setLevel(logging.INFO)
logger.addHandler(logging.StreamHandler(sys.stdout))

In [4]:
def get_html(url):
    """This function makes a GET Request to the url specified.
    This gets the raw HTML content if there is a good response.
    """
    try:
        resp = get(url, stream=True, timeout=30)
        if resp.status_code == 200:
            return resp.content
        else:
            return None
    except Exception as e:
        logging.exception(str(e))

In [163]:
def get_food_name_url(parsed_data_html):
    
    # Convert html data into beautifulSoup format using html parser
    parsed_data_bs = BeautifulSoup(parsed_data_html, "html.parser")

    # Search for table with id = 'myTable' and extract each row of data
    parsed_data_class = parsed_data_bs.findAll('div', {'class': 'row food_result'})
    food_list = []
    for each_row in parsed_data_class:
        link = each_row.find('a')
        nutrition_link = link['href']
        food_name = link.find('div', {'class':'result_name'})
        nutrients = link.findAll('div', {'class':'nutrient_cell'})
        nuts = []
        for each_nutrient in nutrients:
            each_nutrient = each_nutrient.text.strip().split(' ')
            if len(each_nutrient)!=1:
                each_nut = each_nutrient[0][:-1] + ' ' + each_nutrient[-1]
            else:
                each_nut = each_nutrient[0]
            nuts.append(each_nut)
        food_list.append([food_name.text.strip(), nutrition_link])
        food_list[-1].extend(nuts)
    return food_list

In [172]:
url = 'https://www.eatthismuch.com/food/browse/?q=&type=recipe&group=Pasta&page='
url = 'https://www.eatthismuch.com/food/browse/?q=&type=recipe&group=Sandwiches&page='
url = 'https://www.eatthismuch.com/food/browse/?q=&type=recipe&group=Mostly%20meat&page='

In [174]:
food_list = []
for i in range(1,36):
    current_page_url = url + str(i)
    parsed_data_html = get_html(current_page_url)
    food_list.extend(get_food_name_url(parsed_data_html))

In [166]:
pastas_list = pd.DataFrame(food_list)

In [171]:
sandwich_list = pd.DataFrame(food_list)

In [175]:
meat_list = pd.DataFrame(food_list)

In [176]:
pastas_list['type'] = 'pasta'
sandwich_list['type'] = 'sandwich'
meat_list['type'] = 'meat'

In [177]:
food = pd.concat([pastas_list, sandwich_list, meat_list])

In [178]:
food.reset_index(inplace=True, drop=True)

In [181]:
food.columns = ['food_name', 'food_url', 'Calories', 'Carbs', 'Fat', 'Protein', 'Fiber', 'type']

In [183]:
food.to_csv('food_list.csv', header=True, index=False)

In [256]:
food_array = list(food.values)

In [271]:
def get_food_ingredients(parsed_data_html):
    parsed_data_bs = BeautifulSoup(parsed_data_html, "html.parser")
    # Search for table with id = 'myTable' and extract each row of data
    parsed_data_class = parsed_data_bs.find('div', {'class': 'ingredients_box'})
    try:
        ingredients_html = parsed_data_class.findAll('li')
        ingredient_list = []
        for each_ingredient in ingredients_html:
            ingredient = each_ingredient.find('div', {'class':'print_name'}).text
            ingredient_list.append(ingredient)
        return ', '.join(ingredient_list)
    except AttributeError as e:
        return []

In [277]:
url = 'https://www.eatthismuch.com'
for index in range(571,len(food_array)):
    current_page_url = url + food_array[index][1]
    driver.get(current_page_url)
    time.sleep(5)
    parsed_data_html = driver.page_source
    ingredients = get_food_ingredients(parsed_data_html)
    ingredients_list.append([food_array[index][1], ingredients])

In [279]:
ingredient = pd.DataFrame(ingredients_list)

In [282]:
ingredient.columns = ['foodurl', 'ingredients']

In [283]:
final_food = pd.merge(food, ingredient, left_on = 'food_url', right_on = 'foodurl')

In [285]:
final_food.drop(['foodurl'], axis=1, inplace=True)

In [287]:
final_food.to_csv('food_data.csv', index=False)