<a href="https://colab.research.google.com/github/yuvalofek/Ingredient-Recommendation/blob/main/Ingredient_scraping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Generating a ingredient pair dataset from The Food Bible book:
Referenced:
https://andrew-muller.medium.com/getting-text-from-epub-files-in-python-fbfe5df5c2da

I found an epub version of the book which I use for this scraping. I am assuming it is uploaded and the path to it is stored in the PATH variable down below. 

In [1]:
!pip install ebooklib



In [2]:
import ebooklib
from ebooklib import epub
from bs4 import BeautifulSoup
from collections import defaultdict
import re
import random

from pprint import pprint
import sqlite3

In [3]:
PATH = '/content/flavorbible.epub'
food_bible = epub.read_epub(PATH)

In [4]:
chap_content = {}
# chapter 3 has all the ingredient pairing, so we go in and get the content for 
# everything with 3 in the chapter name
for item in food_bible.get_items_of_type(ebooklib.ITEM_DOCUMENT):
  name =item.get_name() 
  if '3' in name:
    chap_content[name] = BeautifulSoup(item.get_content(), 'html.parser')

Some notation the book uses:
<p class="btx1">KEY: Flavors mentioned in regular type are pairings suggested by one or more experts.</p>
<p class="btx1">Those in <strong>bold</strong> were recommended by a number of experts.</p>
<p class="btx1">Those in <strong>BOLD CAPS</strong> were very highly recommended by an even greater number of experts.</p>
<p class="btx13">Those in *<strong>BOLD CAPS</strong> with an asterisk (*) are “Holy Grail” pairings that are the most highly recommended by the greatest number of experts.</p>



## Setting the goals
Now that we have the data in a convenient form, we set some goals for the scrape:
Each edge should be given a number, indicating the strength of that connection as described in the KEY above.
I assign: 
* Regular pairing - 1
* Bold pairing - 2
* Bold & capitalized - 3
* Bold, capitalized, and asterisk - 5 (to bump it to a quantity of a "Holy Grail")

At the end of the scrape, we want to generate a bidirectional graph with weighted connections. For convenince, I think I will store this as a list of tuples in the form: (ingredient 1, ingredient 2, weight of connections)

## Weighing function 

In [5]:
# Functions to capture the different keys:
def weigh_edge(text):
  """ 
  Check conditions for the input text to see where it falls in the key
  """
  try:
    if '*' in text:
      # Asterisk --> Holy grail
      return 5
    if text.lower() != text:
      # Capitalized --> 3
      return 3
    if 'strong' in text.lower():
      # Bolded
      return 2
  except AttributeError:
    None
  return 1

In [6]:
# Test weigh_edge:
# normal 
print(weigh_edge('milk'))

## Only bolded -- 2
print(weigh_edge('<strong>bananas</strong>'))

## Bold and cap - 3
print(weigh_edge('<strong>BEEF,</strong>'))

## Holy grail - 5
print(weigh_edge('<strong>*PORK</strong>'))


# list - 1
print(weigh_edge(['apples','ginger','lemon','quince','sugar']))


1
2
3
5
1


## Initial scrape
Turns out the fomratting is not consistent and sometimes we get 'ul' that are meant to be the header

### We also need a way to get rid of the variable length - complex pairs
To do this we create a dict whose entries are (ingredient, next key). This allows us to create a singly-linked list build inside a dict object, which therefore allows for variable length sequences to be stored. 

If a next key does not exist, we store a None in the next_key to indicate the sequence is over. 

In [7]:
def chain_pairings(key, pair_list, pair_dict, used_keys):
  if len(pair_list)==1:
    pair_dict[key] = ((pair_list[0], None))
    return
  
  # get a key that wasn't used before and add it to used keys... not ideal but its something
  new_key = random.randint(0, 2**16)
  while new_key in used_keys:
    new_key = random.randint(0, 2**16)
  used_keys.add(new_key)

  # Chain the rest of the list
  pair_dict[key] = (pair_list[0], new_key)
  chain_pairings(new_key, pair_list[1:], pair_dict, used_keys)

In [8]:
def scrape_chapter(chapter, pairings, current_ingredient,variable_pairings, used_keys):  
  new = 2
  description_words = ['Season', 'Weight', 'Volume', 'Techniques'] ## Words used to describe current inggredient
  for tag in chapter.find_all('p'):
    class_name = tag.get('class')
    if class_name is not None:
      formatted_class = class_name[0].strip().lower()
      if formatted_class == 'lh1' or formatted_class == 'lh':
        # If this is the ingredient header, start a new entry in the dict and update
        # current ingredient
        current_ingredient = tag.get_text().lower().strip()
        new = 0
      elif formatted_class == 'ul':
        ingredient = tag.get_text().strip()

        if ':' in ingredient:
          # Noticed descriptions for the ingredients were also labeled ul, but they 
          # where followed by a : so we use that to ignore them. 
          continue
        
        # get edge weight
        weight = weigh_edge(ingredient)      
        
        ing_split_plus = ingredient.split('+')
        if len(ing_split_plus) > 1:
          # for more than pair ingredient combinations
          ingredient = [ing.strip() for ing in ing_split_plus]
          key = random.randint(0, 2**16)
          used_keys.add(key)
          chain_pairings(key, ingredient, variable_pairings, used_keys)
          ingredient = str(key)

        else: 
          # Drop astericks if they are there - helps make data uniform
          ingredient = re.sub('\*', '', ingredient).lower()
        
        if ingredient is None or ingredient == '':
          # If we get none of empty string, don't add to dict
          continue
        
        pairings.append((current_ingredient, ingredient, weight))
        new +=1
  return pairings, current_ingredient

In [9]:
# Test
pairings = list()
variable_pairings = dict()
used_keys = set()
current_ingredient = None
pairings, current_ingredient = scrape_chapter(chap_content['Text/FlavorBible_chap-3a.html'],
                                              pairings, current_ingredient,
                                              variable_pairings, used_keys)
pprint(pairings)

[('achiote seeds', 'beef', 1),
 ('achiote seeds', 'chicken', 1),
 ('achiote seeds', 'chiles', 1),
 ('achiote seeds', 'citrus (e.g., sour orange)', 1),
 ('achiote seeds', 'fish', 1),
 ('achiote seeds', 'game birds (e.g., duck, quail)', 1),
 ('achiote seeds', 'garlic', 1),
 ('achiote seeds', 'mexican cuisine, esp. yucatán oil', 3),
 ('achiote seeds', 'pork', 1),
 ('achiote seeds', 'shellfish, e.g., lobster, shrimp', 1),
 ('achiote seeds', 'shrimp', 1),
 ('achiote seeds', '10644', 1),
 ('acidity (see sourness)\r\n  afghan cuisine', 'almonds', 1),
 ('acidity (see sourness)\r\n  afghan cuisine', 'barley', 1),
 ('acidity (see sourness)\r\n  afghan cuisine', 'breads', 1),
 ('acidity (see sourness)\r\n  afghan cuisine', 'cardamom', 1),
 ('acidity (see sourness)\r\n  afghan cuisine', 'chile pepper', 1),
 ('acidity (see sourness)\r\n  afghan cuisine', 'cinnamon', 1),
 ('acidity (see sourness)\r\n  afghan cuisine', 'cloves', 1),
 ('acidity (see sourness)\r\n  afghan cuisine', 'coriander', 1),
 ('

In [10]:
# Get all the pairings!
pairings = list()
variable_pairings = dict()
used_keys = set()
for chapter in chap_content.values():
  pairings, current_ingredient = scrape_chapter(chapter, pairings, current_ingredient, variable_pairings, used_keys)

In [11]:
# Number of edges
len(pairings)

22219

### Counting Nodes

In [220]:
# We put all the ingredients into a set so we can count the nodes
ingredients = set()
for ing1, ing2, _ in pairings:
  try: 
    int(ing2)
    continue
  except ValueError:
    ingredients.add(ing1)

In [221]:
# Number of Nodes
len(ingredients)

602

### Splitting pairings into normal (pairs) and complex (for 3+ ingredients)

In [331]:
# Recovering only the two ingredient pairings:
complex_pairings = []
two_ing_pairings = []
for ing1, ing2, weight in pairings:
  try: 
    int(ing2) 
    complex_pairings.append((ing1, ing2))
  except ValueError:
    two_ing_pairings.append((ing1, ing2, weight))

In [332]:
two_ing_pairings

[('achiote seeds', 'beef', 1),
 ('achiote seeds', 'chicken', 1),
 ('achiote seeds', 'chiles', 1),
 ('achiote seeds', 'citrus (e.g., sour orange)', 1),
 ('achiote seeds', 'fish', 1),
 ('achiote seeds', 'game birds (e.g., duck, quail)', 1),
 ('achiote seeds', 'garlic', 1),
 ('achiote seeds', 'mexican cuisine, esp. yucatán oil', 3),
 ('achiote seeds', 'pork', 1),
 ('achiote seeds', 'shellfish, e.g., lobster, shrimp', 1),
 ('achiote seeds', 'shrimp', 1),
 ('acidity (see sourness)\r\n  afghan cuisine', 'almonds', 1),
 ('acidity (see sourness)\r\n  afghan cuisine', 'barley', 1),
 ('acidity (see sourness)\r\n  afghan cuisine', 'breads', 1),
 ('acidity (see sourness)\r\n  afghan cuisine', 'cardamom', 1),
 ('acidity (see sourness)\r\n  afghan cuisine', 'chile pepper', 1),
 ('acidity (see sourness)\r\n  afghan cuisine', 'cinnamon', 1),
 ('acidity (see sourness)\r\n  afghan cuisine', 'cloves', 1),
 ('acidity (see sourness)\r\n  afghan cuisine', 'coriander', 1),
 ('acidity (see sourness)\r\n  afgh

In [333]:
len(two_ing_pairings)

20763

### Some cleaning of data

In [334]:
from collections import Counter

In [335]:
# Check the most common words in the ingredients:

words = []
for ing1, ing2, _ in two_ing_pairings:
  words.extend(ing1.split(' ') + ing2.split(' '))

word_counts = Counter(words)
word_counts.most_common(50)

[('—', 3368),
 ('(see', 3120),
 ('in', 2428),
 ('also', 2426),
 ('cuisine', 1780),
 ('general', 1713),
 ('and', 1358),
 ('esp.', 1007),
 ('general)', 656),
 ('(aka', 642),
 ('cheese,', 600),
 ('squash,', 560),
 ('oil', 541),
 ('beans,', 511),
 ('beef', 498),
 ('white', 498),
 ('oil,', 483),
 ('(e.g.,', 472),
 ('cream', 470),
 ('pork', 467),
 ('nuts', 464),
 ('fish', 434),
 ('chicken', 403),
 ('mushrooms', 399),
 ('garlic', 394),
 ('lemon', 379),
 ('black', 376),
 ('potatoes', 371),
 ('red', 358),
 ('ginger', 340),
 ('thyme', 340),
 ('seeds', 336),
 ('fennel', 336),
 ('juice', 324),
 ('vinegar,', 319),
 ('sauce', 315),
 ('tomatoes', 309),
 ('sweet', 307),
 ('greens', 307),
 ('leaf', 289),
 ('olive', 288),
 ('celery', 281),
 ('chile', 277),
 ('mint', 274),
 ('coconut', 273),
 ('pepper,', 272),
 ('peppers', 271),
 ('/', 268),
 ('basil', 267),
 ('cinnamon', 258)]

In [336]:
splitters = ['—', 'esp.', '(e.g.', 'eg', 'esp', 'and', 'or', 'aka', '/', 'see', 'also', 'e.g.']
deleters = ['also', 'in general']
length = 0


while length != len(two_ing_pairings):
  length = len(two_ing_pairings)
  for idx, (ing1, ing2, weight) in enumerate(two_ing_pairings):
    for splitter in splitters:
      ing1_ = re.sub(r'[.,!?:;\"\'-()]', '', ing1)
      ing2_ = re.sub(r'[.,!?:;\"\'-()]', '', ing2)

      if splitter in ing1_.split(' '):
        print('->', splitter, '<-')
        print('inp: ', ing1_)
        new_vals = ing1_.split(splitter)
        new_vals = [val.strip() for val in new_vals if val != '']
        print('Deleting...', two_ing_pairings[idx])
        del two_ing_pairings[idx]
        for val in new_vals:
          if val == '':
            continue
          delete_flag = False
          for delete in deleters:
            if delete == val.strip():
              delete_flag = True
              break
          if delete_flag:
            continue
          val_ = re.sub('\,|\;|\)|\(|-', '', val.strip())
          #print('~~', val)
          print('ADDING:', (val_, ing2_, weight))
          two_ing_pairings.append((val_, ing2_, weight))

      ## ing2
      if splitter in ing2_.split(' '):
        new_vals = ing2_.split(splitter)
        new_vals = [val.strip() for val in new_vals if val != '']
        print('Deleting...', two_ing_pairings[idx])
        del two_ing_pairings[idx]
        for val in new_vals:
          if val == '':
            continue
          delete_flag = False
          for delete in deleters:
            if delete == val.strip():
              delete_flag = True
              break
          if delete_flag:
            continue
          val_ = re.sub('\,|\;|\)|\(|-', '', val.strip())
          #print('~~', val)
          print('ADDING:', (ing1_, val_, weight))
          two_ing_pairings.append((ing1_, val_, weight))

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
ADDING: ('pecans', 'breads cookies pies', 1)
-> — <-
inp:  nuts — in general
Deleting... ('nuts — in general', 'grapes', 1)
ADDING: ('nuts', 'grapes', 1)
-> — <-
inp:  nuts — in general
Deleting... ('nuts — in general', 'rum', 1)
ADDING: ('nuts', 'rum', 1)
-> — <-
inp:  nuts — in general
Deleting... ('nuts — in general', 'arugula', 1)
ADDING: ('nuts', 'arugula', 1)
-> — <-
inp:  nuts — in general
Deleting... ('nuts — in general', 'lemon', 1)
ADDING: ('nuts', 'lemon', 1)
-> — <-
inp:  nuts — in general
Deleting... ('nuts — in general', 'avoid', 3)
ADDING: ('nuts', 'avoid', 3)
-> — <-
inp:  game — in general
Deleting... ('game — in general', 'asparagus white', 1)
ADDING: ('game', 'asparagus white', 1)
-> — <-
inp:  game — in general
Deleting... ('game — in general', 'curry paste thai yellow', 3)
ADDING: ('game', 'curry paste thai yellow', 3)
-> — <-
inp:  game — in general
Deleting... ('game — in general', 'pine nuts', 1)
A

In [337]:
len(two_ing_pairings)

32421

In [339]:
# Check the most common words in the ingredients:

words = []
for ing1, ing2, _ in two_ing_pairings:
  words.extend(ing1.split(' ') + ing2.split(' '))

word_counts = Counter(words)
word_counts.most_common(50)

[('cuisine', 2624),
 ('oil', 1089),
 ('fish', 795),
 ('nuts', 793),
 ('cream', 790),
 ('squash', 772),
 ('basil', 745),
 ('white', 729),
 ('beans', 713),
 ('greens', 679),
 ('mushrooms', 651),
 ('beef', 606),
 ('lemon', 605),
 ('potatoes', 589),
 ('chicken', 580),
 ('cheese', 563),
 ('pork', 555),
 ('peppers', 535),
 ('onions', 528),
 ('red', 516),
 ('garlic', 514),
 ('thyme', 504),
 ('fennel', 500),
 ('black', 497),
 ('juice', 459),
 ('seeds', 451),
 ('sauce', 449),
 ('pumpkin', 446),
 ('tomatoes', 427),
 ('sweet', 424),
 ('rice', 420),
 ('game', 418),
 ('ginger', 414),
 ('dishes', 414),
 ('coconut', 413),
 ('leaf', 412),
 ('olive', 409),
 ('anise', 406),
 ('lettuces', 391),
 ('cheese,', 381),
 ('fruit', 381),
 ('chile', 377),
 ('oranges', 374),
 ('shellfish', 372),
 ('green', 370),
 ('mint', 367),
 ('butter', 361),
 ('etc', 359),
 ('celery', 358),
 ('peas', 356)]

In [340]:
two_ing_pairings

[('achiote seeds', 'beef', 1),
 ('achiote seeds', 'chicken', 1),
 ('achiote seeds', 'chiles', 1),
 ('achiote seeds', 'fish', 1),
 ('achiote seeds', 'garlic', 1),
 ('achiote seeds', 'pork', 1),
 ('achiote seeds', 'shrimp', 1),
 ('african cuisine (south)', 'cinnamon', 1),
 ('african cuisine (south)', 'cloves', 1),
 ('african cuisine (south)', 'fenugreek', 1),
 ('african cuisine (south)', 'garlic', 1),
 ('african cuisine (south)', 'ginger', 1),
 ('african cuisine (south)', 'lamb', 1),
 ('african cuisine (south)', 'onions', 1),
 ('african cuisine (south)', 'peas', 1),
 ('african cuisine (south)', 'pumpkin', 1),
 ('african cuisine (south)', 'stews', 1),
 ('african cuisine (south)', 'tomatoes', 1),
 ('african cuisine (south)', 'turmeric', 1),
 ('african cuisine (west)', 'bananas', 1),
 ('african cuisine (west)', 'bell peppers', 1),
 ('african cuisine (west)', 'braised dishes', 1),
 ('african cuisine (west)', 'chicken', 1),
 ('african cuisine (west)', 'chile peppers', 1),
 ('african cuisine (

## Storing in SQLite

In [342]:
# Save to database
DB_PATH = '/content/drive/MyDrive/Colab Notebooks/ingredients_recommender/ingredients.db'
conn = sqlite3.connect(DB_PATH)
c = conn.cursor()

In [343]:
# creating a table for the ingredients
c.execute("""
CREATE TABLE IF NOT EXISTS edges (
  id INTEGER PRIMARY KEY AUTOINCREMENT,
  ingred1 TEXT, 
  ingred2 TEXT, 
  weight INTEGER
);
""")
conn.commit()

In [344]:
c.execute("""
CREATE TABLE IF NOT EXISTS complex_edges (
  id INTEGER PRIMARY KEY,
  ingred TEXT, 
  key INTEGER
);
""")
conn.commit()

In [345]:
# Creating a table for the variable pairings
c.execute("""
CREATE TABLE IF NOT EXISTS long_pairings (
  key INTEGER PRIMARY KEY,
  ingred TEXT,  
  next_key INTEGER
);
""")
conn.commit()

In [346]:
c.executemany(""" INSERT INTO edges (ingred1, ingred2, weight) VALUES (?,?,?); """, two_ing_pairings)
conn.commit()

In [347]:
c.executemany(""" INSERT INTO complex_edges (ingred, key) VALUES (?,?); """, complex_pairings)
conn.commit()

In [348]:
c.executemany("""
  INSERT INTO long_pairings (key, ingred, next_key) VALUES (?,?, ?); 
  """,
  [(key, ing, nxt_key) for key, (ing, nxt_key) in variable_pairings.items()]
  )
conn.commit()

In [349]:
conn.close()