In [1]:
import pandas as pd
import ast
import re
import inflect
import json

In [39]:
# For cleaning up recipe titles
def remove_extra_whitespace(input_string):
    cleaned_string = re.sub(r'\s+', ' ', input_string.strip())
    return cleaned_string.lower()

def replace_non_alpha_parentheses(input_string):
    cleaned_string = re.sub(r'[^a-zA-Z() ]', '', input_string)
    return cleaned_string

def remove_null_chars(s):
    new_list = []
    for x in s:
        if '\x00' in x:
            new_list.append(x.replace('\x00', ""))
        else:
            new_list.append(x)
    return new_list
#-----------------------------#

def remove_non_alphabet(s):
    return {re.sub(r'[^a-zA-Z ]', '', word) for word in s}

def check_ingredients(s, ings):
    food_items = s.intersection(ings)
    return food_items

def get_ingredients():
    p = inflect.engine()
    with open('train.json', 'r') as file:
        data = json.load(file)
    ings = set()
    for recipe in data:
        s = recipe["ingredients"]
        ings.update(s)
    opposite = set()
    for x in ings:
        alt = None
        if p.singular_noun(x):
            alt = p.singular_noun(x)
        else:
            alt = p.plural(x)
        if alt:
            opposite.add(alt)
    ings = ings.union(opposite)
    return ings

In [41]:
df = pd.read_csv('recipes_data.csv')
# convert NER, ingredients, directions to appropriate data structures
df['NER'] = df['NER'].apply(lambda x: set(ast.literal_eval(x)))
df['ingredients'] = df['ingredients'].apply(lambda x: ast.literal_eval(x))
df['directions'] = df['directions'].apply(lambda x: ast.literal_eval(x))

In [None]:
df['NER'] = df['NER'].apply(remove_non_alphabet)

# Create validation set of over 1500 ingredients
all_ingredients = get_ingredients()

# Clean up NER column by matching to ingredient validation set
df['NER'] = df['NER'].apply(lambda x: check_ingredients(x, all_ingredients))
df = df[df['NER'] != set()]

In [72]:
# Create dataframes for each table

ingredients_df = pd.DataFrame(list(all_ingredients), columns=['name'])

recipes_df = df.drop(columns=['source']).rename(columns={'title': 'name'})
recipes_df['name'] = recipes_df['name'].apply(lambda x: remove_extra_whitespace(replace_non_alpha_parentheses(x)))
recipes_df = recipes_df[recipes_df['name'] != '']
recipes_df['directions'] = recipes_df['directions'].apply(remove_null_chars)
recipes_df = recipes_df.rename(columns = {'index': 'id', 'NER': 'ingredient_names'})
recipes_df = recipes_df.groupby('name').filter(lambda group: len(group) > 2)
recipes_df = recipes_df.reset_index()
recipes_df['id'] = recipes_df.index
recipes_df = recipes_df[['id', 'name', 'ingredient_names', 'ingredients', 'directions', 'link', 'site']]
recipes_df['ingredient_names'] = recipes_df['ingredient_names'].apply(lambda x: list(x))

# recipe_ingredients_df = recipes_df.copy()
# recipe_ingredients_df = recipe_ingredients_df.explode('NER')
# recipe_ingredients_df['recipe_id'] = recipe_ingredients_df.index
# recipe_ingredients_df = recipe_ingredients_df.drop(columns=['name', 'ingredients', 'directions', 'link', 'site']).rename(columns={'NER': 'ingredient'}).reset_index()
# recipe_ingredients_df = recipe_ingredients_df[['recipe_id', 'ingredient']]

# recipes_df = recipes_df.drop(columns=['NER'])

In [238]:
import requests
import json
import re
api_url = 'https://trackapi.nutritionix.com/v2/natural/nutrients'
app_id = '4a922d3b'
app_key = '581d2bca47024d1740305fde31226677'
query = '1 cup rice'
headers = {
    'Content-Type' : 'application/json',
    "x-app-id": app_id,
    "x-app-key": app_key
}
params = {
    "query": query
}
response = requests.post(api_url, json=params, headers=headers)
if response.status_code == 200:
    data = response.text
    food_names = re.findall(r'"food_name":"(.*?)"', data)
    print(food_names)
else:
    print(f'Request failed: Error {response.status_code}')

Request failed: Error 401


In [125]:
from spellchecker import SpellChecker

spell = SpellChecker()
misspelled = spell.unknown(['parsey', 'cheesse', 'broc'])

for word in misspelled:
    # Get the one `most likely` answer
    print(spell.correction(word))
    # Get a list of `likely` options
    print(spell.candidates(word))

cheese
{'cheeses', 'cheese'}
bro
{'brok', 'bros', 'brock', 'bloc', 'croc', 'bro', 'roc', 'brow', 'bronc'}
parley
{'parley', 'parsec', 'parsley', 'parse'}
