# exploratory data analysis

In [119]:
import os
from dotenv import load_dotenv

# Load the .env file
load_dotenv()

# Retrieve the API key
api_key = os.getenv('SPOONACULAR_API_KEY')

In [109]:
import requests
import json

url = "https://api.spoonacular.com/recipes/random?number=100&apiKey=" + api_key

# Make a GET request to the API
response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
    # Parse the response to JSON
    data = response.json()
else:
    print(f"Failed to retrieve data: {response.status_code}")

In [110]:
df = pd.json_normalize(data, record_path=['recipes'])

In [111]:
df.to_pickle('../data/raw/recipes2.pkl')

In [112]:
df1 = pd.read_pickle('../data/raw/recipes1.pkl')
df2 = pd.read_pickle('../data/raw/recipes2.pkl')
df3 = pd.read_pickle('../data/raw/recipes.pkl')

df = pd.concat([df1, df2, df3], ignore_index=True)

# remove duplicates
df.drop_duplicates(subset=['id'], inplace=True)
df.reset_index(drop=True, inplace=True)

In [113]:
df.to_pickle('../data/raw/recipes.pkl')

In [116]:
import numpy as np

# Replace '-1' with NaN for 'preparationMinutes' and 'cookingMinutes'
df['preparationMinutes'].replace(-1, np.nan, inplace=True)
df['cookingMinutes'].replace(-1, np.nan, inplace=True)

# Fill NaN values for 'preparationMinutes' and 'cookingMinutes' with the mean or median
df['preparationMinutes'].fillna(df['preparationMinutes'].median(), inplace=True)
df['cookingMinutes'].fillna(df['cookingMinutes'].median(), inplace=True)

# For 'author', replace NaN with 'Unknown'
df['author'].fillna('Unknown', inplace=True)

# You may want to set a default value for any other columns that have missing data
# For 'originalId', as it is a unique identifier, if it's NaN, you can leave it or set a default
df['originalId'].fillna('Unknown', inplace=True)

# If you wish to do this for all columns that are of type object and have NaN values:
object_columns = df.select_dtypes(include=['object']).columns
df[object_columns] = df[object_columns].fillna('Unknown')


In [123]:
import time

# Function to construct ingredient list as a string from the ingredient details
def construct_ingredient_list(ingredients):
    ingredient_list = []
    for ingredient in ingredients:
        ingredient_list.append(ingredient['name'])
    return ', '.join(ingredient_list)

# Function to get cuisine from Spoonacular API
def get_cuisine(title, ingredient_list):
    url = 'https://api.spoonacular.com/recipes/cuisine'
    headers = {'Content-Type': 'application/json'}
    payload = {
        'title': title,
        'ingredientList': ingredient_list,
        'apiKey': api_key
    }
    response = requests.post(url + '?apiKey=' + api_key + '&title=' + title + '&ingredientList=' + ingredient_list)
    if response.status_code == 200:
        return response.json().get('cuisine')
    else:
        # Handle possible API errors (like rate limits, etc.)
        print(f"API error: {response.text}")
        return None

# Loop over the dataframe and update cuisines
for index, row in df[df['cuisines'].apply(lambda x: len(x) == 0)].iterrows():
    time.sleep(1.3)
    ingredient_list = construct_ingredient_list(row['extendedIngredients'])
    print(ingredient_list)
    # cuisine = get_cuisine(row['title'], ingredient_list)
    # if cuisine:
    #     df.at[index, 'cuisines'] = [cuisine]
    # else:
    #     # Handle the case where the API does not return a cuisine
    #    print(f"No cuisine found for recipe titled '{row['title']}'")


carrots, cumin seed powder, coriander, cabbage, ground coriander, juice of lime, olive oil, cabbage, sunflower seeds
bread crumbs, cayenne pepper, celery, parsley, oysters, sprinkling of pepper, pernod, rock salt, salt, shallots, spinach, butter, worcestershire sauce
onion, brown rice, mushrooms, unrefined sunflower oil, water, sea salt, ground pepper
warm water, granulated sugar, sunflower oil, active yeast, beated egg, maple syrup, all purpose flour, kosher salt, sesame seeds, water, honey
beets, olive oil, balsamic vinegar, grapefruit, sprouts, pumpkin seeds, goat cheese
olive oil, onion, grape tomatoes, avocado, quality bread, springs chives, chive blossoms, salt, ground pepper
pork belly, pork butt shoulder, garlic, vinegar, brown sugar, brown sugar, soy sauce, bay leaf, rice wine, rice wine, rice wine, star anise, oregano, fermented black beans, peanuts, olive oil, paprika, water, fried bananas
blueberries, coconut milk, condensed milk, guava, peanuts
olive oil, chicken breasts, 

KeyboardInterrupt: 

In [115]:
for index, row in df.iterrows():
    if row['healthScore'] == 100:
        print(row)



vegetarian                                                               True
vegan                                                                    True
glutenFree                                                               True
dairyFree                                                                True
veryHealthy                                                              True
cheap                                                                   False
veryPopular                                                             False
sustainable                                                             False
lowFodmap                                                                True
weightWatcherSmartPoints                                                    3
gaps                                                                       no
preparationMinutes                                                         -1
cookingMinutes                                                  

In [63]:
# ask user for input
'''
diet = input("What diet do you follow? (vegan, vegetarian, pescatarian, paleo, ketogenic, whole30, none) ")
intolerances = input("Do you have any food intolerances? (dairy, egg, gluten, grain, peanut, seafood, sesame, shellfish, soy, sulfite, tree nut, wheat, none) ")
cuisine = input("What cuisine do you prefer? (african, american, british, cajun, caribbean, chinese, eastern european, european, french, german, greek, indian, irish, italian, japanese, jewish, korean, latin american, mediterranean, mexican, middle eastern, nordic, southern, spanish, thai, vietnamese, none) ")
type_of_recipe = input("What type of recipe do you want? (main course, side dish, dessert, appetizer, salad, bread, breakfast, soup, beverage, sauce, marinade, fingerfood, snack, drink, none) ")
max_time = input("How much time do you have? (minutes) ")
cost = input("are you broke? (true/false) ")
'''

SyntaxError: EOF while scanning triple-quoted string literal (530369632.py, line 7)

In [61]:
# remove recipes that don't match the user's preferences
'''
df = df[df['diets'].str.contains(diet, na=False)]
df = df[df['dishTypes'].str.contains(type_of_recipe, na=False)]
df = df[df['cuisines'].str.contains(cuisine, na=False)]
df = df[df['readyInMinutes'] <= int(max_time)]
df = df[df['cheap'] == bool(cost)]
'''

In [76]:
df.columns

Index(['vegetarian', 'vegan', 'glutenFree', 'dairyFree', 'veryHealthy',
       'cheap', 'veryPopular', 'sustainable', 'lowFodmap',
       'weightWatcherSmartPoints', 'gaps', 'preparationMinutes',
       'cookingMinutes', 'aggregateLikes', 'healthScore', 'creditsText',
       'sourceName', 'pricePerServing', 'extendedIngredients', 'id', 'title',
       'readyInMinutes', 'servings', 'sourceUrl', 'image', 'imageType',
       'summary', 'cuisines', 'dishTypes', 'diets', 'occasions',
       'instructions', 'analyzedInstructions', 'originalId',
       'spoonacularSourceUrl', 'license'],
      dtype='object')

In [77]:
ing = set()
for ingredients in df['diets']:
    for ig in ingredients:
        ing.add(ig)

In [78]:
print(list(ing))

['lacto ovo vegetarian', 'pescatarian', 'paleolithic', 'ketogenic', 'vegan', 'fodmap friendly', 'gluten free', 'whole 30', 'primal', 'dairy free']
