In [1]:
import pandas as pd
import numpy as np
import glob
import re


IMPORT DATA

In [3]:
# Load data
path = r'C:/Users/pc/Desktop/my_git/final_project/recipes/Old' 
files = glob.glob(path + "/*.csv")

all_files = []

for filename in files:
    df_sub = pd.read_csv(filename, index_col=None, header=0, encoding='utf-8')
    all_files.append(df_sub)
    # print(filename)

frame = pd.concat(all_files, axis=0, ignore_index=True)

In [4]:
df = frame.copy()

OVERVIEW DATA

In [5]:
df.describe()

Unnamed: 0,drink_name,recipe,recipe_url,url_of_image
count,7513,6866,6866,6845
unique,2744,1587,2343,2130
top,The 10 Best Light Beers to Drink in 2021,[],https://www.thespruceeats.com/best-light-beers...,//:0
freq,79,2741,79,762


In [6]:
df['recipe'].describe()

count     6866
unique    1587
top         []
freq      2741
Name: recipe, dtype: object

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7887 entries, 0 to 7886
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   drink_name    7513 non-null   object
 1   recipe        6866 non-null   object
 2   recipe_url    6866 non-null   object
 3   url_of_image  6845 non-null   object
dtypes: object(4)
memory usage: 246.6+ KB


CLEANING DATA

In [8]:
# Check wrong data
df[df['recipe'] == '[]'].count()

drink_name      2741
recipe          2741
recipe_url      2741
url_of_image    2725
dtype: int64

In [9]:
# Drop wrong data
df.drop(df[df['recipe'] == '[]'].index, inplace= True)

In [10]:
# Drop duplicated data
df.drop_duplicates(subset ='recipe',
                     keep = 'first', inplace = True)

In [11]:
# Check na values
df.isnull().sum()

drink_name      0
recipe          1
recipe_url      1
url_of_image    1
dtype: int64

In [12]:
# Drop all na values
df.dropna(axis=0, inplace=True)

In [13]:
# Check data after clean
df

Unnamed: 0,drink_name,recipe,recipe_url,url_of_image
0,French Cafe au Lait,"['1 part hot strong coffee (French roast)', '1...",https://www.thespruceeats.com/cafe-au-lait-rec...,https://www.thespruceeats.com/thmb/cRrwUsC6WNB...
1,Pour-Over Coffee,"['10 ounces spring water', '2/3 ounce ground c...",https://www.thespruceeats.com/pour-over-coffee...,https://www.thespruceeats.com/thmb/-XzLvlXHASj...
2,Cuban Coffee (Cafecito),"['1/4 cup ground coffee', '1 1/2 cups water', ...",https://www.thespruceeats.com/cuban-coffee-479...,https://www.thespruceeats.com/thmb/vJGEfrc8S_h...
3,The Perfect Cappuccino,['2 tablespoons finely ground dark roast coffe...,https://www.thespruceeats.com/how-to-make-capp...,https://www.thespruceeats.com/thmb/DWhiIPBDVVq...
4,New Orleans Coffee (Cafe Noir),"['4 tablespoons coffee', '2 tablespoons chicor...",https://www.thespruceeats.com/new-orleans-coff...,https://www.thespruceeats.com/thmb/bogvGb8qhPt...
...,...,...,...,...
7882,Boston Iced Tea,"['1 gallon water ', '1 cup white sugar ', '15 ...",https://www.allrecipes.com/recipe/32547/boston...,https://imagesvc.meredithcorp.io/v3/mm/image?u...
7883,Frozen Vanilla Chai Tea,"['1 cup water ', '2 chai tea bags ', '2 table...",https://www.allrecipes.com/recipe/238025/froze...,https://imagesvc.meredithcorp.io/v3/mm/image?u...
7884,Lemon Mint Iced Tea,"['1 quart water ', '¾ cup white sugar ', '6 r...",https://www.allrecipes.com/recipe/217166/lemon...,https://imagesvc.meredithcorp.io/v3/mm/image?u...
7885,Almond Tea,"['3 tablespoons instant iced tea powder ', '1 ...",https://www.allrecipes.com/recipe/86426/almond...,https://imagesvc.meredithcorp.io/v3/mm/image?u...


CLEANING RECIPE

In [17]:
df_clean = df.copy()

In [15]:
# Function to clean recipes in dataset
def clean_recipe(recipe):
    pat1 = r'[\'\"\(\),*:;.!?~\[\]\{\}]'
    pat2 = r'\\\w{2}\d'  
    pat3 = r'\\\w\d{4}'
    pat4 = r'\\\w\d{3}\w'
    clean_recipe = recipe.lower()    #.strip('-[]{}\'!?~')
    clean_recipe = re.sub(pat1,'',clean_recipe)
    clean_recipe = re.sub(pat2,'',clean_recipe)
    clean_recipe = re.sub(pat3,'',clean_recipe)
    clean_recipe = re.sub(pat4,'',clean_recipe)
    return clean_recipe

In [18]:
# Apply cleaning function on df_clean
df_clean['recipe'] = df_clean['recipe'].apply(lambda x: clean_recipe(x))

In [19]:
df_clean['recipe']

0       1 part hot strong coffee french roast 1 part s...
1          10 ounces spring water 2/3 ounce ground coffee
2       1/4 cup ground coffee 1 1/2 cups water 1/4 cup...
3       2 tablespoons finely ground dark roast coffee ...
4       4 tablespoons coffee 2 tablespoons chicory 1/4...
                              ...                        
7882    1 gallon water  1 cup white sugar  15  tea bag...
7883    1 cup water  2  chai tea bags  2 tablespoons w...
7884    1 quart water  ¾ cup white sugar  6  regular-s...
7885    3 tablespoons instant iced tea powder  1 cup w...
7886    409¼ cups water divided  ¼ cup white tapioca p...
Name: recipe, Length: 1586, dtype: object

In [None]:
# data = df_clean
# path = 'C:/Users/pc/Desktop/my_git/final_project/recipes/Old'
# data.to_csv(path + "/clean_recipe_new.csv", index=False)

In [20]:
# Create function to search recipe - Version 1
def searching_recipe(ingredients, data):
    """ Dataset: df_clean
        Input: Ingredients
        Flow: searching recipe in dataset
        Output: recipes that contain ingredients
    """
    recipe_list = []
    for recipe in data:
        check = all(item in recipe for item in ingredients)
        if check == True:
            recipe_list.append(recipe)
    return recipe_list

In [None]:
# Create function to search recipe - Version 2
def searching_recipe(ingredients,data):
    """ Dataset: df_clean
        Input: Ingredients
        Flow: searching recipe in dataset
        Output: recipes that contain ingredients
    """
    for i in range(len(data['recipe'])):
        check_recipe = all(item in data['recipe'][i] for item in ingredients)
        check_name = any(item in data['drink_name'][i] for item in ingredients)
        if check_recipe == True:
            recipe_list.append(data['recipe'][i])
        elif check_name == True:
            recipe_list.append(data['recipe'][i])
    return recipe_list

In [None]:
def searching_recipe(ingredients, data):
    """ Dataset: df_clean
        Input: Ingredients
        Flow: searching recipe in dataset
        Output: all informations that contain ingredients
    """
    recipe_list = []
    drink_name = []
    recipe_url = []
    image_url = []
    for i in range(len(data['recipe'])):
        check_recipe = all(item in data['recipe'][i] for item in ingredients)
        check_name = any(item in data['drink_name'][i] for item in ingredients)
        if check_recipe == True:
            recipe_list.append(data['recipe'][i])
            drink_name.append(data['drink_name'][i])
            recipe_url.append(data['recipe_url'][i])
            image_url.append(data['url_of_image'][i])
        elif check_name == True:
            recipe_list.append(data['recipe'][i])
            drink_name.append(data['drink_name'][i])
            recipe_url.append(data['recipe_url'][i])
            image_url.append(data['url_of_image'][i])
    
    return recipe_list, drink_name, recipe_url, image_url

In [None]:
from wordcloud import WordCloud
long_string = ','.join(df_clean['recipe'])
wordcloud = WordCloud(background_color = 'white',max_words=5000,contour_width=5,contour_color='steelblue')
wordcloud.generate(long_string)
wordcloud.to_image()