### Creating an customized dataset for the Custom Recipe Generator Based on Dietary Restrictions

> Loading the epicurious dataset and RecipeNLG_dataset and merging them on the basis of title

In [5]:
import pandas as pd

# Load the Epicurious dataset
df1 = pd.read_csv(r"C:\Users\Priyansh Tyagi\Downloads\epi_r.csv") 

# Load RecipeNLG dataset
df2 = pd.read_csv(r"C:\Users\Priyansh Tyagi\Downloads\archive\RecipeNLG_dataset.csv") 

df1['title'] = df1['title'].str.lower()
df2['title'] = df2['title'].str.lower()

merged_df = pd.merge(df1, df2[['title', 'ingredients', 'directions']], on='title', how='left')

missing_ingredients = merged_df[merged_df['ingredients'].isna()]

merged_df.head()

merged_df.to_csv('Recipe_Dataset.csv', index=False)

> Loading the Merged dataset

In [2]:
data1 = pd.read_csv(r"C:\Users\Priyansh Tyagi\Downloads\Recipe_Dataset.csv")
data1.head()

Unnamed: 0,title,rating,calories,protein,fat,sodium,#cakeweek,#wasteless,22-minute meals,3-ingredient recipes,...,yonkers,yuca,zucchini,cookbooks,leftovers,snack,snack week,turkey,ingredients,directions
0,"lentil, apple, and turkey wrap",2.5,426.0,30.0,7.0,559.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,"[""4 cups low-sodium vegetable or chicken stock...","[""1. Place the stock, lentils, celery, carrot,..."
1,boudin blanc terrine with red onion confit,4.375,403.0,18.0,23.0,1439.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,
2,potato and fennel soup hodge,3.75,165.0,6.0,7.0,165.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,
3,mahi-mahi in tomato olive sauce,5.0,,,,,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,
4,spinach noodle casserole,3.125,547.0,20.0,32.0,452.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,


### Data Cleaning

> Removed the extra columns which were not needed and kept the remaining

In [3]:
# Keep only the specified columns
data1 = data1[['title', 'calories', 'protein', 'fat', 'sodium', 'ingredients','directions']]

data1.head()

Unnamed: 0,title,calories,protein,fat,sodium,ingredients,directions
0,"lentil, apple, and turkey wrap",426.0,30.0,7.0,559.0,"[""4 cups low-sodium vegetable or chicken stock...","[""1. Place the stock, lentils, celery, carrot,..."
1,boudin blanc terrine with red onion confit,403.0,18.0,23.0,1439.0,,
2,potato and fennel soup hodge,165.0,6.0,7.0,165.0,,
3,mahi-mahi in tomato olive sauce,,,,,,
4,spinach noodle casserole,547.0,20.0,32.0,452.0,,


> Dropping the duplicate rows which have same title

In [4]:
data1 = data1.drop_duplicates(subset='title')

data1.head()

Unnamed: 0,title,calories,protein,fat,sodium,ingredients,directions
0,"lentil, apple, and turkey wrap",426.0,30.0,7.0,559.0,"[""4 cups low-sodium vegetable or chicken stock...","[""1. Place the stock, lentils, celery, carrot,..."
1,boudin blanc terrine with red onion confit,403.0,18.0,23.0,1439.0,,
2,potato and fennel soup hodge,165.0,6.0,7.0,165.0,,
3,mahi-mahi in tomato olive sauce,,,,,,
4,spinach noodle casserole,547.0,20.0,32.0,452.0,,


> Checking shape of the dataset before and after removing the NaN values rows

In [5]:
data1.shape # Before

(17733, 7)

In [6]:
data1 = data1.dropna()

data1.head()

Unnamed: 0,title,calories,protein,fat,sodium,ingredients,directions
0,"lentil, apple, and turkey wrap",426.0,30.0,7.0,559.0,"[""4 cups low-sodium vegetable or chicken stock...","[""1. Place the stock, lentils, celery, carrot,..."
8,korean marinated beef,170.0,7.0,10.0,1272.0,"[""1/4 cup soy sauce"", ""1 tablespoon sugar"", ""2...","[""Stir together soy sauce, sugar, sesame oil, ..."
9,ham persillade with mustard potato salad and m...,602.0,23.0,41.0,1696.0,"[""6 long parsley sprigs, divided"", ""1 3/4 cups...","[""Chop enough parsley leaves to measure 1 tabl..."
14,peach mustard,134.0,4.0,3.0,1394.0,"[""1 large ripe peach"", ""2 tablespoons sugar"", ...","[""Using the tip of a paring knife, score an X ..."
16,sweet buttermilk spoon breads,146.0,4.0,5.0,160.0,"[""1 cup water"", ""2/3 cup buttermilk"", ""1/3 cup...","[""Butter and sugar six 2/3-to 3/4-cup ramekins..."


In [7]:
data1.shape # After

(5662, 7)

> Checking for null values in the dataset if any 

In [8]:
null_values = data1.isnull().sum()

print(null_values)

title          0
calories       0
protein        0
fat            0
sodium         0
ingredients    0
directions     0
dtype: int64


> Changing the data type of "ingredients" column from list to text 

In [9]:
# Remove brackets and quotes from the 'ingredients' column and join them into a single string
data1['ingredients'] = data1['ingredients'].apply(lambda x: ', '.join(eval(x)))
data1.head()

Unnamed: 0,title,calories,protein,fat,sodium,ingredients,directions
0,"lentil, apple, and turkey wrap",426.0,30.0,7.0,559.0,"4 cups low-sodium vegetable or chicken stock, ...","[""1. Place the stock, lentils, celery, carrot,..."
8,korean marinated beef,170.0,7.0,10.0,1272.0,"1/4 cup soy sauce, 1 tablespoon sugar, 2 teasp...","[""Stir together soy sauce, sugar, sesame oil, ..."
9,ham persillade with mustard potato salad and m...,602.0,23.0,41.0,1696.0,"6 long parsley sprigs, divided, 1 3/4 cups red...","[""Chop enough parsley leaves to measure 1 tabl..."
14,peach mustard,134.0,4.0,3.0,1394.0,"1 large ripe peach, 2 tablespoons sugar, 1 tea...","[""Using the tip of a paring knife, score an X ..."
16,sweet buttermilk spoon breads,146.0,4.0,5.0,160.0,"1 cup water, 2/3 cup buttermilk, 1/3 cup heavy...","[""Butter and sugar six 2/3-to 3/4-cup ramekins..."


> Adding tag column based on some custom thresholds

In [10]:
import numpy as np

def assign_tag(row):
    ingredients = row['ingredients'].lower() 
    calories = row['calories']
    protein = row['protein']
    fat = row['fat']
    sodium = row['sodium']
    
    # Rules for assigning tags
    if 'chicken' in ingredients or 'beef' in ingredients or 'pork' in ingredients or 'fish' in ingredients:
        return 'non-veg'
    elif 'tofu' in ingredients or 'lentils' in ingredients or 'beans' in ingredients:
        return 'vegan'
    elif calories < 200 and protein > 10:
        return 'high protein'
    elif fat < 10 and calories < 150:
        return 'low fat'
    elif sodium > 500:
        return 'high sodium'
    elif calories < 100:
        return 'low calorie'
    else:
        return 'other'

data1['tag'] = data1.apply(assign_tag, axis=1)

data1.head()

Unnamed: 0,title,calories,protein,fat,sodium,ingredients,directions,tag
0,"lentil, apple, and turkey wrap",426.0,30.0,7.0,559.0,"4 cups low-sodium vegetable or chicken stock, ...","[""1. Place the stock, lentils, celery, carrot,...",non-veg
8,korean marinated beef,170.0,7.0,10.0,1272.0,"1/4 cup soy sauce, 1 tablespoon sugar, 2 teasp...","[""Stir together soy sauce, sugar, sesame oil, ...",high sodium
9,ham persillade with mustard potato salad and m...,602.0,23.0,41.0,1696.0,"6 long parsley sprigs, divided, 1 3/4 cups red...","[""Chop enough parsley leaves to measure 1 tabl...",non-veg
14,peach mustard,134.0,4.0,3.0,1394.0,"1 large ripe peach, 2 tablespoons sugar, 1 tea...","[""Using the tip of a paring knife, score an X ...",low fat
16,sweet buttermilk spoon breads,146.0,4.0,5.0,160.0,"1 cup water, 2/3 cup buttermilk, 1/3 cup heavy...","[""Butter and sugar six 2/3-to 3/4-cup ramekins...",low fat


In [12]:
data1.to_csv('Recipe_Dataset_Final.csv', index=False)

> Final Dataset

In [14]:
data = pd.read_csv(r"C:\Users\Priyansh Tyagi\Downloads\Recipe_Dataset_Final.csv")
data.head()

Unnamed: 0,title,calories,protein,fat,sodium,ingredients,directions,tag
0,"lentil, apple, and turkey wrap",426.0,30.0,7.0,559.0,"4 cups low-sodium vegetable or chicken stock, ...","[""1. Place the stock, lentils, celery, carrot,...",non-veg
1,korean marinated beef,170.0,7.0,10.0,1272.0,"1/4 cup soy sauce, 1 tablespoon sugar, 2 teasp...","[""Stir together soy sauce, sugar, sesame oil, ...",high sodium
2,ham persillade with mustard potato salad and m...,602.0,23.0,41.0,1696.0,"6 long parsley sprigs, divided, 1 3/4 cups red...","[""Chop enough parsley leaves to measure 1 tabl...",non-veg
3,peach mustard,134.0,4.0,3.0,1394.0,"1 large ripe peach, 2 tablespoons sugar, 1 tea...","[""Using the tip of a paring knife, score an X ...",low fat
4,sweet buttermilk spoon breads,146.0,4.0,5.0,160.0,"1 cup water, 2/3 cup buttermilk, 1/3 cup heavy...","[""Butter and sugar six 2/3-to 3/4-cup ramekins...",low fat


In [16]:
data.shape

(5662, 8)