<a href="https://colab.research.google.com/github/vaanchhitbaranwal-ux/vaanchhit/blob/main/recipe_recommendation_system.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%load_ext autoreload
%autoreload 2

import argparse
import matplotlib.pyplot as plt
import nltk
import numpy as np
import os
import pandas as pd
import pickle
import pycrfsuite
import random
import re
import sys
import warnings
nltk.download('averaged_perceptron_tagger')
warnings.filterwarnings('ignore')

from selenium import webdriver
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import MultiLabelBinarizer

src_dir = os.path.join(os.getcwd(), '..', '..', 'src')
sys.path.append(src_dir)

from d00_utils import utils
from d01_data import clean_data
from d01_data.web_scraping import sr_scraping, marianos_insta_scraping
from d02_features.feature_creation import nyt_ingredients_crf_feature_creation
from d02_features.feature_creation import instacart_prod_crf_feature_creation
from d03_models.crf_model_recipes import crf_model_recipe_tagger
from d03_models.crf_model_baskets import crf_basket_feature_creation, crf_basket_dataset_creation
from d03_models.app_functions import *

In [None]:
recipes_sr_orig = utils.read_multiple_csv_and_concat('../../data/01_raw/simply_recipes/simply_recipes*')
recipes_sr_orig.drop(columns='Unnamed: 0', inplace=True)

In [None]:
recipes_sr_inter = clean_data.intermediate_clean_recipes_sr(recipes_sr_orig)

In [None]:
recipes_sr_inter.head(2)

In [None]:
grocery_prices_orig = utils.read_multiple_csv_and_concat('../../data/01_raw/grocery_prices_marianos/prod_aile*')

In [None]:
grocery_prices_orig.head(2)

In [None]:
grocery_prices_inter = clean_data.intermediate_clean_marianos_prices(grocery_prices_orig)

In [None]:
grocery_prices_inter.head(2)

In [None]:
aisles = pd.read_csv('../../data/01_raw/instacart_2017_05_01/aisles.csv')
departments = pd.read_csv('../../data/01_raw/instacart_2017_05_01/departments.csv')
order = pd.read_csv('../../data/01_raw/instacart_2017_05_01/orders.csv')
order_products__prior = pd.read_csv('../../data/01_raw/instacart_2017_05_01/order_products__prior.csv')
products = pd.read_csv('../../data/01_raw/instacart_2017_05_01/products.csv')

In [None]:
instacart_baskets = clean_data.combine_instacart_kaggle_datasets(aisles, departments, order,
                                                                 order_products__prior, products)
instacart_baskets.head()

In [None]:
instacart_baskets.info()

In [None]:
pd.DataFrame(instacart_baskets.groupby('user_id')['order_id']\
             .nunique()).sort_values('order_id', ascending=False)\
             .head(5)

In [None]:
recipes_sr_inter.head(2)

In [None]:
recipes_sr_inter.info()

In [None]:
print('Number of unique recipes: ', len(recipes_sr_inter))

In [None]:
nyt_ing = pd.read_csv('../../data/01_raw/nyt-ingredients-snapshot-2015.csv')
nyt_ing.drop(columns=['index'], inplace=True)
print('Number of Handlabeled Ingredients: ', len(nyt_ing))
nyt_ing.head()

In [None]:
nyt_ing.fillna("missing", inplace=True)

In [None]:
X, y = nyt_ingredients_crf_feature_creation(nyt_ing)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
trainer = pycrfsuite.Trainer(verbose=True)

# Submit training data to the trainer
for xseq, yseq in zip(X_train, y_train):
    trainer.append(xseq, yseq)

# Set the parameters of the model
trainer.set_params({
    # coefficient for L1 penalty
    'c1': 0.1,

    # coefficient for L2 penalty
    'c2': 0.01,

    # maximum number of iterations
    'max_iterations': 200,

    # whether to include transitions that
    # are possible, but not observed
    'feature.possible_transitions': True
})

# Provide a file name as a parameter to the train function, such that
# the model will be saved to the file when training is finished
trainer.train('../../data/04_models/crf_nyt_initial_model.model')
# let's read back in our model
tagger = pycrfsuite.Tagger()
tagger.open('../../data/04_models/crf_nyt_initial_model.model')

In [None]:
y_pred = [tagger.tag(xseq) for xseq in X_test]

In [None]:
mlb = MultiLabelBinarizer()
print(classification_report(y_pred=mlb.fit_transform(y_pred), y_true=mlb.fit_transform(y_test)))

In [None]:
trainer = pycrfsuite.Trainer(verbose=True)

# Submit training data to the trainer
for xseq, yseq in zip(X, y):
    trainer.append(xseq, yseq)

# Set the parameters of the model
trainer.set_params({
    # coefficient for L1 penalty
    'c1': 0.1,

    # coefficient for L2 penalty
    'c2': 0.01,

    # maximum number of iterations
    'max_iterations': 200,

    # whether to include transitions that
    # are possible, but not observed
    'feature.possible_transitions': True
})

# Provide a file name as a parameter to the train function, such that
# the model will be saved to the file when training is finished
trainer.train('../../data/0/crf_ing_final.model')

In [None]:
recipe_ing_dict, recipe_links_dict, recipe_tags_dict = crf_model_recipe_tagger(recipes_sr_inter)

In [None]:
recipe_ing_dict

In [None]:
trainer = pycrfsuite.Trainer(verbose=True)

# Submit training data to the trainer
for xseq, yseq in zip(X, y):
    trainer.append(xseq, yseq)

# Set the parameters of the model
trainer.set_params({
    # coefficient for L1 penalty
    'c1': 0.1,

    # coefficient for L2 penalty
    'c2': 0.01,

    # maximum number of iterations
    'max_iterations': 200,

    # whether to include transitions that
    # are possible, but not observed
    'feature.possible_transitions': True
})

# Provide a file name as a parameter to the train function, such that
# the model will be saved to the file when training is finished
trainer.train('../../data/04_models/crf_instacart_products_final.model')

In [None]:
instacart_baskets_update = pd.read_csv('../../data/05_model_output/baskets_newprodlist_2.csv')

In [None]:
instacart_baskets_update

In [None]:
# from a cursory look at the dataset I can tell that there are a number of things marked as food that are not. let's
# get rid of these so that they don't mess up our results.
mask = ((instacart_baskets_update['new_prod_list']!='1')&(instacart_baskets_update['new_prod_list']!='100')&\
        (instacart_baskets_update['new_prod_list']!='11')&(instacart_baskets_update['new_prod_list']!='118')&\
        (instacart_baskets_update['new_prod_list']!='2')&(instacart_baskets_update['new_prod_list']!='24')&\
        (instacart_baskets_update['new_prod_list']!='3')&(instacart_baskets_update['new_prod_list']!='3 cheese')&\
        (instacart_baskets_update['new_prod_list']!='30')&(instacart_baskets_update['new_prod_list']!='328')&\
        (instacart_baskets_update['new_prod_list']!='4')&(instacart_baskets_update['new_prod_list']!='5')&\
        (instacart_baskets_update['new_prod_list']!='50')&(instacart_baskets_update['new_prod_list']!='6')&\
        (instacart_baskets_update['new_prod_list']!='6 cheese')&(instacart_baskets_update['new_prod_list']!='60')&\
        (instacart_baskets_update['new_prod_list']!='7')&(instacart_baskets_update['new_prod_list']!='70')&\
        (instacart_baskets_update['new_prod_list']!='8')&(instacart_baskets_update['new_prod_list']!='85')&\
        (instacart_baskets_update['new_prod_list']!='9')&(instacart_baskets_update['new_prod_list']!='95')&\
        (instacart_baskets_update['new_prod_list']!='97')&(instacart_baskets_update['new_prod_list']!='98')&\
        (instacart_baskets_update['new_prod_list']!='a')&(instacart_baskets_update['new_prod_list']!='a garlic butter sauce')&\
        (instacart_baskets_update['new_prod_list']!=np.nan)&(instacart_baskets_update['new_prod_list']!='nan'))

instacart_baskets_filtered = instacart_baskets_update[mask]

In [None]:
print('Number of Products After Running Names through CRF Mode: ', instacart_baskets_filtered.new_prod_list.nunique())
print('Number of products in the original list: ', instacart_baskets_filtered.product_name.nunique())
print('Number of unique users: ', instacart_baskets_filtered.user_id.nunique())

In [None]:
instacart_users_lst = list(instacart_baskets_filtered.user_id.unique())
len(instacart_users_lst)

In [None]:
random_usrids_100k = random.sample(instacart_users_lst, 100000)
mask = instacart_baskets_filtered['user_id'].isin(random_usrids_100k)
baskets_100k = instacart_baskets_filtered.loc[mask]
print('Number of User IDs: ', baskets_100k.user_id.nunique())

In [None]:
baskets_100k

In [None]:
baskets_complete = baskets_100k.drop(columns=['product_name', 'user_id'])
baskets_complete.head()

In [None]:
data_matrix = pd.read_csv('../../data/05_model_output/data_matrix_sim.csv')
data_matrix.set_index('Unnamed: 0', inplace=True)

In [None]:
print(data_matrix.loc['potato'].nlargest(11))

In [None]:
print('Choose your meal by inputing either 1, 2 or 3')
# print('\n')
meal_input = input("Breakfast: Input 1 || || Lunch: Input 2 || Dinner: Input 3: ")
print('\n')
print('Choose your dietary preferences by inputing either 1 or 2: ')
# print('\n')
dietary_preference_input = input("Vegetarian: Input 1 || Omnivore: Input 2: ")
# print('\n')
print('Type in 3 foods you already like')
item1 = input("Item 1: ")
item2 = input("Item 2: ")
item3 = input("Item 3: ")
print('\n')
print('Searching for five recipe recommendations based both on your inputs and similair foods.')

if meal_input == "1":
    meal = 'Breakfast'
else:
    meal = 'Dinner'

if dietary_preference_input == "1":
    dietary_preference = 'Vegetarian'
else:
    dietary_preference = None
shopping_basket = [item1, item2, item3]
recipe_recommendations_app(shopping_basket, recipe_ing_dict, recipe_tags_dict, meal, dietary_preference, recipe_links_dict)