# This Notebook is used for generating the data for Meal planning for the new Millennium (MnM) problem


##1. Import all the necessary library

In [0]:
from bs4 import BeautifulSoup as bs
from urllib.request import urlopen as url 
import pickle
import json
import multiprocessing
import pandas as pd 
import numpy as np

##2. Define EP_Recipe class to store all the data.

In [0]:
class EP_Recipe():
    title = None
    rating = None
    personal_rating = []
    calories = None
    sodium = None
    fat = None
    protein = None

    def get_title(self, page):
        return page.find('h1', {'itemprop': 'name'}).text

    def get_rating(self, page):
        try:
            return float(page.find_all('span', {'class': 'rating'})[-1].text.split('/')[0]) + 1
        except:
            return None

    def build_recipie(self, page):
        #super(EP_Recipe, self).build_recipie(page)
        self.title = self.get_title(page)
        self.rating = self.get_rating(page)
        self.calories = self.get_calories(page)
        self.sodium = self.get_sodium(page)
        self.fat = self.get_fat(page)
        self.protein = self.get_protein(page)
        self.personal_rating = self.get_personal_rating(page)

    def get_calories(self,page):
        try:
            return float(page.find('span',{'class':'nutri-data','itemprop':'calories'}).text)
        except:
            return None

    def get_sodium(self,page):
        try:
            return float(page.find('span',{'class':'nutri-data','itemprop':'sodiumContent'}).text.split(' ')[0])
        except:
            return None

    def get_fat(self, page):
        try:
            return float(page.find('span', {'class': 'nutri-data', 'itemprop': 'fatContent'}).text.split(' ')[0])
        except:
            return None

    def get_protein(self, page):
        try:
            return float(page.find('span', {'class': 'nutri-data', 'itemprop': 'proteinContent'}).text.split(' ')[0])
        except:
            return None
    
    def get_personal_rating(self, page):
        try: 
            p_ratings = page.findAll('img', {'class': 'fork-rating'})
            p_persons = page.find_all('span', {'class': 'credit'})
            p_r = []            
            for i in range(len(p_ratings)):
                temp = [p_persons[i].text.split('/')[0], int(p_ratings[i]['src'].split('/')[-1].split('_')[0]) + 1]
                p_r.append(temp)
            return p_r
        except:
            return None


    def __init__(self, page):
        print('attempting to build from: '+page)
        try:
            self.build_recipie(bs(url(page), 'html.parser'))
        except Exception as x:
            print('Could not build from %s, %s'%(page,x))
            


##3. Find all the addresses for recipes and store them in ep_urls

In [0]:
all_url = ['https://www.epicurious.com/recipes-menus/what-to-cook-this-weekend-february-22-24-gallery',
          'https://www.epicurious.com/recipes-menus/what-to-cook-this-weekend-february-8-10-gallery',
           "https://www.epicurious.com/ingredients/acorn-delicata-kabocha-spaghetti-squash-winter-recipes-gallery",
           'https://www.epicurious.com/recipes-menus/easy-dinner-recipes-for-cook90-gallery',
          'https://www.epicurious.com/recipes-menus/our-favorite-cook90-lunches-gallery',
          'https://www.epicurious.com/recipes-menus/make-ahead-weeknight-dinners-stew-soup-freezer-casserole-quick-easy-recipes-gallery']

ep_urls = set()
for i in all_url:        
    initializer = url(i)
    res = bs(initializer.read(),"html5lib")
    for div in res.findAll('div', {'class': 'gallery-slide-caption__dek-container'}):
            ep_urls.update([div.find('a')['href']]) 

##4. Scrape the website of recipes and generate the data. Store the data as recipes_data.json.


In [4]:
p = multiprocessing.Pool(4)
output = p.map(EP_Recipe,ep_urls)
pickle.dump(output,open('epi_recipes.final','wb'))

data = pickle.load(open('epi_recipes.final','rb'))
ar = []
for i in data:
    ar.append(i.__dict__)
pickle.dump(ar,open('epi_recipe_dict_form.dict','wb'))

with open('recipes_data.json', 'w') as fp:
    json.dump(ar, fp)

attempting to build from: https://www.epicurious.com/recipes/food/views/garlic-mojo-sauce
attempting to build from: https://www.epicurious.com/recipes/food/views/sheet-pan-steak-fajitas
attempting to build from: https://www.epicurious.com/recipes/food/views/pumpkin-icebox-pie-with-snickerdoodle-crust-56390159
attempting to build from: https://www.epicurious.com/recipes/food/views/kabocha-squash-pilaf-with-coconut
attempting to build from: https://www.epicurious.com/recipes/food/views/big-batch-seasoned-ground-beef
attempting to build from: https://www.epicurious.com/recipes/food/views/butternut-squash-and-chorizo-hash
attempting to build from: https://www.epicurious.com/recipes/food/views/sheet-pan-spaghetti-squash-puttanesca-56390005
attempting to build from: https://www.epicurious.com/recipes/food/views/sheet-pan-cumin-chicken-thighs-with-squash-fennel-and-grapes-56390000
attempting to build from: https://www.epicurious.com/recipes/food/views/sunflower-seed-risotto-with-squash-and-mu

##5. Read the data

In [5]:
df = pd.read_json('recipes_data.json')
df

Unnamed: 0,calories,fat,personal_rating,protein,rating,sodium,title
0,368.0,36.0,"[[foodienats from Tucson, AZ , 5], [MeganMaris...",1.0,5.0,329.0,Garlic Mojo Sauce
1,362.0,25.0,"[[xmatch from Charleston, WV , 5], [pmccallum ...",15.0,4.5,707.0,Butternut Squash and Chorizo Hash
2,633.0,54.0,"[[bitsylee from New York , 5], [fortheloveofba...",15.0,4.5,849.0,"Sunflower Seed ""Risotto"" with Squash and Mushr..."
3,931.0,66.0,"[[ejtuffley from Grosse Pointe, MI , 5]]",50.0,5.0,1123.0,Steak and Eggs with Saucy Beans
4,502.0,31.0,"[[shmecca1 from Los Angeles, CA , 3], [lmrice ...",12.0,4.5,1055.0,Thai Red Curry with Butternut Squash and Chick...
5,257.0,21.0,"[[yoshihaga22 from Laguna Beach, Ca. , 5], [s...",10.0,5.0,659.0,Kabocha Squash and Pork Stir-Fry
6,312.0,24.0,"[[annanacho from Chicago, IL , 5], [lisamichel...",7.0,5.0,331.0,"Chickpea Pancakes with Leeks, Squash, and Yogurt"
7,431.0,31.0,"[[themarten from Salt Lake City , 2], [krf fro...",18.0,3.5,817.0,Antipasto Salad
8,448.0,18.0,"[[paweldroz from Chicago , 4], [paweldroz from...",12.0,4.0,1080.0,Cold Sesame Noodles with Cucumber
9,71.0,5.0,"[[allgloryisfleeting from Friday Harbor, WA , ...",1.0,5.0,385.0,Sunday Stash Marinara Sauce


##6. Data Processing: Delete the row with NaN data and only consider the recipes with at least four reviews.

In [0]:
df = df.dropna(axis=0) #delete row with NaN 

In [0]:
df = df[df['personal_rating'].map(len) > 9] ##delete the recipes with less than 4 reviews

In [8]:
df

Unnamed: 0,calories,fat,personal_rating,protein,rating,sodium,title
4,502.0,31.0,"[[shmecca1 from Los Angeles, CA , 3], [lmrice ...",12.0,4.5,1055.0,Thai Red Curry with Butternut Squash and Chick...
5,257.0,21.0,"[[yoshihaga22 from Laguna Beach, Ca. , 5], [s...",10.0,5.0,659.0,Kabocha Squash and Pork Stir-Fry
10,539.0,24.0,"[[erifile from Boston, MA , 5], [ElaineGloeckl...",37.0,4.5,758.0,Hot Honey Pork Chops with Escarole and White B...
13,544.0,28.0,"[[teacherhansen from Minneapolis , 5], [brette...",24.0,4.5,840.0,"Skillet Phyllo Pie with Butternut Squash, Kale..."
15,486.0,34.0,"[[ninahaft from Oakland, CA , 5], [renientom f...",38.0,4.5,571.0,Baked Mustard-Crusted Salmon with Asparagus an...
16,578.0,23.0,"[[flagg , 5], [delaniepope from Grand Rapids ,...",34.0,4.5,1151.0,Shrimp with Herby White Beans and Tomatoes
17,656.0,36.0,"[[leenieloo from Mundeleien, IL , 5], [mariegr...",22.0,5.0,1089.0,Oven Risotto with Crispy Roasted Mushrooms
18,577.0,38.0,"[[ryerly from Austin, TX , 5], [duganmcdermon2...",23.0,5.0,839.0,Baked Feta and Greens with Lemony Yogurt
21,497.0,30.0,"[[evangaffney from Brooklyn, NY , 5], [moneyle...",29.0,4.5,1177.0,"Sheet-Pan Cumin Chicken Thighs with Squash, Fe..."
24,470.0,34.0,"[[hasmig4u from Ottawa, Canada , 5], [Annegioi...",28.0,4.5,989.0,Sheet-Pan Chicken with Tomatoes and Mozzarella


## 7. Analysize the user ratings data

In [9]:
p_r = pd.DataFrame(columns=['title', 'user', 'rating'])
p_r

Unnamed: 0,title,user,rating


In [0]:
count = 0
for i in range(df.shape[0]):
    for j in df.iloc[i,2]:
        p_r.loc[count] = [df.iloc[i,6],j[0], j[1]]
        count += 1

In [11]:
p_r.to_csv('recipes_users_ratings.csv')
p_r

Unnamed: 0,title,user,rating
0,Thai Red Curry with Butternut Squash and Chick...,"shmecca1 from Los Angeles, CA",3
1,Thai Red Curry with Butternut Squash and Chick...,"lmrice from kansas city, MO",4
2,Thai Red Curry with Butternut Squash and Chick...,rebeccadickson from NYC,5
3,Thai Red Curry with Butternut Squash and Chick...,lourdesjeanne from New York,4
4,Thai Red Curry with Butternut Squash and Chick...,"nbennison from Amherst, NH",5
5,Thai Red Curry with Butternut Squash and Chick...,"A Cook from New York, NY",4
6,Thai Red Curry with Butternut Squash and Chick...,catlivesupstairs,4
7,Thai Red Curry with Butternut Squash and Chick...,MarcyW1a,5
8,Thai Red Curry with Butternut Squash and Chick...,"sarah12jones212212312 from <a href=""http:",1
9,Thai Red Curry with Butternut Squash and Chick...,"Emily5260 from Washington, DC",4


In [12]:
#Count users' rating
cnt = p_r['user'].value_counts()
cnt

loriltx from Houston                          6
jellis14 from Arlington, MA                   6
bas614 from Boston                            6
lovestoputter                                 5
A Cook                                        5
blondoverblue from Sacramento, CA             5
jansan1 from Orange County, CA.               4
kpalmer747 from Sweden                        4
charface from Portland, OR                    4
krf from Bellevue, WA                         4
delooper from Durham, NC                      4
jansan1 from Orange County, CA                4
wheedle from Manhattan                        3
leenieloo from Mundelein, IL                  3
railim from BC, Canada                        3
vnzjunk from U.P. Michigan                    3
czf2007                                       3
pghgrl from brooklyn, ny                      3
mags425 from Colorado                         3
amyhenriott                                   3
moorloughmary from Edmonton, Alberta    

In [13]:
cnt.describe()

count    2103.000000
mean        1.084165
std         0.389016
min         1.000000
25%         1.000000
50%         1.000000
75%         1.000000
max         6.000000
Name: user, dtype: float64

In [14]:
cnt.value_counts()

1    1971
2     108
3      12
4       6
5       3
6       3
Name: user, dtype: int64

In [15]:
#
user_reviews2 = p_r['user'].value_counts()[p_r['user'].value_counts() > 0].index
trun_recipes_user_review = p_r[p_r['user'].isin(user_reviews2)]
trun_recipes_user_review


Unnamed: 0,title,user,rating
0,Thai Red Curry with Butternut Squash and Chick...,"shmecca1 from Los Angeles, CA",3
1,Thai Red Curry with Butternut Squash and Chick...,"lmrice from kansas city, MO",4
2,Thai Red Curry with Butternut Squash and Chick...,rebeccadickson from NYC,5
3,Thai Red Curry with Butternut Squash and Chick...,lourdesjeanne from New York,4
4,Thai Red Curry with Butternut Squash and Chick...,"nbennison from Amherst, NH",5
5,Thai Red Curry with Butternut Squash and Chick...,"A Cook from New York, NY",4
6,Thai Red Curry with Butternut Squash and Chick...,catlivesupstairs,4
7,Thai Red Curry with Butternut Squash and Chick...,MarcyW1a,5
8,Thai Red Curry with Butternut Squash and Chick...,"sarah12jones212212312 from <a href=""http:",1
9,Thai Red Curry with Butternut Squash and Chick...,"Emily5260 from Washington, DC",4


In [16]:
#Construct a 2D matrix for the recipe-user-rating data
trun_recipes_user_review = trun_recipes_user_review.drop_duplicates(['user','title'])
trun_recipes_user_review_matrix = trun_recipes_user_review.pivot(index='user', columns='title', values='rating')
trun_recipes_user_review_matrix

title,10-Minute Sausage Skillet with Cherry Tomatoes and Broccolini,Arroz Caldo (Chicken Rice Porridge),Baked Feta and Greens with Lemony Yogurt,Baked Mustard-Crusted Salmon with Asparagus and Tarragon,Baked Penne with Green Chiles,Basil-Cashew-Lime Vermicelli Bowls with Pork and Green Beans,Beef and Potato Pasties,Black-Eyed Peas With Chard and Green Herb Smash,Bourbon Pumpkin Pie,Braised Chicken Thighs with Squash and Mustard Greens,...,Tomato and Cannellini Bean Soup,Tuna and Artichoke Cooler-Pressed Sandwiches,Twice-Baked Butternut Squash With Parmesan Cream and Candied Bacon,Vegetarian Brown Rice Salad With Parsnips and Whipped Ricotta,Veggie Burgers with Zucchini and Corn,Very Versatile Baked Beans With Cabbage,Vietnamese-Style Spaghetti Squash “Noodle” Bowls with Skirt Steak,White Bean Salad with Lemon and Cumin,White Chicken Chili,Winter Squash Soup with Gruyère Croutons
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"114Italianate from Johnstown, NY",,,,,,,,,,,...,,,,,,,,,,
"1mustang from Costa Mesa, CA",,,,,,,,,,,...,,,,,,,,,,
"1phoodphan from New York, NY",,,,,,,,,,,...,,,,,,,,,,
"26thpeggy from Warrenton, VA",,,,,,,,,,,...,,,,,,,,,,
2andielaine from Olympic Peninsula,,,,,,,,,,,...,,,,,,,,,5,
4artssake from SC,,,,,,,,,,,...,,,,,,,,,,
502forestdrive4876 from Washington DC,,,,,,,,,,,...,,,,,,,,,,
57mott from New York City,,,,,,,,,,,...,,,,,,,,,,
7677chapman from Denver,,,,,,,,2,,,...,,,,,,,,,,
8005 from ATL,,,,,,,,,,,...,,,,,,,,,,


## 8. Construct the final data set for recipes-user-rating.

In [17]:
# Considering most user only rate one recipe, the recipes-user-rating matrix is too sparse. Here we see 50 users as one user, by combine the 50 users' rating as one person's rating.
final_rating_data = pd.DataFrame(columns=trun_recipes_user_review_matrix.columns)
for i in range(trun_recipes_user_review_matrix.shape[0]//50):
    temp = trun_recipes_user_review_matrix.iloc[50*i:50*i+40].mean(skipna=True, axis=0)
    temp.name = 'user' + str(i)
    final_rating_data.loc[i] = temp
final_rating_data

title,10-Minute Sausage Skillet with Cherry Tomatoes and Broccolini,Arroz Caldo (Chicken Rice Porridge),Baked Feta and Greens with Lemony Yogurt,Baked Mustard-Crusted Salmon with Asparagus and Tarragon,Baked Penne with Green Chiles,Basil-Cashew-Lime Vermicelli Bowls with Pork and Green Beans,Beef and Potato Pasties,Black-Eyed Peas With Chard and Green Herb Smash,Bourbon Pumpkin Pie,Braised Chicken Thighs with Squash and Mustard Greens,...,Tomato and Cannellini Bean Soup,Tuna and Artichoke Cooler-Pressed Sandwiches,Twice-Baked Butternut Squash With Parmesan Cream and Candied Bacon,Vegetarian Brown Rice Salad With Parsnips and Whipped Ricotta,Veggie Burgers with Zucchini and Corn,Very Versatile Baked Beans With Cabbage,Vietnamese-Style Spaghetti Squash “Noodle” Bowls with Skirt Steak,White Bean Salad with Lemon and Cumin,White Chicken Chili,Winter Squash Soup with Gruyère Croutons
0,,,,,,,,2.0,4.0,,...,,5.0,,,,,,4.666667,5.0,3.0
1,5.0,,,,,5.0,,,1.0,,...,,,,,,,1.0,2.0,,5.0
2,,,,,,,,,4.0,,...,,,5.0,,,,,,5.0,5.0
3,,,5.0,,,,,,5.0,,...,,,,,3.0,,5.0,1.0,5.0,5.0
4,,,,,,,,,,,...,,,5.0,,3.0,5.0,,4.0,5.0,5.0
5,,,,,4.5,5.0,,5.0,,,...,4.0,5.0,,5.0,,,,2.0,,1.0
6,,,,5.0,5.0,,,,,,...,,,,3.0,4.0,5.0,,,5.0,
7,,,5.0,,,,,,,,...,5.0,,5.0,,5.0,5.0,5.0,,,
8,,,5.0,,3.0,,5.0,4.5,2.0,,...,,5.0,,,5.0,4.0,,,,
9,,,5.0,,,5.0,5.0,,5.0,4.0,...,4.0,,,,,,,,,


In [0]:
#Save the data to csv file for future use
final_rating_data.to_csv('final_rating_data.csv')

from google.colab import files
files.download('final_rating_data.csv') 