# This Notebook is used for generating the data for Meal planning for the new Millennium (MnM) problem


##1. Import all the necessary library

In [0]:
from bs4 import BeautifulSoup as bs
from urllib.request import urlopen as url 
import pickle
import json
import multiprocessing
import pandas as pd 
import numpy as np

##2. Define EP_Recipe class to store all the data.

In [0]:
class EP_Recipe():
    title = None
    rating = None
    personal_rating = []
    calories = None
    sodium = None
    fat = None
    protein = None

    def get_title(self, page):
        return page.find('h1', {'itemprop': 'name'}).text

    def get_rating(self, page):
        try:
            return float(page.find_all('span', {'class': 'rating'})[-1].text.split('/')[0]) + 1
        except:
            return None

    def build_recipie(self, page):
        #super(EP_Recipe, self).build_recipie(page)
        self.title = self.get_title(page)
        self.rating = self.get_rating(page)
        self.calories = self.get_calories(page)
        self.sodium = self.get_sodium(page)
        self.fat = self.get_fat(page)
        self.protein = self.get_protein(page)
        self.personal_rating = self.get_personal_rating(page)

    def get_calories(self,page):
        try:
            return float(page.find('span',{'class':'nutri-data','itemprop':'calories'}).text)
        except:
            return None

    def get_sodium(self,page):
        try:
            return float(page.find('span',{'class':'nutri-data','itemprop':'sodiumContent'}).text.split(' ')[0])
        except:
            return None

    def get_fat(self, page):
        try:
            return float(page.find('span', {'class': 'nutri-data', 'itemprop': 'fatContent'}).text.split(' ')[0])
        except:
            return None

    def get_protein(self, page):
        try:
            return float(page.find('span', {'class': 'nutri-data', 'itemprop': 'proteinContent'}).text.split(' ')[0])
        except:
            return None
    
    def get_personal_rating(self, page):
        try: 
            p_ratings = page.findAll('img', {'class': 'fork-rating'})
            p_persons = page.find_all('span', {'class': 'credit'})
            p_r = []            
            for i in range(len(p_ratings)):
                temp = [p_persons[i].text.split('/')[0], int(p_ratings[i]['src'].split('/')[-1].split('_')[0]) + 1]
                p_r.append(temp)
            return p_r
        except:
            return None


    def __init__(self, page):
        print('attempting to build from: '+page)
        try:
            self.build_recipie(bs(url(page), 'html.parser'))
        except Exception as x:
            print('Could not build from %s, %s'%(page,x))
            


##3. Find all the addresses for recipes and store them in ep_urls

In [0]:
all_url = ['https://www.epicurious.com/recipes-menus/what-to-cook-this-weekend-february-22-24-gallery',
          'https://www.epicurious.com/recipes-menus/what-to-cook-this-weekend-february-8-10-gallery',
           "https://www.epicurious.com/ingredients/acorn-delicata-kabocha-spaghetti-squash-winter-recipes-gallery",
           'https://www.epicurious.com/recipes-menus/easy-dinner-recipes-for-cook90-gallery',
          'https://www.epicurious.com/recipes-menus/our-favorite-cook90-lunches-gallery',
          'https://www.epicurious.com/recipes-menus/make-ahead-weeknight-dinners-stew-soup-freezer-casserole-quick-easy-recipes-gallery']

ep_urls = set()
for i in all_url:        
    initializer = url(i)
    res = bs(initializer.read(),"html5lib")
    for div in res.findAll('div', {'class': 'gallery-slide-caption__dek-container'}):
            ep_urls.update([div.find('a')['href']]) 

##4. Scrape the website of recipes and generate the data. Store the data as recipes_data.json.


In [4]:
p = multiprocessing.Pool(4)
output = p.map(EP_Recipe,ep_urls)
pickle.dump(output,open('epi_recipes.final','wb'))

data = pickle.load(open('epi_recipes.final','rb'))
ar = []
for i in data:
    ar.append(i.__dict__)
pickle.dump(ar,open('epi_recipe_dict_form.dict','wb'))

with open('recipes_data.json', 'w') as fp:
    json.dump(ar, fp)

attempting to build from: https://www.epicurious.com/recipes/food/views/arroz-caldo-filipino-chicken-rice-porridge
attempting to build from: https://www.epicurious.com/recipes/food/views/veggie-burgers-with-zucchini-and-corn
attempting to build from: https://www.epicurious.com/recipes/food/views/10-minute-sausage-skillet-with-cherry-tomatoes-and-broccolini
attempting to build from: https://www.epicurious.com/recipes/food/views/kale-salad-with-roasted-butternut-squash-pomegranate-and-pumpkin-seeds
attempting to build from: https://www.epicurious.com/recipes/food/views/big-red-slow-cooker-pulled-pork
attempting to build from: https://www.epicurious.com/recipes/food/views/farro-salad-with-roasted-sweet-potatoes-red-onion-and-goat-cheese
attempting to build from: https://www.epicurious.com/expert-advice/sunday-stash-1-batch-of-sweet-potatoes-9-family-meals-article
attempting to build from: https://www.epicurious.com/recipes/food/views/easy-fried-rice-with-chicken-and-broccolini
Could not b

##5. Read the data

In [5]:
df = pd.read_json('recipes_data.json')
df

Unnamed: 0,calories,fat,personal_rating,protein,rating,sodium,title
0,1051.0,64.0,"[[kgerwin from Brooklyn, NY , 5], [firesprite ...",63.0,5.0,1212.0,Arroz Caldo (Chicken Rice Porridge)
1,,,"[[rdies , 3]]",,3.0,,Big Red Slow-Cooker Pulled Pork
2,503.0,34.0,"[[lfox18 from Ballston Lake, NY , 5], [hairbra...",29.0,4.5,1574.0,Vietnamese-Style Spaghetti Squash “Noodle” Bow...
3,646.0,52.0,,15.0,4.0,843.0,Butternut Squash Vegducken
4,139.0,9.0,"[[Mossi from Kitchener, ON , 5], [llaaaj from ...",3.0,4.5,539.0,Very Versatile Baked Beans With Cabbage
5,,,"[[pkwp from Winter Park, CO , 5], [suzwil from...",,4.5,,Sheet-Pan Crispy Fish Tacos with Chili-Roasted...
6,345.0,10.0,"[[csidell from Hoosier , 5], [smallkitchenbigp...",10.0,5.0,795.0,Honey-Garlic Noodles
7,135.0,4.0,[],2.0,1.0,624.0,Raw Butternut Squash Ribbon Salad with Orange ...
8,816.0,52.0,"[[robboo from Syr ny , 4], [bellabug , 4], [mo...",39.0,3.5,1015.0,Quick Sweet and Sour Chicken
9,188.0,5.0,"[[questrist , 3], [michaelwryan19573965 from P...",15.0,4.5,483.0,Tuna and Artichoke Cooler-Pressed Sandwiches


##6. Data Processing: Delete the row with NaN data and only consider the recipes with at least four reviews.

In [0]:
df = df.dropna(axis=0) #delete row with NaN 

In [0]:
df = df[df['personal_rating'].map(len) > 9] ##delete the recipes with less than 4 reviews

In [11]:
df

Unnamed: 0,calories,fat,personal_rating,protein,rating,sodium,title
0,1051.0,64.0,"[[kgerwin from Brooklyn, NY , 5], [firesprite ...",63.0,5.0,1212.0,Arroz Caldo (Chicken Rice Porridge)
2,503.0,34.0,"[[lfox18 from Ballston Lake, NY , 5], [hairbra...",29.0,4.5,1574.0,Vietnamese-Style Spaghetti Squash “Noodle” Bow...
4,139.0,9.0,"[[Mossi from Kitchener, ON , 5], [llaaaj from ...",3.0,4.5,539.0,Very Versatile Baked Beans With Cabbage
8,816.0,52.0,"[[robboo from Syr ny , 4], [bellabug , 4], [mo...",39.0,3.5,1015.0,Quick Sweet and Sour Chicken
9,188.0,5.0,"[[questrist , 3], [michaelwryan19573965 from P...",15.0,4.5,483.0,Tuna and Artichoke Cooler-Pressed Sandwiches
13,375.0,16.0,"[[cheching from Toronto, ON , 4], [bootssapphi...",9.0,4.5,794.0,Miso-Tahini Squash Soup with Brown Rice
16,820.0,47.0,"[[dlnordel from Gardena, CA , 5], [sarahmlee f...",28.0,4.5,713.0,Beef and Potato Pasties
19,708.0,41.0,"[[keritilman from Newport, RI , 2], [Maddie Hj...",20.0,3.5,733.0,Veggie Burgers with Zucchini and Corn
21,1032.0,67.0,"[[moongirl721 from New Troy, Michigan , 4], [c...",51.0,4.5,1592.0,Korean Fried Chicken
23,828.0,44.0,"[[msgbdg@gmail.com from Canton, MI , 5], [ajv1...",25.0,5.0,940.0,Spicy Black Bean and Corn Tacos


## 7. Analysize the user ratings data

In [14]:
p_r = pd.DataFrame(columns=['title', 'user', 'rating'])
p_r

Unnamed: 0,title,user,rating


In [0]:
count = 0
for i in range(df.shape[0]):
    for j in df.iloc[i,2]:
        p_r.loc[count] = [df.iloc[i,6],j[0], j[1]]
        count += 1

In [16]:
p_r.to_csv('recipes_users_ratings.csv')
p_r

Unnamed: 0,title,user,rating
0,Arroz Caldo (Chicken Rice Porridge),"kgerwin from Brooklyn, NY",5
1,Arroz Caldo (Chicken Rice Porridge),"firesprite from Oakland, CA",4
2,Arroz Caldo (Chicken Rice Porridge),"michael211 from Boston, MA",5
3,Arroz Caldo (Chicken Rice Porridge),"jeneaby from Alpharetta, Ga",5
4,Arroz Caldo (Chicken Rice Porridge),lisa_anna from New York,5
5,Arroz Caldo (Chicken Rice Porridge),haute.house@icloud.com from Paris France,5
6,Arroz Caldo (Chicken Rice Porridge),kayeshari,5
7,Arroz Caldo (Chicken Rice Porridge),theresa54 from San francisco,5
8,Arroz Caldo (Chicken Rice Porridge),"rvj711 from NoVA (Northern VA) from Manila, Ph...",4
9,Arroz Caldo (Chicken Rice Porridge),"lisadorenwendt from Wilmette, IL",5


In [17]:
#Count users' rating
cnt = p_r['user'].value_counts()
cnt

jellis14 from Arlington, MA                  6
loriltx from Houston                         6
bas614 from Boston                           6
A Cook                                       5
blondoverblue from Sacramento, CA            5
lovestoputter                                5
delooper from Durham, NC                     4
charface from Portland, OR                   4
jansan1 from Orange County, CA.              4
jansan1 from Orange County, CA               4
kpalmer747 from Sweden                       4
krf from Bellevue, WA                        4
amyhenriott                                  3
railim from BC, Canada                       3
leenieloo from Mundelein, IL                 3
vnzjunk from U.P. Michigan                   3
rstarner3877 from Columbia, SC               3
pghgrl from brooklyn, ny                     3
oliveoil62 from Olympia, WA                  3
mags425 from Colorado                        3
czf2007                                      3
wheedle from 

In [18]:
cnt.describe()

count    2102.000000
mean        1.084206
std         0.389104
min         1.000000
25%         1.000000
50%         1.000000
75%         1.000000
max         6.000000
Name: user, dtype: float64

In [19]:
cnt.value_counts()

1    1970
2     108
3      12
4       6
5       3
6       3
Name: user, dtype: int64

In [20]:
#
user_reviews2 = p_r['user'].value_counts()[p_r['user'].value_counts() > 0].index
trun_recipes_user_review = p_r[p_r['user'].isin(user_reviews2)]
trun_recipes_user_review


Unnamed: 0,title,user,rating
0,Arroz Caldo (Chicken Rice Porridge),"kgerwin from Brooklyn, NY",5
1,Arroz Caldo (Chicken Rice Porridge),"firesprite from Oakland, CA",4
2,Arroz Caldo (Chicken Rice Porridge),"michael211 from Boston, MA",5
3,Arroz Caldo (Chicken Rice Porridge),"jeneaby from Alpharetta, Ga",5
4,Arroz Caldo (Chicken Rice Porridge),lisa_anna from New York,5
5,Arroz Caldo (Chicken Rice Porridge),haute.house@icloud.com from Paris France,5
6,Arroz Caldo (Chicken Rice Porridge),kayeshari,5
7,Arroz Caldo (Chicken Rice Porridge),theresa54 from San francisco,5
8,Arroz Caldo (Chicken Rice Porridge),"rvj711 from NoVA (Northern VA) from Manila, Ph...",4
9,Arroz Caldo (Chicken Rice Porridge),"lisadorenwendt from Wilmette, IL",5


In [21]:
#Construct a 2D matrix for the recipe-user-rating data
trun_recipes_user_review = trun_recipes_user_review.drop_duplicates(['user','title'])
trun_recipes_user_review_matrix = trun_recipes_user_review.pivot(index='user', columns='title', values='rating')
trun_recipes_user_review_matrix

title,10-Minute Sausage Skillet with Cherry Tomatoes and Broccolini,Arroz Caldo (Chicken Rice Porridge),Baked Feta and Greens with Lemony Yogurt,Baked Mustard-Crusted Salmon with Asparagus and Tarragon,Baked Penne with Green Chiles,Basil-Cashew-Lime Vermicelli Bowls with Pork and Green Beans,Beef and Potato Pasties,Black-Eyed Peas With Chard and Green Herb Smash,Bourbon Pumpkin Pie,Braised Chicken Thighs with Squash and Mustard Greens,...,Tomato and Cannellini Bean Soup,Tuna and Artichoke Cooler-Pressed Sandwiches,Twice-Baked Butternut Squash With Parmesan Cream and Candied Bacon,Vegetarian Brown Rice Salad With Parsnips and Whipped Ricotta,Veggie Burgers with Zucchini and Corn,Very Versatile Baked Beans With Cabbage,Vietnamese-Style Spaghetti Squash “Noodle” Bowls with Skirt Steak,White Bean Salad with Lemon and Cumin,White Chicken Chili,Winter Squash Soup with Gruyère Croutons
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"114Italianate from Johnstown, NY",,,,,,,,,,,...,,,,,,,,,,
"1mustang from Costa Mesa, CA",,,,,,,,,,,...,,,,,,,,,,
"1phoodphan from New York, NY",,,,,,,,,,,...,,,,,,,,,,
"26thpeggy from Warrenton, VA",,,,,,,,,,,...,,,,,,,,,,
2andielaine from Olympic Peninsula,,,,,,,,,,,...,,,,,,,,,5,
4artssake from SC,,,,,,,,,,,...,,,,,,,,,,
502forestdrive4876 from Washington DC,,,,,,,,,,,...,,,,,,,,,,
57mott from New York City,,,,,,,,,,,...,,,,,,,,,,
7677chapman from Denver,,,,,,,,2,,,...,,,,,,,,,,
8005 from ATL,,,,,,,,,,,...,,,,,,,,,,


## 8. Construct the final data set for recipes-user-rating.

In [22]:
# Considering most user only rate one recipe, the recipes-user-rating matrix is too sparse. Here we see 50 users as one user, by combine the 50 users' rating as one person's rating.
final_rating_data = pd.DataFrame(columns=trun_recipes_user_review_matrix.columns)
for i in range(trun_recipes_user_review_matrix.shape[0]//50):
    temp = trun_recipes_user_review_matrix.iloc[50*i:50*i+40].mean(skipna=True, axis=0)
    temp.name = 'user' + str(i)
    final_rating_data.loc[i] = temp
final_rating_data

title,10-Minute Sausage Skillet with Cherry Tomatoes and Broccolini,Arroz Caldo (Chicken Rice Porridge),Baked Feta and Greens with Lemony Yogurt,Baked Mustard-Crusted Salmon with Asparagus and Tarragon,Baked Penne with Green Chiles,Basil-Cashew-Lime Vermicelli Bowls with Pork and Green Beans,Beef and Potato Pasties,Black-Eyed Peas With Chard and Green Herb Smash,Bourbon Pumpkin Pie,Braised Chicken Thighs with Squash and Mustard Greens,...,Tomato and Cannellini Bean Soup,Tuna and Artichoke Cooler-Pressed Sandwiches,Twice-Baked Butternut Squash With Parmesan Cream and Candied Bacon,Vegetarian Brown Rice Salad With Parsnips and Whipped Ricotta,Veggie Burgers with Zucchini and Corn,Very Versatile Baked Beans With Cabbage,Vietnamese-Style Spaghetti Squash “Noodle” Bowls with Skirt Steak,White Bean Salad with Lemon and Cumin,White Chicken Chili,Winter Squash Soup with Gruyère Croutons
0,,,,,,,,2.0,4.0,,...,,5.0,,,,,,4.666667,5.0,3.0
1,5.0,,,,,5.0,,,1.0,,...,,,,,,,1.0,2.0,,5.0
2,,,,,,,,,4.0,,...,,,5.0,,,,,,5.0,5.0
3,,,5.0,,,,,,5.0,,...,,,,,3.0,,5.0,1.0,5.0,5.0
4,,,,,,,,,,,...,,,5.0,,3.0,5.0,,4.0,5.0,5.0
5,,,,,4.5,5.0,,5.0,,,...,4.0,5.0,,5.0,,,,2.0,,1.0
6,,,,5.0,5.0,,,,,,...,,,,3.0,4.0,5.0,,,5.0,
7,,,5.0,,,,,,,,...,5.0,,5.0,,5.0,5.0,5.0,,,
8,,,5.0,,3.0,,5.0,4.5,2.0,,...,,5.0,,,5.0,4.0,,,,
9,,,5.0,,,5.0,5.0,,5.0,4.0,...,4.0,,,,,,,,,


In [0]:
#Save the data to csv file for future use
final_rating_data.to_csv('final_rating_data.csv')
