# Import Data

In [15]:
import pandas as pd
import numpy as np
from itertools import chain
import seaborn as sns
import re
from matplotlib import pyplot as plt
plt.style.use('ggplot')

In [16]:
badata = pd.read_csv('new_bonapp.csv')
badata['published'] = pd.to_datetime(badata['published'])
badata["ingredients"] = badata["ingredients"].apply(lambda x: x.split(','))

## Clean Ingredients
**This cleans various names for the same ingredient**

In [17]:
def washIngredients(recipe):
    trimmed_lines = [ingred.lower() for ingred in recipe if not re.search('^preferably |^cut on |^be found |^into |^like |^on the |^ ?and/?o?r? |^lightly .* with .*a|^soaked|^thinly |^halved|^shaved for|^shaved into|^shaved with|^split|^sliced .*into|^sliced.*thick|^beaten |ground$|boneless$|^broken into|room temperature$|^cut into|^plus more ?f?o?r? ?.*|ed$', ingred)]
    removed_num = [re.sub('\.|\W+-|\W+—|\W+–|\d+-|\*|"|”|;.*| \(.*\)|\(|\)|½|¾|¼|⅓|⅔|⅛', '', ele) for ele in trimmed_lines]
    removed_kitchen_words = [re.sub('coarsely |is avail.*|are avail.*|knob of | a mix of .*|^firm but |^firmly packed| slices| spears|^shredded |^torn|^and |granulated |^roasted |^asian$|fresh |leaves|skinless|cloves?|slices?d? |with .*stems|chilled |chopped |online|finely |grated |freshly |peeled |sprigs |pestle|such as .*|other |bar spoon |thinly |finely chopped|low.?sodium ', '', ele) for ele in removed_num]
    removed_measurements = [re.sub('\d+|tbsp |cups? |oz |bottle |light |envelope |packages? |found in.*|bunch |online|small |medium |large |pinch of | cans? |pints? | ?inch pieces? |inch.*thick| lengthwise|dashes |^plus |tbsp. |oz. |teaspoons? |tsp |tablespoons? |ounces? |pounds? |^ |^\n', '', ele) for ele in removed_kitchen_words]
    removed_num = [re.sub('\.|\W+-|\W+—|\W+–|\d+-|\*|"|”|;.*| \(.*\)|\(|\)|½|¾|¼|⅓|⅔|⅛', '', ele) for ele in removed_measurements]
    removed_kitchen_words = [re.sub('coarsely |skin.?on | ?fillets?|center.?cut |^boneless |^new$| slices| spears|^heads? o?f? ?|^torn|^and|granulated |^crisp |^tart |^one |^ripe |t.i.*cut |^asian$|fresh |leaves|skinless|cloves?|slices?d? |with .*stems|chilled |chopped |online|finely |at .* markets.*|from a .*|grated |freshly |peeled |sprigs |pestle|such as .*|other |bar spoon |finely chopped|low.?sodium ', '', ele) for ele in removed_num]
    removed_measurements = [re.sub('\d+|cups? |^dash |oz |^lb |bottle |^liter | is a .*| threads$|^like |^scoops? |cut into.*|^seadless | twist$| zest$|^shell.*on |handful o?f? ?|^bowls$|^clear$|envelope ?|packages? |bunch | stalks$|online|small |medium |large |^shaved |pinch of | cans? |pints? | ?inch pieces? | pieces$|inch.*thick|^inch.* | lengthwise|dashes |^plus |tbsp. |oz. |teaspoons? |tsp |tablespoons? |ounces? |pounds? |^ |^\n', '', ele) for ele in removed_kitchen_words]
    return removed_measurements
    
def chopIngredients(recipe):
    split_lines = [ingred.strip().split(" or ") if ingred.count(' or ') > 0 else [ingred.strip()] for ingred in recipe]
    split_lines = list(chain.from_iterable(split_lines))
    second_split_lines = [ingred.split(" and ") if ingred.count(' and ') > 0 else [ingred] for ingred in split_lines]
    second_split_lines = list(chain.from_iterable(second_split_lines))
    return second_split_lines

def peelIngredients(recipe):
    dropped_ingreds = [ingred.strip() for ingred in recipe if ingred != '' and not re.search('^or |^more$|^more for|^for |^a |^find |^ainch|^into|^cut |^about|^any? |^in h|^and/or|^-|^–|^—', ingred)]
    dropped_ingreds_set = set(chain.from_iterable(badata['filtered_ingreds']))
    new_dropped_ingreds = [ingred + 's' if ingred + 's' in dropped_ingreds_set else ingred + 'es' if ingred + 'es' in dropped_ingreds_set else ingred for ingred in dropped_ingreds]
    return new_dropped_ingreds

def simmerIngredients(recipe):
    ##flattens lemons, black pepper, eggs, olive oil, flour, oranges, pears, kale, anchovies
    reduced_ingreds = ['lemon' if re.search('lemons? |lemons?$', ingred) else 'ground black pepper' if \
                       ingred.count('ground pepper') > 0 else 'eggs' if ingred.count('egg ') > 0  else \
                       'lime' if ingred.count('lime') > 0 else 'olive oil' if ingred.count('olive oil') > 0 else \
                       'all-purpose flour' if re.search('all.* flour', ingred) else 'blood oranges' if \
                       ingred.count('blood orange') > 0 else 'oranges' if re.search(' orange$| orange |orange w|orange .*u', ingred) else \
                       'pears' if re.search(' ?pears?$', ingred) else 'kale' if re.search(' kale', ingred) \
                       else 'anchovy fillets' if ingred.count('anchovy') > 0 else ingred for ingred in recipe ]
    repeated_ingreds = list(set(reduced_ingreds))
    return repeated_ingreds

In [18]:
badata["filtered_ingreds"] = badata["ingredients"].copy()
badata['filtered_ingreds'] = badata['filtered_ingreds'].apply(washIngredients)
badata['filtered_ingreds'] = badata['filtered_ingreds'].apply(chopIngredients)
badata['filtered_ingreds'] = badata['filtered_ingreds'].apply(washIngredients)
badata['filtered_ingreds'] = badata['filtered_ingreds'].apply(peelIngredients)
badata['filtered_ingreds'] = badata['filtered_ingreds'].apply(simmerIngredients)

## Expanding of Dataframe to Ingredients
**This will zip recipe titles for each of their ingredients** 

Results in `df_ingred` dataframe

In [20]:
df_ingred = badata.copy()

index=0
list_ = []
for item in df_ingred.filtered_ingreds:
    list_.extend(map(lambda x: [index, x], item))
    index += 1
mini_ingred = pd.DataFrame(list_, columns=['index', 'ingred'])
df_ingred = pd.merge(df_ingred.drop('filtered_ingreds', axis = 1), mini_ingred, \
                     how="right", left_index=True, right_on='index')
df_ingred = df_ingred.drop(["ingredients", "instructions", 'index'], axis=1).sort_values(['published', "dishtitle"]).reset_index()
df_ingred.columns = df_ingred.columns.str.lower()

In [21]:
df_ingred.to_csv(r"./Bon_App_Shiny/data/bonapp_df.csv")

In [22]:
badata.to_csv(r"./Bon_App_Shiny/data/full_bonapp_df.csv")

## Parse Reviews
**This will use NPL to filter unnecessary words, and frequency count remaining words**

In [None]:
## Did not get to, would like to return to at a later date

### Beginning Plots

In [None]:
# df_ingred.groupby('ingred')[['dishtitle']].count().sort_values(by='dishtitle', ascending=False)[:20]

In [None]:
# df_ingred.groupby('ingred')[['dishtitle']].count().sort_values(by='dishtitle')[100:130]

In [None]:
plt.hist(badata['Published'])

In [None]:
df_2020 = (badata[(badata['Published'] > pd.to_datetime("2019-12-01"))])
plt.hist(df_2020['Published'])

In [None]:
import plotly
plotly.offline.init_notebook_mode(connected=True)
from plotly.graph_objs import Figure, Histogram, Layout

min_ = df_2020['Published'].min()
max_ = df_2020['Published'].max()

data = [Histogram(x=df_2020['Published'], 
                  xbins=dict(start=min_,
                             end=max_))]
                             #size=(max_-min_)/2))]
layout = Layout(title="Histogram of March Recipes",
                bargap=0.20)
fig = Figure(data=data, layout=layout)

plotly.offline.iplot(fig, show_link=False, image_width=600, image_height=400)

In [None]:
from plotly.graph_objs import Scatter

data = [Scatter(x=badata['Published'], y=badata['Rating'], mode = 'markers')]#, text=df['movie_title'])]
layout = Layout(title="Gross Income vs Budget")

fig = Figure(data=data, layout=layout)

plotly.offline.iplot(fig, show_link=False)

In [None]:
df_ingred.groupby('published')['rating'].mean().plot()

In [None]:
df_ingred.groupby('published')['rating'].count().plot()

In [None]:
df_ingred.groupby('published')['ingred'].count().plot()

In [None]:
df_ingred.loc[df_ingred['ingred'] == 'kale'].groupby('published')['dishtitle'].count().plot()

In [None]:
badata.loc[badata["Rating"] > 4].groupby('Published')['DishTitle'].count().plot()

In [None]:
sns.distplot(badata['Rating'])

In [None]:
plt.hist(badata['Published'], bins=60)
plt.xlabel('Year of Publication')
plt.ylabel('Count')
plt.title('Histogram of Recipes from 2015 - 2020', fontsize=20)

In [None]:
plt.scatter(badata['Published'], badata["Rating"], alpha=.5)
plt.xlabel('2015 Months')
plt.ylabel('Ratings')
plt.title('Scatterplot of Rating per Month, 2015', fontsize=20)