# Import Data

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import re
from matplotlib import pyplot as plt
plt.style.use('ggplot')

In [3]:
badata = pd.read_csv('bestbonapp.csv')
badata['Published'] = pd.to_datetime(badata['Published'])
badata["Ingredients"] = badata["Ingredients"].apply(lambda x: x.split(','))

### Parse Ingredients
**This creates a new column with parsed ingredients**

In [195]:
def pickIngredients(recipe):
        trimmed_lines = [ingred.lower() for ingred in recipe if not re.search('^preferably from |^sliced .*into|^sliced.*thick|ground$|boneless$|^broken into|room temperature$|^cut into|^plus more ?f?o?r? ?.*|ed$', ingred)]
        removed_num = [re.sub('\d+/?\.?\d* ?\d*-?|\d+-?—?–?\d* |\*|"| \(.*\)|\(|\)|\d?½ |\d*¾ |\d?¼ |\d?⅓ |\d?⅔|\d?⅛', '', ele) for ele in trimmed_lines]
        removed_measurements = [re.sub('cups? |slices?d? |small |medium |grated |freshly |peeled |cans? |pints? | inch pieces? |\.|-|inch.*thick| lengthwise|dashes |^plus |bar spoon |Tbsp. |oz. |finely chopped|teaspoons? |tablespoons? |ounces? |pounds? |^ ', '', ele) for ele in removed_num]
        return removed_measurements

In [271]:
badata["filtered_Ingreds"] = badata["Ingredients"].copy()
badata['filtered_Ingreds'] = badata['filtered_Ingreds'].apply(pickIngredients)
badata['filtered_Ingreds'] = badata['filtered_Ingreds'].apply(pickIngredients)

## Clean Ingredients
**This cleans ingredients further**

In [272]:
def washIngredients(recipe):
    dropped_ingreds = [ingred for ingred in recipe if ingred != '' and not re.search('^ |\n|^a |^about', ingred)]
    split_lines = [ingred.split(" or ") for ingred in dropped_ingreds if ingred.count(' or ') > 0]
    return split_lines

In [273]:
badata['filtered_Ingreds'] = badata['filtered_Ingreds'].apply(washIngredients)

In [275]:
badata['filtered_Ingreds'][0]

[['baby beets', 'radishes'],
 ['smoked salmon', 'gravlax'],
 ['neufchâtel', 'cream cheese'],
 ['wasa crispbread', 'toasted pumpernickel slices']]

## Expanding of Dataframe to Ingredients
**This will zip and repeat recipe titles for each of their ingredients** 

Results in `df_ingred` dataframe

In [267]:
df_ingred = badata.copy()

index=0
list_ = []
for item in df_ingred.filtered_Ingreds:
    list_.extend(map(lambda x: [index, x], item))
    index += 1
mini_ingred = pd.DataFrame(list_, columns=['index', 'ingred'])

In [268]:
df_ingred = pd.merge(df_ingred.drop('filtered_Ingreds', axis = 1), mini_ingred, \
                     how="right", left_index=True, right_on='index')
df_ingred.head()

Unnamed: 0,DishTitle,Ingredients,Instructions,Published,Rating,Reviews,index,ingred
0,Smoked Salmon Breakfast Salad with Crispbread,"[2 baby beets or radishes, thinly sliced, 8 t...","Toss beets, onion (if using), mesclun, lemon j...",2015-01-01,4.5,,0,"[baby beets, radishes]"
1,Smoked Salmon Breakfast Salad with Crispbread,"[2 baby beets or radishes, thinly sliced, 8 t...","Toss beets, onion (if using), mesclun, lemon j...",2015-01-01,4.5,,0,"[smoked salmon, gravlax]"
2,Smoked Salmon Breakfast Salad with Crispbread,"[2 baby beets or radishes, thinly sliced, 8 t...","Toss beets, onion (if using), mesclun, lemon j...",2015-01-01,4.5,,0,"[neufchâtel, cream cheese]"
3,Smoked Salmon Breakfast Salad with Crispbread,"[2 baby beets or radishes, thinly sliced, 8 t...","Toss beets, onion (if using), mesclun, lemon j...",2015-01-01,4.5,,0,"[wasa crispbread, toasted pumpernickel slices]"
4,Trout Roe and Turnip Toast,"[Dark rye bread, Cream cheese, Trout roe, Baby...",Spread toasted dark rye bread with cream chees...,2015-01-01,3.0,,1,"[baby white turnips, radishes]"


In [270]:
quick_list = df_ingred.groupby('ingred')['DishTitle'].count()#.sort_values(ascending=False)
quick_list

TypeError: unhashable type: 'list'

## Parse Reviews
**This will use NPL to filter unnecessary words, and frequency count remaining words**

## Analyze Adjustments in Covid Recipes

### Beginning Plots

In [95]:
# sns.distplot(badata['Rating'])

In [96]:
# plt.hist(badata['Published'])
# plt.xlabel('2015 Months')
# plt.ylabel('Count')
# plt.title('Histogram of Recipes per Month, 2015', fontsize=20)

In [97]:
# plt.scatter(badata['Published'], badata["Rating"], alpha=.5)
# plt.xlabel('2015 Months')
# plt.ylabel('Ratings')
# plt.title('Scatterplot of Rating per Month, 2015', fontsize=20)

In [396]:
# def fixIngred(recipe):
#         trimmed_lines = [ingred for ingred in recipe if not re.search('^preferably from |boneless$|^sliced crosswise into .*|^broken into|room temperature|^cut into|^plus more ?f?o?r? ?.*|ed$', ingred)]
#         removed_num = [re.sub('\d+/?\d* \d*-?|\d*-?—?–\d+ |\*| \(.*\)|\d?½ |\d*¾ |\d?¼ |\d?⅓ |\d?⅔', '', ele) for ele in trimmed_lines]
#         removed_measurements = [re.sub('cups? |cans? |pints? |inch pieces? | lengthwise|dashes |^plus |bar spoon |Tbsp. |oz. |finely chopped|teaspoons? ?|tablespoons? |ounces? |pounds? |^ ', '', ele) for ele in removed_num]
#         return removed_measurements