In [0]:
import requests
import csv
import numpy as np
import pandas as pd
import random

# Load Data

In [0]:
data = pd.read_csv('./data/nlp_finalproj_data_withids.csv')
data.drop(['keto', 'ingred_id'], axis=1, inplace=True)

In [0]:
data['cuisine'].value_counts()

Chinese           266
Caribbean         250
American          250
German            250
Middle Eastern    250
Thai              250
French            250
Mexican           250
Vietnamese        250
Mediterranean     250
Italian           250
Japanese          249
Korean            241
Indian            211
African           123
Name: cuisine, dtype: int64

# Encode and seperate features

In [0]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
import re

from sklearn import preprocessing


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [0]:
def deduplicate(ingredient):
    # try to uniform the duplicated or similar ingredients
    ingredient = re.sub('canned |fresh |cooked |dried |dry |ground |refrigerated |light |ready-to-use ', '', ingredient)
    ingredient = re.sub('lettuc[^0-9]+', 'lettuc', ingredient)
    ingredient = re.sub('[^0-9]+ salt', 'salt', ingredient)
    ingredient = re.sub('[^0-9]+ basil|basil [^0-9]+', 'basil', ingredient)
    ingredient = re.sub('[^0-9]+ bacon|bacon [^0-9]+', 'bacon', ingredient)
    ingredient = re.sub('[^0-9]+ thyme|thyme [^0-9]+', 'thyme', ingredient)
    ingredient = re.sub('[^0-9]+ cilantro|cilantro [^0-9]+', 'cilantro', ingredient)
    ingredient = re.sub('[^0-9]+ parsley|parsley [^0-9]+', 'parsley', ingredient)
    ingredient = re.sub('[^0-9]+ cinnamon|cinnamon [^0-9]+', 'cinnamon', ingredient)
    ingredient = re.sub('cream[^0-9]+chees|cream chees[^0-9]+', 'cream chees', ingredient)
    ingredient = re.sub('[^0-9]+parmesan[^0-9]+', 'parmesan chees', ingredient)
    ingredient = re.sub('[^0-9]+ peanut butt', 'peanut butt', ingredient)
    ingredient = re.sub('[^0-9]+ chili pepper', 'chili pepp', ingredient)
    ingredient = re.sub('[^0-9]+ nectar', 'nectar', ingredient)
    ingredient = re.sub('[^0-9]+ tofu', 'tofu', ingredient)
    ingredient = re.sub('[^0-9]+ tomato', 'tomato', ingredient)
    ingredient = re.sub('[^0-9]+ bell pepp', 'bell pepp', ingredient)
    ingredient = re.sub('egg roll wra[^0-9]+', 'egg roll wrap', ingredient)
    return ingredient


def clean_data(igd_data):
    # igd_data: a array-like object which contains all ingredients list of training dataset
    # stem: boolean, default to be True
    # *output: a dict
    igds = list()
    raw_igds = igd_data.split(sep=',')
    for i in raw_igds:
        igds.append(re.sub(' *\'|\[|\]|\ *"', '', i)) # delete unneeded punctuations
    return igds


def stem_and_deduplicate(igd_data):
    ps = PorterStemmer()
    stem_igds = [ps.stem(x) for x in igd_data]
    return [deduplicate(y) for y in stem_igds]

In [0]:
# Split and encode ingredients
all_ingredients = [clean_data(x) for x in data['ingredients']]

all_ingredients = [stem_and_deduplicate(clean_data(x)) for x in data['ingredients']]

le_igd = preprocessing.LabelEncoder()
le_igd.fit(np.hstack(all_ingredients))

data['encoded_ingredients'] = [le_igd.transform(x) for x in all_ingredients]



In [0]:
# Concatenate the ingredients and process

# Split the process
ps = PorterStemmer()
process = list()
for i in range(len(data)):
    tem = data['process'][i].split(sep=', [')
    tokens = [re.sub(' *\'|\[|\]|\ *"', '', i).replace(',', ' ') for i in tem]
    process.append([ps.stem(x) for x in tokens])

# Link them
process_igds = []
for (n, left) in enumerate(process):
    right = all_ingredients[n]
    piece = []
    for (i, p) in enumerate(left):
        piece.append(p+' '+right[i])
    process_igds.append(piece)

data['process ingredients'] = process_igds

print("For recipe:", data['title'][5])
print("Before pairing:")
print(data['ingredients'][5])
print(data['process'][5])

tem = data['process ingredients'][5]
for t in tem:
    print(t)




 olive oil
lightly beaten egg
red red curry past
 sugar
cooked uncooked ( 250g-300g ) basmati ric
slice spring onion
frozen pea
 soy sauc
smoked flak mackerel
cut into half moon cucumb


In [0]:
def extract_methods(methods_data, stopwords):
    tokens = word_tokenize(str(methods_data).lower())
    tagged_ins = nltk.pos_tag(tokens)
    methods = set()
    ps = PorterStemmer()
    for (m, tag) in tagged_ins:
        if tag in ['VB', 'NN']: # NOT decided if to take NN in
            methods.add(ps.stem(m))
    effective_methods = [w for w in methods if not w in stopwords]
    return effective_methods

In [0]:
# Split and encode methods (from instructions)

freq_vb = ['put', 'add', 'use', 'keep', 'prepar', 'start']

data['instructions'] = [extract_methods(x, freq_vb) for x in data['instructions']]

# Encode the instructions words
le_methods = preprocessing.LabelEncoder()
le_methods.fit(np.hstack(data['instructions']))

data['encoded_instructions'] = [le_methods.transform(x) for x in data['instructions']]


In [0]:
# Split and encode diets

all_diets = [stem_and_deduplicate(clean_data(x)) for x in data['diets']]

le_diets = preprocessing.LabelEncoder()
le_diets.fit(np.hstack(all_diets))
le_diets.classes_

data['encoded_diets'] = [le_diets.transform(x) for x in all_diets]

# Dict for search
code2diets = {x:n for n,x in zip(range(10),le_diets.classes_.tolist())}

diet_info = np.zeros((len(data), 11))
for i in le_diets.classes_.tolist():
    for (num, recipe_info) in enumerate(data['encoded_diets']):
        for diet in recipe_info:
            diet_info[int(num), int(diet)] = 1

diet_names = le_diets.classes_.tolist()
diet_names[0] = 'no diet'

diet_info = pd.DataFrame(diet_info, columns=diet_names)
data = data.join(diet_info)


Unnamed: 0,no diet,dairy fre,fodmap friendli,gluten fre,ketogen,lacto ovo vegetarian,paleolith,pescatarian,primal,vegan,whole 30
0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
3585,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3586,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3587,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3588,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [0]:
# Seperate nutrition feature into three columns

tem = []
for n in data['nutritions']:
    numberonly = re.sub(' *\'|\{|\}|\ *"|[a-z|A-Z]|:', '', n)
    ns = re.split(r',', numberonly)
    tem.append(ns)

data['protein'], data['fat'], data['carbs'] = [float(x[0]) for x in tem], [float(x[1]) for x in tem], [float(x[2]) for x in tem]


In [0]:
# Save the preprocessed data

data.to_csv('./data/nlp_finalproj_data_preprocessed.csv')