In [1]:
import json
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD

In [2]:
with open('train.json', encoding='utf-8') as f:
    d = json.load(f)
    f.close()

data_all = pd.DataFrame(d)

In [3]:
def num_ingre_each_recipe(list_ingrs_each_recipe):
    '''
    This method is to count the number of ingredients of each recipe
    '''
    return len(list_ingrs_each_recipe)

data_all['num_ingre_contained'] = data_all['ingredients'].apply(num_ingre_each_recipe)

# Only choose the recipes containing more than 3 ingres.
data_all = data_all[data_all.num_ingre_contained >= 3]

print(data_all.shape)  # the total amount of data is 39774


(39559, 4)


In [4]:
def datasets_cleaning(list_input):
    seven_up = re.compile(r"^7\sUp")  # the Regular Expression of '7 Up'
    hype = re.compile(r"-")

    deleted_str = []
    deleted_str.append(re.compile(r"\(.*\)"))
    deleted_str.append(re.compile(r"%"))
    deleted_str.append(re.compile(r"/"))
    deleted_str.append(re.compile(r"!"))
    deleted_str.append(re.compile(r"’"))
    deleted_str.append(re.compile(r"\."))
    deleted_str.append(re.compile(r"\d+\s"))
    deleted_str.append(re.compile(r"\b.*®"))
    deleted_str.append(re.compile(r","))
    deleted_str.append(re.compile(r"&"))
    deleted_str.append(re.compile(r"\b.*™"))
    deleted_str.append(re.compile(r"'"))

    useless_words = ["fat", "free", "ounc", "oz", "fine", "finely", "superfine", "crushed", "crush", "cut", 
                     "up", "age", "fashioned", "press", "refined", "squeeze", "refrigerated", "diced", 
                     "processed","nonfat", "packed", "firmly", "loosely", "gluten", "low", "high", "less", 
                     "sodium","reduced","organic", "store bought", "of", "the", "semi", "whole", "reduced",
                     "light", "softened","ground", "fresh", "natural", "flavored", "plain", "unsweetened",
                     "vegan","drained","bags", "squirt", "originals", "flavoured", "cook"]

    Brand_names = ["Bertolli", "Crocker", "Conimex", "Colman", "Crystal Farms", "DeLallo", "Domino",
                   "Doritos", "Earth Balance", "Elmlea", "Estancia", "Fisher", "Flora", "Foster Farms",
                   "Gourmet Garden", "Goya", "Green Giant", "Heinz", "Hellmann", "Hidden Valley",
                   "Honeysuckle White", "Imperial", "JOHNSONVILLE", "Jack Daniels", "Johnsonville",
                   "Jimmy Dean", "KRAFT", "Knorr", "Lipton", "Manischewitz", "McCormick", "Mazola",
                   "Old El Paso", "Pillsbury", "Progresso", "Pure Wesson", "Ragu", "San Marzano",
                   "Sargento", "Soy Vay", "Spice Islands", "Taco BELL", "Truvía", "Uncle Ben",
                   "Uncle Bens", "Velveeta", "Wish Bone", "Yoplait", "Zatarain", "Best Food", "Breyers",
                   "Campbell", "Hidden Valley", "Knorr", "McCormick", "Mizkan", "Progresso",
                   "Frank", "Red Gold"]

    useless_words = [r"%s\b" % useless_words[j]
                     for j in range(len(useless_words))]
    # print(useless_words)
    Brand_names = [r"%s\b" % Brand_names[i].lower()
                   for i in range(len(Brand_names))]

    deleted_str = deleted_str + useless_words + Brand_names

    for string in range(len(list_input)):
        list_input[string] = re.sub(seven_up, "7up", list_input[string])
        list_input[string] = re.sub(hype, " ", list_input[string])
        list_input[string] = re.sub(r"_", " ", list_input[string])
        list_input[string] = list_input[string].lower()
        for del_str in deleted_str:
            list_input[string] = re.sub(del_str, " ", list_input[string])
        list_input[string] = re.sub(r"^\s+", "", list_input[string])
        list_input[string] = re.sub(r"\s+$", "", list_input[string])
        list_input[string] = re.sub(r"\s+", "_", list_input[string])

    return list_input

data_all['ingredients'] = data_all['ingredients'].apply(datasets_cleaning)

In [5]:
# now we seperate the dataset into train, valid, test
y_all = data_all['cuisine'].tolist()
X_all = data_all['ingredients'].tolist()

Xtrain, Xtestval, ytrain, ytestval = train_test_split(X_all,y_all, test_size = 0.2, random_state = 42)
Xtest, Xval, ytest, yval = train_test_split(Xtestval, ytestval, test_size = 0.5, random_state = 42)

data_train = pd.DataFrame(columns=['cuisine','ingredients'])
data_train['cuisine'] = ytrain
data_train['ingredients'] = Xtrain # Creat a DataFrame based on train data(size:31647 *2)

In [6]:
data_train['num_ingre_contained'] = data_train['ingredients'].apply(num_ingre_each_recipe)
data_train['ingre_string'] = data_train['ingredients'].str.join(' ')

In [7]:
data_train.head(5)

Unnamed: 0,cuisine,ingredients,num_ingre_contained,ingre_string
0,southern_us,"[tomatoes, cider_vinegar, pimentos, sharp_ched...",12,tomatoes cider_vinegar pimentos sharp_cheddar_...
1,mexican,"[guajillo_chiles, spanish_onion, tomatillos, o...",24,guajillo_chiles spanish_onion tomatillos orang...
2,british,"[water, vegetable_oil, cinnamon_sticks, demera...",14,water vegetable_oil cinnamon_sticks demerara_s...
3,british,"[beef_drippings, all_purpose_flour, milk, eggs...",6,beef_drippings all_purpose_flour milk eggs sal...
4,mexican,"[grated_parmesan_cheese, corn, monterey_jack, ...",6,grated_parmesan_cheese corn monterey_jack mayo...


In [8]:
list_corpus = data_train['ingre_string'].tolist()
vectorizer = CountVectorizer()
vectorizer.fit(list_corpus)
Xtrain = vectorizer.transform(data_train['ingre_string']).toarray()
feature_names = np.array(vectorizer.get_feature_names())
print(feature_names)
print(len(feature_names)) 

['a_taste_thai_rice_noodles' 'abalone' 'abbamele' ...
 'ziti_pasta_and_drain' 'zucchini' 'zucchini_blossoms']
5736


In [9]:
#print(list(feature_names))

In [10]:
print(Xtrain)
print(np.shape(Xtrain))
print(np.array(ytrain))
print(np.shape(ytrain))
# the size of array is 31647*2387 ,31647 is the size of train data, 2385 is the number of features.

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
(31647, 5736)
['southern_us' 'mexican' 'british' ... 'italian' 'japanese' 'southern_us']
(31647,)


In [11]:
# we use this method to update our splited words and then combine them into a sentence once again.
data_val = pd.DataFrame(columns=['ingredients'])
data_val['ingredients'] = Xval 
data_val['num_ingre_contained'] = data_val['ingredients'].apply(num_ingre_each_recipe)
data_val['ingre_string'] = data_val['ingredients'].str.join(' ')
Xval = vectorizer.transform(data_val['ingre_string']).toarray()

data_test = pd.DataFrame(columns=['ingredients'])
data_test['ingredients'] = Xtest 
data_test['num_ingre_contained'] = data_test['ingredients'].apply(num_ingre_each_recipe)
data_test['ingre_string'] = data_test['ingredients'].str.join(' ')
Xtest = vectorizer.transform(data_test['ingre_string']).toarray()

In [12]:
print(Xval)
print(np.shape(Xval))
print(np.array(yval))
print(np.shape(yval))

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
(3956, 5736)
['chinese' 'italian' 'russian' ... 'italian' 'italian' 'french']
(3956,)


In [13]:
print(Xtest)
print(np.shape(Xtest))
print(np.array(ytest))
print(np.shape(ytest))

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 1 0]
 ...
 [0 0 0 ... 0 1 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
(3956, 5736)
['chinese' 'indian' 'italian' ... 'mexican' 'british' 'thai']
(3956,)


In [14]:
'''svd = TruncatedSVD(n_components=500)
Xtrain_t = np.transpose(Xtrain)
X_train_svd = svd.fit(Xtrain_t).transform(Xtrain_t)'''

'svd = TruncatedSVD(n_components=500)\nXtrain_t = np.transpose(Xtrain)\nX_train_svd = svd.fit(Xtrain_t).transform(Xtrain_t)'

In [15]:
recipe_ingredient = Xtrain.copy()
all_cuisine= sorted(list(set(ytrain.copy())))
all_ingredient = list(feature_names)
all_ingredient_set = set(all_ingredient)
n_recipes = recipe_ingredient.shape[0]
n_ingredient = recipe_ingredient.shape[1]

def sim(s1, s2):
    '''
    compute the similarity of two ingredients
    '''
    return len(s1.intersection(s2)) / np.sqrt((len(s1)*len(s2)))

def sim_ingredient_matrix():
    sim_ingredient = np.zeros((n_ingredient,n_ingredient))
    set_recipe_for_ingre = {} #get the set of recipes for one ingredient
    for i in range(n_ingredient):
        set_recipe_for_ingre[i] = set()
        for r in range(n_recipes):
            if recipe_ingredient[r,i]!=0:
                set_recipe_for_ingre[i].add(r)
                
    for i in range(n_ingredient):
        for ii in range(i+1,n_ingredient):
            inter_have_or_not = list(set_recipe_for_ingre[i] & set_recipe_for_ingre[ii])
            if inter_have_or_not:
                sim_ingredient[i,ii] = sim(set_recipe_for_ingre[i],set_recipe_for_ingre[ii])
                sim_ingredient[ii,i] = sim_ingredient[i,ii]
            
    return(sim_ingredient)

sim_ingredient = sim_ingredient_matrix()

In [16]:
n_ingredient_top_similar = {}
for i in range(n_ingredient):
    n_ingredient_top_similar[i] = np.argsort(-sim_ingredient[i,:])

In [17]:
def compute_value_r_i(partial_recipe,feature_value_part):
    #print('\nwe recommend these ingredients based on %s:' % partial_recipe)
    #rank =[]
    feature_value_part =  feature_value_part
    feature_value  = np.zeros(n_ingredient)
    partial_recipe_set = set(partial_recipe)
    num_j= {}
    value_j = {}

    for n in partial_recipe_set:
        if n in all_ingredient_set:
            index_n = all_ingredient.index(n)
            
            for j in n_ingredient_top_similar[index_n]:
                if sim_ingredient[j,index_n] !=0 :
                    num_j[j]= num_j.get(j,0) + 1
                    value_j[j] = value_j.get(j,0) +sim_ingredient[j,index_n]*feature_value_part[index_n]
                else:
                    break
    for key in num_j.keys():
        feature_value[key] = value_j[key]/num_j[key]

    return feature_value

In [23]:
np.random.seed(seed=42)
data_val_2=  data_val.copy()
n_partial =  6 # number of ingredients in partial recipe
data_val_2 = data_val_2[data_val_2.num_ingre_contained >= n_partial]


In [24]:
def choose_random(ingredients):
    list_got = np.random.permutation(ingredients)[0:n_partial]
    return list_got

data_val_2['ingredients'] = data_val_2['ingredients'].apply(choose_random)
data_val_2['ingre_string'] = data_val_2['ingredients'].str.join(' ')
Xval_2 = vectorizer.transform(data_val_2['ingre_string']).toarray()
data_val_2['Tvalue'] = list(Xval_2)

In [25]:
data_val_2

Unnamed: 0,ingredients,num_ingre_contained,ingre_string,Tvalue
0,"[sugar, black_fungus, kecap_manis, oyster_sauc...",11,sugar black_fungus kecap_manis oyster_sauce be...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,"[parmesan_cheese, unsalted_butter, extra_virgi...",10,parmesan_cheese unsalted_butter extra_virgin_o...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,"[active_dry_yeast, sour_cream, salt, all_purpo...",9,active_dry_yeast sour_cream salt all_purpose_f...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ..."
3,"[white_onion, coarse_salt, lime_juice, serrano...",7,white_onion coarse_salt lime_juice serrano_chi...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,"[crumbled_gorgonzola, olive_oil, active_dry_ye...",17,crumbled_gorgonzola olive_oil active_dry_yeast...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ..."
5,"[tomatoes, ginger, salt, cumin_seed, oil, red_...",14,tomatoes ginger salt cumin_seed oil red_chili_...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
7,"[chopped_pecans, butter, sugar, salt, peaches,...",15,chopped_pecans butter sugar salt peaches bakin...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
8,"[minced_garlic, unsalted_butter, lemon_juice, ...",9,minced_garlic unsalted_butter lemon_juice jumb...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
10,"[sugar, milk, large_eggs, cinnamon, sweetened_...",8,sugar milk large_eggs cinnamon sweetened_conde...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
11,"[ginger, amchur, atta, potatoes, salt, butter]",11,ginger amchur atta potatoes salt butter,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [26]:
def preject_value(data_val_2_ingre, value):
    return(compute_value_r_i(data_val_2_ingre, value))

data_val_2['pre_value'] = data_val_2.apply(lambda pre: preject_value
                                                     (pre['ingredients'], pre['Tvalue']), axis=1)


In [27]:
from sklearn.metrics import mean_absolute_error
mae  = mean_absolute_error(list(data_val_2['Tvalue']),list(data_val_2['pre_value']))
print(mae)

0.01132599103649055
