In [0]:
import json
import pandas as pd
import numpy as np
import re
import nltk
nltk.download("wordnet")
nltk.download("averaged_perceptron_tagger")
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

import matplotlib.pyplot as plt
from sklearn.decomposition import TruncatedSVD

from sklearn import svm
import time

from sklearn.linear_model import LogisticRegression

from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.datasets import make_classification

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [0]:

with open('train.json', encoding='utf-8') as f:
    d = json.load(f)
    f.close()

data_all = pd.DataFrame(d)


def num_ingre_each_recipe(list_ingrs_each_recipe):
    '''
    This method is to count the number of ingredients of each recipe
    '''
    return len(list_ingrs_each_recipe)


data_all['num_ingredients_contained'] = data_all['ingredients'].apply(
    num_ingre_each_recipe)
print(data_all.shape)  # the total amount of data is 39774

# Only choose the recipes containing more than 3 ingres.
data_all = data_all[data_all.num_ingredients_contained >= 3]

# there are some special words like numbers are meaningless.
# Hence, the first thing is to remove these from the datasets.
# other consideration can all be solved by sklearn package
# (Make sure the word is in lower case; Delete symbols; Delete letters standing alone; Delete double spaces)

numbers = re.compile(r"\d")   # the Regular Expression of numbers
seven_up = re.compile(r"^7\sUp")  # the Regular Expression of '7 Up'


def datasets_cleaning(list_input):
    for string in range(len(list_input)):
        list_input[string] = re.sub(numbers, "", list_input[string])
        list_input[string] = re.sub(seven_up, "7up", list_input[string])
    return list_input


#data_all['ingredients'] = data_all['ingredients'].apply(datasets_cleaning)
    
# now we seperate the dataset into train, valid, test
y_all = data_all['cuisine'].tolist()
X_all = data_all['ingredients'].tolist()

Xtrain, Xtestval, ytrain, ytestval = train_test_split(X_all,y_all, test_size = 0.2, random_state = 42)
Xtest, Xval, ytest, yval = train_test_split(Xtestval, ytestval, test_size = 0.5, random_state = 42)

data_train = pd.DataFrame(columns=['cuisine','ingredients'])
data_train['cuisine'] = ytrain
data_train['ingredients'] = Xtrain # Creat a DataFrame based on train data(size:31647 *2)

dict_cuisine = {}
for key in ytrain:
    dict_cuisine[key] = dict_cuisine.get(key, 0) + 1
print(dict_cuisine)  # the distribution of cuisine in ytrain


# then we want to split the name of each ingredients into several single words.
def tokenize_list(list_input):
    new_list = [val.split(" ") for val in list_input]
    list_get = [item for sublist in new_list for item in sublist]
    return list_get


data_train['ingre_splited'] = data_train['ingredients'].apply(tokenize_list)



# there are lot of similar words after spliting the name of ingredients like "apple" and "apples"
# we use stemming algorithm to get the same stem

from nltk.stem import PorterStemmer
porter = PorterStemmer()


def token_stemming(list_input):
    list_get = [porter.stem(word) for word in list_input]
    return list_get


# we use this method to update our splited words and then combine them into a sentence.
data_train['ingre_splited'] = data_train['ingre_splited'].apply(token_stemming)
data_train['ingre_string'] = data_train['ingre_splited'].str.join(' ')


# we generate the corpus based on the train data.
list_corpus = data_train['ingre_string'].tolist()


# Some words are stop words like 'a' 'is' 'or' 'not', we can then delete them from features
# Some words are meaningless adj., we can also delete them
# Some words are the brand names of ingredient, we can delete them as well

# add these meaningless words into the set of stop words, deleting all of them in the following TD-IDF model


def add_deleted_words():
    '''
    Some words are stop words like 'a' 'is' 'or' 'not', we can then delete them from features
    Some words are meaningless adj., we can also delete them
    Some words are the brand names of ingredient, we can delete them as well
    add these meaningless words into the set of stop words, deleting all of them in the following TD-IDF model
    '''

    # Here are all the words I(Quentin) could find to try and simplify the ingredient names
    list_delete_words = ["fat", "free", "oz", "fine", "finely", "superfine", "crushed", "crush", "cut", "up", "age",
                   "fashioned", "press", "refined", "squeeze", "refrigerated", "smoked", "sweet", "diced", "processed",
                   "nonfat", "packed", "firmly", "loosely", "gluten", "low", "high", "less", "sodium", "reduced",
                   "organic", "store bought", "of", "the", "semi", "condensed", "whole", "reduced", "light", "softened"
                   "ground", "fresh", "black", "natural", "flavored", "plain", "unsweetened", "vegan", "nonfat"]
    list_delete_words = tokenize_list(list_delete_words) # Split the Brand names.
    
    # All the brand names I(Quentin) found, we should delete the partial names including the name of ingredient.
    # for example, we only delete "Vay" rather than"Soy Vay",  delete "bell" rather than"Taco bell"
    Brand_names = ["Bertolli", "Crocker", "Conimex", "Colman", "Crystal Farms", "DeLallo", "Domino",
                   "Doritos", "Earth Balance", "Elmlea", "Estancia", "Fisher", "Flora", "Foster Farms",
                   "Gourmet Garden", "Goya", "Green Giant", "Heinz", "Hellmann", "Hidden Valley",
                   "Honeysuckle White", "Imperial", "JOHNSONVILLE", "Jack Daniels", "Johnsonville",
                   "Jimmy Dean", "KRAFT", "Knorr", "Lipton", "Manischewitz", "McCormick", "Mazola",
                   "Old El Paso", "Pillsbury", "Progresso", "Pure Wesson", "Ragu", "San Marzano",
                   "Sargento", "Vay", "Spice Islands", "BELL", "Truvía", "Uncle Ben",
                   "Velveeta", "Wish Bone", "Yoplait", "Zatarain", "Best Food", "Breyers",
                   "Campbell", "Hidden Valley", "Knorr", "McCormick", "Mizkan", "Progresso",
                   "Frank", "Red Gold"]
    
    Brand_names = tokenize_list(Brand_names) # Split the Brand names.
    Brand_names = [Brand_names[i].lower() for i in range(len(Brand_names))]  # lower case brand names
    
    # The following is the original codes, the problem is that the list_of_words[i] is actually a sing word
    #------------------------------------------------------------------------------------------------------
    # which can not match full words of brand, so we need to also split the brand names into single words.
    #def RemoveWords(list_of_words):
        #for i in range(len(list_of_words)):
            #for word_remove in dict_remove:          
                #list_of_words[i]  = re.sub("\\b" + word_remove + "\\b", "", list_of_words[i]) #Only match full words
        #return(list_of_words)
    #unique_words = RemoveWords(unique_words)
    #------------------------------------------------------------------------------------------------------

    from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
    #  use package including all stop words as part of our words waited for deleting

    list_delete_words = list_delete_words + Brand_names + [string for string in ENGLISH_STOP_WORDS]
    
    list_delete_words_stem = [porter.stem(word) for word in list_delete_words]
    
    list_delete_words = list_delete_words + list_delete_words_stem
    
    return sorted(list(set(list_delete_words)))

delete_words = add_deleted_words()


vectorizer = TfidfVectorizer(stop_words=delete_words) # it comes from the built-in method in skilearn. TD-IDF

vectorizer.fit(list_corpus)
# from the output of this code, we can see the some defult setting
# lowercase==True means it will automatically transform all sing words into lower case
# token_pattern='(?u)\\b\\w\\w+\\b' means it will only contain the words rather than a sing letter or some special symbol.

vector = vectorizer.transform(data_train['ingre_string'])
feature_names = np.array(vectorizer.get_feature_names())
print(feature_names)
print(len(feature_names)) # count how many feature we constructed by the bag of words model(TD-IDF).


sorted_by_idf = np.argsort(vectorizer.idf_) 
# get the index of features which are ordered by their idf values

print(feature_names[sorted_by_idf[:30]]) 
# idf values means how frequent it exists in recipes. now we present the most frequent 30 features.

# As these features are so common that it may make a little contribution to the prediction.
# We consider to delete them by adding these features into our customized stop words.

for word in feature_names[sorted_by_idf[:30]]:
    delete_words.append(word)
    
    
 # again, we generate the features using TD-IDF
vectorizer = TfidfVectorizer(stop_words = delete_words)
vectorizer.fit(list_corpus)

Xtrain = vectorizer.transform(data_train['ingre_string']).toarray()
feature_names = np.array(vectorizer.get_feature_names())
print(feature_names)
print(len(feature_names)) # count how many feature we constructed by the bag of words model(TD-IDF).


print(Xtrain)
print(np.shape(Xtrain))
print(np.array(ytrain))
print(np.shape(ytrain))


# we use this method to update our splited words and then combine them into a sentence once again.
data_val = pd.DataFrame(columns=['ingredients'])
data_val['ingredients'] = Xval 
data_val['ingre_splited'] = data_val['ingredients'].apply(tokenize_list)
data_val['ingre_splited'] = data_val['ingre_splited'].apply(token_stemming)
data_val['ingre_string'] = data_val['ingre_splited'].str.join(' ')
Xval = vectorizer.transform(data_val['ingre_string']).toarray()


data_test = pd.DataFrame(columns=['ingredients'])
data_test['ingredients'] = Xtest 
data_test['ingre_splited'] = data_test['ingredients'].apply(tokenize_list)
data_test['ingre_splited'] = data_test['ingre_splited'].apply(token_stemming)
data_test['ingre_string'] = data_test['ingre_splited'].str.join(' ')
Xtest = vectorizer.transform(data_test['ingre_string']).toarray()   

import matplotlib.pyplot as plt
from sklearn.decomposition import TruncatedSVD
tsvd = TruncatedSVD(n_components=200, random_state = 42)

Xtrain_svd = tsvd.fit(Xtrain).transform(Xtrain)
Xtest_svd = tsvd.transform(Xtest)
Xval_svd = tsvd.transform(Xval)

print(Xtrain_svd)
print(np.shape(Xtrain_svd))









(39774, 4)
{'southern_us': 3394, 'mexican': 5157, 'british': 636, 'filipino': 605, 'italian': 6209, 'thai': 1196, 'jamaican': 418, 'french': 2081, 'greek': 951, 'japanese': 1130, 'brazilian': 379, 'moroccan': 633, 'irish': 540, 'indian': 2417, 'spanish': 795, 'cajun_creole': 1224, 'chinese': 2163, 'vietnamese': 656, 'korean': 677, 'russian': 386}
['10' '100' '14' ... 'ziti' 'zucchini' 'épice']
2426
['salt' 'oil' 'pepper' 'garlic' 'onion' 'ground' 'oliv' 'sugar' 'sauc'
 'tomato' 'chicken' 'water' 'chees' 'butter' 'flour' 'egg' 'clove'
 'powder' 'chop' 'dri' 'juic' 'chili' 'veget' 'cilantro' 'milk' 'rice'
 'cream' 'ginger' 'lemon' 'corn']
['10' '100' '14' ... 'ziti' 'zucchini' 'épice']
2396
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
(31647, 2396)
['southern_us' 'mexican' 'british' ... 'italian' 'japanese' 'southern_us']
(31647,)
[[ 0.19607073 -0.00058468 -0.08090434 ... -0.029

In [0]:
start = time.clock()
clf = svm.SVC( gamma=0.2,C=0.7, decision_function_shape='ovo')
clf.fit(Xtrain_svd, ytrain)
elapsed = (time.clock() - start)
print("Time used:",elapsed)

Time used: 285.967134


In [0]:
start = time.clock()
clf = LogisticRegression(random_state=0,solver='lbfgs',multi_class='multinomial',max_iter=1000).fit(Xtrain_svd,ytrain)
elapsed = (time.clock() - start)
print("Time used:",elapsed)

Time used: 34.40326200000004


In [0]:
start = time.clock()
clf = PassiveAggressiveClassifier(C=0.1,max_iter=2000, tol=1e-3,shuffle=False,)
# clf.fit(Xtrain_svd, ytrain)

print(Xtrain.shape)
clf.fit(Xtrain, ytrain)

elapsed = (time.clock() - start)
print("Time used:",elapsed)

(31647, 2396)
Time used: 54.46043099999997


In [0]:
p_result=clf.predict(Xtest)


print(np.sum(ytest==p_result)/len(p_result))
# print(np.bincount(yval))
print(clf.score(Xtest,ytest))

0.7591001011122346
0.7591001011122346


In [0]:
clf = LogisticRegression(random_state=0,solver='lbfgs',multi_class='multinomial').fit(X_train_svd,ytrain)