In [542]:
import json
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [543]:
with open('train.json', encoding='utf-8') as f:
    d = json.load(f)
    f.close()

data_all = pd.DataFrame(d)
# From the file Quentin offered, we can see that if we reorder the ingredient names by its first charactor, it is 
# noted that there are some special numbers(like '1','10','15') or unit names('oz') which are meaningless.
# Hence, the first thing is to remove these from the datasets.

numbers = re.compile(r"\d.*\d\s?")   # the Regular Expression of numbers(like '1','10','15')
num_percent = re.compile(r"\d.*%\s") # the Regular Expression of percentage numbers like 1%
oz = re.compile(r"\soz\.")           # the Regular Expression of 'oz.'
seven_up = re.compile(r"^7\sUp")     # the Regular Expression of '7 Up'

def datasets_cleaning(list_input):
    for ingre_no in range(len(list_input)):
        list_input[ingre_no] = re.sub(numbers, "",list_input[ingre_no])
        list_input[ingre_no] = re.sub(num_percent,"",list_input[ingre_no])
        list_input[ingre_no] = re.sub(oz,"",list_input[ingre_no])
        list_input[ingre_no] = re.sub(seven_up,"7up",list_input[ingre_no])
    return list_input

data_all['ingredients'] = data_all['ingredients'].apply(datasets_cleaning)

In [544]:
# now we seperate the dataset into train, valid, test
y_all = data_all['cuisine'].tolist()
X_all = data_all['ingredients'].tolist()

Xtrain, Xtestval, ytrain, ytestval = train_test_split(X_all,y_all, test_size = 0.2, random_state = 42)
Xtest, Xval, ytest, yval = train_test_split(Xtestval, ytestval, test_size = 0.5, random_state = 42)

data_train = pd.DataFrame(columns=['cuisine','ingredients'])
data_train['cuisine'] = ytrain
data_train['ingredients'] = Xtrain # Creat a DataFrame based on train data(size: 31819*2)

In [545]:
data_train
dict1 = {}
for key in ytrain:
    dict1[key] = dict1.get(key, 0) + 1
print(dict1)

{'mexican': 5102, 'indian': 2401, 'filipino': 619, 'moroccan': 655, 'korean': 664, 'irish': 516, 'southern_us': 3472, 'french': 2096, 'vietnamese': 681, 'japanese': 1139, 'italian': 6271, 'thai': 1224, 'spanish': 807, 'chinese': 2163, 'british': 647, 'brazilian': 383, 'greek': 926, 'russian': 400, 'cajun_creole': 1218, 'jamaican': 435}


In [546]:
# then we want to split the name of each ingredients into several single words.
def tokenize_list(list_input):
    new_list =[val.split(" ") for val in list_input]
    list_get = [item for sublist in new_list for item in sublist]
    return list_get
# example_origin = data_train['ingredients'][0]
# example_new = tokenize_list(example_origin)
# print(example_origin)
# print(example_new)
data_train['ingre_splited'] = data_train['ingredients'].apply(tokenize_list)

In [547]:
data_train.head(5)

Unnamed: 0,cuisine,ingredients,ingre_splited
0,mexican,"[shredded cheddar cheese, chicken meat, choppe...","[shredded, cheddar, cheese, chicken, meat, cho..."
1,indian,"[fresh cilantro, purple onion, ground coriande...","[fresh, cilantro, purple, onion, ground, coria..."
2,filipino,"[sugar, garlic, onions, vinegar, green chilies...","[sugar, garlic, onions, vinegar, green, chilie..."
3,moroccan,"[raw pistachios, purple onion, couscous, dried...","[raw, pistachios, purple, onion, couscous, dri..."
4,mexican,"[tomatoes, pepper, salsa, sliced green onions,...","[tomatoes, pepper, salsa, sliced, green, onion..."


In [548]:
data_train['ingre_string'] = data_train['ingre_splited'].str.join(' ') # combines the splited words into a sentence

In [549]:
data_train.head(5)

Unnamed: 0,cuisine,ingredients,ingre_splited,ingre_string
0,mexican,"[shredded cheddar cheese, chicken meat, choppe...","[shredded, cheddar, cheese, chicken, meat, cho...",shredded cheddar cheese chicken meat chopped o...
1,indian,"[fresh cilantro, purple onion, ground coriande...","[fresh, cilantro, purple, onion, ground, coria...",fresh cilantro purple onion ground coriander g...
2,filipino,"[sugar, garlic, onions, vinegar, green chilies...","[sugar, garlic, onions, vinegar, green, chilie...",sugar garlic onions vinegar green chilies grou...
3,moroccan,"[raw pistachios, purple onion, couscous, dried...","[raw, pistachios, purple, onion, couscous, dri...",raw pistachios purple onion couscous dried apr...
4,mexican,"[tomatoes, pepper, salsa, sliced green onions,...","[tomatoes, pepper, salsa, sliced, green, onion...",tomatoes pepper salsa sliced green onions ched...


In [550]:
list_corpus = data_train['ingre_string'].tolist() # generate the corpus based on the train data.

In [551]:
list_corpus[0:5]

['shredded cheddar cheese chicken meat chopped onion tomatoes green onions black olives dried parsley flour tortillas diced tomatoes sour cream tomato sauce chili powder salsa dried oregano',
 'fresh cilantro purple onion ground coriander ground turmeric ground ginger vegetable oil brown rice flour mustard seeds whole wheat flour rice vinegar cumin seed ground cumin water garlic cayenne pepper white sugar',
 'sugar garlic onions vinegar green chilies ground black pepper coconut cream chicken fish sauce cooking oil coconut milk',
 'raw pistachios purple onion couscous dried apricot lemon juice olive oil salt harissa chopped parsley',
 'tomatoes pepper salsa sliced green onions cheddar cheese garlic powder baked tortilla chips taco sauce non-fat sour cream iceberg lettuce ground round kidney beans whole kernel corn, drain']

In [552]:
vectorizer = TfidfVectorizer() # it comes from the built-in method in skilearn. TD-IDF
vector = vectorizer.fit_transform(list_corpus)
names = vectorizer.get_feature_names()

In [553]:
print(len(names)) # count how many feature we constructed by the bag of words model(TD-IDF).

2867


In [554]:
# Some words are meaningless like 'a' 'is' 'or' 'not', we can then delete them from features.
custom_stop_words = []
for word in ENGLISH_STOP_WORDS:
    custom_stop_words.append(word)

In [555]:
# we can generate features once again, but no stop words('is','or','and'...) in the features.
vectorizer_use_sw = TfidfVectorizer(stop_words=custom_stop_words)
vector_use_sw = vectorizer_use_sw.fit_transform(list_corpus)
names_use_sw = vectorizer_use_sw.get_feature_names()

In [556]:
print(len(names_use_sw)) # count how many feature we constructed if we delete the stop words.

2831


In [557]:
# Then some words are same like 'apple' and 'apples', we want to only keep 'apple'
import nltk # we use the nltk package

nltk.download("wordnet")
nltk.download("averaged_perceptron_tagger")

from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
print(lemmatizer.lemmatize('dogs')) # it is a toy example.


dog


[nltk_data] Downloading package wordnet to /Users/ymyang/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/ymyang/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [558]:
def token_lemmatize(list_input):
    list_get =[lemmatizer.lemmatize(word) for word in list_input]
    return list_get

# we use this method to update our splited words and then combine them into a sentence once again.
data_train['ingre_splited'] = data_train['ingre_splited'].apply(token_lemmatize)
data_train['ingre_string'] = data_train['ingre_splited'].str.join(' ')

In [559]:
data_train.head(5)

Unnamed: 0,cuisine,ingredients,ingre_splited,ingre_string
0,mexican,"[shredded cheddar cheese, chicken meat, choppe...","[shredded, cheddar, cheese, chicken, meat, cho...",shredded cheddar cheese chicken meat chopped o...
1,indian,"[fresh cilantro, purple onion, ground coriande...","[fresh, cilantro, purple, onion, ground, coria...",fresh cilantro purple onion ground coriander g...
2,filipino,"[sugar, garlic, onions, vinegar, green chilies...","[sugar, garlic, onion, vinegar, green, chilies...",sugar garlic onion vinegar green chilies groun...
3,moroccan,"[raw pistachios, purple onion, couscous, dried...","[raw, pistachio, purple, onion, couscous, drie...",raw pistachio purple onion couscous dried apri...
4,mexican,"[tomatoes, pepper, salsa, sliced green onions,...","[tomato, pepper, salsa, sliced, green, onion, ...",tomato pepper salsa sliced green onion cheddar...


In [560]:
# we update the corpus once again.
list_corpus = data_train['ingre_string'].tolist()

In [561]:
list_corpus[0:5]

['shredded cheddar cheese chicken meat chopped onion tomato green onion black olive dried parsley flour tortilla diced tomato sour cream tomato sauce chili powder salsa dried oregano',
 'fresh cilantro purple onion ground coriander ground turmeric ground ginger vegetable oil brown rice flour mustard seed whole wheat flour rice vinegar cumin seed ground cumin water garlic cayenne pepper white sugar',
 'sugar garlic onion vinegar green chilies ground black pepper coconut cream chicken fish sauce cooking oil coconut milk',
 'raw pistachio purple onion couscous dried apricot lemon juice olive oil salt harissa chopped parsley',
 'tomato pepper salsa sliced green onion cheddar cheese garlic powder baked tortilla chip taco sauce non-fat sour cream iceberg lettuce ground round kidney bean whole kernel corn, drain']

In [562]:
# again, we generate the features using TD-IDF(a kind of the bag of words model)
vectorizer_use_sw_lem = TfidfVectorizer(stop_words = custom_stop_words)
vector_use_sw_lem = vectorizer_use_sw_lem.fit_transform(list_corpus)
names_use_sw_lem = vectorizer_use_sw_lem.get_feature_names()

In [563]:
print(len(names_use_sw_lem)) # we count how many features we still have.

2662


In [564]:
feature_names = np.array(names_use_sw_lem) 
print(feature_names)
print(vector_use_sw_lem.toarray()) 
# the size of array is 31819*2662 ,31819 is the size of train data, 2662 is the number of features.

['7up' 'abalone' 'abbamele' ... 'ziti' 'zucchini' 'épices']
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [565]:
sorted_by_idf = np.argsort(vectorizer_use_sw_lem.idf_) 
# get the index of features which are ordered by their idf values

print(feature_names[sorted_by_idf[:30]]) 
# idf values means how frequent it exists in recipes. now we present the most frequent 30 features.

# As these features are so common that it may make a little contribution to the prediction.
# We consider to delete them by adding these features into our customized stop words.

for word in feature_names[sorted_by_idf[:30]]:
    custom_stop_words.append(word)
    
# again, we generate the features using TD-IDF(a kind of the bag of words model)
vectorizer_use_sw_lem = TfidfVectorizer(stop_words = custom_stop_words)
vector_use_sw_lem = vectorizer_use_sw_lem.fit_transform(list_corpus)
names_use_sw_lem = vectorizer_use_sw_lem.get_feature_names()

['salt' 'oil' 'pepper' 'garlic' 'onion' 'fresh' 'ground' 'olive' 'sugar'
 'sauce' 'black' 'tomato' 'water' 'chicken' 'cheese' 'butter' 'flour'
 'egg' 'green' 'red' 'clove' 'white' 'powder' 'juice' 'chopped' 'leaf'
 'vegetable' 'cilantro' 'milk' 'rice']


In [566]:
feature_names = np.array(names_use_sw_lem) 
print(feature_names)
print(vector_use_sw_lem.toarray())
print(np.shape(vector_use_sw_lem.toarray()))
# the size of array is 31819*2632 ,31819 is the size of train data, 2632 is the number of features.

['7up' 'abalone' 'abbamele' ... 'ziti' 'zucchini' 'épices']
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
(31819, 2632)


In [567]:
# Then, I believe it would be much better if we continue to using SVD/PCA method to reduce the dimension of feature