In [0]:
import json
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

import matplotlib.pyplot as plt
from sklearn.decomposition import TruncatedSVD

from sklearn import svm
import time

from sklearn.linear_model import LogisticRegression

from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.datasets import make_classification

In [0]:
with open('train.json', encoding='utf-8') as f:
    d = json.load(f)
    f.close()

data_all = pd.DataFrame(d)

def num_ingre_each_recipe(list_ingrs_each_recipe):
    '''
    This method is to count the number of ingredients of each recipe
    '''
    return len(list_ingrs_each_recipe)


data_all['num_ingre_contained'] = data_all['ingredients'].apply(num_ingre_each_recipe)

# Only choose the recipes containing more than 3 ingres.
data_all = data_all[data_all.num_ingre_contained >= 3]

print(data_all.shape)  # the total amount of data is 39774

def datasets_cleaning(list_input):
    seven_up = re.compile(r"^7\sUp")  # the Regular Expression of '7 Up'
    hype = re.compile(r"-")

    deleted_str = []
    deleted_str.append(re.compile(r"\(.*\)"))
    deleted_str.append(re.compile(r"%"))
    deleted_str.append(re.compile(r"/"))
    deleted_str.append(re.compile(r"!"))
    deleted_str.append(re.compile(r"’"))
    deleted_str.append(re.compile(r"\."))
    deleted_str.append(re.compile(r"\d+\s"))
    deleted_str.append(re.compile(r"\b.*®"))
    deleted_str.append(re.compile(r","))
    deleted_str.append(re.compile(r"&"))
    deleted_str.append(re.compile(r"\b.*™"))
    deleted_str.append(re.compile(r"'"))

    useless_words = ["fat", "free", "ounc", "oz", "fine", "finely", "superfine", "crushed", "crush", "cut", 
                     "up", "age", "fashioned", "press", "refined", "squeeze", "refrigerated", "diced", 
                     "processed","nonfat", "packed", "firmly", "loosely", "gluten", "low", "high", "less", 
                     "sodium","reduced","organic", "store bought", "of", "the", "semi", "whole", "reduced",
                     "light", "softened","ground", "fresh", "natural", "flavored", "plain", "unsweetened",
                     "vegan","drained","bags", "squirt", "originals", "flavoured", "cook"]

    Brand_names = ["Bertolli", "Crocker", "Conimex", "Colman", "Crystal Farms", "DeLallo", "Domino",
                   "Doritos", "Earth Balance", "Elmlea", "Estancia", "Fisher", "Flora", "Foster Farms",
                   "Gourmet Garden", "Goya", "Green Giant", "Heinz", "Hellmann", "Hidden Valley",
                   "Honeysuckle White", "Imperial", "JOHNSONVILLE", "Jack Daniels", "Johnsonville",
                   "Jimmy Dean", "KRAFT", "Knorr", "Lipton", "Manischewitz", "McCormick", "Mazola",
                   "Old El Paso", "Pillsbury", "Progresso", "Pure Wesson", "Ragu", "San Marzano",
                   "Sargento", "Soy Vay", "Spice Islands", "Taco BELL", "Truvía", "Uncle Ben",
                   "Uncle Bens", "Velveeta", "Wish Bone", "Yoplait", "Zatarain", "Best Food", "Breyers",
                   "Campbell", "Hidden Valley", "Knorr", "McCormick", "Mizkan", "Progresso",
                   "Frank", "Red Gold"]

    useless_words = [r"%s\b" % useless_words[j]
                     for j in range(len(useless_words))]
    # print(useless_words)
    Brand_names = [r"%s\b" % Brand_names[i].lower()
                   for i in range(len(Brand_names))]

    deleted_str = deleted_str + useless_words + Brand_names

    for string in range(len(list_input)):
        list_input[string] = re.sub(seven_up, "7up", list_input[string])
        list_input[string] = re.sub(hype, " ", list_input[string])
        list_input[string] = re.sub(r"_", " ", list_input[string])
        list_input[string] = list_input[string].lower()
        for del_str in deleted_str:
            list_input[string] = re.sub(del_str, " ", list_input[string])
        list_input[string] = re.sub(r"^\s+", "", list_input[string])
        list_input[string] = re.sub(r"\s+$", "", list_input[string])
        list_input[string] = re.sub(r"\s+", "_", list_input[string])

    return list_input


data_all['ingredients'] = data_all['ingredients'].apply(datasets_cleaning)


# now we seperate the dataset into train, valid, test
y_all = data_all['cuisine'].tolist()
X_all = data_all['ingredients'].tolist()

Xtrain, Xtestval, ytrain, ytestval = train_test_split(X_all,y_all, test_size = 0.2, random_state = 42)
Xtest, Xval, ytest, yval = train_test_split(Xtestval, ytestval, test_size = 0.5, random_state = 42)

data_train = pd.DataFrame(columns=['cuisine','ingredients'])
data_train['cuisine'] = ytrain
data_train['ingredients'] = Xtrain # Creat a DataFrame based on train data(size:31647 *2)

data_train['num_ingre_contained'] = data_train['ingredients'].apply(num_ingre_each_recipe)
data_train['ingre_string'] = data_train['ingredients'].str.join(' ')

list_corpus = data_train['ingre_string'].tolist()
list_corpus[0:3]

vectorizer = CountVectorizer()
vectorizer.fit(list_corpus)
Xtrain = vectorizer.transform(data_train['ingre_string']).toarray()
feature_names = np.array(vectorizer.get_feature_names())
print(feature_names)
print(len(feature_names)) 


print(Xtrain)
print(np.shape(Xtrain))
print(np.array(ytrain))
print(np.shape(ytrain))


# we use this method to update our splited words and then combine them into a sentence once again.
data_val = pd.DataFrame(columns=['ingredients'])
data_val['ingredients'] = Xval 
data_val['num_ingre_contained'] = data_val['ingredients'].apply(num_ingre_each_recipe)
data_val['ingre_string'] = data_val['ingredients'].str.join(' ')
Xval = vectorizer.transform(data_val['ingre_string']).toarray()


data_test = pd.DataFrame(columns=['ingredients'])
data_test['ingredients'] = Xtest 
data_val['num_ingre_contained'] = data_val['ingredients'].apply(num_ingre_each_recipe)
data_test['ingre_string'] = data_test['ingredients'].str.join(' ')
Xtest = vectorizer.transform(data_test['ingre_string']).toarray()

print(Xval)
print(np.shape(Xval))
print(np.array(yval))
print(np.shape(yval))

import matplotlib.pyplot as plt
from sklearn.decomposition import TruncatedSVD
tsvd = TruncatedSVD(n_components=200, random_state = 42)

Xtrain_svd = tsvd.fit(Xtrain).transform(Xtrain)
Xtest_svd = tsvd.transform(Xtest)
Xval_svd = tsvd.transform(Xval)

print(Xtrain_svd)
print(np.shape(Xtrain_svd))




(39559, 4)
['a_taste_thai_rice_noodles' 'abalone' 'abbamele' ...
 'ziti_pasta_and_drain' 'zucchini' 'zucchini_blossoms']
5736
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
(31647, 5736)
['southern_us' 'mexican' 'british' ... 'italian' 'japanese' 'southern_us']
(31647,)
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
(3956, 5736)
['chinese' 'italian' 'russian' ... 'italian' 'italian' 'french']
(3956,)
[[ 0.53527582 -0.44411132 -0.21906304 ... -0.04335137  0.33862042
   0.12597848]
 [ 1.60547664 -0.35081945 -0.01419527 ... -0.08386646 -0.26094328
   0.29388734]
 [ 1.76422587 -0.34706205 -0.19966902 ... -0.00374093 -0.0494686
  -0.05759293]
 ...
 [ 0.00870534 -0.00372915  0.00192617 ... -0.03469206 -0.00734346
  -0.0079372 ]
 [ 0.677149   -0.42375212  0.90784308 ...  0.00789553  0.0081455
  -0.05333466]
 [ 0.22518311  0.20697599  0.14519699 ... 

In [0]:
start = time.clock()
clf = svm.SVC( gamma=0.2,C=0.7, decision_function_shape='ovo')
clf.fit(Xtrain_svd, ytrain)
elapsed = (time.clock() - start)
print("Time used:",elapsed)

Time used: 390.762283


In [0]:
start = time.clock()
clf = LogisticRegression(random_state=0,solver='lbfgs',multi_class='multinomial',max_iter=2000).fit(Xtrain_svd,ytrain)
elapsed = (time.clock() - start)
print("Time used:",elapsed)

Time used: 44.28662600000007


In [0]:
start = time.clock()
clf = PassiveAggressiveClassifier(C=0.1,max_iter=2000, tol=1e-3,shuffle=False,)
# clf.fit(Xtrain_svd, ytrain)
print(Xtrain.shape)
clf.fit(Xtrain, ytrain)

elapsed = (time.clock() - start)
print("Time used:",elapsed)

(31647, 5736)
Time used: 197.3306990000001


In [0]:
# X_val_svd = svd.fit(Xtrain).transform(Xval)
p_result=clf.predict(Xtest_svd)


print(np.sum(ytest==p_result)/len(p_result))
# print(np.bincount(yval))
print(clf.score(Xtest_svd,ytest))

0.6870576339737108
0.6870576339737108
