In [1]:
import numpy as np
import json
from sklearn.feature_extraction.text import TfidfTransformer
from scipy import sparse

import matplotlib.pyplot as plt
%matplotlib inline

with open('train.json') as data_file:    
    data = json.load(data_file)

In [2]:
len(data)

39774

In [3]:
data[0]

{u'cuisine': u'greek',
 u'id': 10259,
 u'ingredients': [u'romaine lettuce',
  u'black olives',
  u'grape tomatoes',
  u'garlic',
  u'pepper',
  u'purple onion',
  u'seasoning',
  u'garbanzo beans',
  u'feta cheese crumbles']}

In [4]:
type(data[0])

dict

In [5]:
print data[0]['cuisine']

greek


<h3>Now from here on I want to create a count matrix so that we can feed it directly to TF IDF</h3>

<a href = "http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfTransformer.html">click  for more details </a>

<pre>
also the count matrix will be of size (#of cusine , #of ingredient) and say an element at position (i,j) will tell us the count of the ingredient no. j in the cuisine no. i   
</pre>


In [6]:
#let me create a list of all ingredient, a list of all cuisines and a dictionary right now which contain the count of ingredient 
#i in cuisine j
dict_cuisine_ingredient = {}
all_cuisine = []
all_ingredient = []

for recipe in data:
    
    if recipe['cuisine'] not in dict_cuisine_ingredient:
        dict_cuisine_ingredient[recipe['cuisine']] = {}
        all_cuisine.append(recipe['cuisine'])
    
    for ingredient in recipe['ingredients']:
        dict_cuisine_ingredient[recipe['cuisine']][ingredient] = dict_cuisine_ingredient[recipe['cuisine']].get(ingredient,0) + 1
        all_ingredient.append(ingredient)

        
    


<h3> Count of Cuisines and Ingredients </h3>

In [7]:
len(dict_cuisine_ingredient)

20

In [8]:
#just want to have unique
all_cuisine = list(set(all_cuisine)) 
all_ingredient = list(set(all_ingredient))

In [9]:
print len(all_cuisine)
print len(all_ingredient)

20
6714


In [10]:
print all_cuisine

[u'irish', u'mexican', u'chinese', u'filipino', u'vietnamese', u'moroccan', u'brazilian', u'japanese', u'british', u'greek', u'indian', u'jamaican', u'french', u'spanish', u'russian', u'cajun_creole', u'thai', u'southern_us', u'korean', u'italian']


In [11]:
print all_ingredient[:20]

[u'low-sodium fat-free chicken broth', u'sweetened coconut', u'baking chocolate', u'egg roll wrappers', u'bottled low sodium salsa', u'vegan parmesan cheese', u'clam sauce', u'mahlab', u'(10 oz.) frozen chopped spinach, thawed and squeezed dry', u'figs', u'caramels', u'broiler', u'jalapeno chilies', u'(15 oz.) refried beans', u'brioche buns', u'broccoli romanesco', u'flaked oats', u'anise extract', u'whole wheat pastry flour', u'ravva']


<h3> Now if we want to see how many ingredients are used in indian cuisine </h3>

In [12]:
print len(dict_cuisine_ingredient['indian']) # a lot of ingredients here 

1664


<h3> Now lets do the count matrix from the dictionary </h3>

In [13]:
#the rows and the columns will be based on all_cuisine and all_ingredient
count_matrix = np.zeros((len(all_cuisine),len(all_ingredient)))

for i,cuisine in enumerate(all_cuisine):
    for j,ingredient in enumerate(all_ingredient):
        count_matrix[i,j] = dict_cuisine_ingredient[cuisine].get(ingredient,0)



In [14]:
#looks like it is a sparse matrix we can take its benefit
count_matrix

array([[  0.,   1.,   1., ...,   0.,   0.,   2.],
       [  4.,   0.,   0., ...,   0.,   1.,  52.],
       [  1.,   1.,   0., ...,   0.,   0.,  29.],
       ..., 
       [  8.,   0.,   0., ...,   0.,   0.,  17.],
       [  0.,   0.,   0., ...,   0.,   0.,   4.],
       [  4.,   0.,   1., ...,   0.,   1.,  85.]])

In [15]:
count_matrix = sparse.csr_matrix(count_matrix)

<h2> Now lets do TF IDF </h2>
<br>
<a href ="https://en.wikipedia.org/wiki/Tf%E2%80%93idf"> more information here </h3>

In [16]:
transformer = TfidfTransformer()
tf_idf = transformer.fit_transform(count_matrix)

In [17]:
tf_idf

<20x6714 sparse matrix of type '<type 'numpy.float64'>'
	with 29179 stored elements in Compressed Sparse Row format>

In [18]:
#Again to matrix 
tf_idf_matrix = tf_idf.toarray()

In [19]:
tf_idf_matrix.shape

(20L, 6714L)

<h3> Lets apply clustering on it </h3>
<pre>
It will be expensive to do clustering in 6714 dimensions so lets do PCA first and convert it into 2 dimensions 
</pre>

In [20]:
from sklearn.decomposition import PCA

In [21]:
pca = PCA(n_components = 5)

reduced_data = pca.fit_transform(tf_idf_matrix)

In [22]:
reduced_data  #now the whole thing goes from 20 X 6714 to 20 X 5

array([[-0.3190509 ,  0.39170113,  0.10008381, -0.03017281,  0.06326038],
       [-0.07486998, -0.32682025,  0.05442174, -0.27504495, -0.15209605],
       [ 0.56009452,  0.16113335, -0.24069736,  0.05060211, -0.07985692],
       [ 0.20270626,  0.03607879,  0.05282143, -0.10025187, -0.11798547],
       [ 0.56358491, -0.05305991,  0.40653846,  0.19919508,  0.0884989 ],
       [-0.17645412, -0.40208961, -0.12582193, -0.06495178,  0.28841227],
       [-0.09842421, -0.17693367,  0.11179003, -0.13459559, -0.2077101 ],
       [ 0.46796668,  0.22767811, -0.39121186, -0.06937381,  0.15523455],
       [-0.33068868,  0.4254544 ,  0.10836234, -0.01319504,  0.09759842],
       [-0.25779666, -0.32178728, -0.1759282 ,  0.31458899,  0.07152844],
       [-0.00186022, -0.21125909,  0.00524111, -0.46856591,  0.32072498],
       [-0.02508152, -0.01287297,  0.02645654, -0.21063903, -0.16790973],
       [-0.3306661 ,  0.16313439,  0.01846612,  0.20312366,  0.08064937],
       [-0.25956645, -0.28793704, -0.1

In [23]:
from sklearn.cluster import KMeans

In [24]:
kmeans = KMeans(n_clusters = 3,random_state = 0).fit(reduced_data)

In [25]:
kmeans.labels_

array([1, 0, 2, 2, 2, 0, 0, 2, 1, 0, 0, 0, 1, 0, 1, 0, 2, 1, 2, 0])

In [26]:
all_cuisine.index('indian')

10

In [27]:
kmeans.labels_[10]

0

In [28]:
labels = {(all_cuisine[i],kmeans.labels_[i]) for i in range(len(all_cuisine))}

In [29]:
labels

{(u'brazilian', 0),
 (u'british', 1),
 (u'cajun_creole', 0),
 (u'chinese', 2),
 (u'filipino', 2),
 (u'french', 1),
 (u'greek', 0),
 (u'indian', 0),
 (u'irish', 1),
 (u'italian', 0),
 (u'jamaican', 0),
 (u'japanese', 2),
 (u'korean', 2),
 (u'mexican', 0),
 (u'moroccan', 0),
 (u'russian', 1),
 (u'southern_us', 1),
 (u'spanish', 0),
 (u'thai', 2),
 (u'vietnamese', 2)}

<pre>
If we look at the above labels we can see that spanish,jamaican,mexican,indian which is mostly considered spicy is put in the same classification.

Also the food such as Chinese,filipano,thai,vietnamese are very similar label 2.And so on 
</pre>

A great kernel showing the same analysis with great visualization is
<a href = "https://www.kaggle.com/alonalevy/whats-cooking/cultural-diffusion-by-recipes">shown here </a> 

<h3> More clustering - This time soft clustering </h3>

In [30]:
#Now lets see some results from mixture model
from sklearn.mixture import GMM

In [31]:
clusterer = GMM(n_components = 3,random_state= 0).fit(reduced_data)

In [32]:
#here we are directly getting the probabilty of the various classes.
clusterer.predict_proba(reduced_data)
# we are getting close to 1 probability because I am trying it on train data

array([[  1.77468776e-008,   9.99999982e-001,   1.43928959e-011],
       [  9.99999952e-001,   4.39222469e-021,   4.76979501e-008],
       [  1.18372137e-012,   3.96052407e-113,   1.00000000e+000],
       [  1.97029199e-002,   2.03631961e-037,   9.80297080e-001],
       [  2.31099249e-014,   5.42477616e-116,   1.00000000e+000],
       [  9.99999998e-001,   6.45424546e-021,   2.09860028e-009],
       [  9.99997556e-001,   5.26467573e-016,   2.44422883e-006],
       [  2.32039430e-012,   8.38224542e-103,   1.00000000e+000],
       [  5.90087636e-009,   9.99999994e-001,   7.60822012e-012],
       [  9.99999998e-001,   1.76268850e-016,   2.12313789e-009],
       [  9.99999999e-001,   2.68093592e-028,   1.20105336e-009],
       [  9.99878924e-001,   8.71037806e-017,   1.21076199e-004],
       [  1.53770126e-003,   9.98462296e-001,   2.46469718e-009],
       [  9.99999991e-001,   6.54228249e-012,   9.21501206e-009],
       [  2.22561980e-006,   9.99997774e-001,   8.69557127e-010],
       [  