## Data Science Methodology With Decision Trees and Clustering

Python and Pandas version of https://gist.github.com/polong-lin/272fa27c135da785e29b5ef3628692ae

### Download the data file

In [40]:
import os
import pprint

from sklearn.cluster import KMeans
import pandas as pd
import requests


fname = 'recipes.csv'
if not os.path.isfile(fname):
    data_url = 'https://ibm.box.com/shared/static/5wah9atr5o1akuuavl2z9tkjzdinr1lv.csv'
    response = requests.get(data_url)
    if response.ok:
        fp = open(fname, 'w')
        fp.write(response.content)
        fp.close()
        print 'Data file downloaded'

### Load data in a dataframe

In [27]:
df = pd.read_csv('recipes.csv')
df.head()

Unnamed: 0,country,almond,angelica,anise,anise_seed,apple,apple_brandy,apricot,armagnac,artemisia,...,whiskey,white_bread,white_wine,whole_grain_wheat_flour,wine,wood,yam,yeast,yogurt,zucchini
0,Vietnamese,No,No,No,No,No,No,No,No,No,...,No,No,No,No,No,No,No,No,No,No
1,Vietnamese,No,No,No,No,No,No,No,No,No,...,No,No,No,No,No,No,No,No,No,No
2,Vietnamese,No,No,No,No,No,No,No,No,No,...,No,No,No,No,No,No,No,No,No,No
3,Vietnamese,No,No,No,No,No,No,No,No,No,...,No,No,No,No,No,No,No,No,No,No
4,Vietnamese,No,No,No,No,No,No,No,No,No,...,No,No,No,No,No,No,No,No,No,No


### Number of rows and columns

In [21]:
print 'Rows: {}'.format(len(df.index))
print 'Columns: {}'.format(len(df.axes[1]))

Rows: 57691
Columns: 384


### Filter ingredients

In [23]:
print df.filter(regex='rice').axes[1].values

['brown_rice' 'licorice' 'rice']


In [14]:
print df.filter(regex='wasabi').axes[1].values

['wasabi']


In [15]:
print df.filter(regex='soy').axes[1].values

['soy_sauce' 'soybean' 'soybean_oil']


### Recipes frequency table

In [17]:
df['country'].value_counts()

American                 40150
Mexico                    1754
Italian                   1715
Italy                     1461
Asian                     1176
French                     996
east_asian                 951
Canada                     774
korean                     767
Mexican                    622
western                    450
Southern_SoulFood          346
India                      324
Jewish                     320
Spanish_Portuguese         291
Mediterranean              289
UK-and-Ireland             282
Indian                     274
France                     268
MiddleEastern              248
Central_SouthAmerican      241
Germany                    237
Eastern-Europe             235
Chinese                    226
Greek                      225
English_Scottish           204
Caribbean                  183
Thai                       164
Scandinavia                158
Cajun_Creole               146
                         ...  
Scandinavian                92
chinese 

### Clean data

In [30]:
def rename_country(value):
    country_mapping = {
        'china': 'chinese',
        'france': 'french',
        'germany': 'german',
        'india': 'indian',
        'israel': 'jewish',
        'italy': 'italian',
        'japan': 'japanese',
        'korea': 'korean',
        'mexico': 'mexican',
        'scandinavia': 'scandinavian',
        'thailand': 'thai',
        'vietname': 'vietnamese',
    }
    return country_mapping.get(value, value)

df['country'] = df['country'].str.lower()
df['country'] = df['country'].apply(rename_country)
df.replace('Yes', 1, inplace=True)
df.replace('No', 0, inplace=True)

### Most popular ingredients

In [31]:
df[df > 0].count(axis=0).sort_values(axis=0, ascending=False).drop('country')

egg                 21114
wheat               20896
butter              20814
onion               18205
garlic              17465
milk                12925
vegetable_oil       11189
cream               10194
tomato               9978
olive_oil            9923
black_pepper         9893
pepper               9282
vanilla              9040
cayenne              8303
vinegar              8097
cane_molasses        7770
bell_pepper          6006
cinnamon             5639
parsley              5579
chicken              5483
lemon_juice          5099
beef                 4946
corn                 4836
cocoa                4807
scallion             4798
bread                4596
ginger               4396
mustard              4125
rice                 3888
basil                3845
                    ...  
chamomile               3
roasted_almond          3
rapeseed                3
hop                     3
holy_basil              3
long_pepper             2
strawberry_juice        2
raw_beef    

### Canadian Food

In [35]:
recipes_df = df.country.value_counts().to_frame(name='total')
recipes_df = recipes_df[recipes_df.total >= 50]
recipes_df.sort_index(inplace=True)
countries = recipes_df.index.values
sum_df = df[df.country.isin(countries)].groupby(by=['country']).sum()
norm_df = sum_df.apply(lambda k: k / recipes_df.total)
norm_df.loc['canada'].sort_values(ascending=False).head()

wheat     0.395349
butter    0.381137
egg       0.354005
onion     0.343669
garlic    0.270026
Name: canada, dtype: float64

### Top 3 ingredients for each country

In [36]:
for country in norm_df.index:
    print country
    print norm_df.loc[country].sort_values(ascending=False).head()[:3] * 100
    print ''

african
onion        53.043478
olive_oil    52.173913
garlic       49.565217
Name: african, dtype: float64

american
butter    41.158157
egg       40.513076
wheat     39.840598
Name: american, dtype: float64

asian
soy_sauce    49.622800
ginger       48.616932
garlic       47.946354
Name: asian, dtype: float64

cajun_creole
onion      69.863014
cayenne    56.164384
garlic     48.630137
Name: cajun_creole, dtype: float64

canada
wheat     39.534884
butter    38.113695
egg       35.400517
Name: canada, dtype: float64

caribbean
onion            51.366120
garlic           50.819672
vegetable_oil    31.147541
Name: caribbean, dtype: float64

central_southamerican
garlic     56.846473
onion      54.356846
cayenne    51.867220
Name: central_southamerican, dtype: float64

chinese
soy_sauce    68.552036
ginger       53.393665
garlic       52.941176
Name: chinese, dtype: float64

east_asian
garlic       55.205047
soy_sauce    50.368034
scallion     49.526814
Name: east_asian, dtype: float64

ea

### Similar cuisines

In [41]:
kmeans = KMeans(init='k-means++', n_clusters=9, n_init=10, max_iter=10)
kmeans.fit(norm_df)
labels = kmeans.labels_

groups = {}
for index, label in enumerate(labels):
    if label not in groups:
        groups[label] = []
    groups[label].append(countries[index])

pprint.pprint(groups)

{0: ['indian'],
 1: ['american',
     'canada',
     'eastern-europe',
     'easterneuropean_russian',
     'english_scottish',
     'french',
     'german',
     'irish',
     'jewish',
     'scandinavian',
     'southern_soulfood',
     'uk-and-ireland',
     'western'],
 2: ['greek', 'italian', 'mediterranean'],
 3: ['east_asian', 'korean'],
 4: ['cajun_creole', 'central_southamerican', 'mexican', 'southwestern'],
 5: ['african', 'middleeastern', 'moroccan', 'north-african'],
 6: ['thai', 'vietnamese'],
 7: ['asian', 'chinese', 'japanese'],
 8: ['caribbean', 'portugal', 'south-america', 'spain', 'spanish_portuguese']}
