In [None]:
import pandas as pd
import numpy as np
import csv
import zipfile
import re

import random
import matplotlib as mpl
import matplotlib.pyplot as plt

from nltk.corpus import stopwords
from scipy.misc import imread
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from wordcloud import WordCloud, STOPWORDS


%matplotlib inline
filepath = "../resources/food.csv.zip"
zf = zipfile.ZipFile(filepath)
food = pd.read_csv(zf.open('food.csv'), parse_dates=True, dtype=object, delimiter="\t", quoting=csv.QUOTE_NONE, encoding='utf-8')
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows',10)

food

In [None]:
def printCategoryInfo(dataframe, min_values=2):
    with pd.option_context('display.max_rows', 99999, 'display.max_columns', 1):
        print("Product names:")
        printWordOccurences(str(dataframe["product_name"]), min_values)
        print("-----------")
        print("Categories_en:")
        printWordOccurences(str(dataframe["categories_en"]), min_values)
        print("-----------")
        print("Categories_tags:")
        printWordOccurences(str(dataframe["categories_tags"]), min_values)
        print("-----------")
        print("Categories:")
        printWordOccurences(str(dataframe["categories"]), min_values)

In [None]:
def grey_color(word, font_size, position, orientation, random_state=None, **kwargs):
    return 'hsl(0, 0%%, %d%%)' % random.randint(50, 100)

def draw_wordcloud(text):
    # mpl.style.use('dark')
    limit = 1000
    infosize = 12

    fontcolor='#fafafa'
    bgcolor = '#000000'
    # english_stopwords = set(stopwords.words('english')) | STOPWORDS | ENGLISH_STOP_WORDS
    
    wordcloud = WordCloud(
        max_words=limit,
        # stopwords=english_stopwords,
        # mask=imread('img/sherlock-holmes-silhouette.png'),
        background_color=bgcolor,
        #font_path=font
    ).generate(text)
    
    fig = plt.figure()
    fig.set_figwidth(14)
    fig.set_figheight(18)

    plt.imshow(wordcloud.recolor(color_func=grey_color, random_state=3))
    #plt.title(title, color=fontcolor, size=30, y=1.01)
    #plt.annotate(footer, xy=(0, -.025), xycoords='axes fraction', fontsize=infosize, color=fontcolor)
    plt.axis('off')
    plt.show()

In [None]:
# from stackoverflow
def getWordScores(text):
    scores = {}
    for word in text.strip().split():
        word = word.lower()
        scores[word] = scores.get(word,0) + 1
    return scores

def printWordOccurences(text, min_word_count=0):
    import operator
    text = getWordScores(text)
    sorted_text = sorted(text.items(), key=operator.itemgetter(1))
    for i in sorted_text:
        if int(i[1]) >= min_word_count:
            print(i)

In [None]:
# get a list of all candy tags and combinations of candy tags, get all foods who's tags are in that list.
# bad approach because it's slow and if something contains candy tags + non-candy tags (e.g. 'desserts, water')
# it falls through the cracks. Sadly I can't figure out how to do
# "for candy_tag in candy_tags: get items where candy_tag in food['categories_tags']"

from itertools import combinations

candy_tags = ["en:sugary-snacks", "en:desserts", "en:candies", "en:chocolates", "en:pies", "en:puddings", 
 "en:biscuits-and-cakes", "en:confectioneries", "en:chocolate-chip-cookies", "en:chocolate-block", 
 "en:jelly-crystals", "en:carrot-cake", "en:chocolate-covered-muesli-bar"]
augmented_tags = candy_tags

for i in range(2, 4):
    augmented_tags += [",".join(map(str, comb)) for comb in combinations(candy_tags, i)]

#augmented_tags
tagged_candies = food[food['categories_tags'].isin(augmented_tags)]

#tagged_candies = [item for item in food if not set(augmented_tags).isdisjoint(item['categories_tags'])]
                           
tagged_candies

In [None]:
food_not_tagged_as_candy = food[~food['categories_tags'].isin(augmented_tags)]

food_not_tagged_as_candy

In [None]:
candies_around_50_carbs = tagged_candies[tagged_candies['carbohydrates_100g'].astype(float) > 47]
candies_around_50_carbs = candies_around_50_carbs[candies_around_50_carbs['carbohydrates_100g'].astype(float) < 57]

candies_around_50_carbs

#candies_below = tagged_candies[tagged_candies['carbohydrates_100g'].astype(float) < 20]
#candies_below
#candies_above = tagged_candies[tagged_candies['carbohydrates_100g'].astype(float) > 65]
#candies_above

In [None]:
candies_around_50_sugar = tagged_candies[tagged_candies['sugars_100g'].astype(float) > 45]
candies_around_50_sugar = candies_around_50_carbs[candies_around_50_carbs['sugars_100g'].astype(float) < 55]

candies_around_50_sugar

#candies_below = tagged_candies[tagged_candies['sugars_100g'].astype(float) < 30]
#candies_below
candies_above = tagged_candies[tagged_candies['sugars_100g'].astype(float) > 65]
candies_above

In [None]:
candies_around_x_fats = tagged_candies[tagged_candies['fat_100g'].astype(float) < 5]
candies_around_x_fats = candies_around_x_fats[candies_around_x_fats['fat_100g'].astype(float) < 5]

#candies_around_x_fats = candies_around_x_fats[candies_around_x_fats['carbohydrates_100g'].astype(float) > 30]


candies_around_x_fats

In [None]:
candies_around_x_sugar = tagged_candies[tagged_candies['sugars_100g'].astype(float) > 22]
candies_around_x_sugar = candies_around_x_sugar[candies_around_x_sugar['sugars_100g'].astype(float) < 30]

candies_around_x_sugar = candies_around_x_sugar[candies_around_x_sugar['carbohydrates_100g'].astype(float) > 30]


candies_around_x_sugar

In [None]:
candies_around_x_carbs = tagged_candies[tagged_candies['carbohydrates_100g'].astype(float) > 22]
candies_around_x_carbs = candies_around_x_carbs[candies_around_x_carbs['carbohydrates_100g'].astype(float) < 30]

candies_around_x_carbs

In [None]:
import plotly.plotly as py
import plotly.graph_objs as go
py.sign_in('Odin123', '5Ba7VLNrZsqtUXUatHiZ')

candy_sugar = tagged_candies["sugars_100g"].astype(float).mean()
others_sugar = food_not_tagged_as_candy["sugars_100g"].astype(float).mean()

candy_fat = tagged_candies["fat_100g"].astype(float).mean()
others_fat = food_not_tagged_as_candy["fat_100g"].astype(float).mean()

candy_energy = tagged_candies["energy_100g"].astype(float).mean() / 1000
others_energy = food_not_tagged_as_candy["energy_100g"].astype(float).mean() / 1000

candy_carbs = tagged_candies["carbohydrates_100g"].astype(float).mean()
others_carbs = food_not_tagged_as_candy["carbohydrates_100g"].astype(float).mean()

candy_salt = tagged_candies["salt_100g"].astype(float).mean()
others_salt = food_not_tagged_as_candy["salt_100g"].astype(float).mean()

candy_nutrition_score = tagged_candies["nutrition-score-fr_100g"].astype(float).mean()
others_nutrition_score = food_not_tagged_as_candy["nutrition-score-fr_100g"].astype(float).mean()

x_names = ["sugar", "fat", "energy (MJ)", "carbohydrates", "salt", "nutrition score"]
y_values_candy = [candy_sugar, candy_fat, candy_energy, candy_carbs, candy_salt, candy_nutrition_score]
y_values_others = [others_sugar, others_fat, others_energy, others_carbs, others_salt, others_nutrition_score]

trace1 = go.Bar(
    x=x_names,
    y=y_values_candy,
    name='Candy'
)
trace2 = go.Bar(
    x=x_names,
    y=y_values_others,
    name='Other Foods'
)

data = [trace1, trace2]
layout = go.Layout(
    #xaxis=dict(tickangle=-180),
    barmode='group',
    height=700,
    margin=go.Margin(
        l=50,
        r=50,
        b=150,
        t=100,
        pad=4
    ),
)

fig = go.Figure(data=data, layout=layout)
# comment out figure-creation to save API access volume
#py.iplot(fig, filename='candy_others_comparison_bars', bbox_inches='tight')

In [None]:
trace1 = go.Histogram(
    x=tagged_candies["salt_100g"].astype(float),
)

layout = go.Layout(
    title='Salt (in g) in Candy',
    xaxis=dict(
        title='salt in 100g (steps of 5)'
    ),
    yaxis=dict(
        title='Number of candy in interval'
    ),
    bargap=0.01
)
data = [trace1]

fig = go.Figure(data=data, layout=layout)
#py.iplot(fig, filename='candy_salt_histo')

In [None]:
trace1 = go.Histogram(
    x=tagged_candies["sugars_100g"].astype(float),
)

layout = go.Layout(
    title='Sugar (in g) in Candy',
    xaxis=dict(
        title='sugar in 100g (steps of 5)'
    ),
    yaxis=dict(
        title='Number of candy in interval'
    ),
    bargap=0.01
)
data = [trace1]

fig = go.Figure(data=data, layout=layout)
#py.iplot(fig, filename='candy_sugar_histo')

In [None]:
trace1 = go.Histogram(
    x=tagged_candies["fat_100g"].astype(float),
)

layout = go.Layout(
    title='Fat (in g) in Candy',
    xaxis=dict(
        title='fat in 100g'
    ),
    yaxis=dict(
        title='Number of candy in interval'
    ),
    bargap=0.01
)
data = [trace1]

fig = go.Figure(data=data, layout=layout)
#py.iplot(fig, filename='candy_fat_histo')

In [None]:
trace1 = go.Histogram(
    x=tagged_candies["carbohydrates_100g"].astype(float),
)

layout = go.Layout(
    title='Carbohydrates (in g) in Candy',
    xaxis=dict(
        title='carbohydrates in 100g (steps of 5)'
    ),
    yaxis=dict(
        title='Number of candy in interval'
    ),
    bargap=0.01
)
data = [trace1]

fig = go.Figure(data=data, layout=layout)
#py.iplot(fig, filename='candy_carbs_histo')

In [None]:
trace1 = go.Histogram(
    x=food_not_tagged_as_candy["carbohydrates_100g"].astype(float),
)

layout = go.Layout(
    title='Carbohydrates in Others',
    xaxis=dict(
        title='carbohydrates in 100g'
    ),
    yaxis=dict(
        title='Number of items in interval'
    ),
    bargap=0.01
)
data = [trace1]

fig = go.Figure(data=data, layout=layout)
#py.iplot(fig, filename='others_carbs_histo')

In [None]:
## WEEK 4 ##

trace = go.Scatter(
    x = tagged_candies["fat_100g"].astype(float),
    y = tagged_candies["sugars_100g"].astype(float),
    mode = 'markers'
)

layout = go.Layout(
    title= 'Sweets: Fat to Sugar ratio',
    xaxis= dict(
        range=[0, 100],
        title="Fat in g / 100g"
    ),
    yaxis=dict(
        range=[0, 100],
        title="Sugars in g / 100g"
    )
)

data = [trace]

fig = go.Figure(data=data, layout=layout)
#py.iplot(fig, filename='candy_fat_sugar_ratio_scatter')

In [None]:
trace = go.Scatter(
    x = food_not_tagged_as_candy["fat_100g"].astype(float),
    y = food_not_tagged_as_candy["sugars_100g"].astype(float),
    mode = 'markers'
)

layout = go.Layout(
    title= 'Non-sweets: Fat to Sugar ratio',
    xaxis= dict(
        range=[0, 100],
        title="Fat in g / 100g"
    ),
    yaxis=dict(
        range=[0, 100],
        title="Sugars in g / 100g"
    )
)

data = [trace]

#fig = go.Figure(data=data, layout=layout)
#py.iplot(fig, filename='non-candy_fat_sugar_ratio_scatter')

In [None]:
food_not_tagged_as_candy['fat/sugar'] = food_not_tagged_as_candy['fat_100g'].astype(float) / food_not_tagged_as_candy['sugars_100g'].astype(float)

trace = go.Scatter(
    x = food_not_tagged_as_candy['fat/sugar'],
    y = food_not_tagged_as_candy["salt_100g"].astype(float),
    mode = 'markers'
)

layout = go.Layout(
    title= 'Non-sweets: (Fat to Sugar) to salt ratio',
    xaxis= dict(
        range=[0, 100],
        title="(Fat to Sugar) in g / 100g"
    ),
    yaxis=dict(
        range=[0, 100],
        title="salt in g / 100g"
    )
)

data = [trace]

fig = go.Figure(data=data, layout=layout)
#py.iplot(fig, filename='non-candy_fat-sugar_salt_ratio_scatter')

In [None]:
# non-candy in the area where candy is on the same graph: chocolate[~1000], (fruit-based) beverages[~5000] sugary [~2000]
low_ratio_non_candy = food_not_tagged_as_candy[food_not_tagged_as_candy['fat/sugar'] <= 1]
low_ratio_non_candy = low_ratio_non_candy[low_ratio_non_candy["salt_100g"].astype(float) < 0.4]

#printCategoryInfo(low_ratio_non_candy, 200)

In [None]:
tagged_candies['fat/sugar'] = tagged_candies['fat_100g'].astype(float) / tagged_candies['sugars_100g'].astype(float)

trace = go.Scatter(
    x = tagged_candies['fat/sugar'],
    y = tagged_candies["salt_100g"].astype(float),
    mode = 'markers'
)

layout = go.Layout(
    title= 'Sweets: (Fat to Sugar) to salt ratio',
    xaxis= dict(
        range=[0, 100],
        title="(Fat to Sugar) in g / 100g"
    ),
    yaxis=dict(
        range=[0, 100],
        title="salt in g / 100g"
    )
)

data = [trace]

fig = go.Figure(data=data, layout=layout)
#py.iplot(fig, filename='candy_fat-sugar_salt_ratio_scatter')

In [None]:
fat_sugar_outlier = tagged_candies[tagged_candies['salt_100g'].astype(float) > 0.47]
#print(fat_sugar_outlier)

In [None]:
# outlier to the right
candies_at_x_sugar_fat = tagged_candies[tagged_candies['sugars_100g'].astype(float) == 11.7]
candies_at_x_sugar_fat = candies_at_x_sugar_fat[candies_at_x_sugar_fat['fat_100g'].astype(float) == 67]

#candies_at_x_sugar_fat # Quernons d'ardoise Chocolat et nougatine


In [None]:
trace = go.Scatter(
    x = food_not_tagged_as_candy['fat/sugar'],
    y = food_not_tagged_as_candy["energy_100g"].astype(float),
    mode = 'markers'
)

layout = go.Layout(
    title= 'Non-Sweets: (Fat to Sugar) to energy ratio',
    xaxis= dict(
        range=[0, 100],
        title="(Fat to Sugar) in g / 100g"
    ),
    yaxis=dict(
        range=[0, 5000],
        title="energy in J / 100g"
    )
)

data = [trace]

fig = go.Figure(data=data, layout=layout)
#py.iplot(fig, filename='non-candy_fat-sugar_energy_ratio_scatter_limited')

In [None]:
high_energy = food_not_tagged_as_candy[food_not_tagged_as_candy["energy_100g"].astype(float) > 2000000]
#print(high_energy)

In [None]:
trace = go.Scatter(
    x = tagged_candies['fat/sugar'],
    y = tagged_candies["energy_100g"].astype(float),
    mode = 'markers'
)

layout = go.Layout(
    title= 'Sweets: (Fat to Sugar) to energy ratio',
    xaxis= dict(
        range=[0, 100],
        title="(Fat to Sugar) in g / 100g"
    ),
    yaxis=dict(
        #range=[0, 100],
        title="energy in J / 100g"
    )
)

data = [trace]

fig = go.Figure(data=data, layout=layout)
#py.iplot(fig, filename='candy_fat-sugar_energy_ratio_scatter')

In [None]:
candies_below_4_fat = tagged_candies[tagged_candies['fat_100g'].astype(float) < 4]

# printCategoryInfo(candies_below_4_fat)  # Jelly / Pudding

#TODO: split gained info into subgroups 
#(what differentiates jelly and pudding? Salt or sth?, then get stuff on other values (include the sugar side as well))


In [None]:
# <4 fat, <25 sugars, candy: pudding  (+ some haribo & mint)
sugars_40_to_60_below_4_fat = candies_below_4_fat[candies_below_4_fat["sugars_100g"].astype(float) <= 25]

#printCategoryInfo(candies_below_4_fat)

In [None]:
# <4 fat, <25 sugars, non_candy: drinks, yogurts, fruits
sugars_25sugars_below_4_fat_noncandy = food_not_tagged_as_candy[food_not_tagged_as_candy["fat_100g"].astype(float) < 4]
sugars_25sugars_below_4_fat_noncandy = sugars_25sugars_below_4_fat_noncandy[sugars_25sugars_below_4_fat_noncandy["sugars_100g"].astype(float) < 25]

#printCategoryInfo(sugars_25sugars_below_4_fat_noncandy, 20)

In [None]:
# <4 fat, 40-60 sugars candy: Haribo
sugars_40_to_60_below_4_fat = candies_below_4_fat[candies_below_4_fat["sugars_100g"].astype(float) <= 60]
sugars_40_to_60_below_4_fat = sugars_40_to_60_below_4_fat[sugars_40_to_60_below_4_fat["sugars_100g"].astype(float) >= 40]

# printCategoryInfo(sugars_40_to_60_below_4_fat)

In [None]:
# <4 fat, 40-60 sugars non-candy: beverages, jelly, bonbons
sugars_40_to_60_below_4_fat_noncandy = food_not_tagged_as_candy[food_not_tagged_as_candy["fat_100g"].astype(float) < 4]
sugars_40_to_60_below_4_fat_noncandy = sugars_40_to_60_below_4_fat_noncandy[sugars_40_to_60_below_4_fat_noncandy["sugars_100g"].astype(float) <= 60]
sugars_40_to_60_below_4_fat_noncandy = sugars_40_to_60_below_4_fat_noncandy[sugars_40_to_60_below_4_fat_noncandy["sugars_100g"].astype(float) >= 40]

#printCategoryInfo(sugars_40_to_60_below_4_fat_noncandy, 20)

# seems like I missed a bunch of tags :xxxx might want to revisit that, then keep looking for clusters to work with

In [None]:
# 30-42 fat, 40-60 sugar  candy: chocolate
candies_at_x_sugar_fat = tagged_candies[tagged_candies['fat_100g'].astype(float) > 30]
candies_at_x_sugar_fat = candies_at_x_sugar_fat[candies_at_x_sugar_fat['fat_100g'].astype(float) < 42]
candies_at_x_sugar_fat = candies_at_x_sugar_fat[candies_at_x_sugar_fat['sugars_100g'].astype(float) > 40]
candies_at_x_sugar_fat = candies_at_x_sugar_fat[candies_at_x_sugar_fat['sugars_100g'].astype(float) < 60]

#printCategoryInfo(candies_at_x_sugar_fat, 20)

In [None]:
# 30-42 fat, 40-60 sugar  non-candy: chocolate
candies_at_x_sugar_fat = food_not_tagged_as_candy[food_not_tagged_as_candy['fat_100g'].astype(float) > 30]
candies_at_x_sugar_fat = candies_at_x_sugar_fat[candies_at_x_sugar_fat['fat_100g'].astype(float) < 42]
candies_at_x_sugar_fat = candies_at_x_sugar_fat[candies_at_x_sugar_fat['sugars_100g'].astype(float) > 40]
candies_at_x_sugar_fat = candies_at_x_sugar_fat[candies_at_x_sugar_fat['sugars_100g'].astype(float) < 60]

#printCategoryInfo(candies_at_x_sugar_fat, 20)

In [None]:
### Machine Learning Stuff ###
from sklearn_pandas import DataFrameMapper

#pd.to_numeric(tagged_candies, errors='coerce')
#tagged_candies[['sugars_100g','fat_100g', 'salt_100g', 'energy_100g']] = tagged_candies[['sugars_100g','fat_100g', 'salt_100g', 'energy_100g']].apply(lambda x: pd.to_numeric(x, errors='coerce'))

# TODO: use .apply to split tags and find all candies, use sklean-pandas to start some machine learning stuffs

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score


# 100 trees, seed for rnd-ness is 22
forest = RandomForestClassifier(n_estimators = 100, random_state = 22)

prediction_features = pd.read_csv(zf.open('food.csv'), parse_dates=True, dtype=object, delimiter="\t", quoting=csv.QUOTE_NONE, encoding='utf-8', 
                                  usecols=["sugars_100g", 
                                           "fat_100g",
                                           "salt_100g",
                                           "energy_100g",
                                           "categories_en",
                                          ])

# turn all objects (aside from categories) into floats
prediction_features = prediction_features.apply(pd.to_numeric, errors='ignore')

# remove columns with NA values
prediction_features = prediction_features.dropna()
# print(prediction_features)

# put labels in separate dataframe
labels_to_predict = prediction_features["categories_en"]
prediction_features.drop('categories_en', axis=1, inplace=True)

# turn labels_to_predict from a string-dataframe into a numbers-dataframe (indicating whether the word 'chocolate' is in categories with 1 or 0)
indeces = []
values = []
for index, value in labels_to_predict.iteritems():
    indeces.append(index)
    if 'chocolate' in value:
        values.append(1.0)
    else:
        values.append(0.0)

values = np.asarray(values)  # turn values into np array
labels_dict = {'is_chocolate' : pd.Series(values, indeces)} 

labels_to_predict = pd.DataFrame(labels_dict)

for index, value in labels_to_predict['is_chocolate'].iteritems():
    pass#print(value)

# TODO: need to split this into 3 sets: training, validation(to tweak), test
X_train, X_test, y_train, y_test = train_test_split(prediction_features, labels_to_predict, random_state=22)

In [None]:
# build tree (fit) and get cross-val using training data

X_train_len = len(X_train["fat_100g"].values)
y_train_len = len(y_train["is_chocolate"].values)

fat_data = X_train["fat_100g"].values.reshape(X_train_len, 1)
sugars_data = X_train["sugars_100g"].values.reshape(X_train_len, 1)
salt_data = X_train["salt_100g"].values.reshape(X_train_len, 1)
energy_data = X_train["energy_100g"].values.reshape(X_train_len, 1)

# combine training data and flatten label-column to list
train_data = np.hstack([fat_data, sugars_data, salt_data, energy_data])
final_labels_train = np.ravel(y_train)

forest = forest.fit(train_data, final_labels_train)

# determine cross validation score
scores = cross_val_score(forest, train_data, final_labels_train, scoring='accuracy', cv = 5)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

In [None]:
# get f1-score based on test data
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score


X_test_len = len(X_test["fat_100g"].values)

fat_test = X_test["fat_100g"].values.reshape(X_test_len, 1)
sugars_test = X_test["sugars_100g"].values.reshape(X_test_len, 1)
salt_test = X_test["salt_100g"].values.reshape(X_test_len, 1)
energy_test = X_test["energy_100g"].values.reshape(X_test_len, 1)

final_data_test = np.hstack([fat_test, sugars_test, salt_test, energy_test])
final_labels_test = np.ravel(y_test)

y_predicted = forest.predict(final_data_test)
for i in y_predicted:
    pass#print(i)
#y_accurancy = np.mean(final_labels_test == y_predicted) * 100
#print("Accuracy of test data: {0:.1f}%".format(y_accurancy))

# print(fat_test)

recall = recall_score(y_test, y_predicted)
precision = precision_score(y_test, y_predicted)
f_score = f1_score(y_test, y_predicted)

print("recall:" , recall)
print("precision: ", precision)
print("f1_score: ", f_score)

In [None]:
def predict_chocolate(df):
    print("removing columns with NaN, might prevent aligning original df with predictions; pls remove NaN columns ahead of time")
    # turn everything you can into floats and remove columns with NA values
    df = df.apply(pd.to_numeric, errors='ignore')
    df = df.dropna(subset=["fat_100g", "sugars_100g", "salt_100g", "energy_100g"])

    length = len(df["fat_100g"].values)
    
    fat_data = df["fat_100g"].values.reshape(length, 1)
    sugars_data = df["sugars_100g"].values.reshape(length, 1)
    salt_data = df["salt_100g"].values.reshape(length, 1)
    energy_data = df["energy_100g"].values.reshape(length, 1)
    
    # combine training data and flatten label-column to list
    final_data = np.hstack([fat_data, sugars_data, salt_data, energy_data])
        
    
    return forest.predict(final_data)

#print(predict_chocolate(food))

cleansed_candies = tagged_candies.apply(pd.to_numeric, errors='ignore')
cleansed_candies = cleansed_candies.dropna(subset=["fat_100g", "sugars_100g", "salt_100g", "energy_100g"])

acc = 0
for i in (predict_chocolate(cleansed_candies)):
    acc += i
    
print("predicts: ", acc)

acc = 0
for index, value in cleansed_candies['categories_en'].iteritems():
    if "Choc" in value:
        acc += 1

print("chcocs: ", acc)

acc = 0
y_predicted = predict_chocolate(cleansed_candies)
for i in (y_predicted):
    if "Choc" in value:
        acc += i
print("true positives: ", acc)
# good scores but poor performance in predicting chocolate; good scores come from high amounts of true negatives.
# Not great tbh fam

In [None]:
# print precision, recall and f1_score of sweets-only prediction

y_test_cleansed_candies = cleansed_candies['categories_en'].copy()
for index, value in y_test_cleansed_candies.iteritems():
    if "Choc" in value:
        y_test_cleansed_candies[index] = "1"
    else:
        y_test_cleansed_candies[index] = "0"

        
y_test_cleansed_candies = y_test_cleansed_candies.astype(int).values
        
# problem: yfind out what y_predicted contains
    
recall = recall_score(y_test_cleansed_candies, y_predicted)
precision = precision_score(y_test_cleansed_candies, y_predicted)
f_score = f1_score(y_test_cleansed_candies, y_predicted)

print("recall:" , recall)
print("precision: ", precision)
print("f1_score: ", f_score)

In [None]:
chocs_only = food.dropna(subset=["categories_en"])
chocs_only = chocs_only[chocs_only['categories_en'].str.contains("Choc")]
chocs_only = chocs_only.apply(pd.to_numeric, errors='ignore')
chocs_only = chocs_only.dropna(subset=["fat_100g", "sugars_100g", "salt_100g", "energy_100g"])

trace = go.Scatter(
    x = chocs_only['sugars_100g'],
    y = chocs_only["energy_100g"].astype(float),
    mode = 'markers'
)

layout = go.Layout(
    title= 'Chocolates: sugar to energy',
    xaxis= dict(
        range=[0, 100],
        title="sugar in g / 100g"
    ),
    yaxis=dict(
        #range=[0, 100],
        title="energy in J / 100g"
    )
)

data = [trace]

fig = go.Figure(data=data, layout=layout)
#py.iplot(fig, filename='choc-sugar_energy_ratio_scatter')

In [None]:
trace = go.Scatter(
    x = chocs_only['fat_100g'],
    y = chocs_only["salt_100g"].astype(float),
    mode = 'markers'
)

layout = go.Layout(
    title= 'Chocolates: fat to salt',
    xaxis= dict(
        range=[0, 100],
        title="fat in g / 100g"
    ),
    yaxis=dict(
        #range=[0, 100],
        title="salt in g / 100g"
    )
)

data = [trace]

fig = go.Figure(data=data, layout=layout)
#py.iplot(fig, filename='choc-fat_salt_ratio_scatter')

In [None]:
trace = go.Scatter(
    x = food['fat_100g'].astype(float),
    y = food["salt_100g"].astype(float),
    mode = 'markers'
)

layout = go.Layout(
    title= 'Food: fat to salt',
    xaxis= dict(
        range=[0, 100],
        title="fat in g / 100g"
    ),
    yaxis=dict(
        range=[0, 100],
        title="salt in g / 100g"
    )
)

data = [trace]

fig = go.Figure(data=data, layout=layout)
#py.iplot(fig, filename='food-fat_salt_ratio_scatter')

In [None]:
dark_chocs = chocs_only[chocs_only['categories_en'].str.contains("Dark")]

print(len(dark_chocs))
#print(dark_chocs)

trace1 = go.Scatter(
    x = chocs_only['sugars_100g'],
    y = chocs_only["energy_100g"].astype(float),
    mode = 'markers',
    name = "All Chocolates",
    marker = dict(
        color = 'rgba(255, 182, 193, .9)'
    )
)

trace2 = go.Scatter(
    x = dark_chocs['sugars_100g'],
    y = dark_chocs["energy_100g"].astype(float),
    mode = 'markers',
    name = "Dark Chocolates",
    marker = dict(
        #size = 10,
        color = 'rgba(152, 0, 0, .8)'
       # line = dict(
       #     width = 2,
       #     color = 'rgb(0, 0, 0)'
       # )
    )
)

layout = go.Layout(
    title= 'Dark Chocolates: sugar to energy',
    xaxis= dict(
        range=[0, 100],
        title="sugar in g / 100g"
    ),
    yaxis=dict(
        #range=[0, 100],
        title="energy in J / 100g"
    )
)

data = [trace1, trace2]

fig = go.Figure(data=data, layout=layout)
#py.iplot(fig, filename='dark_choc-sugar_energy_ratio_scatter')

In [None]:
trace1 = go.Scatter(
    x = chocs_only['sugars_100g'],
    y = chocs_only["energy_100g"].astype(float),
    mode = 'markers',
    name = "All Chocolates",
    marker = dict(
        color = 'rgba(255, 182, 193, .9)'
    )
)

trace2 = go.Scatter(
    x = dark_chocs['sugars_100g'],
    y = dark_chocs["energy_100g"].astype(float),
    mode = 'markers',
    name = "Dark Chocolates",
    marker = dict(
        #size = 10,
        color = 'rgba(152, 0, 0, .8)'
       # line = dict(
       #     width = 2,
       #     color = 'rgb(0, 0, 0)'
       # )
    )
)

layout = go.Layout(
    title= 'Dark Chocolates: sugar to energy',
    xaxis= dict(
        range=[0, 100],
        title="sugar in g / 100g"
    ),
    yaxis=dict(
        #range=[0, 100],
        title="energy in J / 100g"
    )
)

data = [trace1, trace2]

fig = go.Figure(data=data, layout=layout)
#py.iplot(fig, filename='dark_choc-sugar_energy_ratio_scatter')

In [None]:
# predcols: columns used to predict [str list]
# label_cond: substr that has to be part of label for labels_to_predict [str]
# returns tuple (forest, y_test, X_test)
def train_forest(predcols, label_cond):
    cols_to_load = predcols + ["categories_en"]
    
    forest = RandomForestClassifier(n_estimators = 100, random_state = 22)
    
    prediction_features = pd.read_csv(zf.open('food.csv'), parse_dates=True, dtype=object, delimiter="\t", quoting=csv.QUOTE_NONE, encoding='utf-8', 
                                      usecols=cols_to_load)

    #print(prediction_features)
    
    # turn all objects (aside from categories) into floats
    prediction_features = prediction_features.apply(pd.to_numeric, errors='ignore')

    # remove columns with NA values
    prediction_features = prediction_features.dropna()
    # print(prediction_features)

    # put labels in separate dataframe
    labels_to_predict = prediction_features["categories_en"]
    prediction_features.drop('categories_en', axis=1, inplace=True)

    # turn labels_to_predict from a string-dataframe into a numbers-dataframe (indicating whether the word 'chocolate' is in categories with 1 or 0)
    indeces = []
    values = []
    for index, value in labels_to_predict.iteritems():
        indeces.append(index)
        if label_cond in value:
            values.append(1.0)
        else:
            values.append(0.0)

    predict_labels_id = "is_" + label_cond

    values = np.asarray(values)  # turn values into np array
    labels_dict = {predict_labels_id : pd.Series(values, indeces)} 

    labels_to_predict = pd.DataFrame(labels_dict)

    #for index, value in labels_to_predict[predict_labels_id].iteritems():
    #    print(value)

    # TODO: need to split this into 3 sets: training, validation(to tweak), test
    X_train, X_test, y_train, y_test = train_test_split(prediction_features, labels_to_predict, random_state=22)


    X_train_len = len(X_train[predcols[0]].values)
    y_train_len = len(y_train[predict_labels_id].values)
    
    train_data = []
    for attribute in predcols:
        train_data.append(X_train[attribute].values.reshape(X_train_len, 1))

    # combine training data and flatten label-column to list
    train_data = np.hstack(train_data)
    final_labels_train = np.ravel(y_train)

    forest = forest.fit(train_data, final_labels_train)

    # determine cross validation score
    scores = cross_val_score(forest, train_data, final_labels_train, scoring='accuracy', cv = 5)
    print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
    
    return (forest, y_test, X_test)

def predict(forest, df, predcols):
    print("removing columns with NaN, might prevent aligning original df with predictions; pls remove NaN columns ahead of time")
    # turn everything you can into floats and remove columns with NA values
    df = df.apply(pd.to_numeric, errors='ignore')
    df = df.dropna(subset=predcols)

    length = len(df[predcols[0]].values)
    
    pred_data = []
    for attribute in predcols:
        pred_data.append(df[attribute].values.reshape(length, 1))
    
    # combine training data and flatten label-column to list
    final_data = np.hstack(pred_data)
        
    return forest.predict(final_data)

In [None]:
predcols = ["sugars_100g", "energy_100g", "salt_100g", "fat_100g", "proteins_100g", "carbohydrates_100g"]
dark_choc_forest, dark_choc_y_test, dark_choc_X_test = train_forest(predcols, "Dark choco")

y_predicted = predict(dark_choc_forest, dark_choc_X_test, predcols)

recall = recall_score(dark_choc_y_test, y_predicted)
precision = precision_score(dark_choc_y_test, y_predicted)
f_score = f1_score(dark_choc_y_test, y_predicted)

print("recall:" , recall)
print("precision: ", precision)
print("f1_score: ", f_score)

In [None]:
"""
Chocolate predictor:

all food:
recall: 0.639705882353
precision:  0.692307692308
f1_score:  0.664968152866
    
sweets:
recall: 0.200680272109
precision:  0.983333333333
f1_score:  0.333333333333"""

"""
Dark Choc sugar/energy:

recall: 0.489010989011
precision:  0.585526315789
f1_score:  0.532934131737

sugar/energy/salt/fat:

recall: 0.678362573099
precision:  0.738853503185
f1_score:  0.707317073171

-> all worse if we remove salt
-> precision up, rest worse if we add sodium or remove fat

"""

In [None]:
print(food)