In [82]:
import plotly.plotly as py
import plotly.graph_objs as go
py.sign_in('Odin123', '5Ba7VLNrZsqtUXUatHiZ')

import pandas as pd
import numpy as np
import csv
import zipfile
import re

import random
import matplotlib as mpl
import matplotlib.pyplot as plt

from nltk.corpus import stopwords
from scipy.misc import imread
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from wordcloud import WordCloud, STOPWORDS


%matplotlib inline
filepath = "../resources/food.csv.zip"
zf = zipfile.ZipFile(filepath)
food = pd.read_csv(zf.open('food.csv'), parse_dates=True, dtype=object, delimiter="\t", quoting=csv.QUOTE_NONE, encoding='utf-8')
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows',10)

# get a list of all candy tags and combinations of candy tags, get all foods who's tags are in that list.
# bad approach because it's slow and if something contains candy tags + non-candy tags (e.g. 'desserts, water')
# it falls through the cracks. Sadly I can't figure out how to do
# "for candy_tag in candy_tags: get items where candy_tag in food['categories_tags']"

from itertools import combinations

candy_tags = ["en:sugary-snacks", "en:desserts", "en:candies", "en:chocolates", "en:pies", "en:puddings", 
 "en:biscuits-and-cakes", "en:confectioneries", "en:chocolate-chip-cookies", "en:chocolate-block", 
 "en:jelly-crystals", "en:carrot-cake", "en:chocolate-covered-muesli-bar"]
augmented_tags = candy_tags

for i in range(2, 4):
    augmented_tags += [",".join(map(str, comb)) for comb in combinations(candy_tags, i)]

#augmented_tags
tagged_candies = food[food['categories_tags'].isin(augmented_tags)]

#tagged_candies = [item for item in food if not set(augmented_tags).isdisjoint(item['categories_tags'])]


In [83]:
cols_to_load = ["sugars_100g", "energy_100g", "salt_100g", "fat_100g", "proteins_100g", "carbohydrates_100g", "categories_en", "code"]
clean_food = pd.read_csv(zf.open('food.csv'), parse_dates=True, dtype=object, delimiter="\t", quoting=csv.QUOTE_NONE, encoding='utf-8', 
                                      usecols=cols_to_load)
clean_food = clean_food.apply(pd.to_numeric, errors='ignore')
 # remove columns with NA values
clean_food = clean_food.dropna()

In [84]:
### Machine Learning Stuff ###
from sklearn_pandas import DataFrameMapper
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
#pd.to_numeric(tagged_candies, errors='coerce')
#tagged_candies[['sugars_100g','fat_100g', 'salt_100g', 'energy_100g']] = tagged_candies[['sugars_100g','fat_100g', 'salt_100g', 'energy_100g']].apply(lambda x: pd.to_numeric(x, errors='coerce'))

# TODO: use .apply to split tags and find all candies, use sklean-pandas to start some machine learning stuffs

In [85]:
def predict_chocolate(df):
    print("removing columns with NaN, might prevent aligning original df with predictions; pls remove NaN columns ahead of time")
    # turn everything you can into floats and remove columns with NA values
    df = df.apply(pd.to_numeric, errors='ignore')
    df = df.dropna(subset=["fat_100g", "sugars_100g", "salt_100g", "energy_100g"])

    length = len(df["fat_100g"].values)
    
    fat_data = df["fat_100g"].values.reshape(length, 1)
    sugars_data = df["sugars_100g"].values.reshape(length, 1)
    salt_data = df["salt_100g"].values.reshape(length, 1)
    energy_data = df["energy_100g"].values.reshape(length, 1)
    
    # combine training data and flatten label-column to list
    final_data = np.hstack([fat_data, sugars_data, salt_data, energy_data])
        
    
    return forest.predict(final_data)


In [86]:
def plot_attributes(id_string, name, use_only_chocs=True):
    if use_only_chocs:
        selected_chocs = chocs_only[chocs_only['categories_en'].str.contains(id_string)]
    else:
        cols_to_load = ["sugars_100g", "energy_100g", "salt_100g", "fat_100g", "proteins_100g", "carbohydrates_100g", 'categories_en']
        food = pd.read_csv(zf.open('food.csv'), parse_dates=True, dtype=object, delimiter="\t", quoting=csv.QUOTE_NONE, encoding='utf-8', 
                                      usecols=cols_to_load)
        # turn all objects (aside from categories) into floats
        food = food.apply(pd.to_numeric, errors='ignore')
        # remove columns with NA values
        food = food.dropna()
        
        selected_chocs = food[food['categories_en'].str.contains(id_string)]  # shouldnt be called selected_chocs since its all food
        
    print(len(selected_chocs))
    #print(dark_chocs)

    trace1 = go.Scatter(
        x = selected_chocs['sugars_100g'],
        y = [85 for x in range(len(selected_chocs['sugars_100g']))],
        mode = 'markers',
        name = "x: sugar per 100g",
        marker = dict(
            color = 'rgba(152, 0, 0, .8)'
        )
    )
    
    trace2 = go.Scatter(
        x = selected_chocs["fat_100g"],
        y = [70 for x in range(len(selected_chocs['fat_100g']))],
        mode = 'markers',
        name = "x: fat per 100g",
        marker = dict(
            color = 'rgba(152, 152, 0, .8)'
        )
    )
    
    trace3 = go.Scatter(
        x = selected_chocs["salt_100g"],
        y = [55 for x in range(len(selected_chocs['salt_100g']))],
        mode = 'markers',
        name = "x: salt per 100g",
        marker = dict(
            color = 'rgba(188, 113, 255, .9)'
        )
    )
    
    trace4 = go.Scatter(
        x = selected_chocs["carbohydrates_100g"],
        y = [40 for x in range(len(selected_chocs['carbohydrates_100g']))],
        mode = 'markers',
        name = "x: carbohydrates per 100g",
        marker = dict(
            color = 'rgba(100, 100, 255, .9)'
        )
    )
    
    trace5 = go.Scatter(
        x = selected_chocs['proteins_100g'],
        y = [25 for x in range(len(selected_chocs['proteins_100g']))],
        mode = 'markers',
        name = "x: proteins per 100g",
        marker = dict(
            #size = 10,
            color = 'rgba(79, 190, 96, .9)'
           # line = dict(
           #     width = 2,
           #     color = 'rgb(0, 0, 0)'
           # )
        )
    )

    layout = go.Layout(
        title= name +  ': attribute clusters',
        xaxis= dict(
            range=[0, 100],
            title="x-axis"
        ),
        yaxis=dict(
            range=[0, 100],
            title="y-axis"
        )
    )

    data = [trace1, trace2, trace3, trace4, trace5]

    fig = go.Figure(data=data, layout=layout)
    py.iplot(fig, filename=name + '-attribute_scatter')

In [87]:
# predcols: columns used to predict [str list]
# label_cond: substr that has to be part of label for labels_to_predict [str]
# forest_count: number of trees to use in forest
# custom_df: dataframe to use, None for standard food.csv
# returns tuple (forest, y_test, X_test)
def train_forest(predcols, label_cond, forest_count = 100, custom_df = None):
    cols_to_load = predcols + ["categories_en"]
    
    forest = RandomForestClassifier(n_estimators = forest_count, random_state = 22)
    
    if custom_df is None:
        prediction_features = pd.read_csv(zf.open('food.csv'), parse_dates=True, dtype=object, delimiter="\t", quoting=csv.QUOTE_NONE, encoding='utf-8', 
                                      usecols=cols_to_load)
    else:
        prediction_features = custom_df
    
    #print(prediction_features)
    
    # turn all objects (aside from categories) into floats
    prediction_features = prediction_features.apply(pd.to_numeric, errors='ignore')

    # remove columns with NA values
    prediction_features = prediction_features.dropna()
    # print(prediction_features)

    # put labels in separate dataframe
    labels_to_predict = prediction_features["categories_en"]
    prediction_features.drop('categories_en', axis=1, inplace=True)

    # turn labels_to_predict from a string-dataframe into a numbers-dataframe (indicating whether the word 'chocolate' is in categories with 1 or 0)
    indeces = []
    values = []
    for index, value in labels_to_predict.iteritems():
        indeces.append(index)
        if label_cond in value:
            values.append(1.0)
        else:
            values.append(0.0)

    predict_labels_id = "is_" + label_cond

    values = np.asarray(values)  # turn values into np array
    labels_dict = {predict_labels_id : pd.Series(values, indeces)} 

    labels_to_predict = pd.DataFrame(labels_dict)

    #for index, value in labels_to_predict[predict_labels_id].iteritems():
    #    print(value)

    # TODO: need to split this into 3 sets: training, validation(to tweak), test
    X_train, X_test, y_train, y_test = train_test_split(prediction_features, labels_to_predict, random_state=22)


    X_train_len = len(X_train[predcols[0]].values)
    y_train_len = len(y_train[predict_labels_id].values)
    
    train_data = []
    for attribute in predcols:
        train_data.append(X_train[attribute].values.reshape(X_train_len, 1))

    # combine training data and flatten label-column to list
    train_data = np.hstack(train_data)
    final_labels_train = np.ravel(y_train)

    forest = forest.fit(train_data, final_labels_train)

    # determine cross validation score
    scores = cross_val_score(forest, train_data, final_labels_train, scoring='accuracy', cv = 5)
    print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
    
    return (forest, y_test, X_test)

#def predict(forest, df, predcols):
#    print("removing columns with NaN, might prevent aligning original df with predictions; pls remove NaN columns ahead of time")
#    # turn everything you can into floats and remove columns with NA values
#    df = df.apply(pd.to_numeric, errors='ignore')
#    df = df.dropna(subset=predcols)

#    length = len(df[predcols[0]].values)
    
#    pred_data = []
#    for attribute in predcols:
#        pred_data.append(df[attribute].values.reshape(length, 1))
    
    # combine training data and flatten label-column to list
#    final_data = np.hstack(pred_data)
        
#    return forest.predict(final_data)

In [88]:
def is_in_food_list(items, item):
    for i in items:
        if i["code"] == item["code"]:
            return True
    return False

def predict(forest, df, predcols):
    length = len(df[predcols[0]].values)
    
    pred_data = []
    for attribute in predcols:
        pred_data.append(df[attribute].values.reshape(length, 1))
    
    # combine training data and flatten label-column to list
    final_data = np.hstack(pred_data)
        
    return forest.predict(final_data)

# call with list of forests, df, list of predcols (one for each forest!)
# should return a list with all items that were predicted to be a kind of chocolate by the forests
def predict_chocolate_unified(forests, df, predcols_list):
    pred_data = []
    for i in range(len(forests)):
        predicted_by_this_forest = predict(forests[i], df, predcols_list[i])
        for j in range(len(df)):
            if (predicted_by_this_forest[j] == 1):
                if not is_in_food_list(pred_data, df.iloc[j]):
                    pred_data.append(df.iloc[j])
            
    print(pred_data)
    return pred_data




In [89]:
# 100 trees, seed for rnd-ness is 22
forest = RandomForestClassifier(n_estimators = 100, random_state = 22)

prediction_features = pd.read_csv(zf.open('food.csv'), parse_dates=True, dtype=object, delimiter="\t", quoting=csv.QUOTE_NONE, encoding='utf-8', 
                                  usecols=["sugars_100g", 
                                           "fat_100g",
                                           "salt_100g",
                                           "energy_100g",
                                           "categories_en",
                                          ])

# turn all objects (aside from categories) into floats
prediction_features = prediction_features.apply(pd.to_numeric, errors='ignore')

# remove columns with NA values
prediction_features = prediction_features.dropna()
# print(prediction_features)

# put labels in separate dataframe
labels_to_predict = prediction_features["categories_en"]
prediction_features.drop('categories_en', axis=1, inplace=True)

# turn labels_to_predict from a string-dataframe into a numbers-dataframe (indicating whether the word 'chocolate' is in categories with 1 or 0)
indeces = []
values = []
for index, value in labels_to_predict.iteritems():
    indeces.append(index)
    if 'chocolate' in value:
        values.append(1.0)
    else:
        values.append(0.0)

values = np.asarray(values)  # turn values into np array
labels_dict = {'is_chocolate' : pd.Series(values, indeces)} 

labels_to_predict = pd.DataFrame(labels_dict)

for index, value in labels_to_predict['is_chocolate'].iteritems():
    pass#print(value)

# TODO: need to split this into 3 sets: training, validation(to tweak), test
X_train, X_test, y_train, y_test = train_test_split(prediction_features, labels_to_predict, random_state=22)

In [90]:
# build tree (fit) and get cross-val using training data

X_train_len = len(X_train["fat_100g"].values)
y_train_len = len(y_train["is_chocolate"].values)

fat_data = X_train["fat_100g"].values.reshape(X_train_len, 1)
sugars_data = X_train["sugars_100g"].values.reshape(X_train_len, 1)
salt_data = X_train["salt_100g"].values.reshape(X_train_len, 1)
energy_data = X_train["energy_100g"].values.reshape(X_train_len, 1)

# combine training data and flatten label-column to list
train_data = np.hstack([fat_data, sugars_data, salt_data, energy_data])
final_labels_train = np.ravel(y_train)

forest = forest.fit(train_data, final_labels_train)

# determine cross validation score
scores = cross_val_score(forest, train_data, final_labels_train, scoring='accuracy', cv = 5)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.98 (+/- 0.01)


In [91]:
# get f1-score based on test data
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score


X_test_len = len(X_test["fat_100g"].values)

fat_test = X_test["fat_100g"].values.reshape(X_test_len, 1)
sugars_test = X_test["sugars_100g"].values.reshape(X_test_len, 1)
salt_test = X_test["salt_100g"].values.reshape(X_test_len, 1)
energy_test = X_test["energy_100g"].values.reshape(X_test_len, 1)

final_data_test = np.hstack([fat_test, sugars_test, salt_test, energy_test])
final_labels_test = np.ravel(y_test)

y_predicted = forest.predict(final_data_test)
for i in y_predicted:
    pass#print(i)
#y_accurancy = np.mean(final_labels_test == y_predicted) * 100
#print("Accuracy of test data: {0:.1f}%".format(y_accurancy))

# print(fat_test)

recall = recall_score(y_test, y_predicted)
precision = precision_score(y_test, y_predicted)
f_score = f1_score(y_test, y_predicted)

print("recall:" , recall)
print("precision: ", precision)
print("f1_score: ", f_score)

recall: 0.639705882353
precision:  0.692307692308
f1_score:  0.664968152866


In [92]:
#print(predict_chocolate(food))

cleansed_candies = tagged_candies.apply(pd.to_numeric, errors='ignore')
cleansed_candies = cleansed_candies.dropna(subset=["fat_100g", "sugars_100g", "salt_100g", "energy_100g"])

acc = 0
for i in (predict_chocolate(cleansed_candies)):
    acc += i
    
print("predicts: ", acc)

acc = 0
for index, value in cleansed_candies['categories_en'].iteritems():
    if "Choc" in value:
        acc += 1

print("chcocs: ", acc)

acc = 0
y_predicted = predict_chocolate(cleansed_candies)
for i in (y_predicted):
    if "Choc" in value:
        acc += i
print("true positives: ", acc)
# good scores but poor performance in predicting chocolate; good scores come from high amounts of true negatives.
# Not great tbh fam

removing columns with NaN, might prevent aligning original df with predictions; pls remove NaN columns ahead of time
predicts:  60.0
chcocs:  294
removing columns with NaN, might prevent aligning original df with predictions; pls remove NaN columns ahead of time
true positives:  60.0


In [93]:
# print precision, recall and f1_score of sweets-only prediction

y_test_cleansed_candies = cleansed_candies['categories_en'].copy()
for index, value in y_test_cleansed_candies.iteritems():
    if "Choc" in value:
        y_test_cleansed_candies[index] = "1"
    else:
        y_test_cleansed_candies[index] = "0"

        
y_test_cleansed_candies = y_test_cleansed_candies.astype(int).values
        
# problem: yfind out what y_predicted contains
    
recall = recall_score(y_test_cleansed_candies, y_predicted)
precision = precision_score(y_test_cleansed_candies, y_predicted)
f_score = f1_score(y_test_cleansed_candies, y_predicted)

print("recall:" , recall)
print("precision: ", precision)
print("f1_score: ", f_score)

recall: 0.200680272109
precision:  0.983333333333
f1_score:  0.333333333333


In [94]:
chocs_only = food.dropna(subset=["categories_en"])
chocs_only = chocs_only[chocs_only['categories_en'].str.contains("Choc")]
chocs_only = chocs_only.apply(pd.to_numeric, errors='ignore')
chocs_only = chocs_only.dropna(subset=["fat_100g", "sugars_100g", "salt_100g", "energy_100g"])

trace = go.Scatter(
    x = chocs_only['sugars_100g'],
    y = chocs_only["energy_100g"].astype(float),
    mode = 'markers'
)

layout = go.Layout(
    title= 'Chocolates: sugar to energy',
    xaxis= dict(
        range=[0, 100],
        title="sugar in g / 100g"
    ),
    yaxis=dict(
        #range=[0, 100],
        title="energy in J / 100g"
    )
)

data = [trace]

fig = go.Figure(data=data, layout=layout)
#py.iplot(fig, filename='choc-sugar_energy_ratio_scatter')

In [95]:
trace = go.Scatter(
    x = chocs_only['fat_100g'],
    y = chocs_only["salt_100g"].astype(float),
    mode = 'markers'
)

layout = go.Layout(
    title= 'Chocolates: fat to salt',
    xaxis= dict(
        range=[0, 100],
        title="fat in g / 100g"
    ),
    yaxis=dict(
        #range=[0, 100],
        title="salt in g / 100g"
    )
)

data = [trace]

fig = go.Figure(data=data, layout=layout)
#py.iplot(fig, filename='choc-fat_salt_ratio_scatter')

In [96]:
trace = go.Scatter(
    x = food['fat_100g'].astype(float),
    y = food["salt_100g"].astype(float),
    mode = 'markers'
)

layout = go.Layout(
    title= 'Food: fat to salt',
    xaxis= dict(
        range=[0, 100],
        title="fat in g / 100g"
    ),
    yaxis=dict(
        range=[0, 100],
        title="salt in g / 100g"
    )
)

data = [trace]

fig = go.Figure(data=data, layout=layout)
#py.iplot(fig, filename='food-fat_salt_ratio_scatter')

In [97]:
dark_chocs = chocs_only[chocs_only['categories_en'].str.contains("Dark")]
white_chocs = chocs_only[chocs_only['categories_en'].str.contains("White")]

print(len(dark_chocs))
#print(dark_chocs)

trace1 = go.Scatter(
    x = chocs_only['sugars_100g'],
    y = chocs_only["energy_100g"].astype(float),
    mode = 'markers',
    name = "All Chocolates",
    marker = dict(
        color = 'rgba(255, 182, 193, .9)'
    )
)

trace2 = go.Scatter(
    x = white_chocs['sugars_100g'],
    y = white_chocs["energy_100g"].astype(float),
    mode = 'markers',
    name = "white Chocolates",
    marker = dict(
        #size = 10,
        color = 'rgba(152, 0, 0, .8)'
       # line = dict(
       #     width = 2,
       #     color = 'rgb(0, 0, 0)'
       # )
    )
)

layout = go.Layout(
    title= 'White Chocolates: sugar to energy',
    xaxis= dict(
        range=[0, 100],
        title="sugar in g / 100g"
    ),
    yaxis=dict(
        #range=[0, 100],
        title="energy in J / 100g"
    )
)

data = [trace1, trace2]

fig = go.Figure(data=data, layout=layout)
#py.iplot(fig, filename='white_choc-sugar_energy_ratio_scatter')

724


In [98]:
trace1 = go.Scatter(
    x = chocs_only['sugars_100g'],
    y = chocs_only["energy_100g"].astype(float),
    mode = 'markers',
    name = "All Chocolates",
    marker = dict(
        color = 'rgba(255, 182, 193, .9)'
    )
)

trace2 = go.Scatter(
    x = dark_chocs['sugars_100g'],
    y = dark_chocs["energy_100g"].astype(float),
    mode = 'markers',
    name = "Dark Chocolates",
    marker = dict(
        #size = 10,
        color = 'rgba(152, 0, 0, .8)'
       # line = dict(
       #     width = 2,
       #     color = 'rgb(0, 0, 0)'
       # )
    )
)

layout = go.Layout(
    title= 'Dark Chocolates: sugar to energy',
    xaxis= dict(
        range=[0, 100],
        title="sugar in g / 100g"
    ),
    yaxis=dict(
        #range=[0, 100],
        title="energy in J / 100g"
    )
)

data = [trace1, trace2]

fig = go.Figure(data=data, layout=layout)
#py.iplot(fig, filename='dark_choc-sugar_energy_ratio_scatter')

In [99]:
# predict dark chocolate
dark_predcols = ["sugars_100g", "energy_100g", "salt_100g", "fat_100g", "proteins_100g", "carbohydrates_100g"]
dark_choc_forest, dark_choc_y_test, dark_choc_X_test = train_forest(dark_predcols, "Dark choco")

y_predicted = predict(dark_choc_forest, dark_choc_X_test, dark_predcols)

recall = recall_score(dark_choc_y_test, y_predicted)
precision = precision_score(dark_choc_y_test, y_predicted)
f_score = f1_score(dark_choc_y_test, y_predicted)

print("recall:" , recall)
print("precision: ", precision)
print("f1_score: ", f_score)

Accuracy: 0.99 (+/- 0.00)
recall: 0.752747252747
precision:  0.796511627907
f1_score:  0.774011299435


In [100]:
# predict white chocolate
white_predcols = ["sugars_100g", "energy_100g", "salt_100g", "fat_100g", "proteins_100g", "carbohydrates_100g"]
white_choc_forest, white_choc_y_test, white_choc_X_test = train_forest(white_predcols, "White choco")

y_predicted = predict(white_choc_forest, white_choc_X_test, white_predcols)

recall = recall_score(white_choc_y_test, y_predicted)
precision = precision_score(white_choc_y_test, y_predicted)
f_score = f1_score(white_choc_y_test, y_predicted)

print("recall:" , recall)
print("precision: ", precision)
print("f1_score: ", f_score)

Accuracy: 1.00 (+/- 0.00)
recall: 0.133333333333
precision:  1.0
f1_score:  0.235294117647


In [101]:
# predict milk chocolate
milk_predcols = ["sugars_100g", "salt_100g", "fat_100g", "proteins_100g"]
milk_choc_forest, milk_choc_y_test, milk_choc_X_test = train_forest(milk_predcols, "Milk choco", forest_count = 500)

y_predicted = predict(milk_choc_forest, milk_choc_X_test, milk_predcols)

recall = recall_score(milk_choc_y_test, y_predicted)
precision = precision_score(milk_choc_y_test, y_predicted)
f_score = f1_score(milk_choc_y_test, y_predicted)

print("recall:" , recall)
print("precision: ", precision)
print("f1_score: ", f_score)

Accuracy: 0.99 (+/- 0.00)
recall: 0.503546099291
precision:  0.628318584071
f1_score:  0.55905511811


In [102]:
predict_chocolate_unified([milk_choc_forest], clean_food, [milk_predcols])

[code                                        0009542018054
categories_en    Sugary snacks,Chocolates,Milk chocolates
energy_100g                                          2400
fat_100g                                               40
sugars_100g                                            50
proteins_100g                                           5
salt_100g                                           0.222
Name: 191, dtype: object, code             0034000123803
categories_en          Spreads
energy_100g               2260
fat_100g                  32.4
sugars_100g               51.4
proteins_100g             8.11
salt_100g                0.343
Name: 1257, dtype: object, code                                                 0034000171156
categories_en    Sugary snacks,Chocolates,Milk chocolates,Choco...
energy_100g                                                   2090
fat_100g                                                      31.6
sugars_100g                                            

[code                                        0009542018054
 categories_en    Sugary snacks,Chocolates,Milk chocolates
 energy_100g                                          2400
 fat_100g                                               40
 sugars_100g                                            50
 proteins_100g                                           5
 salt_100g                                           0.222
 Name: 191, dtype: object, code             0034000123803
 categories_en          Spreads
 energy_100g               2260
 fat_100g                  32.4
 sugars_100g               51.4
 proteins_100g             8.11
 salt_100g                0.343
 Name: 1257, dtype: object, code                                                 0034000171156
 categories_en    Sugary snacks,Chocolates,Milk chocolates,Choco...
 energy_100g                                                   2090
 fat_100g                                                      31.6
 sugars_100g                          

In [103]:
#plot_attributes("White choco", "White Chocolate")
#plot_attributes("Milk choco", "Milk Chocolate")
#plot_attributes("Dark choco", "Dark Chocolate")
#plot_attributes("", "All Foods", use_only_chocs=False)

In [104]:
"""
Chocolate predictor:

all food:
recall: 0.639705882353
precision:  0.692307692308
f1_score:  0.664968152866
    
sweets:
recall: 0.200680272109
precision:  0.983333333333
f1_score:  0.333333333333"""

"""
Dark Choc sugar/energy:

recall: 0.489010989011
precision:  0.585526315789
f1_score:  0.532934131737

sugar/energy/salt/fat:

recall: 0.678362573099
precision:  0.738853503185
f1_score:  0.707317073171


-> all worse if we remove salt
-> precision up, rest worse if we add sodium or remove fat

White Choc all-the-things:
recall: 0.133333333333
precision:  1.0
f1_score:  0.235294117647

Milk Choc all but carbohydrates with 500 trees instead of 100:  <BEST f1 MILK>
recall: 0.486842105263
precision:  0.660714285714
f1_score:  0.560606060606

Milk Choc all but carbohydrates & energy with 500 trees instead of 100:  <SIDEGRADE>
recall: 0.503546099291
precision:  0.628318584071
f1_score:  0.55905511811


"""

'\nDark Choc sugar/energy:\n\nrecall: 0.489010989011\nprecision:  0.585526315789\nf1_score:  0.532934131737\n\nsugar/energy/salt/fat:\n\nrecall: 0.678362573099\nprecision:  0.738853503185\nf1_score:  0.707317073171\n\n\n-> all worse if we remove salt\n-> precision up, rest worse if we add sodium or remove fat\n\nWhite Choc all-the-things:\nrecall: 0.133333333333\nprecision:  1.0\nf1_score:  0.235294117647\n\nMilk Choc all but carbohydrates with 500 trees instead of 100:  <BEST f1 MILK>\nrecall: 0.486842105263\nprecision:  0.660714285714\nf1_score:  0.560606060606\n\nMilk Choc all but carbohydrates & energy with 500 trees instead of 100:  <SIDEGRADE>\nrecall: 0.503546099291\nprecision:  0.628318584071\nf1_score:  0.55905511811\n\n\n'

In [105]:
mult_whites = clean_food.copy()

counter = 0
for index, row in mult_whites.iterrows():
    if "White choco" in row['categories_en']:
        counter += 1
print("pre: ", counter)
# create a df that contains all food but every time there's a white chocolate it is added 20 more times

white_choc_templist = []
for index, row in mult_whites.iterrows():
    if "White choco" in row['categories_en']:
        white_choc_templist.extend([row for i in range(20)])

for i in white_choc_templist:
    i["code"] = int(i["code"]) + 1000
        
mult_whites = mult_whites.append(white_choc_templist, ignore_index=True) # could multiply white_choc_templist here instead of earlier as well

counter = 0
for index, row in mult_whites.iterrows():
    if "White choco" in row['categories_en']:
        counter += 1
print("post: ", counter)



pre:  75
post:  1575


In [106]:
white_choc_forest, white_choc_y_test, white_choc_X_test = train_forest(white_predcols, "White choco", forest_count = 100, custom_df = mult_whites)

y_predicted = predict(white_choc_forest, white_choc_X_test, white_predcols)

recall = recall_score(white_choc_y_test, y_predicted)
precision = precision_score(white_choc_y_test, y_predicted)
f_score = f1_score(white_choc_y_test, y_predicted)

print("recall:" , recall)
print("precision: ", precision)
print("f1_score: ", f_score)

KeyError: 'carbohydrates_100g'

In [None]:
# predict_proba(X)   - predicts probabilities for classification = confidence, I think

#print(food)
for index, value in food['categories_en'].iteritems():
    try:
        if "choco" in value:
            print(value)
    except:
        pass

In [None]:
print(food)