# Load data and libraries

In [1]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.metrics.pairwise import cosine_similarity
from spellchecker import SpellChecker
from nltk.metrics.distance import edit_distance
import re

In [2]:
df = pd.read_csv("myfitnesspal_1k_sample_14.csv")

In [3]:
df.columns

Index(['Name', 'Meal', 'Brand', 'Calories', 'Carbs', 'Fat', 'Protein',
       'Sodium', 'Amount', 'Units'],
      dtype='object')

In [4]:
df.dropna(subset='Meal', inplace=True)

In [5]:
list_foods = df['Meal'].to_list()

In [6]:
len(list_foods)

10310

In [7]:
df.shape

(10310, 10)

## Enrich dataset with more foods

In [8]:
food = pd.read_csv("data/food_update_log_entry.csv")

In [9]:
tmp_df = food['description'].str.lower().str.split(r'[,|)|(|-]', expand=True, n=1)
tmp_df = tmp_df[0].str.strip()
tmp_list = tmp_df.drop_duplicates().dropna().to_list()
len(tmp_list)

273

In [10]:
# new_food = pd.read_csv("foodb_2020_04_07_csv/Food.csv")

In [11]:
# new_food.head()

### Combine and preprocess the 2 lists

In [12]:
enriched_food_list = tmp_list + list_foods
len(enriched_food_list)

10583

In [13]:
word_tokens = []
for i, sentence in enumerate(enriched_food_list):
    try:
        word_tokens.append(word_tokenize(sentence))
    except TypeError:
        print(f"Error at {i}: {sentence}")
len(word_tokens)

10583

In [14]:
stop_words = set(stopwords.words('english'))
total_list_no_stopwords = []
for list in word_tokens:
    stopwords_list = [w for w in list if not w in stop_words]
    total_list_no_stopwords.append(stopwords_list)
len(total_list_no_stopwords)

10583

In [15]:
dataset_processsed = []
for list in total_list_no_stopwords:
    noun_lemmatized = [WordNetLemmatizer().lemmatize(word, pos = "n") for word in list]
    dataset_processsed.append(noun_lemmatized)
len(dataset_processsed)

10583

In [16]:
original = dataset_processsed[273:]
len(original)

10310

In [17]:
# Add the processed meals into the dataset
df['processed'] = original

In [18]:
df['processed'] = df['processed'].apply(lambda x: ' '.join(x))

In [19]:
df.head()

Unnamed: 0,Name,Meal,Brand,Calories,Carbs,Fat,Protein,Sodium,Amount,Units,processed
0,my - mcdonalds espresso pronto flat white,mcdonalds espresso pronto flat white,my,412,29,24,21,258,2.0,tall,mcdonalds espresso pronto flat white
1,quest bar - banana nut muffin natural protein bar,banana nut muffin natural protein bar,quest bar,170,25,5,20,260,60.0,g,banana nut muffin natural protein bar
2,uncle tobys australia - vita brits,vita brits,uncle tobys australia,176,33,1,5,195,3.0,biscuits,vita brit
3,pauls - smarter white milk,smarter white milk,pauls,342,34,12,24,402,600.0,ml,smarter white milk
4,quest bar - cookies and cream,cookies and cream,quest bar,180,22,7,21,310,1.0,bar,cooky cream


# Apply the Word2Vec to embed the tokenized dataset

#### 1. Create word embeddings
        Create a word to 100dim-vector df or dict
#### 2. Sum words for each sentntence (sentence to vector df/dict)
#### 3. Create input embedding
#### 4. Compare input dict to sentence dict

In [66]:
# Create word embeddings
word2vec = Word2Vec(dataset_processsed, min_count=1, window = 5)
vectors = word2vec.wv

In [21]:
len(dataset_processsed)

10583

In [67]:
vectors.vectors[1].size

100

In [68]:
# word2vec.corpus_total_words
len(vectors.index_to_key)

3360

In [24]:
# wv['melon']

In [27]:
len(sentence_to_vect_dataset.keys())

6616

In [28]:
# word_to_vect_dataset.keys()

In [29]:
# len(sentence_to_vect_datatset['mcdonalds espresso pronto flat white'])

In [30]:
wv.similar_by_word(wv['melon'] + wv['salad'])

[('salad', 0.999779462814331),
 ('-', 0.999150812625885),
 ('mix', 0.9991172552108765),
 ('oz', 0.9990977048873901),
 ('latte', 0.999068558216095),
 ('beef', 0.9990552663803101),
 ("''", 0.999039888381958),
 ('fruit', 0.9990172386169434),
 ('water', 0.9990145564079285),
 ('cheese', 0.9989919066429138)]

In [69]:
res = vectors.similar_by_word('muffin')
type(res)

list

# Train the Word2Vec model

In [70]:
word2vec.train(dataset_processsed, total_examples = len(dataset_processsed), epochs=500)

(14181457, 17898000)

In [72]:
# Create a word to vector dict
word_to_vect_dataset = {}
for word in word2vec.wv.index_to_key:
    vector = word2vec.wv[word]
    word_to_vect_dataset[word] = vector
# word_to_vect_dataset

In [73]:
# Create a sentence to vector dict
sentence_to_vect_dataset = {}
for list_ in original:
    total = np.zeros(100)
    sentence = ''
    for word in list_:
        if word in word_to_vect_dataset.keys():
            total += word_to_vect_dataset[word]
            sentence += word + " "
    sentence_to_vect_dataset[sentence.strip()] = total

# Preprocess and embed the input

In [74]:
input_sentence = 'fruit salad'

In [75]:
word_tokens_input = word_tokenize(input_sentence)
word_tokens_input

['fruit', 'salad']

In [76]:
stopwords_input= [w for w in word_tokens_input if not w in stop_words]
stopwords_input

['fruit', 'salad']

In [77]:
noun_lemmatized = [WordNetLemmatizer().lemmatize(word, pos = "n") for word in stopwords_input]
noun_lemmatized

['fruit', 'salad']

In [78]:
# Find an embeded vector for the input inside our dataset. It should output the closest sum vector macthed to words existing in datatset.
# If it doesn't exist, it will spell check and look again

word_tokens_input_corr = []
spell = SpellChecker(distance=2)
misspelled = spell.unknown(noun_lemmatized)
total_vectors_found = np.zeros(100)
for word in noun_lemmatized:
    if word in word_to_vect_dataset.keys():
        total_vectors_found += word_to_vect_dataset[word]
        word_tokens_input_corr.append(word)
    else:
        try:
            if word in misspelled:
                corr_word = spell.correction(word)
                word_tokens_input_corr.append(corr_word)

                total_vectors_found += word_to_vect_dataset[corr_word]
            else:
                total_vectors_found += np.zeros(100)
                word_tokens_input_corr.append(word)
        except:
            total_vectors_found += np.zeros(100)
# total_vectors_found
word_tokens_input_corr = [WordNetLemmatizer().lemmatize(word, pos = "n") for word in word_tokens_input_corr]
word_tokens_input_corr

['fruit', 'salad']

# Look for similarities in the dataset

In [79]:
similarities = {}
# input_vector = list(word_to_vect_input.values())[0][0]
for sentence, vector in sentence_to_vect_dataset.items():
    similarity_score = cosine_similarity([total_vectors_found], [vector])
    similarities[sentence] = similarity_score
top_n = 10
sorted_similarities = sorted(similarities.items(), key=lambda x: x[1], reverse=True)[:top_n]

# Number of most similar sentences to print
# for sentence, similarity_score in sorted_similarities[:top_n]:
# print(f"Sentence: {sentence}, Similarity Score: {similarity_score}")

In [80]:
max(similarities.values())[0][0]

1.0000000000000002

In [81]:
max_key = max(similarities, key=lambda k: similarities[k][0][0])
max_key

'fruit salad'

In [82]:
# similarities['protein bar']

In [83]:
sorted_similarities[0][0]

'fruit salad'

In [84]:
df

Unnamed: 0,Name,Meal,Brand,Calories,Carbs,Fat,Protein,Sodium,Amount,Units,processed
0,my - mcdonalds espresso pronto flat white,mcdonalds espresso pronto flat white,my,412,29,24,21,258,2.0,tall,mcdonalds espresso pronto flat white
1,quest bar - banana nut muffin natural protein bar,banana nut muffin natural protein bar,quest bar,170,25,5,20,260,60.0,g,banana nut muffin natural protein bar
2,uncle tobys australia - vita brits,vita brits,uncle tobys australia,176,33,1,5,195,3.0,biscuits,vita brit
3,pauls - smarter white milk,smarter white milk,pauls,342,34,12,24,402,600.0,ml,smarter white milk
4,quest bar - cookies and cream,cookies and cream,quest bar,180,22,7,21,310,1.0,bar,cooky cream
...,...,...,...,...,...,...,...,...,...,...,...
10550,morrisons - frozen hash browns,frozen hash browns,morrisons,113,16,5,1,100,1.0,hash,frozen hash brown
10551,toast - toast,toast,toast,250,53,0,18,0,2.5,slice,toast
10552,sainsburys - liquorice allsorts,liquorice allsorts,sainsburys,895,0,4,0,0,250.0,g,liquorice allsorts
10553,warburtons - fruity tea cakes,fruity tea cakes,warburtons,328,60,4,11,380,2.0,tea,fruity tea cake


In [103]:
sorted_similarities

[('fruit salad', array([[1.]])),
 ('tropical fruit salad', array([[0.93929386]])),
 ('fresh fruit salad', array([[0.88147553]])),
 ('mixed berry fruit salad', array([[0.78877057]])),
 ('fruit chew', array([[0.76055011]])),
 ('fruit teacake', array([[0.75944644]])),
 ('kiwi fruit', array([[0.74754216]])),
 ('fruit gusher', array([[0.74638288]])),
 ('forest fruit', array([[0.73992296]])),
 ('cobb salad', array([[0.7209426]]))]

### Extract nutrition data for the chosen match

In [104]:
sorted_similarities[0][0]

'fruit salad'

In [105]:
df_meal = (df[df['processed'] == sorted_similarities[0][0]]).iloc[0]
print(f"{df_meal['Name']}: {df_meal['Calories']}kcal, protein: {df_meal['Protein']}g, carbs: {df_meal['Carbs']}g, fats: {df_meal['Fat']}g")


generic - fruit salad: 180kcal, protein: 4g, carbs: 42g, fats: 0g


In [106]:
df_meal['Name']

'generic - fruit salad'

In [None]:
# Can I get something that looks for an exact combination of words e.g. 'egg sandwich'?
# if not found, look for nearest meaning i.e. egg with a carb (toast, muffin, roll, wrap etc.)?
#  What about typos?

In [None]:
# TODO:
# Find a list of words that don't exist in database, single foods is best
# Split the current dataset in foods only and the rest
# Add new foods to the dataset. Create embeddings
# Separate the original list of foods and reconnect it with the rest of the dataframe
# Add the tokenized column into the dataframe, so that to retrieve nutritional information

# Enriching the dataset with new words

Upload a list of new foods

Separate the original foods and merge them back with the rest of the dataset

Use the newly trained model to embed the input