## Prerequisities

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
import os
from os.path import join
import json
import numpy as np
import re

main_path = 'gdrive/MyDrive/bot/data'
recipes_path = join(main_path, 'recipes/recipes.json')
recipes_vectors_path = join(main_path, 'recipes/recipes_vectors.json')
recipes_cleared_path = join(main_path, 'recipes/recipes_cleared.json')

In [None]:
!pip install fasttext

In [None]:
import fasttext
ft_path = os.path.join(main_path, 'fasttext/cc.pl.100.bin')
ft = fasttext.load_model(ft_path)

In [None]:
!pip install morfeusz2

In [None]:
import morfeusz2
morf = morfeusz2.Morfeusz()

## Clear, lemmatize and vectorize ingredients

In [None]:
def find_all_subjects(analysis):
  subjects = []
  for a in analysis:
    if a[2][2].split(':')[0] == 'subst':
      subjects.append(a)
  return subjects

In [None]:
from collections import OrderedDict
def lemmatize(word):
  analysis = morf.analyse(word)
  analysis =  find_all_subjects(analysis)
  if not analysis:
    return -1
  lemmas = [a[2][1] for a in analysis]

  cleared_lemmas = []
  for lemma in lemmas:
    if len(lemma.split(':')) == 1:
      cleared_lemmas.append(lemma)

  cleared_lemmas2 = []
  prefs = set()
  for lemma in cleared_lemmas:
    if lemma[:3] not in prefs:
      cleared_lemmas2.append(lemma)
      prefs.add(lemma[:3])
      
  return ' '.join(cleared_lemmas2)

In [None]:
def remove_quantity(ingredient):
    words = ingredient.split()
    for i, word in enumerate(words):
        try:
            float(word)
            return ' '.join(words[:i])
        except ValueError:
            continue
    return ' '.join(words)

In [None]:
keywords_to_remove = ["Knorr", 'fix', "Fix", 'olej', 'oliwa', 'sól', 'woda', 'pieprz',
'mąka', 'masło', 'cukier', 'ryż', 'cebula', 'null']
def remove_some_ingredients(ingredients):
    return [ingredient for ingredient in ingredients if not any(k in ingredient for k in keywords_to_remove)]

In [None]:
def remove_stopwords(ingredient):
  stop_words = ['null']
  ingredient = [i for i in ingredient.split(' ') if i not in stop_words]
  return ' '.join(ingredient)

In [None]:
#remove_stopwords("sól null")

In [None]:
def clear_recipes2():
    cleared_recipes_dict = {}
    with open(recipes_path, 'r', encoding = "utf-8") as recipe_file:
        recipes = json.load(recipe_file)
        for url, recipe in recipes.items():
            try:
                ingredients = recipe["ingredients"]
                instructions = recipe['instructions']
                title = recipe['title']
            except KeyError:
                continue
            ingredients_without_stopwords = list(map(remove_stopwords, ingredients))
            cleared_ingredients = list(map(remove_quantity, ingredients))
            cleared_ingredients = remove_some_ingredients(cleared_ingredients)
            cleared_ingredients = [word for word in cleared_ingredients if word]
            cleared_ingredients = list(map(lemmatize, cleared_ingredients))
            if not any(i == -1 for i in cleared_ingredients):
              cleared_recipes_dict[url] = {
                  'title': title,
                  'instructions': instructions,
                  'ingredients': ingredients_without_stopwords,
                  'ingredients_cleared': cleared_ingredients
              }        
    with open(recipes_cleared_path, 'w', encoding = "utf-8") as cleared_file:
        json.dump(cleared_recipes_dict, cleared_file, ensure_ascii=False)

In [None]:
# clear_recipes2()

In [None]:
word_vectors_dict = {}
def get_sentence_vector(sentence):
    words = sentence.split()
    # leave only letters and numbers
    words = [re.sub(r'[\W_]+', '', word) for word in words]
    
    word_vectors = []
    for word in words:
        if word not in word_vectors_dict:
            word_vectors_dict[word] = ft.get_word_vector(word)
        word_vectors.append(word_vectors_dict[word])

    word_vectors = list(map(np.array, word_vectors))
    vectors_sum = np.add.reduce(word_vectors)
    vectors_mean = vectors_sum / len(words)
    return vectors_mean

In [None]:
def find_cosine_similarity(A, B):
    return np.dot(A,B)/(np.linalg.norm(A)*np.linalg.norm(B))

In [None]:
def make_ingredients_vector_file():
    with open(recipes_cleared_path, 'r', encoding='utf-8') as recipe_file:
        with open(recipes_vectors_path, 'w', encoding='utf-8') as vectors_file:
            recipes = json.load(recipe_file)
            for url, recipe in recipes.items():
                try:
                    ingredients = recipe["ingredients_cleared"]
                except KeyError:
                    print(recipe)
                ingredients_vector = get_sentence_vector(' '.join(ingredients))
                # print(ingredients_vector)
                try:
                  ingredients_vector = ingredients_vector.tolist()
                  ingredients_vector = [round(num, 4) for num in ingredients_vector]
                  to_write = json.dumps({url: ingredients_vector}, ensure_ascii=False)
                  vectors_file.write(f"{to_write}\n")
                except:
                  print(recipe, ingredients_vector)
                  continue

In [None]:
# make_ingredients_vector_file()

## Source

In [None]:
# def get_urls1(ingredients, how_many):
#     vector_in = get_sentence_vector(ingredients)
   
#     url2similarity = {}
#     with open(recipe_vectors_path, 'r', encoding='utf8') as vectors:
#         for line in vectors:
#             line = json.loads(line)

#             # theres only one pair (key-value) in every line
#             url_out, vector_out = list(line.items())[0]
        
#             similarity = find_cosine_similarity(vector_in, vector_out)
            
#             if similarity > 0.6:
#                 url2similarity[url_out] = similarity
#     res = dict(sorted(url2similarity.items(), key = itemgetter(1), reverse = True)[:how_many]) 
#     return res

In [None]:
def get_urls3(ingredients, how_many):
    ingredients = ingredients.split(",")
    ingredients = " ".join(list(map(lemmatize, ingredients)))
    vector_in = get_sentence_vector(ingredients)
    res_urls = {}
    with open(recipes_vectors_path, 'r', encoding='utf8') as vectors:
        for line in vectors:
            (url_out, vector_out), = json.loads(line).items()
            similarity = find_cosine_similarity(vector_in, vector_out)
            res_urls[url_out] = similarity
    res_urls = sorted(res_urls, key=res_urls.get, reverse=True)
    return res_urls

In [None]:
import pprint
pp = pprint.PrettyPrinter(indent=4)

def find_recipes_func(ingredients, how_many=1):
    urls = get_urls3(ingredients, how_many)
    recipes_out = []
    with open(recipes_cleared_path, 'r') as recipes_file:
        recipes = json.load(recipes_file)
        for url in urls:
          if url in recipes.keys():
            recipes_out.append(recipes[url])
    return recipes_out[:how_many]

In [None]:
from typing import List
import random


def find_best_recipe(ingredients: List[str]) -> str:
  three_tries = find_recipes_func(ingredients, 3)
  random_of_best = random.choice(three_tries)
  return f"{random_of_best['title']}.\nSkładniki:\n{', '.join(random_of_best['ingredients'])}.\nInstrukcje:\n{random_of_best['instructions']}"

In [None]:
find_best_recipe("szynka, ser, ogórek, majonez, makaron")

In [None]:
find_best_recipe("jajeczka, pomidory")

In [None]:
#find_recipes("cukinia, marchewka, ciecierzyca", 3)

In [None]:
#find_recipes("pomidory, mleko, masło", 3)

In [None]:
#find_recipes("szynka, kurczak, ogórek, sałata lodowa, papryka, ser", 3)

In [None]:
#find_recipes("makaron, kurczak, oliwki, pomidor, parmezan", 3)

In [None]:
#find_recipes("pieczarki, pomidory, cukinia, papryka, marchew, makaron", 3)