# Notebook to convert text into embeddings

 We did not use the word2vec embeddings

In [None]:
import numpy as np
import pandas as pd
import polars as pl

In [None]:
def to_parquet(arr, recipes, name):
    pd.DataFrame(
    arr,
    index = recipes['RecipeId'].to_numpy(),
    columns=[f'{name}_{i}' for i in range(arr.shape[1])]
    ).to_parquet(
    f'{name}.parquet'
    )

In [None]:
def get_word2vec_embeddings(recipes, dims):
    from gensim.models import Doc2Vec, Word2Vec
    ingredient_list=[ings[0] for ings in recipes.select('RecipeIngredientParts').iter_rows()]
    max_size=float(recipes.select(pl.col('RecipeIngredientParts').list.len()).max().to_numpy())
    model = Word2Vec(vector_size=dims, window=max_size)
    model.build_vocab(ingredient_list)
    model.train(ingredient_list, total_examples=model.corpus_count, epochs=model.epochs)
    def get_vec(ings, wv):
        try:
            return wv[[ing for ing in ings if ing in wv]].mean(0)
        except ValueError:
            return np.zeros(dims)
    arr = np.vstack([get_vec(ings, model.wv) for ings in ingredient_list])
    to_parquet(arr, recipes, f'word2vec{dims}')

In [None]:
def get_name_embeddings(recipes, device='cpu'):
    try:
        from sentence_transformers import SentenceTransformer
    except ModuleNotFoundError:
        !pip install sentence-transformers
        from sentence_transformers import SentenceTransformer
    model = SentenceTransformer('all-MiniLM-L12-v2').to(device)
    arr = model.encode(recipes['Name'].to_numpy(), batch_size=256)#, show_progress_bar=True)
    to_parquet(arr, recipes, 'names')

In [None]:
def get_instruction_embeddings(recipes, device='cpu'):
    try:
        from sentence_transformers import SentenceTransformer
    except ModuleNotFoundError:
        !pip install sentence-transformers -q
        from sentence_transformers import SentenceTransformer
    model = SentenceTransformer('all-MiniLM-L12-v2').to(device)
    insts = recipes.select(
        pl.col('RecipeInstructions').list.join(' '),
        'RecipeId'
    ).with_columns(
        pl.col('RecipeInstructions').str.split('. ')
    ).explode(
        'RecipeInstructions'
    ).with_row_count()
#     splits = np.array_split(insts['RecipeInstructions'].to_numpy(), np.ceil(len(insts)/256))
#     embs=np.vstack([model.encode(split, batch_size=256) for split in tqdm(splits)])
    embs = model.encode(insts['RecipeInstructions'].to_numpy(), batch_size=256)#, show_progress_bar=True)
    def f(p):
        return embs[p['row_nr'].to_numpy()].mean(0)
    arr = np.vstack(insts.to_pandas().groupby('RecipeId').apply(f))
    to_parquet(arr, recipes, 'instructions')

In [None]:
def get_description_embeddings(recipes, device='cpu'):
    try:
        from sentence_transformers import SentenceTransformer
    except ModuleNotFoundError:
        !pip install sentence-transformers -q
        from sentence_transformers import SentenceTransformer
    model = SentenceTransformer('all-MiniLM-L12-v2').to(device)
    insts = recipes.select(
        pl.col('Description'),
        'RecipeId'
    ).with_columns(
        pl.col('Description').str.split('. ')
    ).explode(
        'Description'
    ).with_row_count()
#     splits = np.array_split(insts['RecipeInstructions'].to_numpy(), np.ceil(len(insts)/256))
#     embs=np.vstack([model.encode(split, batch_size=256) for split in tqdm(splits)])
    embs = model.encode(insts['Description'].to_numpy(), batch_size=256)#, show_progress_bar=True)
    def f(p):
        return embs[p['row_nr'].to_numpy()].mean(0)
    arr = np.vstack(insts.to_pandas().groupby('RecipeId').apply(f))
    to_parquet(arr, recipes, 'Description')

In [None]:
recipes=pl.read_parquet(path+'recipes.parquet')

In [None]:
get_word2vec_embeddings(recipes, 10)

In [None]:
get_word2vec_embeddings(recipes, 50)

In [None]:
get_name_embeddings(recipes, 'cuda')

In [None]:
get_description_embeddings(recipes, 'cuda')

In [None]:
get_instruction_embeddings(recipes, 'cuda')