In [1]:
from config import VK_TOKEN

import tqdm
import json
import vk_api
import re
import pathlib

In [2]:
vk_session = vk_api.VkApi(token=VK_TOKEN, app_id=5820017)
vk_session._auth_token()

vk = vk_session.get_api()

In [3]:
tools = vk_api.VkTools(vk_session)

In [4]:
recipes_publics = [-35838585, -43818640, -27740785]

In [5]:
path = pathlib.Path("/home/george/workspace/data/vegan_chef")

In [7]:
for public_id in tqdm.tqdm(recipes_publics):
    all_recipes = tools.get_all("wall.get", 80, values={"owner_id": public_id})
    json.dump(all_recipes, (path/f"publics/{public_id}.json").open("w"))

100%|██████████| 3/3 [01:44<00:00, 32.65s/it]


In [117]:
with open("/home/george/workspace/data/vegan_chef/all_text.txt", "w") as f:
    for public_id in tqdm.tqdm(recipes_publics):
        all_recipes = json.load((path/f"publics/{public_id}.json").open())
        for item in all_recipes['items']:
            matched = re.match("[а-яА-Я ]+", item['text'].lower())
            if matched:
                print(matched.group(), file=f)

100%|██████████| 3/3 [00:03<00:00,  1.04s/it]


In [7]:
import sentencepiece as spm

In [119]:
spm.SentencePieceTrainer.Train(f"--input={path/'all_text.txt'} --model_prefix=m --vocab_size=1000")

True

In [120]:
!mv m.model assets/
!mv m.vocab assets/

In [8]:
sp = spm.SentencePieceProcessor()
sp.Load("assets/m.model")

True

In [9]:
tokens = [a.split("\t")[0] for a in (pathlib.Path("assets")/"m.vocab").read_text().split("\n")]

In [10]:
class PublicsIterator:
    def __init__(self, public_paths):
        self.public_paths = public_paths
    def __iter__(self):
        for public_path in self.public_paths:
            all_recipes = json.load(public_path.open())
            for recipe in all_recipes['items']:
                yield ["<s>"] + [tokens[idx] for idx in sp.EncodeAsIds(recipe['text'].lower())] + ["</s>"]

In [11]:
from gensim.models import Word2Vec

In [12]:
model = Word2Vec(
    PublicsIterator([path/f"publics/{public_id}.json" for public_id in recipes_publics]),
    size=64,
    min_count=2,
    negative=3,
    iter=10,
    workers=-1
)

In [13]:
tokens = tokens[:-1]

In [14]:
w2v_tokens = list(model.wv.vocab)

In [15]:
len(w2v_tokens)

1000

In [16]:
set(tokens).difference(set(w2v_tokens))

set()

In [19]:
matched_w2v_vectors = model.wv.vectors[[w2v_tokens.index(tok) for tok in tokens]]

In [43]:
matched_w2v_vectors[sp.EncodeAsIds("блинчики с вареньем")].mean(0)

array([-8.7163318e-04,  2.3241742e-03, -3.0582023e-03,  1.0804716e-03,
       -3.7886046e-03,  3.1761844e-03, -4.2146255e-04,  7.4962183e-04,
        2.6627318e-03,  3.3055150e-03,  1.2516363e-03,  6.4906658e-04,
        4.1361628e-03, -2.2773072e-03, -2.8003387e-03,  3.0769624e-03,
       -1.6833334e-03, -3.5099522e-04,  1.4955821e-03, -2.9971958e-03,
        1.3834680e-04,  4.2369934e-03, -1.0883913e-03,  5.2044215e-04,
       -1.6529947e-03,  1.3119113e-03, -1.6971199e-03,  1.4646619e-03,
        2.9651696e-04,  1.1172984e-03,  1.1414266e-03,  1.6686590e-03,
        7.7394396e-04, -1.1305993e-03,  1.1879136e-05, -1.9767224e-03,
       -9.4394083e-04, -3.0657961e-03, -3.0995342e-03, -1.5884381e-03,
        5.0404500e-03, -2.4668998e-03,  2.9501761e-03,  1.5034506e-03,
        1.8930326e-03, -3.8315454e-03,  4.9679317e-03, -1.3614419e-03,
        1.4626341e-03, -2.2063553e-03,  1.0474450e-03, -2.7723976e-03,
        1.7444810e-03, -5.9158012e-04, -3.3226744e-03,  1.2464928e-03,
      

In [60]:
vec1 = matched_w2v_vectors[sp.EncodeAsIds("блинчики с вареньем")].mean(0)
vec2 = matched_w2v_vectors[sp.EncodeAsIds("блины с джемом")].mean(0)
vec3 = matched_w2v_vectors[sp.EncodeAsIds("капуста с тофу")].mean(0)

In [58]:
from scipy.spatial.distance import cosine

In [61]:
1 - cosine(vec1, vec2), 1 - cosine(vec1, vec3)

(0.3453354239463806, 0.10821899026632309)

In [62]:
def compare_ingredients(ingr1, ingr2, matched_w2v_vectors, sp):
    vec1 = matched_w2v_vectors[sp.EncodeAsIds(ingr1)].mean(0)
    vec2 = matched_w2v_vectors[sp.EncodeAsIds(ingr2)].mean(0)
    return 1 - cosine(vec1, vec2)

In [27]:
import numpy as np
np.save("assets/matched_w2v_vectors.npy", matched_w2v_vectors)