In [1]:
import time
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.offline
from gensim.models import Word2Vec
from sklearn.manifold import TSNE
import os

In [2]:
os.chdir("C:/Users/yuvfr/Documents/final_project_m&c")

In [65]:
# choosing between four combinations of Word2Vec models:
# context window: 2, 3 or 5, size: 50 or 100
# constructing DBs
model_5_100 = model_3X100 = Word2Vec.load("food2vec_ver1.model")
model_3_100 = Word2Vec.load("food2vec_ver3_window3.model")
model_5_50 = model_3X100 = Word2Vec.load("food2vec_ver2_size50.model")
model_3_50 = Word2Vec.load("food2vec_ver4_size50_window3.model")
model_2_100 = Word2Vec.load("food2vec_ver6_size100_window2.model")
model_2_50 = Word2Vec.load("food2vec_ver5_size50_window2.model")

ing_names = pd.read_csv("ingredients_of_corpus.csv", index_col=0, squeeze=True)

# subsetting the techniques for coloring the points (dropping techs that Word2Vec didn't assign them a vector)
techs_df = pd.read_csv("raw_techs.csv", usecols=["tech","stemmed"])
# techs_df = techs_df[techs_df['stemmed'].isin(list(model.wv.vocab))]
# create techs dict in order to move easily from stem to original
techs_dict = pd.Series(techs_df['tech'].values,index = techs_df['stemmed']).to_dict()
df_after_stem = pd.read_csv("df_after_stem.csv", index_col=0)

In [40]:
# creating Series of known cook techs
to_hist = df_after_stem.stemmed.str.replace("]","")
to_hist = to_hist.str.replace("[","")

In [51]:
to_hist = to_hist[to_hist != ""]

In [55]:
to_hist = to_hist.str.split(",")

In [56]:
from itertools import chain
my_object = list(chain.from_iterable(to_hist.tolist()))

In [59]:
to_hist = pd.Series(my_object)

In [63]:
# to_hist.value_counts()
techs_series = pd.Series(["bake", "boil","simmer","roast","grill","fri"])

In [68]:
def sync_ing_embds(model):
    filter_dict = {}
    for ing in ing_names:
        if ing in model.wv.vocab:
            filter_dict[ing] = model.wv[ing]
    return filter_dict

def run_tsne(nemb):
    tsne = TSNE(perplexity=50, n_components=2, init='pca', n_iter=5000, verbose=1)
    return tsne.fit_transform(nemb)

# function to find the most related cook technique for a given ingredient
# return series of the most related tech and the similarity between them
# ing is the ingredient name (string)
# model is the model that loaded from the repo
# techs_series is the series which consists the stemmed techs in the vocabulary
def assign_related_tech(ing,model, techs_series):
    max_sim = -np.inf
    related = None
    for tech in techs_series:
        current_sim = model.similarity(ing,tech)
        if current_sim > max_sim:
            related = tech
            max_sim = current_sim
    return pd.Series([techs_dict[related],max_sim])

def create_df_to_plot(model):
    filter_dict = sync_ing_embds(model)
    ing_of_corpus_df = pd.DataFrame(filter_dict).T
    ing_array = ing_of_corpus_df.values
    
    # projecting the ingredients vectors on R^2
    print(time.ctime())
    ing_array = run_tsne(ing_array)
    print(time.ctime())

    # prepare the DB to be plotted
    db = pd.DataFrame(ing_array)
    db.columns = ["x", "y"]
    db["ing_names"] = ing_of_corpus_df.index
    
    # apply the function above to each ingredient in the db
    db[['most_related_tech','similarity_to_tech']] = db['ing_names'].apply(
        lambda ing: assign_related_tech(ing,model,techs_series))
    return db

In [69]:
result_model_5_100 = create_df_to_plot(model_5_100)
result_model_3_100 = create_df_to_plot(model_3_100)
result_model_5_50 = create_df_to_plot(model_5_50)
result_model_3_50 = create_df_to_plot(model_3_50)
result_model_2_50 = create_df_to_plot(model_2_50)
result_model_2_100 = create_df_to_plot(model_2_100)

Sun Sep  1 16:55:43 2019
[t-SNE] Computing 151 nearest neighbors...
[t-SNE] Indexed 2408 samples in 0.008s...
[t-SNE] Computed neighbors for 2408 samples in 0.992s...
[t-SNE] Computed conditional probabilities for sample 1000 / 2408
[t-SNE] Computed conditional probabilities for sample 2000 / 2408
[t-SNE] Computed conditional probabilities for sample 2408 / 2408
[t-SNE] Mean sigma: 0.137970
[t-SNE] KL divergence after 250 iterations with early exaggeration: 72.899567
[t-SNE] KL divergence after 4050 iterations: 1.795817
Sun Sep  1 16:57:36 2019



Call to deprecated `similarity` (Method will be removed in 4.0.0, use self.wv.similarity() instead).



Sun Sep  1 16:57:37 2019
[t-SNE] Computing 151 nearest neighbors...
[t-SNE] Indexed 2408 samples in 0.009s...
[t-SNE] Computed neighbors for 2408 samples in 1.070s...
[t-SNE] Computed conditional probabilities for sample 1000 / 2408
[t-SNE] Computed conditional probabilities for sample 2000 / 2408
[t-SNE] Computed conditional probabilities for sample 2408 / 2408
[t-SNE] Mean sigma: 0.130521
[t-SNE] KL divergence after 250 iterations with early exaggeration: 73.902489
[t-SNE] KL divergence after 3000 iterations: 1.721310
Sun Sep  1 16:58:54 2019



Call to deprecated `similarity` (Method will be removed in 4.0.0, use self.wv.similarity() instead).



Sun Sep  1 16:58:55 2019
[t-SNE] Computing 151 nearest neighbors...
[t-SNE] Indexed 2408 samples in 0.005s...
[t-SNE] Computed neighbors for 2408 samples in 0.607s...
[t-SNE] Computed conditional probabilities for sample 1000 / 2408
[t-SNE] Computed conditional probabilities for sample 2000 / 2408
[t-SNE] Computed conditional probabilities for sample 2408 / 2408
[t-SNE] Mean sigma: 0.129220
[t-SNE] KL divergence after 250 iterations with early exaggeration: 70.110359
[t-SNE] KL divergence after 5000 iterations: 1.733841
Sun Sep  1 17:00:52 2019



Call to deprecated `similarity` (Method will be removed in 4.0.0, use self.wv.similarity() instead).



Sun Sep  1 17:00:53 2019
[t-SNE] Computing 151 nearest neighbors...
[t-SNE] Indexed 2408 samples in 0.004s...
[t-SNE] Computed neighbors for 2408 samples in 0.605s...
[t-SNE] Computed conditional probabilities for sample 1000 / 2408
[t-SNE] Computed conditional probabilities for sample 2000 / 2408
[t-SNE] Computed conditional probabilities for sample 2408 / 2408
[t-SNE] Mean sigma: 0.121106
[t-SNE] KL divergence after 250 iterations with early exaggeration: 69.911743
[t-SNE] KL divergence after 3350 iterations: 1.682750
Sun Sep  1 17:02:16 2019



Call to deprecated `similarity` (Method will be removed in 4.0.0, use self.wv.similarity() instead).



Sun Sep  1 17:02:17 2019
[t-SNE] Computing 151 nearest neighbors...
[t-SNE] Indexed 2408 samples in 0.006s...
[t-SNE] Computed neighbors for 2408 samples in 0.603s...
[t-SNE] Computed conditional probabilities for sample 1000 / 2408
[t-SNE] Computed conditional probabilities for sample 2000 / 2408
[t-SNE] Computed conditional probabilities for sample 2408 / 2408
[t-SNE] Mean sigma: 0.120200
[t-SNE] KL divergence after 250 iterations with early exaggeration: 70.248062
[t-SNE] KL divergence after 5000 iterations: 1.635494
Sun Sep  1 17:04:14 2019



Call to deprecated `similarity` (Method will be removed in 4.0.0, use self.wv.similarity() instead).



Sun Sep  1 17:04:15 2019
[t-SNE] Computing 151 nearest neighbors...
[t-SNE] Indexed 2408 samples in 0.009s...
[t-SNE] Computed neighbors for 2408 samples in 1.093s...
[t-SNE] Computed conditional probabilities for sample 1000 / 2408
[t-SNE] Computed conditional probabilities for sample 2000 / 2408
[t-SNE] Computed conditional probabilities for sample 2408 / 2408
[t-SNE] Mean sigma: 0.126269
[t-SNE] KL divergence after 250 iterations with early exaggeration: 71.209396
[t-SNE] KL divergence after 4350 iterations: 1.680436
Sun Sep  1 17:06:03 2019



Call to deprecated `similarity` (Method will be removed in 4.0.0, use self.wv.similarity() instead).



In [70]:
print(result_model_5_50.similarity_to_tech.mean())
print(result_model_5_100.similarity_to_tech.mean())
print(result_model_3_50.similarity_to_tech.mean())
print(result_model_3_100.similarity_to_tech.mean())
result_model_5_50.to_csv("result_model_5_50_perp50.csv")
result_model_5_100.to_csv("result_model_5_100_perp50.csv")
result_model_3_50.to_csv("result_model_3_50_perp50.csv")
result_model_3_100.to_csv("result_model_3_100_perp50.csv")
result_model_2_50.to_csv("result_model_2_50_perp50.csv")
result_model_2_100.to_csv("result_model_2_100_perp50.csv")

0.08566875161759345
0.0716245303621231
0.0912318877333178
0.07994043264763145


In [94]:
# result_model_5_100#
# result_model_3_100#
# result_model_5_50#
# result_model_3_50#
# result_model_2_50#
# result_model_2_100#
path = "result_model_2_100_perp50"
result = pd.read_csv(path+".csv",index_col=0)

In [95]:
result.head()

Unnamed: 0,x,y,ing_names,most_related_tech,similarity_to_tech
0,-0.153272,-13.204207,black-olives,grilling,0.028653
1,7.278295,6.267877,grape-tomatoes,frying,0.028193
2,4.204807,33.46253,garlic,frying,0.253628
3,0.678489,30.039904,pepper,simmering,0.254902
4,0.461573,26.480455,seasoning,grilling,0.217594


In [96]:
# plot data
fig = px.scatter(result, x="x", y="y", hover_name="ing_names", hover_data=["ing_names"], color="most_related_tech")
fig.update_layout(showlegend=False)
fig.show()

In [97]:
plotly.offline.plot(fig, filename=path+".html")

'result_model_2_100_perp50.html'

perplexity
5% rule of thumb == 120
[stack](https://stackoverflow.com/questions/17426292/what-is-the-most-efficient-way-to-create-a-dictionary-of-two-pandas-dataframe-co)
[to_data_science](https://towardsdatascience.com/how-to-tune-hyperparameters-of-tsne-7c0596a18868)
sqrt(N) rule of thumb ~== 50

In [178]:
0.05*2400
2400**0.5

48.98979485566356