In [103]:
import time
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.offline
from gensim.models import Word2Vec
from sklearn.manifold import TSNE
import os

In [242]:
os.chdir("C:/Users/yuvfr/Documents/final_project_m&c")

In [232]:
# choosing between four combinations of Word2Vec models:
# context window: 3 or 5, size: 50 or 100
# constructing DBs
model_5_100 = model_3X100 = Word2Vec.load("food2vec_ver1.model")
model_3_100 = Word2Vec.load("food2vec_ver3_window3.model")
model_5_50 = model_3X100 = Word2Vec.load("food2vec_ver2_size50.model")
model_3_50 = Word2Vec.load("food2vec_ver4_size50_window3.model")

ing_names = pd.read_csv("ingredients_of_corpus.csv", index_col=0, squeeze=True)

# subsetting the techniques for coloring the points (dropping techs that Word2Vec didn't assign them a vector)
techs_df = pd.read_csv("raw_techs.csv", usecols=["tech","stemmed"])
techs_df = techs_df[techs_df['stemmed'].isin(list(model.wv.vocab))]
# create techs dict in order to move easily from stem to original
techs_dict = pd.Series(techs_df['tech'].values,index = techs_df['stemmed']).to_dict()

In [233]:
def sync_ing_embds(model):
    filter_dict = {}
    for ing in ing_names:
        if ing in model.wv.vocab:
            filter_dict[ing] = model.wv[ing]
    return filter_dict

def run_tsne(nemb):
    tsne = TSNE(perplexity=50, n_components=2, init='pca', n_iter=5000, verbose=1)
    return tsne.fit_transform(nemb)

# function to find the most related cook technique for a given ingredient
# return series of the most related tech and the similarity between them
# ing is the ingredient name (string)
# model is the model that loaded from the repo
# techs_series is the series which consists the stemmed techs in the vocabulary
def assign_related_tech(ing,model, techs_series):
    max_sim = -np.inf
    related = None
    for tech in techs_series:
        current_sim = model.similarity(ing,tech)
        if current_sim > max_sim:
            related = tech
            max_sim = current_sim
    return pd.Series([techs_dict[related],max_sim])

def create_df_to_plot(model):
    filter_dict = sync_ing_embds(model)
    ing_of_corpus_df = pd.DataFrame(filter_dict).T
    ing_array = ing_of_corpus_df.values
    
    # projecting the ingredients vectors on R^2
    print(time.ctime())
    ing_array = run_tsne(ing_array)
    print(time.ctime())

    # prepare the DB to be plotted
    db = pd.DataFrame(ing_array)
    db.columns = ["x", "y"]
    db["ing_names"] = ing_of_corpus_df.index
    
    # apply the function above to each ingredient in the db
    db[['most_related_tech','similarity_to_tech']] = db['ing_names'].apply(
        lambda ing: assign_related_tech(ing,model,techs_df['stemmed']))
    return db

In [234]:
result_model_5_100 = create_df_to_plot(model_5_100)
result_model_3_100 = create_df_to_plot(model_3_100)
result_model_5_50 = create_df_to_plot(model_5_50)
result_model_3_50 = create_df_to_plot(model_3_50)

Fri Aug 30 02:12:52 2019
[t-SNE] Computing 151 nearest neighbors...
[t-SNE] Indexed 2408 samples in 0.008s...
[t-SNE] Computed neighbors for 2408 samples in 0.998s...
[t-SNE] Computed conditional probabilities for sample 1000 / 2408
[t-SNE] Computed conditional probabilities for sample 2000 / 2408
[t-SNE] Computed conditional probabilities for sample 2408 / 2408
[t-SNE] Mean sigma: 0.137970
[t-SNE] KL divergence after 250 iterations with early exaggeration: 72.097832
[t-SNE] KL divergence after 2800 iterations: 1.803892
Fri Aug 30 02:14:14 2019



Call to deprecated `similarity` (Method will be removed in 4.0.0, use self.wv.similarity() instead).



Fri Aug 30 02:14:15 2019
[t-SNE] Computing 151 nearest neighbors...
[t-SNE] Indexed 2408 samples in 0.009s...
[t-SNE] Computed neighbors for 2408 samples in 1.140s...
[t-SNE] Computed conditional probabilities for sample 1000 / 2408
[t-SNE] Computed conditional probabilities for sample 2000 / 2408
[t-SNE] Computed conditional probabilities for sample 2408 / 2408
[t-SNE] Mean sigma: 0.130521
[t-SNE] KL divergence after 250 iterations with early exaggeration: 74.039055
[t-SNE] KL divergence after 4050 iterations: 1.764870
Fri Aug 30 02:16:11 2019



Call to deprecated `similarity` (Method will be removed in 4.0.0, use self.wv.similarity() instead).



Fri Aug 30 02:16:13 2019
[t-SNE] Computing 151 nearest neighbors...
[t-SNE] Indexed 2408 samples in 0.006s...
[t-SNE] Computed neighbors for 2408 samples in 0.683s...
[t-SNE] Computed conditional probabilities for sample 1000 / 2408
[t-SNE] Computed conditional probabilities for sample 2000 / 2408
[t-SNE] Computed conditional probabilities for sample 2408 / 2408
[t-SNE] Mean sigma: 0.129220
[t-SNE] KL divergence after 250 iterations with early exaggeration: 70.127365
[t-SNE] KL divergence after 4600 iterations: 1.683828
Fri Aug 30 02:18:08 2019



Call to deprecated `similarity` (Method will be removed in 4.0.0, use self.wv.similarity() instead).



Fri Aug 30 02:18:10 2019
[t-SNE] Computing 151 nearest neighbors...
[t-SNE] Indexed 2408 samples in 0.005s...
[t-SNE] Computed neighbors for 2408 samples in 0.639s...
[t-SNE] Computed conditional probabilities for sample 1000 / 2408
[t-SNE] Computed conditional probabilities for sample 2000 / 2408
[t-SNE] Computed conditional probabilities for sample 2408 / 2408
[t-SNE] Mean sigma: 0.121106
[t-SNE] KL divergence after 250 iterations with early exaggeration: 69.531044
[t-SNE] KL divergence after 3950 iterations: 1.704406
Fri Aug 30 02:19:57 2019



Call to deprecated `similarity` (Method will be removed in 4.0.0, use self.wv.similarity() instead).



In [235]:
print(result_model_5_50.similarity_to_tech.mean())
print(result_model_5_100.similarity_to_tech.mean())
print(result_model_3_50.similarity_to_tech.mean())
print(result_model_3_100.similarity_to_tech.mean())
# result_model_5_50.to_csv("result_model_5_50_perp50.csv")
# result_model_5_100.to_csv("result_model_5_100_perp50.csv")
# result_model_3_50.to_csv("result_model_3_50_perp50.csv")
# result_model_3_100.to_csv("result_model_3_100_perp50.csv")

0.5387635665428995
0.46274979911942615
0.566360160955955
0.4955720023818289


In [282]:
# result_model_3_50_perp120
# result_model_3_100_perp120
# result_model_5_50_perp120
# result_model_5_100_perp120
path = "result_model_3_100_perp120"
result = pd.read_csv(path+".csv",index_col=0)

In [284]:
result.head()

Unnamed: 0,x,y,ing_names,most_related_tech,similarity_to_tech
0,1.205715,-1.227246,black-olives,fermenting,0.542836
1,-0.844865,3.243824,grape-tomatoes,julienning,0.564685
2,-13.065539,-4.054787,garlic,mincing,0.564239
3,-13.654481,-2.950895,pepper,mincing,0.325184
4,-10.815898,-2.438437,seasoning,fermenting,0.318115


In [285]:
# plot data
fig = px.scatter(result, x="x", y="y", hover_name="ing_names", hover_data=["ing_names"], color="most_related_tech")
fig.update_layout(showlegend=False)
fig.show()

In [286]:
plotly.offline.plot(fig, filename=path+".html")

'result_model_3_100_perp120.html'

perplexity
5% rule of thumb == 120
[stack](https://stackoverflow.com/questions/17426292/what-is-the-most-efficient-way-to-create-a-dictionary-of-two-pandas-dataframe-co)
[to_data_science](https://towardsdatascience.com/how-to-tune-hyperparameters-of-tsne-7c0596a18868)
sqrt(N) rule of thumb ~== 50

In [178]:
0.05*2400
2400**0.5

48.98979485566356