# [Chefkoch.de](http://www.chefkoch.de/) Maturaarbeit 2017/18
------

## Ziel:
### Dimensionsreduzierung mit LDA und t-SNE mit besserer Hardware (4.Teil, Part 2)

# Need a new and clean env, different from 04_01 env

In [1]:
!pip install markupsafe==2.0.1
!pip install psutil
!pip install lda
!pip install bokeh



In [3]:
import csv

from gensim import corpora, models
import gensim

In [4]:
def get_recipe_names():
    recipe_names = []
    skip_first = True # col name
    chef_file = 'input/test/chefkoch_rezepte_analysis_cleannames.csv'
    with open(chef_file, 'r') as f:
        chefkoch = csv.reader(f)
        for row in chefkoch:
            if skip_first:
                skip_first = False
                continue
            try:
                recipe_names.append(row[-1])
            except: 
                continue 
    return(recipe_names)

In [5]:
recipe_names = get_recipe_names()
print(len(recipe_names)) # Anzahl aller Rezeptnamen

316755


In [6]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [10]:
import os
import argparse
import time
import lda
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.manifold import TSNE
import bokeh.plotting as bp
from bokeh.plotting import save
from bokeh.models import HoverTool

In [11]:
out_dir = "output_tsne"
os.makedirs(out_dir, exist_ok=True)

In [12]:
# unpack
n_topics = 40
n_iter = 500
n_top_words = 4
threshold = 0

##############################################################################
# train an LDA model

cvectorizer = CountVectorizer(min_df=1)
cvz = cvectorizer.fit_transform(recipe_names)

lda_model = lda.LDA(n_topics=n_topics, n_iter=n_iter)
X_topics = lda_model.fit_transform(cvz)


np.save(out_dir +'/lda_doc_topic_{}recipe_names{}topics_cloud_2.npy'.format(X_topics.shape[0], X_topics.shape[1]), X_topics)

np.save(out_dir +'/lda_topic_word_{}recipe_names{}topics_cloud_2.npy'.format(X_topics.shape[0], X_topics.shape[1]), lda_model.topic_word_)

print('\n>>> doc_topic & topic word written to disk\n')

2022-12-09 21:36:59,480 : INFO : n_documents: 316755
2022-12-09 21:36:59,480 : INFO : vocab_size: 60479
2022-12-09 21:36:59,480 : INFO : n_words: 571982
2022-12-09 21:36:59,481 : INFO : n_topics: 40
2022-12-09 21:36:59,481 : INFO : n_iter: 500
2022-12-09 21:37:04,320 : INFO : <0> log likelihood: -7597635
2022-12-09 21:37:05,219 : INFO : <10> log likelihood: -6148883
2022-12-09 21:37:06,147 : INFO : <20> log likelihood: -5751467
2022-12-09 21:37:07,078 : INFO : <30> log likelihood: -5593311
2022-12-09 21:37:07,993 : INFO : <40> log likelihood: -5526442
2022-12-09 21:37:08,885 : INFO : <50> log likelihood: -5490208
2022-12-09 21:37:09,802 : INFO : <60> log likelihood: -5463076
2022-12-09 21:37:10,683 : INFO : <70> log likelihood: -5446041
2022-12-09 21:37:11,544 : INFO : <80> log likelihood: -5435027
2022-12-09 21:37:12,476 : INFO : <90> log likelihood: -5419944
2022-12-09 21:37:13,391 : INFO : <100> log likelihood: -5410059
2022-12-09 21:37:14,277 : INFO : <110> log likelihood: -5400389


>>> doc_topic & topic word written to disk



In [13]:
##############################################################################
# threshold and plot

_idx = np.amax(X_topics, axis=1) > threshold  # idx of recipe_names that > threshold
_topics = X_topics[_idx]

num_example = len(_topics)

In [14]:
num_example

316755

In [15]:
import psutil
psutil.virtual_memory()

svmem(total=17179869184, available=933187584, percent=94.6, used=1593573376, free=15572992, active=919846912, inactive=887349248, wired=673726464)

In [16]:
import joblib

In [17]:
# t-SNE: 50 -> 2D
tsne_model = TSNE(n_components=2, perplexity=5, verbose=2, random_state=0, angle=.7, learning_rate=150,
                    init='pca', n_iter=2500, n_iter_without_progress=100)
tsne_lda = tsne_model.fit_transform(_topics[:num_example])

# save tsne model
joblib.dump(tsne_lda, out_dir+'/tsne_lda_cloud_2.pkl')

[t-SNE] Computing 16 nearest neighbors...
[t-SNE] Indexed 316755 samples in 0.004s...
[t-SNE] Computed neighbors for 316755 samples in 89.581s...
[t-SNE] Computed conditional probabilities for sample 1000 / 316755
[t-SNE] Computed conditional probabilities for sample 2000 / 316755
[t-SNE] Computed conditional probabilities for sample 3000 / 316755
[t-SNE] Computed conditional probabilities for sample 4000 / 316755
[t-SNE] Computed conditional probabilities for sample 5000 / 316755
[t-SNE] Computed conditional probabilities for sample 6000 / 316755
[t-SNE] Computed conditional probabilities for sample 7000 / 316755
[t-SNE] Computed conditional probabilities for sample 8000 / 316755
[t-SNE] Computed conditional probabilities for sample 9000 / 316755
[t-SNE] Computed conditional probabilities for sample 10000 / 316755
[t-SNE] Computed conditional probabilities for sample 11000 / 316755
[t-SNE] Computed conditional probabilities for sample 12000 / 316755
[t-SNE] Computed conditional probab

[t-SNE] Computed conditional probabilities for sample 149000 / 316755
[t-SNE] Computed conditional probabilities for sample 150000 / 316755
[t-SNE] Computed conditional probabilities for sample 151000 / 316755
[t-SNE] Computed conditional probabilities for sample 152000 / 316755
[t-SNE] Computed conditional probabilities for sample 153000 / 316755
[t-SNE] Computed conditional probabilities for sample 154000 / 316755
[t-SNE] Computed conditional probabilities for sample 155000 / 316755
[t-SNE] Computed conditional probabilities for sample 156000 / 316755
[t-SNE] Computed conditional probabilities for sample 157000 / 316755
[t-SNE] Computed conditional probabilities for sample 158000 / 316755
[t-SNE] Computed conditional probabilities for sample 159000 / 316755
[t-SNE] Computed conditional probabilities for sample 160000 / 316755
[t-SNE] Computed conditional probabilities for sample 161000 / 316755
[t-SNE] Computed conditional probabilities for sample 162000 / 316755
[t-SNE] Computed con

[t-SNE] Computed conditional probabilities for sample 281000 / 316755
[t-SNE] Computed conditional probabilities for sample 282000 / 316755
[t-SNE] Computed conditional probabilities for sample 283000 / 316755
[t-SNE] Computed conditional probabilities for sample 284000 / 316755
[t-SNE] Computed conditional probabilities for sample 285000 / 316755
[t-SNE] Computed conditional probabilities for sample 286000 / 316755
[t-SNE] Computed conditional probabilities for sample 287000 / 316755
[t-SNE] Computed conditional probabilities for sample 288000 / 316755
[t-SNE] Computed conditional probabilities for sample 289000 / 316755
[t-SNE] Computed conditional probabilities for sample 290000 / 316755
[t-SNE] Computed conditional probabilities for sample 291000 / 316755
[t-SNE] Computed conditional probabilities for sample 292000 / 316755
[t-SNE] Computed conditional probabilities for sample 293000 / 316755
[t-SNE] Computed conditional probabilities for sample 294000 / 316755
[t-SNE] Computed con



[t-SNE] Iteration 50: error = 146.1621094, gradient norm = 0.0008558 (50 iterations in 21.653s)
[t-SNE] Iteration 100: error = 142.7243652, gradient norm = 0.0016430 (50 iterations in 19.512s)
[t-SNE] Iteration 150: error = 130.8240356, gradient norm = 0.0014898 (50 iterations in 17.392s)
[t-SNE] Iteration 200: error = 123.8704529, gradient norm = 0.0011295 (50 iterations in 16.435s)
[t-SNE] Iteration 250: error = 119.3483353, gradient norm = 0.0010984 (50 iterations in 16.084s)
[t-SNE] KL divergence after 250 iterations with early exaggeration: 119.348335
[t-SNE] Iteration 300: error = 7.4072609, gradient norm = 0.0007141 (50 iterations in 16.233s)
[t-SNE] Iteration 350: error = 7.1757441, gradient norm = 0.0006255 (50 iterations in 16.164s)
[t-SNE] Iteration 400: error = 6.8116813, gradient norm = 0.0005305 (50 iterations in 15.623s)
[t-SNE] Iteration 450: error = 6.4881120, gradient norm = 0.0004514 (50 iterations in 15.575s)
[t-SNE] Iteration 500: error = 6.1902261, gradient norm =

['output_tsne/tsne_lda_cloud_2.pkl']

In [21]:
# find the most probable topic for each recipe category
_lda_keys = []
for i in range(_topics.shape[0]):
    _lda_keys += _topics[i].argmax(),

# show topics and their top words
topic_summaries = []
topic_word = lda_model.topic_word_  # get the topic words
vocab = cvectorizer.get_feature_names()
for i, topic_dist in enumerate(topic_word):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words+1):-1]
    topic_summaries.append(' '.join(topic_words))
    
# 40 colors
colormap = np.array([
    "#144c73", "#1f77b4", "#aec7e8", "#d6e2f3", "#c15a00", "#ff7f0e", "#ffbb78", "#ffe2c5",
    "#1c641c","#2ca02c", "#98df8a", "#cdefc6", "#951b1c", "#d62728", "#ff9896", "#ffe3e3",
    "#6e4196", "#9467bd", "#c5b0d5", "#eae2f0", "#5a3730", "#8c564b", "#c49c94","#dfcac5",
    "#d638a6", "#e377c2","#f0b6de", "#f7b6d2", "#fce4ee", "#595959", "#7f7f7f",
    "#c7c7c7","#ededed","#7c7c16", "#bcbd22", "#dbdb8d","#ededc7", "#0f7f8a",
    "#17becf", "#9edae5", "#daf1f5"
  ])

# plot
title = "[Recipe Topics] t-SNE visualization of LDA model trained on {} names, {} topics, thresholding at {} topic probability, {} iter ({} datapoints and top {} words)".format(X_topics.shape[0], n_topics, threshold, n_iter, num_example, n_top_words)

plot_lda = bp.figure(min_width=2000, min_height=1300, title=title,
                       tools="pan,wheel_zoom,box_zoom,reset,hover,save",
                       x_axis_type=None, y_axis_type=None, min_border=1)
source = bp.ColumnDataSource(data={'x':tsne_lda[:, 0],'y':tsne_lda[:, 1], 'color':colormap[_lda_keys][:num_example],
                                  "content": recipe_names[:num_example],
                                  "topic_key": _lda_keys[:num_example]})
plot_lda.scatter(x='x', y='y',color='color',source=source)

topic_coord = np.empty((X_topics.shape[1], 2)) * np.nan
for topic_num in _lda_keys:
    if not np.isnan(topic_coord).any():
        break
    topic_coord[topic_num] = tsne_lda[_lda_keys.index(topic_num)]

# plot crucial words
for i in range(X_topics.shape[1]):
    plot_lda.text(topic_coord[i, 0], topic_coord[i, 1], [topic_summaries[i]])

# hover tools
hover = plot_lda.select(dict(type=HoverTool))
hover.tooltips = {"content": "@content - topic: @topic_key"}

save(plot_lda, out_dir + '/NEW_20_recipe_names_final_tsne_2_lda_viz_{}_{}_{}_{}_{}_{}.html'.format(
    X_topics.shape[0], n_topics, threshold, n_iter, num_example, n_top_words))


     

  save(plot_lda, out_dir + '/NEW_20_recipe_names_final_tsne_2_lda_viz_{}_{}_{}_{}_{}_{}.html'.format(
  save(plot_lda, out_dir + '/NEW_20_recipe_names_final_tsne_2_lda_viz_{}_{}_{}_{}_{}_{}.html'.format(


'/Users/z0h03ws/Downloads/Food-Recipe-CNN/output_tsne/NEW_20_recipe_names_final_tsne_2_lda_viz_316755_40_0_500_316755_4.html'

In [None]:
# from IPython.display import Image
# PATH = "/Users/Muriz/Desktop/download-1.png"
# Image(filename = PATH, width='100%', height=140)

In [None]:
# Image(filename = '/Users/Muriz/Desktop/download.png', width='100%', height=140)