In [None]:
from gensim.models import Word2Vec
import pandas as pd
import os
import numpy as np
import sys
from bokeh.plotting import figure, output_file, show
from bokeh.models import ColumnDataSource
from sklearn.cluster import AgglomerativeClustering
from bokeh import palettes
from sqlalchemy import create_engine

# Installation (if fast = True below):
# https://pypi.org/project/fitsne/0.1.10/
# I'm unsure if the fftw.org part (below) is needed, 
# or if the pip commands listed on the pypi link 
# are enough to get it working (and get it working on multiple cores).
# Download and execute:
# http://fftw.org/install/mac.html with commands below
# $./configure --enable-threads && make
# $ sudo make install
# You NEED to clone https://github.com/KlugerLab/FIt-SNE
# Then you NEED to execute
# g++ -std=c++11 -O3  src/sptree.cpp src/tsne.cpp src/nbodyfft.cpp  -o bin/fast_tsne -pthread -lfftw3 -lm
# Then you NEED to add the repository path (which will have your compiled code in ./bin)
# to sys as shown on the next lines

fast = True
if fast:
    sys.path.append('/Users/danielklass/Dropbox/GaTech/cse6242_project/FIt-SNE')
    from fast_tsne import fast_tsne # O(N) via FFT, see all the comments above...
else:
    from sklearn.manifold import TSNE # O(Nlog(N))

In [None]:
RAND = 4
workers = os.cpu_count() - 2

In [None]:
MOVIE_ID = 'movieId'
TITLE = 'title'
VECTOR = 'vector'
GENRES = 'genres'
X = 'x'
Y = 'y'
CLUSTER = 'cluster'
COLOR = 'color'

In [None]:
model_filename = 'w2v_vs_16_sg_1_hs_1_mc_1_it_1_wn_32_ng_2.gensim'
model = Word2Vec.load(os.path.join('gensim_models2', model_filename))

In [None]:
df_movies = pd.read_csv('ml-20m/movies.csv', index_col=MOVIE_ID)

In [None]:
def get_movie_vector(i):
    try:
        return model.wv.get_vector(str(i))
    except KeyError:
        return np.nan

In [None]:
df_movies[VECTOR] = df_movies.index.get_level_values(MOVIE_ID).map(get_movie_vector)

In [None]:
df_movies = df_movies[pd.notnull(df_movies[VECTOR])]

In [None]:
vectors = df_movies[VECTOR].to_numpy()

In [None]:
vectors = np.vstack(vectors)

In [None]:
clustering = AgglomerativeClustering(
    n_clusters=128,
    linkage='ward',
)
clusters = clustering.fit(vectors)

In [None]:
if fast:
    # strongly recommended, fast O(N)
    tsne_result = fast_tsne(vectors, seed=RAND, nthreads=workers)
else:
    # never finished, slow O(Nlog(N))
    tsne_result = TSNE(vectors, random_state=RAND)

In [None]:
df_movies[X] = tsne_result[:, 0]
df_movies[Y] = tsne_result[:, 1]

In [None]:
palette = palettes.Category20_20

In [None]:
clusters.children_.shape

In [None]:
len(df_movies)

In [None]:
df_movies[CLUSTER] = clusters.labels_
df_movies[COLOR] = df_movies[CLUSTER].map(lambda x: palette[x])

In [None]:
source = ColumnDataSource(df_movies)
TOOLTIPS = [(i, '@{}'.format(i)) for i in [MOVIE_ID, TITLE, X, Y]]
fig = figure(plot_width=800, plot_height=800, tooltips=TOOLTIPS, output_backend='webgl')
fig.circle('x', 'y', source=source, size=8, color=COLOR)

In [None]:
output_file('bokeh_{}.html'.format(model_filename), title=model_filename, mode='cdn')

In [None]:
show(fig)

In [None]:
df_movies[[TITLE, GENRES, X, Y, CLUSTER, COLOR]].to_csv(
    'Movie_Data_{}.tsv'.format(model_filename), sep='\t')

In [None]:
# This is used to write data to Django's database
df_output = df_movies[[TITLE, GENRES, X, Y, CLUSTER]]
df_output = df_output.rename(columns={
    'title': 'movie_title',
})
df_output.index.rename('movie_id', inplace=True)
df_output['embedder'] = model_filename

In [None]:
eng = create_engine('sqlite:///cse6242_team5/db.sqlite3')
with eng.begin() as con:
    df_output.to_sql('movie_edge_movie', con, if_exists='append')