In [1]:
from gensim.models import Word2Vec
import pandas as pd
import os
import numpy as np
import sys
from bokeh.plotting import figure, output_file, show
from bokeh.models import ColumnDataSource
from sklearn.cluster import AgglomerativeClustering
from bokeh import palettes
from sqlalchemy import create_engine
import pickle

# Installation (if fast = True below):
# https://pypi.org/project/fitsne/0.1.10/
# I'm unsure if the fftw.org part (below) is needed, 
# or if the pip commands listed on the pypi link 
# are enough to get it working (and get it working on multiple cores).
# Download and execute:
# http://fftw.org/install/mac.html with commands below
# $./configure --enable-threads && make
# $ sudo make install
# You NEED to clone https://github.com/KlugerLab/FIt-SNE
# Then you NEED to execute
# g++ -std=c++11 -O3  src/sptree.cpp src/tsne.cpp src/nbodyfft.cpp  -o bin/fast_tsne -pthread -lfftw3 -lm
# Then you NEED to add the repository path (which will have your compiled code in ./bin)
# to sys as shown on the next lines

fast = True
if fast:
    sys.path.append('/Users/danielklass/Dropbox/GaTech/cse6242_project/FIt-SNE')
    from fast_tsne import fast_tsne # O(N) via FFT, see all the comments above...
else:
    from sklearn.manifold import TSNE # O(Nlog(N))

In [2]:
RAND = 4
workers = os.cpu_count() - 2

In [3]:
MOVIE_ID = 'movieId'
TITLE = 'title'
RATING = 'rating'
VECTOR = 'vector'
GENRES = 'genres'
MEAN = 'mean'
COUNT = 'count'
STDDEV = 'std'
X = 'x'
Y = 'y'
CLUSTER = 'cluster'
COLOR = 'color'

In [4]:
model_filename = 'w2v_vs_64_sg_1_hs_1_mc_1_it_4_wn_32_ng_2_all_data_trg_val_tst.gensim'
model = Word2Vec.load(os.path.join('gensim_models2', model_filename))

In [5]:
with open('metadata.pkl', 'rb') as f:
    dict_metadata = pickle.load(f)

In [6]:
dict_metadata[1]

{'Title': 'Toy Story',
 'Year': '1995',
 'Rated': 'G',
 'Released': '22 Nov 1995',
 'Runtime': '81 min',
 'Genre': 'Animation, Adventure, Comedy, Family, Fantasy',
 'Director': 'John Lasseter',
 'Writer': 'John Lasseter (original story by), Pete Docter (original story by), Andrew Stanton (original story by), Joe Ranft (original story by), Joss Whedon (screenplay by), Andrew Stanton (screenplay by), Joel Cohen (screenplay by), Alec Sokolow (screenplay by)',
 'Actors': 'Tom Hanks, Tim Allen, Don Rickles, Jim Varney',
 'Plot': 'A little boy named Andy loves to be in his room, playing with his toys, especially his doll named "Woody". But, what do the toys do when Andy is not with them, they come to life. Woody believes that he has life (as a toy) good. However, he must worry about Andy\'s family moving, and what Woody does not know is about Andy\'s birthday party. Woody does not realize that Andy\'s mother gave him an action figure known as Buzz Lightyear, who does not believe that he is a

In [7]:
df_movies = pd.read_csv('ml-20m/movies.csv', index_col=MOVIE_ID)

In [8]:
df_rating = pd.read_csv('ml-20m/ratings.csv')
df_rating = df_rating.groupby([MOVIE_ID])[RATING].agg(['count', 'mean', 'std'])

In [9]:
def get_movie_vector(i):
    try:
        return model.wv.get_vector(str(i))
    except KeyError:
        return np.nan

In [10]:
df_movies[VECTOR] = df_movies.index.get_level_values(MOVIE_ID).map(get_movie_vector)

In [11]:
len(df_movies)

27278

In [12]:
df_movies = df_movies[pd.notnull(df_movies[VECTOR])].copy()

In [13]:
len(df_movies)

23892

In [14]:
vectors = df_movies[VECTOR].to_numpy()

In [15]:
vectors = np.vstack(vectors)

In [16]:
clustering = AgglomerativeClustering(
    n_clusters=20,
    linkage='ward',
)
clusters = clustering.fit(vectors)

In [17]:
if fast:
    # strongly recommended, fast O(N)
    tsne_result = fast_tsne(vectors, seed=RAND, nthreads=workers)
else:
    # never finished, slow O(Nlog(N))
    tsne_result = TSNE(vectors, random_state=RAND)

In [18]:
df_movies[X] = tsne_result[:, 0]
df_movies[Y] = tsne_result[:, 1]

In [19]:
palette = palettes.Category20_20

In [20]:
clusters.children_.shape

(23891, 2)

In [21]:
len(df_movies)

23892

In [22]:
df_movies[CLUSTER] = clusters.labels_
df_movies[COLOR] = df_movies[CLUSTER].map(lambda x: palette[x])

In [23]:
source = ColumnDataSource(df_movies)
TOOLTIPS = [(i, '@{}'.format(i)) for i in [MOVIE_ID, TITLE, X, Y]]
fig = figure(plot_width=800, plot_height=800, tooltips=TOOLTIPS, output_backend='webgl')
fig.circle('x', 'y', source=source, size=8, color=COLOR)

In [24]:
output_file('bokeh_{}.html'.format(model_filename), title=model_filename, mode='cdn')

In [25]:
show(fig)

In [26]:
df_rating.head()

Unnamed: 0_level_0,count,mean,std
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,49695,3.92124,0.889012
2,22243,3.211977,0.95115
3,12735,3.15104,1.006642
4,2756,2.861393,1.095702
5,12161,3.064592,0.98214


In [27]:
df_movies = df_movies.merge(df_rating, on=[MOVIE_ID])

In [28]:
df_movies[[TITLE, GENRES, X, Y, CLUSTER, COLOR, MEAN, STDDEV, COUNT]].to_csv(
    'Movie_Data_{}.tsv'.format(model_filename), sep='\t')

In [29]:
# This is used to write data to Django's database
df_output = df_movies[[TITLE, GENRES, X, Y, CLUSTER, MEAN, STDDEV, COUNT]]
df_output = df_output.rename(columns={
    'title': 'movie_title',
})
df_output.index.rename('movie_id', inplace=True)
df_output['embedder'] = model_filename

In [35]:
sorted(list(df_output[POSTER_URL].unique()))

TypeError: '<' not supported between instances of 'NoneType' and 'str'

In [34]:
# Metadata
POSTER_URL = 'poster_url'
RUNTIME = 'runtime'
DIRECTOR = 'director'
ACTORS = 'actors'
METASCORE = 'metascore'
IMDB_RATING = 'imdb_rating'
IMDB_VOTES = 'imdb_votes'

df_output[POSTER_URL] = df_output.index.map(lambda x: dict_metadata[x]['Poster']).map(
    lambda x: None if x == 'N/A' else x)
df_output[RUNTIME] = df_output.index.map(
    lambda x: dict_metadata[x]['Runtime']).map(
    lambda x: x.replace(' min', '')).map(
    lambda x: int(x) if x.isdigit() else None)
df_output[DIRECTOR] = df_output.index.map(lambda x: dict_metadata[x]['Director']).map(
lambda x: '|'.join(x.split(', ')))
df_output[ACTORS] = df_output.index.map(lambda x: dict_metadata[x]['Actors']).map(
lambda x: x.replace(', ', '|'))
df_output[METASCORE] = df_output.index.map(lambda x: dict_metadata[x]['Metascore']).map(
    lambda x: int(x) if x.isdigit() else None)
df_output[IMDB_RATING] = df_output.index.map(lambda x: dict_metadata[x]['imdbRating']).map(
    lambda x: float(x) if x != 'N/A' else None)
df_output[IMDB_VOTES] = df_output.index.map(lambda x: dict_metadata[x]['imdbVotes']).map(
    lambda x: int(x.replace(',', '')) if x != 'N/A' else None)

In [38]:
eng = create_engine('sqlite:///cse6242_team5/db.sqlite3')
with eng.begin() as con:
    df_output.to_sql('movie_edge_movie', con, if_exists='append')