In [21]:
from gensim.models import Word2Vec
from sklearn.model_selection import ParameterGrid
import pandas as pd
import h5py
import os
import logging

In [22]:
def show_synonyms(model, search_str, num_synonyms, verbose=True):
    synonym_list = list()
    movie_index = df_movies[df_movies[TITLE].str.match(search_str)]
    for mi in movie_index.index:
        synonym_list.extend([(i, df_movies.loc[int(i[0])][TITLE]) for i in 
                             list(model.wv.most_similar(str(mi), topn=num_synonyms))])
    cosine_similarity = pd.Series([i[0][1] for i in synonym_list])
    mean = cosine_similarity.mean()
    stddev = cosine_similarity.std()
    if verbose:
        print(movie_index)
        print('Mean: {} \t StdDev: {}'.format(mean, stddev))
    return synonym_list, mean, stddev

In [23]:
MOVIE_TARGETS = '|'.join([
    '(^Matrix, The)',
    '(Saving Private Ryan)',
    '(Lawrence of Arabia)',
    '(Bridesmaids)',
    '(^Notebook, The \(2004)',
    '(^.*Zhivago.*)',
])

In [24]:
MOVIE_ID = 'movieId'
TITLE = 'title'

In [25]:
df_movies = pd.read_csv('ml-20m/movies.csv', index_col=MOVIE_ID)

In [26]:
files = [f for f in os.listdir() if f[-7:] == '.gensim']

In [27]:
results = pd.DataFrame([[f] + list(show_synonyms(Word2Vec.load(f), MOVIE_TARGETS, 5, False)[1:]) for f in files], columns=['file', 'mean', 'stddev'])

In [28]:
results.sort_values(by=['mean'])

Unnamed: 0,file,mean,stddev
2,w2v_vs_64_sg_0_hs_0_mc_1_it_1_wn_5774_ng_0.gensim,0.463294,0.030951
15,w2v_vs_64_sg_1_hs_0_mc_1_it_1_wn_5774_ng_0.gensim,0.463294,0.030951
22,w2v_vs_64_sg_0_hs_1_mc_1_it_1_wn_5774_ng_2.gensim,0.544947,0.15916
16,w2v_vs_64_sg_0_hs_1_mc_1_it_1_wn_5774_ng_0.gensim,0.583604,0.147688
12,w2v_vs_64_sg_0_hs_0_mc_1_it_1_wn_5774_ng_2.gensim,0.774507,0.154974
7,w2v_vs_128_sg_1_hs_1_mc_1_it_1_wn_16_ng_2.gensim,0.783779,0.042104
9,w2v_vs_64_sg_1_hs_1_mc_1_it_1_wn_16_ng_0.gensim,0.785017,0.073539
18,w2v_vs_64_sg_1_hs_1_mc_1_it_1_wn_32_ng_0.gensim,0.808674,0.064195
14,w2v_vs_128_sg_1_hs_1_mc_1_it_1_wn_32_ng_2.gensim,0.823294,0.043644
13,w2v_vs_128_sg_1_hs_1_mc_1_it_1_wn_5774_ng_2.ge...,0.834297,0.090579


In [29]:
# w2v_vs_64_sg_0_hs_1_mc_1_it_1_wn_5774_ng_0.gensim not horrible, not great max ~0.7
# w2v_vs_64_sg_1_hs_1_mc_1_it_1_wn_5774_ng_0.gensim on point, quite good, max ~0.9
# w2v_vs_64_sg_0_hs_1_mc_1_it_1_wn_5774_ng_2.gensim not horrible, not great max ~0.64
# w2v_vs_64_sg_0_hs_0_mc_1_it_1_wn_5774_ng_0.gensim almost junk, max ~0.47
# w2v_vs_64_sg_1_hs_0_mc_1_it_1_wn_5774_ng_0.gensim almost junk max ~0.47
# w2v_vs_64_sg_1_hs_0_mc_1_it_1_wn_5774_ng_2.gensim super good max ~0.95
# w2v_vs_64_sg_1_hs_1_mc_1_it_1_wn_5774_ng_2.gensim super good max ~0.95
# w2v_vs_64_sg_0_hs_0_mc_1_it_1_wn_5774_ng_2.gensim mixed bag max ~0.86
model = Word2Vec.load('w2v_vs_16_sg_1_hs_1_mc_1_it_1_wn_32_ng_2.gensim')
show_synonyms(model, '.*Walter Mitty.*', 5)

                                           title                   genres
movieId                                                                  
7826     Secret Life of Walter Mitty, The (1947)  Comedy|Romance|Thriller
106918   Secret Life of Walter Mitty, The (2013)   Adventure|Comedy|Drama
Mean: 0.9720546066761017 	 StdDev: 0.013300776040472128


([(('7840', 0.9700156450271606), 'Gunga Din (1939)'),
  (('7888', 0.9589544534683228),
   'How to Succeed in Business Without Really Trying (1967)'),
  (('8651', 0.9587623476982117), 'Man of La Mancha (1972)'),
  (('8451', 0.9573909640312195), 'Blackboard Jungle (1955)'),
  (('6561', 0.9562827348709106), 'Mouse That Roared, The (1959)'),
  (('102407', 0.9916270971298218), 'Great Gatsby, The (2013)'),
  (('104374', 0.9839649796485901), 'About Time (2013)'),
  (('106487', 0.9818593263626099), 'Hunger Games: Catching Fire, The (2013)'),
  (('103341', 0.9812790155410767), "World's End, The (2013)"),
  (('102903', 0.9804095029830933), 'Now You See Me (2013)')],
 0.9720546066761017,
 0.013300776040472128)