In [1]:
import pandas as pd
from pyspark.mllib.feature import Word2Vec
from pyspark import SparkContext
import h5py
import os
import sys

In [2]:
cpus = int(os.cpu_count() / 2)

In [3]:
# https://stackoverflow.com/questions/40208420/how-to-find-hdf5-file-groups-keys-within-python
with h5py.File('binarized.hdf') as f:
    print(f.keys())

<KeysViewHDF5 ['ratings', 'trg', 'tst']>


In [4]:
sc = SparkContext('local[{cpus}]'.format(cpus=cpus), 'word2vec')

In [24]:
# COLUMNS
LIKED = 'Liked'
MOVIE_ID = 'movieId'
USER_ID = 'userId'
TIMESTAMP = 'Timestamp'
TITLE = 'title'
GENRE = 'genres'

In [20]:
df_movies = pd.read_csv('ml-20m/movies.csv', index_col=MOVIE_ID)

In [6]:
df = pd.read_hdf('binarized.hdf', key='ratings')

In [7]:
df = df.drop([TIMESTAMP], axis=1)

In [8]:
df = df[df[LIKED] == 1]
df[MOVIE_ID] = df.index.get_level_values(MOVIE_ID).astype(str)

In [9]:
len(df)

13062852

In [None]:
# df = df.head(10000) # comment out for production

In [10]:
# PARAMETERS
# The most ratings any user has had
df_gb = df.groupby([USER_ID])
WINDOW_SIZE = df_gb[LIKED].count().max() 

In [11]:
WINDOW_SIZE 

8241

In [12]:
dict_str_groups = {k: list(v[MOVIE_ID]) for k, v in df_gb}

In [13]:
document = sc.parallelize(dict_str_groups.values(), cpus)

In [14]:
word2vec = Word2Vec()
word2vec.setWindowSize(WINDOW_SIZE)
model = word2vec.fit(document)

In [15]:
model

<pyspark.mllib.feature.Word2VecModel at 0x107c96908>

In [17]:
model.save(sc, 'trained_wor2vec_pyspark.sparkmodel')

In [39]:
def show_synonyms(search_str, num_synonyms):
    synonym_list = list()
    movie_index = df_movies[df_movies[TITLE].str.match('.*Matrix.*')].index
    for mi in movie_index:
        synonym_list.extend([(i, df_movies.loc[int(i[0])][TITLE]) for i in 
                             list(model.findSynonyms(str(mi), num_synonyms))])
    return synonym_list

In [40]:
show_synonyms('.*Matrix.*', 5)

[(('1625', 0.996126651763916), 'Game, The (1997)'),
 (('1961', 0.9959763884544373), 'Rain Man (1988)'),
 (('1923', 0.9958289861679077), "There's Something About Mary (1998)"),
 (('1968', 0.995648980140686), 'Breakfast Club, The (1985)'),
 (('1954', 0.9956085085868835), 'Rocky (1976)'),
 (('7153', 0.9908901453018188),
  'Lord of the Rings: The Return of the King, The (2003)'),
 (('6863', 0.990288496017456), 'School of Rock (2003)'),
 (('6502', 0.990177571773529), '28 Days Later (2002)'),
 (('6373', 0.9900848865509033), 'Bruce Almighty (2003)'),
 (('6874', 0.9900653958320618), 'Kill Bill: Vol. 1 (2003)'),
 (('7143', 0.9917793869972229), 'Last Samurai, The (2003)'),
 (('7254', 0.9901487231254578), 'The Butterfly Effect (2004)'),
 (('8360', 0.9885937571525574), 'Shrek 2 (2004)'),
 (('8636', 0.9852023720741272), 'Spider-Man 2 (2004)'),
 (('7147', 0.9850006699562073), 'Big Fish (2003)')]

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
2571,"Matrix, The (1999)",Action|Sci-Fi|Thriller
6365,"Matrix Reloaded, The (2003)",Action|Adventure|Sci-Fi|Thriller|IMAX
6934,"Matrix Revolutions, The (2003)",Action|Adventure|Sci-Fi|Thriller|IMAX
