In [None]:
from gensim.models import Word2Vec
from sklearn.model_selection import ParameterGrid
import pandas as pd
import h5py
import os
import logging

In [None]:
logging.basicConfig(filename='word2vec.log', level=logging.DEBUG)

In [None]:
workers = os.cpu_count() - 2

In [None]:
# https://stackoverflow.com/questions/40208420/how-to-find-hdf5-file-groups-keys-within-python
with h5py.File('binarized.hdf') as f:
    print(f.keys())

In [None]:
# COLUMNS
LIKED = 'Liked'
MOVIE_ID = 'movieId'
USER_ID = 'userId'
TIMESTAMP = 'Timestamp'
TITLE = 'title'
GENRE = 'genres'

In [None]:
def transform_df(_df):
    _df.sort_values(by=[TIMESTAMP], inplace=True, ascending=True)
    _df[MOVIE_ID] = _df.index.get_level_values(MOVIE_ID).astype(str)
    return _df
    

In [None]:
df_movies = pd.read_csv('ml-20m/movies.csv', index_col=MOVIE_ID)

In [None]:
df_trg = pd.read_hdf('binarized.hdf', key='trg')
df_trg = df_trg[df_trg[LIKED] == 1]
# df_trg = df_trg.head(50000) # todo comment out for production
df_trg = transform_df(df_trg)
df_val = transform_df(pd.read_hdf('binarized.hdf', key='val'))

In [None]:
df_trg.head()

In [None]:
df_trg_gb = df_trg.groupby([USER_ID])
dict_groups_trg = {k: list(v[MOVIE_ID]) 
                   for k, v in df_trg_gb}
MAX_WINDOW_SIZE = df_trg_gb[LIKED].count().max()

In [None]:
df_val_gb = df_val.groupby([USER_ID])
dict_groups_val = {k: list(v[MOVIE_ID]) 
                   for k, v in df_val_gb}

In [None]:
VECTOR_SIZE = 'vector_size'
MIN_COUNT = 'min_count'
WINDOW_SIZE = 'window_size'
NEGATIVE_SAMPLING = 'negative_sampling'
ITERATIONS = 'iterations'
SKIP_GRAM = 'skip_gram'
HIERARCHICAL_SOFTMAX = 'hierarchical_softmax'
param_grid = ParameterGrid({
    VECTOR_SIZE: [16, 24, 32],
    MIN_COUNT: [1, 5, 10],
    # todo, see if iterations makes much of a difference
    ITERATIONS: [1],
    WINDOW_SIZE: [MAX_WINDOW_SIZE, 32, 16],
    NEGATIVE_SAMPLING: [2, 0],  # zero is no negative sampling
    SKIP_GRAM: [1], # zero is no skip gram
    HIERARCHICAL_SOFTMAX: [1, 0], # zero is no hierarchical softmax
})

In [None]:
# print(list(param_grid))

In [None]:
len(param_grid)

In [None]:
for params in param_grid:
    print(params)
    start_dttm = pd.Timestamp('now')
    print(start_dttm)
    logging.debug('Params: {params}'.format(params=params))
    logging.debug('Start Train: {ts}'.format(ts=start_dttm))
    
    # Fit under grid parameters
    model = Word2Vec(dict_groups_trg.values(),
                     workers=workers,
                     max_vocab_size=None,
                     max_final_vocab=None,
                     size=params[VECTOR_SIZE],
                     sg=params[SKIP_GRAM],
                     hs=params[HIERARCHICAL_SOFTMAX],
                     min_count=params[MIN_COUNT],
                     iter=params[ITERATIONS],
                     window=params[WINDOW_SIZE],
                     negative=params[NEGATIVE_SAMPLING],
                     seed=42,
                    )
    # Reading the docs, we must still set PYTHONHASHSEED for reproducable runs
    # So this helps... but not really
    stop_dttm = pd.Timestamp('now')
    print(stop_dttm)
    logging.debug('Stop Train: {ts}'.format(ts=stop_dttm))
    logging.debug('Params: {}'.format(params))
    duration = stop_dttm - start_dttm
    logging.debug('Duration: {}'.format(duration))
    print(duration)
    print('===\n')
    outpath = 'w2v_vs_{vs}_sg_{sg}_hs_{hs}_mc_{mc}_it_{it}_wn_{wn}_ng_{ng}.gensim'.format(
        vs=params[VECTOR_SIZE], 
        sg=params[SKIP_GRAM],
        hs=params[HIERARCHICAL_SOFTMAX],
        mc=params[MIN_COUNT],
        # lr=params[LEARNING_RATE],
        it=params[ITERATIONS],
        wn=params[WINDOW_SIZE], 
        ng=params[NEGATIVE_SAMPLING],
    )
    
    if os.path.isfile(outpath):
        os.remove(outpath)
    model.save(outpath)
    
#     break

In [None]:
def show_synonyms(search_str, num_synonyms):
    synonym_list = list()
    movie_index = df_movies[df_movies[TITLE].str.match(search_str)]
    print(movie_index)
    for mi in movie_index.index:
        synonym_list.extend([(i, df_movies.loc[int(i[0])][TITLE]) for i in 
                             list(model.wv.most_similar(str(mi), topn=num_synonyms))])
    return synonym_list

In [None]:
model = Word2Vec.load('w2v_vs_128_mc_1_it_8_wn_5774_ng_5.gensim')

In [None]:
show_synonyms('.*Matrix.*', 5)

In [None]:
show_synonyms('.*Private Ryan.*', 5)

In [None]:
show_synonyms('.*Star Wars: Episode.*', 5)