In [None]:
import pandas as pd
import os
from pathlib import Path
from tensorflow import keras as k  # using 2.0.0-rc0
import numpy as np
import random
np.set_printoptions(linewidth=128)
%matplotlib inline

In [None]:
path = Path('ml-20m')  # First time using pathlib, pretty neat. "division" is cool

In [None]:
USERID = 'userId'
MOVIEID = 'movieId'
RATING = 'rating'
TITLE = 'title'
GENRES = 'genres'

In [None]:
list(path.iterdir())

In [None]:
df_ratings = pd.read_csv(path / 'ratings.csv', index_col=[USERID, MOVIEID])[RATING]

In [None]:
# WARNING: MOVIEID is not a 0 through X where X is len(df_movies)
df_movies = pd.read_csv(path / 'movies.csv', index_col=MOVIEID)
# That is why we're extracting the index to give us an ordering
df_movies_index = df_movies.index

In [None]:
movie_id_decoder = {val: ix for ix, val in enumerate(df_movies_index)}

In [None]:
df_movies.head()

In [None]:
df_ratings.head()

In [None]:
distinct_users = set(df_ratings.index.get_level_values(USERID))
distinct_users_count = len(distinct_users)
distinct_movies = set(df_movies.index.get_level_values(MOVIEID))
distinct_movies_count = len(distinct_movies)


In [None]:
def form_one_hot_movie(_i):
    _zeros = np.zeros(distinct_movies_count)
    _movie_index = movie_id_decoder[_i]
    _zeros[_movie_index] = 1.0
    return _zeros

In [None]:
def audit_dict(_dict):
    # https://stackoverflow.com/questions/53124979/get-a-random-subset-of-a-dictionary
    _memory_limit = 20000
    if len(_dict) > _memory_limit: # tune to suit your computer's memory limits
        print('Hit limit')
        _dict = dict(random.sample(_dict.items(), int(_memory_limit * 0.9))) 
        print('New Dict Len: {}'.format(len(_dict)))
        return _dict
    else:
        return _dict

def form_output(_user_id, _user_data, _dict_cache):  # need something faster... not sure what
    _dict_cache = audit_dict(_dict_cache)
    
    if _user_id in _dict_cache.keys():
        return _dict_cache[_user_id], _dict_cache
    else:
        _calculated_value = _user_data.reindex(df_movies_index).fillna(0.0).to_numpy()
        _dict_cache[_user_id] = _calculated_value
        return _calculated_value, _dict_cache

In [None]:
# For a given user and the user's ratings
# It returns the user's ratings vector conformed to the movie index 
# with zeros filling unseen movies
# It also returns the caching dictionary which may or may not be updated
# This needs to be passed around to avoid globals unfortunately
form_output(1, df_ratings.loc[1], {})

In [None]:
# This makes a dict of movie_id: one_hot
# Recall that the movie_id is not between 0 and len(movies)
# So the one_hot incorporates the lookup to translate a movie_id
# To the proper index so the vector can actually be len(movies)
dict_movie_one_hots = {movie_id: form_one_hot_movie(movie_id) 
                       for movie_id in df_movies_index}

In [None]:
# This has problems. Please help :)

batch_size = 128
model = k.models.Sequential()
model.add(k.layers.Embedding(distinct_movies_count, 64, 
                             input_length=distinct_movies_count))
model.add(k.layers.Flatten())
model.add(k.layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='mse')

In [None]:
dict_cache = dict()
ix = 0
batch_x = list()
batch_y = list()
for (user_id, movie_id), row in df_ratings.iloc[0:4096].sample(frac=1).items():
    movie_one_hot = dict_movie_one_hots[movie_id]
    output_vector, dict_cache = form_output(user_id, df_ratings.loc[user_id], dict_cache)
    
    batch_x.append(movie_one_hot)
    batch_y.append(output_vector)
    
    if len(batch_x) == 128:
        batch_x = np.vstack(batch_x)
        batch_y = np.vstack(batch_y)
        model.fit(x=batch_x, y=batch_y)
        batch_x = list()
        batch_y = list()
        
    if ix % 1000 == 0:
        print(ix)
    ix += 1
#     print(user)
#     print(movie)
#     print(row)
#     print(movie_one_hot)
#     print(output_vector)
#     print(dict_cache)
print('done')
#     model.fit(movie_one_hot, output_vector)