In [None]:
import pandas as pd
import os
from pathlib import Path
from tensorflow import keras as k  # using 2.0.0-rc0
import numpy as np
import random
from scipy.sparse import dok_matrix, random, coo_matrix
np.set_printoptions(linewidth=128)
%matplotlib inline

In [None]:
path = Path('ml-20m')  # First time using pathlib, pretty neat. "division" is cool

In [None]:
USERID = 'userId'
MOVIEID = 'movieId'
RATING = 'rating'
TITLE = 'title'
GENRES = 'genres'

In [None]:
list(path.iterdir())

In [None]:
df_ratings = pd.read_csv(path / 'ratings.csv', index_col=[USERID, MOVIEID])[RATING]

In [None]:
df_ratings.groupby([MOVIEID]).count().value_counts()

In [None]:
df_ratings = df_ratings / 5.0  # Encodes the input between [0, 1]

In [None]:
# WARNING: MOVIEID is not a 0 through X where X is len(df_movies)
df_movies = pd.read_csv(path / 'movies.csv', index_col=MOVIEID)
# That is why we're extracting the index to give us an ordering
df_movies_index = df_movies.index

In [None]:
movie_id_decoder = {val: ix for ix, val in enumerate(df_movies_index)}

In [None]:
df_movies.head()

In [None]:
df_ratings.head()

In [None]:
distinct_users = set(df_ratings.index.get_level_values(USERID))
distinct_users_count = len(distinct_users)
distinct_movies = set(df_movies.index.get_level_values(MOVIEID))
distinct_movies_count = len(distinct_movies)


In [None]:
def form_input_output(_user_ratings, __sparse__=True):
    _len_user_ratings = len(_user_ratings)
    _len_user_ratings_minus_one = _len_user_ratings - 1
    _sparse_row_count = _len_user_ratings * (_len_user_ratings_minus_one)
    
    if __sparse__:
        _sparse_matrix_x = dok_matrix((_len_user_ratings * (_len_user_ratings_minus_one), 
                                  distinct_movies_count))
        _sparse_matrix_y = dok_matrix((_len_user_ratings * (_len_user_ratings_minus_one), 
                                  distinct_movies_count))
    else:
        _sparse_matrix_x = np.zeros((_sparse_row_count, distinct_movies_count))
        _sparse_matrix_y = np.zeros((_sparse_row_count, distinct_movies_count))
    
    _outer_loop_num = 0
    _inner_loop_num = 0
    for _ix, _i_rating in _user_ratings.items():
        _row_start = _outer_loop_num * _len_user_ratings_minus_one
        _row_stop  = (_outer_loop_num + 1) * _len_user_ratings_minus_one
#         print('Row Start: {} \t Row Stop: {}'.format(_row_start, _row_stop))
        _ix_movie_index = movie_id_decoder[_ix]
        _sparse_matrix_x[_row_start: _row_stop, _ix_movie_index] = _i_rating
        
        # Yes, the _inner_loop_num shouldn't get reset
        for _jx, _j_rating in ((i, j) for i, j in _user_ratings.items() if i != _ix):
            _jx_movie_index = movie_id_decoder[_jx]
#             print('Movie Num: {} \t Movie IX: {} \t Movie Rating: {}'.format(
#                 _jx, _jx_movie_index, _j_rating))
            _sparse_matrix_y[_inner_loop_num, _jx_movie_index] = _j_rating
            _inner_loop_num += 1
            
        _outer_loop_num += 1
    if __sparse__:
        # Torch likes coo
        return _sparse_matrix_x.tocoo(), _sparse_matrix_y.tocoo()
    else:
        return _sparse_matrix_x, _sparse_matrix_y
    

In [None]:
model = k.models.Sequential()
model.add(k.layers.Dense(128, activation='linear', input_dim=distinct_movies_count))
model.add(k.layers.Dense(distinct_movies_count, activation='linear'))
model.compile(loss='mse', optimizer=k.optimizers.Adam())

In [None]:
%%time
ix = 0

# Do we want to accumulate and shuffle batches of users?
data_x = list()
data_y = list()

for user_id in distinct_users:
    print(user_id)
    user_ratings = df_ratings.loc[user_id]
    
    batch_x, batch_y = form_input_output(user_ratings)
    data_x.append(batch_x)
    data_y.append(batch_y)
    
#     model.fit(x=batch_x, y=batch_y)
        
    if ix == 1000:
        break
    ix += 1
    
#     print(user)
#     print(movie)
#     print(row)
#     print(movie_one_hot)
#     print(output_vector)
#     print(dict_cache)
print('done')
#     model.fit(movie_one_hot, output_vector)

In [None]:
sum([len(i) for i in data_x])