In [None]:
import pandas as pd
import os
from pathlib import Path
from tensorflow import keras as k  # using 2.0.0-rc0
import numpy as np
import random
from scipy.sparse import dok_matrix, random, coo_matrix
from collections import defaultdict, namedtuple, OrderedDict
from torch import nn
from torch.nn.functional import mse_loss
import torch
import pickle
device_cpu = torch.device('cpu')
device_cuda = torch.device('cuda:0')
device = device_cpu if not torch.cuda.is_available() else device_cpu

np.set_printoptions(linewidth=128)
%matplotlib inline

In [None]:
path = Path('ml-20m')  # First time using pathlib, pretty neat. "division" is cool
sparseData = namedtuple('sparseData', 
                        ['rows_x_y', 'columns_x', 'values_x', 'columns_y', 'values_y'])

In [None]:
USERID = 'userId'
MOVIEID = 'movieId'
RATING = 'rating'
TITLE = 'title'
GENRES = 'genres'

In [None]:
list(path.iterdir())

In [None]:
df_ratings = pd.read_csv(path / 'ratings.csv', index_col=[USERID, MOVIEID])[RATING]

In [None]:
df_ratings.groupby([MOVIEID]).count().value_counts()

In [None]:
df_ratings = df_ratings / 5.0  # Encodes the input between [0, 1]

In [None]:
# WARNING: MOVIEID is not a 0 through X where X is len(df_movies)
df_movies = pd.read_csv(path / 'movies.csv', index_col=MOVIEID)
# That is why we're extracting the index to give us an ordering
df_movies_index = df_movies.index

In [None]:
df_movies.head()

In [None]:
df_ratings.head()

In [None]:
distinct_users = set(df_ratings.index.get_level_values(USERID))
distinct_users_count = len(distinct_users)
distinct_movies = set(df_movies.index.get_level_values(MOVIEID))
distinct_movies_count = len(distinct_movies)


In [None]:
movie_id_decoder = {val: ix for ix, val in enumerate(df_movies_index)}
user_id_decoder = {val: ix for ix, val in enumerate(distinct_users)}

In [None]:
def series_to_nparray(_s):
    # Needs to be list data structure since sparse vector assembly uses "+" operator
    # Numpy would add whereas Python would merge the lists
    _values = _s.to_numpy().tolist()
    
    # Needs to be list data structure since inner loop (vectorize) utilizes addition
    # to do concatenation
    # We also don't want to put this is the same array as the data since that will
    # coerce our lovely indexes to floats, which can't be used as indexes
    _index = [movie_id_decoder[i] for i in _s.index]
    return (_index, _values)

In [None]:
dict_user_ratings = dict()
for user_id in distinct_users:
    user_data = df_ratings[user_id]
    dict_user_ratings[user_id] = series_to_nparray(user_data)

In [None]:
def form_input_output_vectorized(_user_ratings, sparse=True):
    _len_user_ratings = len(_user_ratings[0])
    _len_user_ratings_minus_one = _len_user_ratings - 1
    _sparse_matrix_length = (_len_user_ratings * _len_user_ratings_minus_one)
    
    _zipped_data = zip(_user_ratings[0], _user_ratings[1])

    _row_range_xy = list()
    _col_range_x = list()
    _col_range_y = list()
    _values_x = list()
    _values_y = list()

    for _ix, (_ix_movie_index, _i_rating) in enumerate(_zipped_data):
        _row_start = _ix * _len_user_ratings_minus_one
        _row_stop  = (_ix + 1) * _len_user_ratings_minus_one
        
        _row_range_xy.extend(list(range(_row_start, _row_stop)))
        
        _col_range_x.extend([_ix_movie_index] * _len_user_ratings_minus_one)
        _values_x.extend([_i_rating] * _len_user_ratings_minus_one)
        
        _col_range_y.extend(_user_ratings[0][0:_ix] + _user_ratings[0][_ix + 1:])
        _values_y.extend(_user_ratings[1][0:_ix] + _user_ratings[1][_ix + 1:])
    
    if sparse:
        _size_tensor = torch.Size([_sparse_matrix_length, distinct_movies_count])

        _sparse_x_i = torch.LongTensor([_row_range_xy, _col_range_x]).to(device)
        _sparse_x_vals = torch.FloatTensor(_values_x).to(device)

        _sparse_matrix_x = torch.sparse.FloatTensor(
            _sparse_x_i, _sparse_x_vals, _size_tensor).to(device)

        _sparse_y_i = torch.LongTensor([_row_range_xy, _col_range_y]).to(device)
        _sparse_y_vals = torch.FloatTensor(_values_y).to(device)

        _sparse_matrix_y = torch.sparse.FloatTensor(
            _sparse_y_i, _sparse_y_vals, _size_tensor).to(device)

        _sparse_data = sparseData(_row_range_xy, _col_range_x, _values_x, _col_range_y, _values_y)
    else:
        _sparse_matrix_x = np.zeros((_len_user_ratings * (_len_user_ratings_minus_one), 
                                  distinct_movies_count))
        _sparse_matrix_y = np.zeros((_len_user_ratings * (_len_user_ratings_minus_one), 
                                  distinct_movies_count))
        _sparse_matrix_x[_row_range_xy, _col_range_x] = _values_x
        _sparse_matrix_y[_row_range_xy, _col_range_y] = _values_y
    
    _sparse_data = sparseData(_row_range_xy, _col_range_x, 
                              _values_x, _col_range_y, _values_y)   
    _sparse_data = None
    return _sparse_matrix_x, _sparse_matrix_y, _sparse_data
        

In [None]:
%%time
x = form_input_output_vectorized(dict_user_ratings[1], sparse=True)

In [None]:
x = form_input_output_vectorized((dict_user_ratings[1][0][0:3], 
                                  dict_user_ratings[1][1][0:3]))
print(x[0].to_dense())
print(x[1].to_dense())

In [None]:
# model = k.models.Sequential()
# model.add(k.layers.Dense(128, activation='linear', input_dim=distinct_movies_count))
# model.add(k.layers.Dense(distinct_movies_count, activation='linear'))
# model.compile(loss='mse', optimizer=k.optimizers.Adam())

In [None]:
# good thing I did my RL assignment in both PyTorch and Keras :)
# Keras was faster, but that was before version 1.0 of PyTorch
learning_rate = 1e-4
neurons = 128
nn_config = OrderedDict([
        ('in', nn.Linear(in_features=distinct_movies_count, out_features=neurons)),
        ('ReLU1', nn.ReLU()),
        ('H1', nn.Linear(in_features=neurons, out_features=neurons)),
        ('ReLU2', nn.ReLU()),
        ('out', nn.Linear(in_features=neurons, out_features=distinct_movies_count))
    ])
model = nn.Sequential(nn_config).to(device)

# citation: https://pytorch.org/tutorials/beginner/pytorch_with_examples.html

In [None]:
tensorY

In [None]:
y_pred

In [None]:
%%time
ix = 0

sparse_data_dict = dict()

for counter, user_id in enumerate(distinct_users):
    user_ratings = dict_user_ratings[user_id]
    tensorX, tensorY, sparse_data = form_input_output_vectorized(user_ratings, sparse=True)
    
    # Works with dense? Why not sparse?
    tensorX = tensorX.to(device)
    tensorY = tensorY.to(device)
    
    y_pred = model(tensorX).to(device)
    print(y_pred)
    
    # Torch can't do operations on sparse + dense
    # https://github.com/pytorch/pytorch/issues/2389
    # Necessitating the conversion to dense
    
    loss = mse_loss(y_pred, tensorY.to_dense()).to(device) # issue is here
    loss.to(device)
    
    model.zero_grad()
    loss.backward()
    with torch.no_grad():
        for param in model.parameters():
            param -= learning_rate * param.grad
    
#     sparse_data_dict[user_id] = sparse_data # Store and write out to pickle?
    
    
    if counter == 1000:
        break
    print('Counter: {} \t User: {}'.format(counter, user_id))

In [None]:
with open('sparse_data.pickle', 'wb') as f:
    pickle.dump(sparse_data_dict, f)



# for user_id in random.shuffle(distinct_users):
#     print(user_id)
#     user_ratings = dict_user_ratings[user_id]
#     
#     batch_x, batch_y = form_input_output_vectorized_sparse(user_ratings, sparse=False)
# #     data_x.append(batch_x)
# #     data_y.append(batch_y)
#     
#     model.fit(x=batch_x, y=batch_y)
#         
#     if ix == 1000:
#         break
#     ix += 1
    
#     print(user)
#     print(movie)
#     print(row)
#     print(movie_one_hot)
#     print(output_vector)
#     print(dict_cache)
print('done')
#     model.fit(movie_one_hot, output_vector)