In [1]:
import pandas as pd
import os
from pathlib import Path
from tensorflow import keras as k  # using 2.0.0-rc0
import numpy as np
import random
from scipy.sparse import dok_matrix, random, coo_matrix
from collections import defaultdict, namedtuple, OrderedDict
from torch import nn
from torch.nn.functional import mse_loss
import torch
import pickle
device_cpu = torch.device('cpu')
device_cuda = torch.device('cuda:0')
device = device_cpu if not torch.cuda.is_available() else device_cpu

np.set_printoptions(linewidth=128)
%matplotlib inline

In [2]:
path = Path('ml-20m')  # First time using pathlib, pretty neat. "division" is cool
sparseData = namedtuple('sparseData', 
                        ['rows_x_y', 'columns_x', 'values_x', 'columns_y', 'values_y'])

In [3]:
USERID = 'userId'
MOVIEID = 'movieId'
RATING = 'rating'
TITLE = 'title'
GENRES = 'genres'

In [4]:
list(path.iterdir())

[PosixPath('ml-20m/links.csv'),
 PosixPath('ml-20m/tags.csv'),
 PosixPath('ml-20m/genome-tags.csv'),
 PosixPath('ml-20m/ratings.csv'),
 PosixPath('ml-20m/README.txt'),
 PosixPath('ml-20m/genome-scores.csv'),
 PosixPath('ml-20m/movies.csv')]

In [5]:
df_ratings = pd.read_csv(path / 'ratings.csv', index_col=[USERID, MOVIEID])[RATING]

  mask |= (ar1 == a)


In [6]:
df_ratings.groupby([MOVIEID]).count().value_counts()

1        3972
2        2043
3        1355
4        1029
5         826
6         647
7         574
8         462
9         385
10        372
11        294
12        290
13        264
14        234
17        195
15        190
16        173
18        164
19        143
20        143
21        139
22        124
25        114
28        112
29        105
35        101
23         98
26         97
24         96
31         92
         ... 
2920        1
15310       1
857         1
2888        1
4923        1
825         1
777         1
2792        1
4827        1
11196       1
3032        1
1289        1
5291        1
7418        1
1273        1
5355        1
3304        1
1241        1
3288        1
17617       1
40106       1
3048        1
3224        1
1161        1
3208        1
3192        1
13359       1
1065        1
3096        1
8268        1
Name: rating, Length: 3423, dtype: int64

In [7]:
df_ratings = df_ratings / 5.0  # Encodes the input between [0, 1]

In [8]:
# WARNING: MOVIEID is not a 0 through X where X is len(df_movies)
df_movies = pd.read_csv(path / 'movies.csv', index_col=MOVIEID)
# That is why we're extracting the index to give us an ordering
df_movies_index = df_movies.index

In [9]:
df_movies.head()

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,Father of the Bride Part II (1995),Comedy


In [10]:
df_ratings.head()

userId  movieId
1       2          0.7
        29         0.7
        32         0.7
        47         0.7
        50         0.7
Name: rating, dtype: float64

In [11]:
distinct_users = set(df_ratings.index.get_level_values(USERID))
distinct_users_count = len(distinct_users)
distinct_movies = set(df_movies.index.get_level_values(MOVIEID))
distinct_movies_count = len(distinct_movies)


In [12]:
movie_id_decoder = {val: ix for ix, val in enumerate(df_movies_index)}
user_id_decoder = {val: ix for ix, val in enumerate(distinct_users)}

In [13]:
def series_to_nparray(_s):
    # Needs to be list data structure since sparse vector assembly uses "+" operator
    # Numpy would add whereas Python would merge the lists
    _values = _s.to_numpy().tolist()
    
    # Needs to be list data structure since inner loop (vectorize) utilizes addition
    # to do concatenation
    # We also don't want to put this is the same array as the data since that will
    # coerce our lovely indexes to floats, which can't be used as indexes
    _index = [movie_id_decoder[i] for i in _s.index]
    return (_index, _values)

In [14]:
dict_user_ratings = dict()
for user_id in distinct_users:
    user_data = df_ratings[user_id]
    dict_user_ratings[user_id] = series_to_nparray(user_data)

In [35]:
def form_input_output_vectorized(_user_ratings, _output_type):
    _len_user_ratings = len(_user_ratings[0])
    _len_user_ratings_minus_one = _len_user_ratings - 1
    _sparse_matrix_length = (_len_user_ratings * _len_user_ratings_minus_one)
    
    _zipped_data = zip(_user_ratings[0], _user_ratings[1])

    _row_range_xy = list()
    _col_range_x = list()
    _col_range_y = list()
    _values_x = list()
    _values_y = list()
    _output_shape = [_sparse_matrix_length, distinct_movies_count]
    
    for _ix, (_ix_movie_index, _i_rating) in enumerate(_zipped_data):
        _row_start = _ix * _len_user_ratings_minus_one
        _row_stop  = (_ix + 1) * _len_user_ratings_minus_one
        
        _row_range_xy.extend(list(range(_row_start, _row_stop)))
        
        _col_range_x.extend([_ix_movie_index] * _len_user_ratings_minus_one)
        _values_x.extend([_i_rating] * _len_user_ratings_minus_one)
        
        _col_range_y.extend(_user_ratings[0][0:_ix] + _user_ratings[0][_ix + 1:])
        _values_y.extend(_user_ratings[1][0:_ix] + _user_ratings[1][_ix + 1:])
    
    if _output_type == 'torch_sparse':
        _size_tensor = torch.Size(_output_shape)

        _sparse_x_i = torch.LongTensor([_row_range_xy, _col_range_x]).to(device)
        _sparse_x_vals = torch.FloatTensor(_values_x).to(device)

        _sparse_matrix_x = torch.sparse.FloatTensor(
            _sparse_x_i, _sparse_x_vals, _size_tensor).to(device)

        _sparse_y_i = torch.LongTensor([_row_range_xy, _col_range_y]).to(device)
        _sparse_y_vals = torch.FloatTensor(_values_y).to(device)

        _sparse_matrix_y = torch.sparse.FloatTensor(
            _sparse_y_i, _sparse_y_vals, _size_tensor).to(device)
        
    elif _output_type == 'torch_dense':
        _sparse_matrix_x = torch.zeros(_output_shape)
        _sparse_matrix_x[_row_range_xy, _col_range_x] = torch.FloatTensor(_values_x)
        _sparse_matrix_x.to(device)

        _sparse_matrix_y = torch.zeros(_output_shape)
        _sparse_matrix_y[_row_range_xy, _col_range_y] = torch.FloatTensor(_values_y)
        _sparse_matrix_y.to(device)
            
    elif _output_type == 'numpy_sparse':
        _sparse_matrix_x = coo_matrix((_values_x, (_row_range_xy, _col_range_x)),
                                     shape=_output_shape)
        _sparse_matrix_y = coo_matrix((_values_y, (_row_range_xy, _col_range_y)),
                                     shape=_output_shape)
        
    elif _output_type == 'numpy_dense':
        _sparse_matrix_x = np.zeros(_output_shape)
        _sparse_matrix_y = np.zeros(_output_shape)
        _sparse_matrix_x[_row_range_xy, _col_range_x] = _values_x
        _sparse_matrix_y[_row_range_xy, _col_range_y] = _values_y
    else:
        raise NotImplementedError
    
    _sparse_data = sparseData(_row_range_xy, _col_range_x, 
                              _values_x, _col_range_y, _values_y)
    
    return _sparse_matrix_x, _sparse_matrix_y, _sparse_data
        

In [53]:
def form_input_output_vectorized_generator(_user_ratings, _output_type):
    _len_user_ratings = len(_user_ratings[0])
    _len_user_ratings_minus_one = _len_user_ratings - 1
    _sparse_matrix_length = _len_user_ratings_minus_one
    
    _zipped_data = zip(_user_ratings[0], _user_ratings[1])
    _row_range_xy = list(range(_len_user_ratings_minus_one))
    
    _output_shape = [_sparse_matrix_length, distinct_movies_count]
    
    for _ix, (_ix_movie_index, _i_rating) in enumerate(_zipped_data):
        _col_range_x = [_ix_movie_index] * _len_user_ratings_minus_one
        _values_x = [_i_rating] * _len_user_ratings_minus_one
        _col_range_y = _user_ratings[0][0:_ix] + _user_ratings[0][_ix + 1:]
        _values_y = _user_ratings[1][0:_ix] + _user_ratings[1][_ix + 1:]
    
        if _output_type == 'torch_sparse':
            _size_tensor = torch.Size(_output_shape)

            _sparse_x_i = torch.LongTensor([_row_range_xy, _col_range_x]).to(device)
            _sparse_x_vals = torch.FloatTensor(_values_x).to(device)

            _sparse_matrix_x = torch.sparse.FloatTensor(
                _sparse_x_i, _sparse_x_vals, _size_tensor).to(device)

            _sparse_y_i = torch.LongTensor([_row_range_xy, _col_range_y]).to(device)
            _sparse_y_vals = torch.FloatTensor(_values_y).to(device)

            _sparse_matrix_y = torch.sparse.FloatTensor(
                _sparse_y_i, _sparse_y_vals, _size_tensor).to(device)
            
        elif _output_type == 'torch_dense':
            _sparse_matrix_x = torch.zeros(_output_shape)
            _sparse_matrix_x[_row_range_xy, _col_range_x] = torch.FloatTensor(_values_x)
            _sparse_matrix_x.to(device)
            
            _sparse_matrix_y = torch.zeros(_output_shape)
            _sparse_matrix_y[_row_range_xy, _col_range_y] = torch.FloatTensor(_values_y)
            _sparse_matrix_y.to(device)

        elif _output_type == 'numpy_sparse':
            _sparse_matrix_x = coo_matrix((_values_x, (_row_range_xy, _col_range_x)),
                                         shape=_output_shape)
            _sparse_matrix_y = coo_matrix((_values_y, (_row_range_xy, _col_range_y)),
                                         shape=_output_shape)
            
        elif _output_type == 'numpy_dense':
            _sparse_matrix_x = np.zeros(_output_shape)
            _sparse_matrix_y = np.zeros(_output_shape)
            _sparse_matrix_x[_row_range_xy, _col_range_x] = _values_x
            _sparse_matrix_y[_row_range_xy, _col_range_y] = _values_y
        else:
            raise NotImplementedError
    
        _sparse_data = sparseData(_row_range_xy, _col_range_x, 
                                  _values_x, _col_range_y, _values_y)   

        yield _sparse_matrix_x, _sparse_matrix_y, _sparse_data 
#     return _sparse_matrix_x, _sparse_matrix_y, _sparse_data
# x = form_input_output_vectorized_generator((dict_user_ratings[1][0][0:3], 
#                                   dict_user_ratings[1][1][0:3]),
#                                 'numpy_dense')
# y = x.__next__()
# print(y[0].to_dense())
# print(y[1].to_dense())

# y = x.__next__()
# print(y[0].to_dense())
# print(y[1].to_dense())  

# y = x.__next__()
# print(y[0].to_dense())
# print(y[1].to_dense())

# y = x.__next__()
# print(y[0])
# print('=\n')
# print(y[1])
# print('===\n')
# y = x.__next__()
# print(y[0])
# print('=\n')
# print(y[1])  
# print('===\n')
# y = x.__next__()
# print(y[0])
# print('=\n')
# print(y[1])
# print('===\n')

[[0.  0.7 0.  ... 0.  0.  0. ]
 [0.  0.7 0.  ... 0.  0.  0. ]]
=

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
===

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
=

[[0.  0.7 0.  ... 0.  0.  0. ]
 [0.  0.  0.  ... 0.  0.  0. ]]
===

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
=

[[0.  0.7 0.  ... 0.  0.  0. ]
 [0.  0.  0.  ... 0.  0.  0. ]]
===



In [17]:
# x = form_input_output_vectorized(user_data_length[903][0][0:1000],
#                             user_data_length[903][1][0:1000])

In [18]:
user_lengths = pd.DataFrame(pd.Series({k: len(v[0]) for k, v in dict_user_ratings.items()})).reset_index()

In [19]:
user_lengths = user_lengths.groupby([0])['index'].agg(lambda x: sorted(x))

In [20]:
user_lengths[600:]

0
620     [271, 10119, 12753, 27387, 38854, 49523, 64533...
621     [17988, 23832, 34179, 37825, 67609, 78057, 980...
622     [24027, 32662, 37294, 41357, 55244, 58451, 764...
623     [10086, 13049, 40455, 41816, 43267, 63596, 872...
624     [1644, 14133, 18004, 18280, 24214, 25477, 2866...
625     [15911, 18712, 22500, 28509, 43636, 83923, 923...
626     [16174, 18968, 35601, 35985, 48382, 49204, 521...
627     [32881, 33841, 40617, 43703, 44780, 45563, 514...
628     [5825, 27946, 31246, 37764, 50488, 67979, 7354...
629     [4924, 30130, 41277, 57543, 64787, 65219, 9262...
630     [1678, 16127, 54134, 59781, 60385, 76384, 8787...
631     [4831, 8188, 15108, 21185, 36093, 47816, 48162...
632     [20969, 24981, 29294, 41744, 44770, 51965, 544...
633     [8249, 17319, 21378, 30092, 40281, 45929, 5394...
634     [5482, 9406, 10021, 35066, 49745, 50958, 56270...
635     [37780, 50656, 62133, 69753, 97198, 100766, 11...
636     [9917, 17042, 28060, 32744, 41096, 73348, 8090...
637     [239

In [40]:
user_target = 1

In [47]:
%%time 
# vectorized
out_data_vectorized = dict()
for out_type in ['numpy_dense', 'torch_dense', 'numpy_sparse', 'numpy_dense']:
    out_data_vectorized[out_type] = form_input_output_vectorized(dict_user_ratings[user_target], out_type)

CPU times: user 2.32 s, sys: 3.76 s, total: 6.08 s
Wall time: 6.16 s


In [49]:
%%time
# generator
out_data_generated = dict()
for out_type in ['numpy_dense', 'torch_dense', 'numpy_sparse', 'numpy_dense']:
    x = form_input_output_vectorized_generator(dict_user_ratings[user_target], out_type)
    out_data_generated[out_type] = [i for i in x]

CPU times: user 2.33 s, sys: 3.56 s, total: 5.89 s
Wall time: 5.96 s


In [50]:
out_data_generated['numpy_dense']

[(array([[0. , 0.7, 0. , ..., 0. , 0. , 0. ],
         [0. , 0.7, 0. , ..., 0. , 0. , 0. ],
         [0. , 0.7, 0. , ..., 0. , 0. , 0. ],
         ...,
         [0. , 0.7, 0. , ..., 0. , 0. , 0. ],
         [0. , 0.7, 0. , ..., 0. , 0. , 0. ],
         [0. , 0.7, 0. , ..., 0. , 0. , 0. ]]),
  array([[0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         ...,
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.]]),
  sparseData(rows_x_y=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107,

In [None]:
model = k.models.Sequential()
model.add(k.layers.Dense(128, activation='linear', input_dim=distinct_movies_count))
model.add(k.layers.Dense(distinct_movies_count, activation='relu')) # Must be positive
model.compile(loss='mse', optimizer=k.optimizers.Adam())

In [None]:
%%time
for ix, user_id in enumerate(distinct_users):
    user_ratings = dict_user_ratings[user_id]
    print(user_id)
    if len(user_ratings[0]) > 500:
        # skip big ones
        continue
    
    batch_x, batch_y, _sparse_data = form_input_output_vectorized(user_ratings, 'numpy_dense')
    model.fit(x=batch_x, y=batch_y)
        
    break

In [None]:
# good thing I did my RL assignment in both PyTorch and Keras :)
# Keras was faster, but that was before version 1.0 of PyTorch
learning_rate = 1e-4
neurons = 128
nn_config = OrderedDict([
        ('in', nn.Linear(in_features=distinct_movies_count, out_features=neurons)),
        ('H1', nn.Linear(in_features=neurons, out_features=neurons)),
        ('out', nn.Linear(in_features=neurons, out_features=distinct_movies_count))
    ])
model = nn.Sequential(nn_config).to(device)

# citation: https://pytorch.org/tutorials/beginner/pytorch_with_examples.html

In [None]:
%%time

for ix, user_id in enumerate(distinct_users):
    user_ratings = dict_user_ratings[user_id]
    print(user_id)
    if len(user_ratings[0]) > :
        # skip big ones
        continue
        
    tensorX, tensorY, sparse_data = form_input_output_vectorized(user_ratings, 'torch_dense')
    
    # Works with dense? Why not sparse?
#     tensorX = tensorX.to_dense().to(device)
#     tensorY = tensorY.to_dense().to(device)
#     print('Predicting Y')
    y_pred = model(tensorX).to(device)
#     print('Y has been predicted')
#     y_pred = y_pred.to(device)
#     print('Y has been sparsified')
    # Torch can't do operations on sparse + dense
    # https://github.com/pytorch/pytorch/issues/2389
    # Necessitating the conversion to dense
    
    loss = mse_loss(y_pred, tensorY).to(device) # issue is here
    loss.to(device)
    
    model.zero_grad()
    loss.backward()
    with torch.no_grad():
        for param in model.parameters():
            param -= learning_rate * param.grad
    
    break
    print('Counter: {} \t User: {}'.format(counter, user_id))