In [1]:
!wget 'http://files.grouplens.org/datasets/movielens/ml-1m.zip'
!unzip 'ml-1m.zip'

--2018-12-26 14:34:15--  http://files.grouplens.org/datasets/movielens/ml-1m.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.34.235
Connecting to files.grouplens.org (files.grouplens.org)|128.101.34.235|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5917549 (5.6M) [application/zip]
Saving to: 'ml-1m.zip'


2018-12-26 14:34:29 (448 KB/s) - 'ml-1m.zip' saved [5917549/5917549]

Archive:  ml-1m.zip
   creating: ml-1m/
  inflating: ml-1m/movies.dat        
  inflating: ml-1m/ratings.dat       
  inflating: ml-1m/README            
  inflating: ml-1m/users.dat         


In [2]:
import pandas as pd
import numpy as np

In [3]:
ratings = pd.read_csv('ml-1m/ratings.dat', 
                    sep='::', 
                    engine='python', 
                    encoding='latin-1',
                    names=['userId', 'movieId', 'rating', 'timestamp']).drop(['timestamp'], axis=1)
movies = pd.read_csv('ml-1m/movies.dat', 
                    sep='::', 
                    engine='python', 
                    encoding='latin-1',
                    names=['movieId', 'title', 'genres'])

ratings.shape, movies.shape

((1000209, 3), (3883, 3))

In [5]:
ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,1193,5
1,1,661,3
2,1,914,3
3,1,3408,4
4,1,2355,5


In [6]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [7]:
num_movies = ratings['movieId'].unique().shape[0]
num_users = ratings['userId'].unique().shape[0]
print('Number of movies: ',num_movies)
print('Number of users: ',num_users)

Number of movies:  3706
Number of users:  6040


In [8]:
def train_test_split(ratings):
    test = np.zeros(ratings.shape)
    train = ratings.copy()
    for user in range(ratings.shape[0]):
        test_ratings = np.random.choice(ratings[user, :].nonzero()[0], 
                                        size=5, 
                                        replace=False)
        train[user, test_ratings] = 0.
        test[user, test_ratings] = ratings[user, test_ratings]

    return train, test

In [9]:
sparse_ratings = ratings.pivot(index = 'userId', columns ='movieId', values = 'rating').fillna(0).values

train, test = train_test_split(sparse_ratings)

train.shape, test.shape

((6040, 3706), (6040, 3706))

In [166]:
from scipy.sparse.linalg import svds

def compute_svds(ratings, k=10):
    ratings = ratings - ratings.mean(axis=1).reshape(-1,1)
    U, sigma, Vt = svds(ratings, k = k)
#     sigma = np.diag(np.sqrt(sigma))
    sigma = np.diag(sigma)
    return U, sigma, Vt
#     print(U.shape, sigma.shape, Vt.shape)
    
def predict(ratings, U, S, V):
    return U.dot(S).dot(V) + ratings.mean(axis=1).reshape(-1,1)

In [188]:
U, S, Vt = compute_svds(train, k=30)

In [189]:
pred = predict(train, U, S, Vt)

In [193]:
from sklearn.metrics import mean_squared_error

def get_rmse(pred, actual):
    pred = pred[actual.nonzero()].flatten()
    actual = actual[actual.nonzero()].flatten()
    return np.sqrt(mean_squared_error(pred, actual))

In [194]:
print('SVD RMSE: ' + str(get_rmse(pred, test)))

SVD RMSE: 2.92893919536704


In [28]:
t = np.random.rand(100, 150)
compute_svds(t, 10)

(100, 10) (10, 10) (10, 150)


In [16]:
from numpy.linalg import svd
U, sigma, Vt = svd(train)
U.shape, sigma.shape, Vt.shape

((6040, 6040), (3706,), (3706, 3706))

In [18]:
import numpy, scipy.sparse
from sparsesvd import sparsesvd
mat = numpy.random.rand(200, 100) # create a random matrix
smat = scipy.sparse.csc_matrix(mat) # convert to sparse CSC format
ut, s, vt = sparsesvd(smat, 100)
ut.shape, s.shape, vt.shape

((100, 200), (100,), (100, 100))

In [21]:
U, sigma, Vt = svds(mat, k = 50)

In [25]:
np.diag(sigma)[1]

array([0.        , 3.86354047, 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ])

In [159]:
rats = np.array([
    [5,4,5,0,0,0,0,0,0],
    [5,5,4,0,0,0,0,0,0],
    [0,5,4,0,0,0,0,0,0],
    [2,2,2,0,0,0,0,0,0],
    [0,0,0,3,4,5,0,0,0],
    [0,0,0,5,4,5,0,0,0],
    [0,0,0,0,0,0,5,3,4],
    [0,0,0,0,0,0,5,5,3],
    [0,0,0,0,0,0,5,0,3],
])
rats_mean_user = np.mean(rats, axis=1).reshape(-1,1)
rats_mean_item = np.mean(rats, axis=0).reshape(1,-1)
rats_mean_user.shape, rats_mean_item.shape

rats.shape

(9, 9)

In [163]:
np.around(rats-rats_mean_user,2)

array([[ 3.44,  2.44,  3.44, -1.56, -1.56, -1.56, -1.56, -1.56, -1.56],
       [ 3.44,  3.44,  2.44, -1.56, -1.56, -1.56, -1.56, -1.56, -1.56],
       [-1.  ,  4.  ,  3.  , -1.  , -1.  , -1.  , -1.  , -1.  , -1.  ],
       [ 1.33,  1.33,  1.33, -0.67, -0.67, -0.67, -0.67, -0.67, -0.67],
       [-1.33, -1.33, -1.33,  1.67,  2.67,  3.67, -1.33, -1.33, -1.33],
       [-1.56, -1.56, -1.56,  3.44,  2.44,  3.44, -1.56, -1.56, -1.56],
       [-1.33, -1.33, -1.33, -1.33, -1.33, -1.33,  3.67,  1.67,  2.67],
       [-1.44, -1.44, -1.44, -1.44, -1.44, -1.44,  3.56,  3.56,  1.56],
       [-0.89, -0.89, -0.89, -0.89, -0.89, -0.89,  4.11, -0.89,  2.11]])

In [146]:
u,s,vt = compute_svds(rats.astype('float'), k=3 )

In [147]:
u

array([[-6.75582434e-17, -1.03367774e-16,  6.34575568e-01],
       [ 1.43312739e-16,  1.17456728e-16,  6.37345189e-01],
       [ 9.16986637e-17,  1.67719523e-16,  4.37155531e-01],
       [-6.55201741e-01,  4.14288825e-16,  8.41607892e-17],
       [-7.55453955e-01,  4.42472030e-16,  5.25338567e-17],
       [ 3.31822815e-16,  6.13046270e-01,  5.30683308e-17],
       [ 4.99198208e-16,  6.46212185e-01,  2.12451936e-16],
       [ 2.53334304e-16,  4.54515217e-01,  6.99539703e-17]])

In [148]:
s

array([[3.27164631, 0.        , 0.        ],
       [0.        , 3.39113405, 0.        ],
       [0.        , 0.        , 3.55009199]])

In [149]:
vt

array([[ 3.53871610e-17,  8.45340034e-17,  5.62661608e-17,
        -5.36533285e-01, -5.27167131e-01, -6.58958914e-01,
         5.06534380e-16,  3.26192757e-16,  3.34921474e-16],
       [ 6.12574399e-18,  8.80372727e-17,  5.42502192e-17,
         3.00459972e-16,  2.98009214e-16,  3.72511517e-16,
         7.45132586e-01,  4.40894979e-01,  5.00388896e-01],
       [ 5.04604183e-01,  6.27684659e-01,  5.92778700e-01,
         4.08748228e-17,  4.33842689e-17,  5.42303361e-17,
         1.33091391e-16,  9.69173872e-17,  8.40655534e-17]])

In [155]:
res = u.dot(s).dot(vt)
# res.round()
# res

In [157]:
np.around(res,decimals=2)

array([[ 1.14,  1.41,  1.34,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ],
       [ 1.14,  1.42,  1.34, -0.  , -0.  , -0.  ,  0.  ,  0.  ,  0.  ],
       [ 0.78,  0.97,  0.92, -0.  , -0.  , -0.  ,  0.  ,  0.  ,  0.  ],
       [ 0.  ,  0.  ,  0.  ,  1.15,  1.13,  1.41, -0.  , -0.  , -0.  ],
       [ 0.  , -0.  , -0.  ,  1.33,  1.3 ,  1.63, -0.  , -0.  , -0.  ],
       [ 0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  1.55,  0.92,  1.04],
       [ 0.  ,  0.  ,  0.  , -0.  , -0.  , -0.  ,  1.63,  0.97,  1.1 ],
       [ 0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  1.15,  0.68,  0.77]])