In [1]:
import pandas as pd
import numpy as np
import random
from sklearn.model_selection import train_test_split

In [2]:
df_ratings = pd.read_csv("ml-latest-small/ratings.csv")
df_movies = pd.read_csv("ml-latest-small/movies.csv")

In [3]:
from scipy.linalg import svd

In [4]:
import scipy.sparse as sp
from scipy import sparse
from scipy.sparse.linalg import spsolve

In [5]:
user_item_matrix = df_ratings.pivot_table(index=['userId'], columns=['movieId'], values='rating')
# Contruct a sparse matrix for our users and items containing number of plays
sparse_ui= sp.csr_matrix(user_item_matrix)

In [6]:
X_train, X_test = train_test_split(sparse_ui, test_size = 0.25, random_state=57)
ind_train, ind_test = train_test_split(user_item_matrix, test_size = 0.25, random_state=57)

In [7]:
X_res = pd.DataFrame(index=ind_train.index, data=[], columns=['actual'])
for i in X_res.index:
    X_res.loc[i]['actual'] = list(set(ind_train.loc[i][ind_train.loc[i].notnull()].index))

In [8]:
ind_train_u = pd.Series(ind_train.index.tolist())
ind_train_i = pd.Series(ind_train.columns.values.tolist())

In [9]:
from scipy import sparse
from scipy.sparse.linalg import spsolve
# from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler

In [10]:
seed=57
rstate = np.random.RandomState(seed)
rank_size=10
lambda_val = 0.1
num_user = X_train.shape[0]
num_item = X_train.shape[1]
P = sparse.csr_matrix((rstate.normal(size = (num_user, rank_size)))) # Random numbers in a m x rank shape
Q = sparse.csr_matrix((rstate.normal(size = (num_item, rank_size)))) # Normally this would be rank x n but we can transpose at the end. Makes calculation more simple.
QTQ = Q.T.dot(Q) # QTQ
PTP = P.T.dot(P)
P_eye = sparse.eye(num_user)
Q_eye = sparse.eye(num_item)
lambda_eye = lambda_val * sparse.eye(rank_size) # Our regularization term lambda*I.

In [11]:
u=5
pref = X_train[u, :].toarray() # Grab user row from confidence matrix and convert to dense
pref_u = pref[~np.isnan(pref)] # We take only the movies which the user has rated
u_rated_movies_ind = np.argwhere(~np.isnan(pref))[:,1] # Index of the rated movies
Qu = Q[u_rated_movies_ind, :] # We construct the Qu matrix of only rated existant pairs (u, i)
QuTru = Qu.T.dot(pref_u.T) # This is the QuTPu term
Q[u] = spsolve(QTQ +lambda_eye, QuTru)
print(Q[u].toarray())

[[ 0.0044179  -0.01638758 -0.00766912 -0.00914071  0.02136258 -0.00095192
  -0.00905147  0.01610489  0.00381576 -0.00747813]]


In [12]:
def lfm_als(training_set, lambda_val, iterations=10, rank_size=20, seed=57):
    """
    Model by Bell R.M., Koren Y., Volinsky C. The BellKor 2008 solution to the Netflix Prize.
    
    parameters:
    
    training_set - Our matrix of ratings with shape m x n, where m is the number of users and n is the number of items.
    Should be a sparse csr matrix to save space. 
    
    lambda_val - Used for regularization during alternating least squares. Increasing this value may increase bias
    but decrease variance. Default is 0.1. 
    
    iterations - The number of times to alternate between both user feature vector and item feature vector in
    alternating least squares. More iterations will allow better convergence at the cost of increased computation. 
    The authors found 10 iterations was sufficient, but more may be required to converge. 
    
    rank_size - The number of latent features in the user/item feature vectors. The paper recommends varying this 
    between 20-200. Increasing the number of features may overfit but could reduce bias. 
    
    seed - Set the seed for reproducible results
    
    returns:
    
    The feature vectors for users and items. The dot product of these feature vectors should give you the expected 
    "rating" at each point in your original matrix. 
    """
    
    # Get the size of our original ratings matrix, m x n
    num_user = training_set.shape[0]
    num_item = training_set.shape[1]
    
    # initialize our X/Y feature vectors randomly with a set seed
    rstate = np.random.RandomState(seed)
    
    P = sparse.csr_matrix((rstate.normal(size = (num_user, rank_size)))) # предсставление пользователей
    Q = sparse.csr_matrix((rstate.normal(size = (num_item, rank_size)))) # Normally this would be rank x n but we can transpose at the end. Makes calculation more simple.
    QTQ = Q.T.dot(Q) # QTQ
    PTP = P.T.dot(P)
    P_eye = sparse.eye(num_user)
    Q_eye = sparse.eye(num_item)
    lambda_eye = lambda_val * sparse.eye(rank_size) # Our regularization term lambda*I.
    
    # Begin iterations
   
    # Iterate back and forth between solving X given fixed Y and vice versa
    for iter_step in range(iterations):
        # Compute yTy and xTx at beginning of each iteration to save computing time
        QTQ = Q.T.dot(Q) # QTQ
        PTP = P.T.dot(P)
        
        # Being iteration to solve for P based on fixed Q
        for u in range(num_user):
            pref = training_set[u, :].toarray() # Grab user row from confidence matrix and convert to dense
            pref_u = pref[~np.isnan(pref)] # We take only the movies which the user has rated
            u_rated_movies_ind = np.argwhere(~np.isnan(pref))[:,1] # Index of the rated movies
            Qu = Q[u_rated_movies_ind, :] # We construct the Qu matrix of only rated existant pairs (u, i)
            QuTru = Qu.T.dot(pref_u.T) # This is the QuTru term
            Q[u] = spsolve(QTQ +lambda_eye, QuTru) # Solve for Qu = ((QTQ + lambda*I)^-1)QT*Pu | Ax=b
        
        # Begin iteration to solve for Q based on fixed P
        for i in range(num_item):
            pref = X_train[:,i].toarray()
            pref_i = pref[~np.isnan(pref)]
            i_rated_movies_ind = np.argwhere(~np.isnan(pref))[:,0]
            Pi = P[i_rated_movies_ind, :]
            PiTri = Pi.T.dot(pref_i.T)
            P[u] = spsolve(PTP + lambda_eye, PiTri)
            
    # End iterations
    return P, Q.T

In [13]:
%%time
P, Q = lfm_als(X_train, lambda_val = 0.1, iterations = 10, rank_size = 20)

Wall time: 10min 28s


In [21]:
print("P shape: {}".format(P.shape))
print("Q shape: {}".format(Q.shape))

P shape: (457, 20)
Q shape: (20, 9724)


In [14]:
# Let's find similar movies 
movieId = 5615
ind_i = ind_train_i[ind_train_i == movieId].index[0]

# Get the item row
qi = Q[:, ind_i].toarray()[:, 0]

In [15]:
# Calculate the similarity score between choseen movie and other movies
# and select the top 10 most similar.
scores = Q.T.dot(qi)
top_10 = np.argsort(scores)[::-1][:10]

In [16]:
top_10

array([3977, 6043, 6192, 6991, 5879, 4048, 4439, 7068, 1166, 9547],
      dtype=int64)

In [22]:
df_movies[df_movies['movieId']==ind_train_i[6043]]['title'].iloc[0]

'Man Who Cried, The (2000)'

In [20]:
movies = []
movies_genres = []
movies_scores = []
movies_ids = []

# Get and print the actual artists names and scores
for idx in top_10:
    movies_ids.append(ind_train_i[idx])
    movies.append(df_movies[df_movies['movieId']==ind_train_i[idx]]['title'].iloc[0])
    movies_genres.append(df_movies[df_movies['movieId']==ind_train_i[idx]]['genres'].iloc[0])
    movies_scores.append(scores[idx])

similar = pd.DataFrame({'movieId': movies_ids, 'movies': movies, 'score': movies_scores, 'genres': movies_genres})
similar

Unnamed: 0,movieId,movies,score,genres
0,5615,Invincible (2001),29.756241,Drama
1,40723,Wolf Creek (2005),21.278858,Crime|Horror|Thriller
2,45499,X-Men: The Last Stand (2006),19.570919,Action|Sci-Fi|Thriller
3,68073,Pirate Radio (2009),19.309013,Comedy|Drama
4,33437,Unleashed (Danny the Dog) (2005),18.446562,Action|Crime|Drama|Thriller
5,5772,My Dinner with André (1981),18.351032,Drama
6,6564,Lara Croft Tomb Raider: The Cradle of Life (2003),18.269228,Action|Adventure|Comedy|Romance|Thriller
7,70015,Polytechnique (2009),18.228665,Crime|Drama
8,1547,Shiloh (1997),17.807164,Children|Drama
9,173873,Gulliver's Travels (1996),17.672585,Adventure|Children|Fantasy


In [23]:
def predict_top_k(user_id, training_set, P, Q, df_movies, ind_train_i, k=10):
    """
    Recommend items for a given user given a trained model
    Args:
        user_id (int): The id of the user we want to create recommendations for.
        data_sparse (csr_matrix): Our original training data.
        P (csr_matrix): Users embedding
        Q (pandas.DataFrame): Item embedding
        k (int): How many recommendations we want to return:
    Returns:
        recommendations (pandas.DataFrame): DataFrame with k movies and scores
    """
  
    # Get all interactions by the user
    user_interactions = training_set[user_id,:].toarray()

    # We don't want to recommend items the user has consumed. So let's
    # set them all to 0 and the NaNs to 1.
    user_interactions = np.where(~np.isnan(user_interactions), 0, user_interactions)
    user_interactions = np.nan_to_num(user_interactions[0], nan=1)
    
    # This is where we calculate the recommendation by taking the 
    # dot-product of the user vectors with the item vectors.
            
    rec_vector = P[user_id,:].dot(Q).toarray()
    
    # Let's scale our scores between 0 and 1 to make it all easier to interpret.
    min_max = MinMaxScaler()
    rec_vector_scaled = min_max.fit_transform(rec_vector.reshape(-1,1))[:,0]
    recommend_vector = user_interactions*rec_vector_scaled
   
    # Get all the movies indices in order of recommendations (descending) and
    # select only the top "k" items.
    item_idx = np.argsort(recommend_vector)[::-1][:k]

    # Loop through our recommended movie indicies and look up the movie title
    movies = []
    movies_scores = []
    movies_ids = []

    # Get and print the actual movie names, IDs and scores
    for idx in item_idx:
        movies_ids.append(ind_train_i[idx])
        movies.append(df_movies[df_movies['movieId']==ind_train_i[idx]]['title'].iloc[0])
        movies_scores.append(recommend_vector[idx])

    similar = pd.DataFrame({'movieId': movies_ids, 'title': movies, 'score': movies_scores})
    
    return similar

In [24]:
# Let's generate and print our recommendations
user_id = 103
recommendations = predict_top_k(user_id, X_train, P, Q, df_movies, ind_train_i, k=10)
print(recommendations)

   movieId                                              title     score
0     5615                                  Invincible (2001)  1.000000
1    93502                                  Ledge, The (2011)  0.994485
2    89072                                  Stake Land (2010)  0.957493
3     4334                                       Yi Yi (2000)  0.947172
4     5570                              Thesis (Tesis) (1996)  0.934872
5    26236  White Sun of the Desert, The (Beloe solntse pu...  0.932483
6    80549                                      Easy A (2010)  0.931932
7     4395  Big Deal on Madonna Street (I Soliti Ignoti) (...  0.926718
8   144222                             Bros Before Hos (2013)  0.924318
9     3790                                      Groove (2000)  0.922260
