# Рекомендации товаров

In [1]:
import numpy as np # поданобится для линейной алгебры
import pandas as pd # CSV I/O + предварительная подготовка выборки
from scipy.sparse import vstack # для объединения разреженных матриц
from scipy import sparse # для разреженных матриц
from scipy.sparse.linalg import spsolve # решение системы линейных уравнений

In [2]:
from subprocess import check_output
from sklearn.model_selection import train_test_split

DATA_PATH = "./retailrocket"

print(check_output(["ls", DATA_PATH]).decode("utf8"))

category_tree.csv
events.csv
item_properties_part1.csv
item_properties_part2.csv



In [3]:
events = pd.read_csv(DATA_PATH + '/events.csv')
category_tree = pd.read_csv(DATA_PATH + '/category_tree.csv')
items1 = pd.read_csv(DATA_PATH + '/item_properties_part1.csv')
items2 = pd.read_csv(DATA_PATH + '/item_properties_part2.csv')
items = pd.concat([items1, items2])

# Создаем user-item матрицу

In [4]:
n_users = events['visitorid'].unique().shape[0]

In [5]:
n_items = items['itemid'].max()

In [6]:
print (str(n_users) +" " +  str(n_items))

1407580 466866


In [7]:
user_to_item_matrix = sparse.dok_matrix((n_users+1, n_items+2), dtype=np.int8)

In [8]:
# Заполняем матрицу весами событий

action_weights = {'view': 1, 'addtocart': 2, 'transaction': 3}
for row in events.itertuples():
    mapped_user_key = row[2]
    event_type = row.event
    if event_type in action_weights.keys():
            user_to_item_matrix[mapped_user_key, row[4]] = action_weights[event_type]

In [9]:
user_to_item_matrix = user_to_item_matrix.tocsr()
print(user_to_item_matrix.shape)

(1407581, 466868)


# Разбиваем на обучение и тест

Посмотрим сначала на разреженность матрицы:

In [10]:
sparsity = float(len(user_to_item_matrix.nonzero()[0])) # число ненулевых ячеек
sparsity /= (user_to_item_matrix.shape[0] * user_to_item_matrix.shape[1]) # размер матрицы
print (sparsity)

3.264344859727226e-06


Разобьем на обучающую и тестовую выборку по пользователям. Ситуацию, когда добавляются новые товары в этом примере не рассматриваем.

In [11]:
X_train, X_test = train_test_split(user_to_item_matrix, test_size=0.20)

In [12]:
X_train.shape

(1126064, 466868)

In [13]:
X_test.shape

(281517, 466868)

# Похожесть пользователей по косинусной мере


In [14]:
from sklearn.metrics.pairwise import cosine_similarity

In [15]:
# TODO: this is user to user similarity. check item to item similarity as well
cosine_similarity_matrix = cosine_similarity(X_train, X_train, dense_output=False)
cosine_similarity_matrix.setdiag(0)



In [16]:
one_user_id = 42
another_user_id = cosine_similarity_matrix[one_user_id].argmax()
print(another_user_id)

431965


In [17]:
print(X_train[one_user_id])

  (0, 229382)	1
  (0, 333191)	1
  (0, 353334)	1


In [18]:
print(X_train[another_user_id])

  (0, 229382)	1
  (0, 333191)	1


Попробуем найти пользователя, который сделал много действий:

In [19]:
for i in range(X_train.shape[0]):
    if X_train[i, :].sum() > 5:
        print(i)
        print(X_train[i])
        break

30
  (0, 2567)	1
  (0, 12057)	1
  (0, 40702)	1
  (0, 80726)	1
  (0, 97651)	1
  (0, 98178)	1
  (0, 174896)	1
  (0, 215063)	1
  (0, 427641)	1
  (0, 457447)	1


## Рекомендации на основе implicit ALS

In [20]:
def implicit_weighted_ALS(training_set, lambda_val = 0.1, alpha = 40, iterations = 10, rank_size = 20, seed = 0):
    '''
    Implicit weighted ALS taken from Hu, Koren, and Volinsky 2008. Designed for alternating least squares and implicit
    feedback based collaborative filtering. 
    
    parameters:
    
    training_set - Our matrix of ratings with shape m x n, where m is the number of users and n is the number of items.
    Should be a sparse csr matrix to save space. 
    
    lambda_val - Used for regularization during alternating least squares. Increasing this value may increase bias
    but decrease variance. Default is 0.1. 
    
    alpha - The parameter associated with the confidence matrix discussed in the paper, where Cui = 1 + alpha*Rui. 
    The paper found a default of 40 most effective. Decreasing this will decrease the variability in confidence between
    various ratings.
    
    iterations - The number of times to alternate between both user feature vector and item feature vector in
    alternating least squares. More iterations will allow better convergence at the cost of increased computation. 
    The authors found 10 iterations was sufficient, but more may be required to converge. 
    
    rank_size - The number of latent features in the user/item feature vectors. The paper recommends varying this 
    between 20-200. Increasing the number of features may overfit but could reduce bias. 
    
    seed - Set the seed for reproducible results
    
    returns:
    
    The feature vectors for users and items. The dot product of these feature vectors should give you the expected 
    "rating" at each point in your original matrix. 
    '''
    
    # first set up our confidence matrix
    
    conf = (alpha * training_set) # To allow the matrix to stay sparse, I will add one later when each row is taken 
                                # and converted to dense. 
    num_user = conf.shape[0]
    num_item = conf.shape[1] # Get the size of our original ratings matrix, m x n
    
    # initialize our X/Y feature vectors randomly with a set seed
    rstate = np.random.RandomState(seed)
    
    X = sparse.csr_matrix(rstate.normal(size = (num_user, rank_size))) # Random numbers in a m x rank shape
    Y = sparse.csr_matrix(rstate.normal(size = (num_item, rank_size))) # Normally this would be rank x n but we can 
                                                                 # transpose at the end. Makes calculation more simple.
    X_eye = sparse.eye(num_user)
    Y_eye = sparse.eye(num_item)
    lambda_eye = lambda_val * sparse.eye(rank_size) # Our regularization term lambda*I. 
    
    # We can compute this before iteration starts. 
    
    # Begin iterations
   
    for iter_step in range(iterations): # Iterate back and forth between solving X given fixed Y and vice versa
        # Compute yTy and xTx at beginning of each iteration to save computing time
        yTy = Y.T.dot(Y)
        xTx = X.T.dot(X)
        
        print(iter_step)
        
        # Being iteration to solve for X based on fixed Y
        for u in range(num_user):
            if u % 100 == 0:
                print(u)
            conf_samp = conf[u,:].toarray() # Grab user row from confidence matrix and convert to dense
            pref = conf_samp.copy() 
            pref[pref != 0] = 1 # Create binarized preference vector 
            
            CuI = sparse.diags(conf_samp, [0]) # Get Cu - I term, don't need to subtract 1 since we never added it 
            yTCuIY = Y.T.dot(CuI).dot(Y) # This is the yT(Cu-I)Y term 
            yTCupu = Y.T.dot(CuI + Y_eye).dot(pref.T) # This is the yTCuPu term, where we add the eye back in
                                                      # Cu - I + I = Cu
                
            X[u] = spsolve(yTy + yTCuIY + lambda_eye, yTCupu) 
            # Solve for Xu = ((yTy + yT(Cu-I)Y + lambda*I)^-1)yTCuPu, equation 4 from the paper  
        
        # Begin iteration to solve for Y based on fixed X 
        for i in range(num_item):
            conf_samp = conf[:,i].T.toarray() # transpose to get it in row format and convert to dense
            pref = conf_samp.copy()
            pref[pref != 0] = 1 # Create binarized preference vector
            
            CiI = sparse.diags(conf_samp, [0]) # Get Ci - I term, don't need to subtract 1 since we never added it
            xTCiIX = X.T.dot(CiI).dot(X) # This is the xT(Cu-I)X term
            xTCiPi = X.T.dot(CiI + X_eye).dot(pref.T) # This is the xTCiPi term
            
            Y[i] = spsolve(xTx + xTCiIX + lambda_eye, xTCiPi)
            # Solve for Yi = ((xTx + xT(Cu-I)X) + lambda*I)^-1)xTCiPi, equation 5 from the paper
    
    # End iterations
    return X, Y.T # Transpose at the end to make up for not being transposed at the beginning. 
                         # Y needs to be rank x n. Keep these as separate matrices for scale reasons. 

In [None]:
! pip install implicit

In [21]:
user_vecs, item_vecs = implicit_weighted_ALS(X_train, lambda_val = 0.1, alpha = 15, iterations = 1,
                                            rank_size = 20)

0
0
100
200
300
400
500


KeyboardInterrupt: 

In [25]:
print(X_train.shape)

(1126064, 466868)


In [None]:
def max_n(row_data, row_indices, n):
        i = row_data.argsort()[-n:]
        # i = row_data.argpartition(-n)[-n:]
        top_values = row_data[i]
        top_indices = row_indices[i]
        return top_values, top_indices, i

In [None]:
def predict_topk(ratings, similarity, kind='user', k=40):
    pred = sparse.csr_matrix((0,ratings.shape[1]), dtype=np.int8)
    if kind == 'user':
        for i in range(similarity.shape[0]):
            top_k_values, top_k_users = max_n(np.array(similarity.data[i]),np.array(similarity.rows[i]),k)[:2]
            current = top_k_values.reshape(1,-1).dot(ratings[top_k_users].todense())
            current /= np.sum(np.abs(top_k_values))+1
            vstack([pred, current])
    if kind == 'item':
        for j in range(ratings.shape[1]):
            top_k_items = [np.argsort(similarity[:,j])[:-k-1:-1]]
            for i in range(ratings.shape[0]):
                pred[i, j] = similarity[j, :][top_k_items].dot(ratings[i, :][top_k_items].T) 
                pred[i, j] /= np.sum(np.abs(similarity[j, :][top_k_items]))        
    
    return pred

In [None]:
cosine_similarity_matrix_ll=cosine_similarity_matrix.tolil()

In [None]:
pred = predict_topk(X_train, cosine_similarity_matrix_ll, kind='user', k=5)

## Домашнее задание

1. Попробовать построить простые рекомендации на основе user-based подхода (по косинусной мере или по корреляции Пирсона)
1. ** Попробовать запустить ALS на небольшой подвыборке пользователей (например, на 5-10 тысячах)
1. ** Попробовать написать рекомендации на основе матричного разложения, выполненного с помощью SGD