Примеры составлены по мотивам главы 2 книги Т.Сегаран "Программируем коллективный разум" http://www.symbol.ru/alphabet/613828.html






## Словарь с предпочтениями пользователи-фильмы

In [14]:
# Словарь кинокритиков и выставленных ими оценок для небольшого набора
# данных о фильмах
critics={'Lisa Rose': {'Lady in the Water': 2.5, 'Snakes on a Plane': 3.5,
 'Just My Luck': 3.0, 'Superman Returns': 3.5, 'You, Me and Dupree': 2.5, 
 'The Night Listener': 3.0},
'Gene Seymour': {'Lady in the Water': 3.0, 'Snakes on a Plane': 3.5, 
 'Just My Luck': 1.5, 'Superman Returns': 5.0, 'The Night Listener': 3.0, 
 'You, Me and Dupree': 3.5}, 
'Michael Phillips': {'Lady in the Water': 2.5, 'Snakes on a Plane': 3.0,
 'Superman Returns': 3.5, 'The Night Listener': 4.0},
'Claudia Puig': {'Snakes on a Plane': 3.5, 'Just My Luck': 3.0,
 'The Night Listener': 4.5, 'Superman Returns': 4.0, 
 'You, Me and Dupree': 2.5},
'Mick LaSalle': {'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0, 
 'Just My Luck': 2.0, 'Superman Returns': 3.0, 'The Night Listener': 3.0,
 'You, Me and Dupree': 2.0}, 
'Jack Matthews': {'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0,
 'The Night Listener': 3.0, 'Superman Returns': 5.0, 'You, Me and Dupree': 3.5},
'Toby': {'Snakes on a Plane':4.5,'You, Me and Dupree':1.0,'Superman Returns':4.0}}

In [15]:
import numpy as np
import pandas as pd

In [93]:
rates_base = pd.DataFrame(critics).as_matrix().T
rates_base

  """Entry point for launching an IPython kernel.


array([[2.5, 3.5, 3. , 3.5, 2.5, 3. ],
       [3. , 3.5, 1.5, 5. , 3.5, 3. ],
       [2.5, 3. , nan, 3.5, nan, 4. ],
       [nan, 3.5, 3. , 4. , 2.5, 4.5],
       [3. , 4. , 2. , 3. , 2. , 3. ],
       [3. , 4. , nan, 5. , 3.5, 3. ],
       [nan, 4.5, nan, 4. , 1. , nan]])

In [57]:
rates = np.nan_to_num(rates_base)
rates

array([[2.5, 3.5, 3. , 3.5, 2.5, 3. ],
       [3. , 3.5, 1.5, 5. , 3.5, 3. ],
       [2.5, 3. , 0. , 3.5, 0. , 4. ],
       [0. , 3.5, 3. , 4. , 2.5, 4.5],
       [3. , 4. , 2. , 3. , 2. , 3. ],
       [3. , 4. , 0. , 5. , 3.5, 3. ],
       [0. , 4.5, 0. , 4. , 1. , 0. ]])

In [96]:
from numpy.linalg import solve


class ExplicitMF():
    def __init__(self, 
                 ratings,
                 n_factors=40,
                 learning='sgd',
                 item_fact_reg=0.0, 
                 user_fact_reg=0.0,
                 item_bias_reg=0.0,
                 user_bias_reg=0.0,
                 verbose=False):
        """
        Train a matrix factorization model to predict empty 
        entries in a matrix. The terminology assumes a 
        ratings matrix which is ~ user x item
        
        Params
        ======
        ratings : (ndarray)
            User x Item matrix with corresponding ratings
        
        n_factors : (int)
            Number of latent factors to use in matrix 
            factorization model
        learning : (str)
            Method of optimization. Options include 
            'sgd' or 'als'.
        
        item_fact_reg : (float)
            Regularization term for item latent factors
        
        user_fact_reg : (float)
            Regularization term for user latent factors
            
        item_bias_reg : (float)
            Regularization term for item biases
        
        user_bias_reg : (float)
            Regularization term for user biases
        
        verbose : (bool)
            Whether or not to printout training progress
        """
        
        self.ratings = ratings
        self.n_users, self.n_items = ratings.shape
        self.n_factors = n_factors
        self.item_fact_reg = item_fact_reg
        self.user_fact_reg = user_fact_reg
        self.item_bias_reg = item_bias_reg
        self.user_bias_reg = user_bias_reg
        self.learning = learning
        if self.learning == 'sgd':
            self.sample_row, self.sample_col = self.ratings.nonzero()
            self.n_samples = len(self.sample_row)
        self._v = verbose

    def als_step(self,
                 latent_vectors,
                 fixed_vecs,
                 ratings,
                 _lambda,
                 type='user'):
        """
        One of the two ALS steps. Solve for the latent vectors
        specified by type.
        """
        if type == 'user':
            # Precompute
            YTY = fixed_vecs.T.dot(fixed_vecs)
            lambdaI = np.eye(YTY.shape[0]) * _lambda

            for u in range(latent_vectors.shape[0]):
                latent_vectors[u, :] = solve((YTY + lambdaI), 
                                             ratings[u, :].dot(fixed_vecs))
        elif type == 'item':
            # Precompute
            XTX = fixed_vecs.T.dot(fixed_vecs)
            lambdaI = np.eye(XTX.shape[0]) * _lambda
            
            for i in range(latent_vectors.shape[0]):
                latent_vectors[i, :] = solve((XTX + lambdaI), 
                                             ratings[:, i].T.dot(fixed_vecs))
        return latent_vectors

    def train(self, n_iter=10, learning_rate=0.1):
        """ Train model for n_iter iterations from scratch."""
        # initialize latent vectors        
        self.user_vecs = np.random.normal(scale=1./self.n_factors, size=(self.n_users, self.n_factors))
        self.item_vecs = np.random.normal(scale=1./self.n_factors, size=(self.n_items, self.n_factors))
        
        if self.learning == 'als':
            self.partial_train(n_iter,0)
        elif self.learning == 'sgd':
            self.learning_rate = learning_rate
            self.user_bias = np.zeros(self.n_users)
            self.item_bias = np.zeros(self.n_items)
            self.global_bias = np.mean(self.ratings[np.where(self.ratings != 0)])
            self.partial_train(n_iter,0)
    
    
    def partial_train(self, n_iter, iter_done):
        """ 
        Train model for n_iter iterations. Can be 
        called multiple times for further training.
        """
        ctr = 1
        while ctr <= n_iter:
            if (ctr+iter_done) % 10 == 0 and self._v:
                print (f'\tcurrent iteration: {ctr+iter_done}')
            if self.learning == 'als':
                self.user_vecs = self.als_step(self.user_vecs, 
                                               self.item_vecs, 
                                               self.ratings, 
                                               self.user_fact_reg, 
                                               type='user')
                self.item_vecs = self.als_step(self.item_vecs, 
                                               self.user_vecs, 
                                               self.ratings, 
                                               self.item_fact_reg, 
                                               type='item')
            elif self.learning == 'sgd':
                self.training_indices = np.arange(self.n_samples)
                np.random.shuffle(self.training_indices)
                self.sgd()
            ctr += 1

    def sgd(self):
        for idx in self.training_indices:
            u = self.sample_row[idx]
            i = self.sample_col[idx]
            prediction = self.predict(u, i)
            e = (self.ratings[u,i] - prediction) # error
            
            # Update biases
            self.user_bias[u] += self.learning_rate * (e - self.user_bias_reg * self.user_bias[u])
            self.item_bias[i] += self.learning_rate * (e - self.item_bias_reg * self.item_bias[i])
            
            #Update latent factors
            
            delta_i = self.learning_rate * (e * self.user_vecs[u, :] - self.item_fact_reg * self.item_vecs[i,:])
            
            self.user_vecs[u, :] += self.learning_rate *\
                                    (e * self.item_vecs[i, :] - self.user_fact_reg * self.user_vecs[u,:])
#            self.item_vecs[i, :] += self.learning_rate * \
#                                    (e * self.user_vecs[u, :] - self.item_fact_reg * self.item_vecs[i,:])
            self.item_vecs[i, :] += delta_i
   

    def predict(self, u, i):
        """ Single user and item prediction."""
        if self.learning == 'als':
            return self.user_vecs[u, :].dot(self.item_vecs[i, :].T)
        elif self.learning == 'sgd':
            prediction = self.global_bias + self.user_bias[u] + self.item_bias[i]
            prediction += self.user_vecs[u, :].dot(self.item_vecs[i, :].T)
            return prediction
    
    def predict_all(self):
        """ Predict ratings for every user and item."""
        predictions = np.zeros((self.user_vecs.shape[0], 
                                self.item_vecs.shape[0]))
        for u in range(self.user_vecs.shape[0]):
            for i in range(self.item_vecs.shape[0]):
                predictions[u, i] = self.predict(u, i)
                
        return predictions
    
    def calculate_learning_curve(self, iter_array, test, learning_rate=0.1):
        """
        Keep track of MSE as a function of training iterations.
        
        Params
        ======
        iter_array : (list)
            List of numbers of iterations to train for each step of 
            the learning curve. e.g. [1, 5, 10, 20]
        test : (2D ndarray)
            Testing dataset (assumed to be user x item).
        
        The function creates two new class attributes:
        
        train_mse : (list)
            Training data MSE values for each value of iter_array
        test_mse : (list)
            Test data MSE values for each value of iter_array
        """
        iter_array.sort()
        self.train_mse =[]
        self.test_mse = []
        iter_diff = 0
        for (i, n_iter) in enumerate(iter_array):
            if self._v:
                print (f'Iteration: {n_iter}')
            if i == 0:
                self.train(n_iter - iter_diff, learning_rate)
            else:
                self.partial_train(n_iter - iter_diff, iter_diff)

            predictions = self.predict_all()

            self.train_mse += [get_mse(predictions, self.ratings)]
            self.test_mse += [get_mse(predictions, test)]
            if self._v:
                print (f'MSE train:test: {round(self.train_mse[-1],2)} : {round(self.test_mse[-1],2)}\n')
            iter_diff = n_iter

In [59]:
small_test = ExplicitMF(rates, n_factors = 2)


In [61]:
np.random.seed(0)
small_test.train(100)

In [62]:
small_test.user_vecs

array([[ 0.40217248,  0.25797608],
       [-0.16243287,  0.87598517],
       [ 0.16100939, -0.25233535],
       [ 0.08975379,  0.07907003],
       [ 0.86469985,  0.67316213],
       [ 0.01513168,  0.93212874],
       [ 0.47573728, -1.66879674]])

In [63]:
small_test.train(100)
small_test.user_vecs

array([[ 0.15213102,  0.45500027],
       [ 0.64522574, -0.46699448],
       [ 0.522016  ,  0.69195765],
       [ 0.28731734,  0.59807869],
       [-0.41869311,  0.3449829 ],
       [ 0.45565799, -0.45419218],
       [-1.03686891, -0.9714227 ]])

In [65]:
np.random.seed(0)
small_test.train(100)
small_test.user_vecs

array([[ 0.40217248,  0.25797608],
       [-0.16243287,  0.87598517],
       [ 0.16100939, -0.25233535],
       [ 0.08975379,  0.07907003],
       [ 0.86469985,  0.67316213],
       [ 0.01513168,  0.93212874],
       [ 0.47573728, -1.66879674]])

In [66]:
small_test.partial_train(100, 100)
small_test.user_vecs

array([[ 0.43723467,  0.37413166],
       [-0.20754576,  0.61306766],
       [ 0.17121251, -0.02789465],
       [ 0.02890269,  0.114141  ],
       [ 0.80175832,  0.51259266],
       [-0.04298589,  0.6792025 ],
       [ 0.5180563 , -2.14187129]])

In [67]:
small_test.partial_train(100, 200)
small_test.user_vecs



array([[nan, nan],
       [nan, nan],
       [nan, nan],
       [nan, nan],
       [nan, nan],
       [nan, nan],
       [nan, nan]])

array([[ 0.27924925,  0.37524454],
       [-0.26663675, -0.84067193],
       [ 0.81720107, -0.02148859],
       [ 0.7002485 , -0.19122691],
       [-0.10791849,  0.97323666],
       [-0.34557659, -0.52839578],
       [ 3.78626345,  0.88902859]])

In [53]:
print(small_test.item_bias)
print(small_test.user_bias)

[-0.38012637  0.19552859 -0.50322135 -0.00748267 -0.59051758  0.2819706
  1.23284412]
[-0.22808433  0.60370324 -0.91464046  0.95547614 -0.32068337  0.13322413]


In [74]:
np.random.seed(0)
small_test.train(100, 0.01)
small_test.user_vecs

array([[ 0.30649008,  0.12739635],
       [ 0.19952093,  1.4053112 ],
       [ 0.31643079, -0.41136693],
       [ 0.42886522, -0.16968078],
       [ 0.58307119,  0.33139668],
       [-0.05483718,  0.85933672],
       [ 0.47526805,  0.28593246]])

In [83]:
small_test.partial_train(10000, 100)
small_test.user_vecs

array([[ 0.4735413 ,  0.48497761],
       [-0.35966023,  0.51521259],
       [ 0.26919599,  0.15274652],
       [ 0.16228673,  0.16575106],
       [ 0.87036004,  0.81335548],
       [-0.17200936,  0.62457611],
       [ 1.69899774, -3.44284765]])

In [84]:
small_test.predict_all() 

array([[ 2.71181919,  3.56602721,  2.54366178,  3.48060826,  2.26261535,
         3.4055833 ],
       [ 2.88231654,  3.57973425,  1.51633611,  5.01708247,  3.50785519,
         2.99430229],
       [ 2.4140297 ,  3.06437845,  3.02911036,  3.5117287 ,  2.03031617,
         4.00950421],
       [ 2.78176916,  3.41644378,  3.21422022,  4.05490247,  2.54127941,
         4.27453587],
       [ 2.88243438,  3.97824584,  2.19781266,  3.00682275,  2.12129594,
         2.79925332],
       [ 3.10904168,  3.89834679,  1.64445015,  4.93825338,  3.56028656,
         2.99482832],
       [ 5.30342166,  4.49705987, 19.21100138,  3.99341183,  1.00859768,
        18.79395698]])

In [85]:
small_test.user_bias

array([-0.35744271,  0.43000948, -0.47329869, -0.02721962, -0.51156278,
        0.50731166,  1.67688811])

In [86]:
small_test.item_bias

array([-0.55522546, -0.03467747,  0.18818513,  0.98088308, -0.68648891,
        1.35200906])

In [87]:
small_test.global_bias

3.2285714285714286

In [88]:
small_test.global_bias + small_test.item_bias[-1] +small_test.user_bias[-1]

6.257468596506937

In [89]:
small_test.user_vecs[-1]

array([ 1.69899774, -3.44284765])

In [90]:
small_test.item_vecs[-1]

array([ 1.33039651, -2.98478143])

In [94]:
small_test.predict_all() - rates_base

array([[ 0.21181919,  0.06602721, -0.45633822, -0.01939174, -0.23738465,
         0.4055833 ],
       [-0.11768346,  0.07973425,  0.01633611,  0.01708247,  0.00785519,
        -0.00569771],
       [-0.0859703 ,  0.06437845,         nan,  0.0117287 ,         nan,
         0.00950421],
       [        nan, -0.08355622,  0.21422022,  0.05490247,  0.04127941,
        -0.22546413],
       [-0.11756562, -0.02175416,  0.19781266,  0.00682275,  0.12129594,
        -0.20074668],
       [ 0.10904168, -0.10165321,         nan, -0.06174662,  0.06028656,
        -0.00517168],
       [        nan, -0.00294013,         nan, -0.00658817,  0.00859768,
                nan]])

In [97]:
small_test = ExplicitMF(rates, n_factors = 2)
np.random.seed(0)
small_test.train(100, 0.01)
small_test.user_vecs

array([[ 0.30829193,  0.12771461],
       [ 0.19911752,  1.40894392],
       [ 0.32238188, -0.4113416 ],
       [ 0.43407689, -0.1698714 ],
       [ 0.58113064,  0.32870178],
       [-0.05819788,  0.86094874],
       [ 0.47873408,  0.28072167]])

In [98]:
small_test.partial_train(10000, 100)
small_test.user_vecs

array([[ 0.49943859,  0.53823666],
       [-0.43743264,  0.70516984],
       [ 0.28289139,  0.11430185],
       [ 0.15520764,  0.15590224],
       [ 0.93993089,  0.9189709 ],
       [-0.22474213,  0.82083514],
       [ 1.59927371, -3.16478866]])

In [99]:
small_test.predict_all() - rates_base

array([[ 0.22260291,  0.07650373, -0.44802229, -0.00627089, -0.25144248,
         0.41374775],
       [-0.11989419,  0.07138466,  0.02848343,  0.00812727,  0.0238319 ,
        -0.01134542],
       [-0.08645487,  0.06426477,         nan,  0.01202391,         nan,
         0.01022411],
       [        nan, -0.06476354,  0.20587848,  0.08277888, -0.00352276,
        -0.22518199],
       [-0.12203006, -0.02725635,  0.21669661, -0.0025042 ,  0.12690145,
        -0.18974205],
       [ 0.10645137, -0.11037187,         nan, -0.07812818,  0.07952442,
         0.00190345],
       [        nan, -0.00792694,         nan, -0.01487834,  0.01866038,
                nan]])