# 8.대규모 데이터의 처리를 위한 Sparse Matrix 사용
 - 데이터의 양이 커지면 메모리의 한계가 있음
 - 메모리가 괜찮더라고 대부분의 원소가 비어있는 매트릭스 전체를 저장해서 처리하는 것은 비효율적이다

## 8.1 Sparse matrix의 개념과 Python에서의 사용
 - sparse matrix - 설명 생략 , 데이터가 희박할수록 사용하기 좋음
   - 단점 : data를 저장하거나 읽을 때마다 데이터 존재 유뮤 확인을 통한 처리를 해야하기 때문에 overhead cost가 많이 듬
   - 종류
     1. csc_matrix : compressed sparse column format - efficient column slicing
     2. csr_matrix : compressed sparse row format - efficient row slicing
     3. bsr_matrix : block sparse row format
     4. lil_matrix : list of lists format
     5. dok_matrix : dictionary of keys format
     6. coo_matrix : coordinate format
     7. dia_matrix : diagonal format

In [1]:
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix

from utility import *
from sklearn.utils import shuffle

In [2]:
ratings = {'user_id' : [1,2,4],
           'movie_id' : [2,3,7],
           'rating' : [4,3,1]
          }
ratings = pd.DataFrame(ratings)

In [3]:
# pandas pivot을 통해 full matrix로 변경
rating_matrix = ratings.pivot(index='user_id', columns='movie_id', values='rating').fillna(0)
full_matrix1 = np.array(rating_matrix)
print(full_matrix1)

[[4. 0. 0.]
 [0. 3. 0.]
 [0. 0. 1.]]


In [4]:
# sparse matrix를 통해 full matrix로 변경
data = np.array(ratings['rating'])
row_indices = np.array(ratings['user_id'])
col_indices = np.array(ratings['movie_id'])
rating_matrix = csr_matrix((data,(row_indices, col_indices)),dtype=int)
print(rating_matrix)

full_matrix2 = rating_matrix.toarray()
print(full_matrix2)

  (1, 2)	4
  (2, 3)	3
  (4, 7)	1
[[0 0 0 0 0 0 0 0]
 [0 0 4 0 0 0 0 0]
 [0 0 0 3 0 0 0 0]
 [0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 1]]


In [5]:
#sparse matrix 게산
print(rating_matrix * 2)
print(rating_matrix.T)
print(rating_matrix.dot(rating_matrix.T))

  (1, 2)	8
  (2, 3)	6
  (4, 7)	2
  (2, 1)	4
  (3, 2)	3
  (7, 4)	1
  (1, 1)	16
  (2, 2)	9
  (4, 4)	1


## 8.2 Sparse Matrix를 추천 알고리즘에 적용하기

In [6]:
_, _, ratings = getData()

ratings.reset_index(inplace=True)
ratings.drop('timestamp',axis=1, inplace=True)

TRAIN_SIZE = 0.75
ratings = shuffle(ratings, random_state=1)
cutoff = int(TRAIN_SIZE * len(ratings))
ratings_train = ratings.iloc[:cutoff]
ratings_test = ratings.iloc[cutoff:]

data = np.array(ratings['rating'])
row_indices = np.array(ratings['user_id'])
col_indices = np.array(ratings['movie_id'])

ratings = csr_matrix((data,(row_indices, col_indices)), dtype=int)

In [7]:
# MF class
class MF:
    def __init__(self, ratings, K, alpha, beta, iterations, verbose = True):
        self.R = ratings
        self.K = K
        self.num_users, self.num_items = self.R.shape
        self.alpha = alpha
        self.beta = beta
        self.iterations = iterations
        self.verbose = verbose
    
    # RMSE 계산
    def rmse(self):
        xs, ys = self.R.nonzero() # 0이 아닌 인덱스 리턴
        self.predictions = []
        self.errors = []
        for x,y in zip(xs, ys):
            prediction = self.get_prediction(x,y)
            self.predictions.append(prediction)
            self.errors.append(self.R[x,y] - prediction)
        self.predictions = np.array(self.predictions)
        self.errors = np.array(self.errors)
        return np.sqrt(np.mean(self.errors**2))
    
    def train(self):
        # P행렬과 Q행렬 초기화
        self.P = np.random.normal(scale=1./self.K, size=(self.num_users, self.K)) # (num_users, K)
        self.Q = np.random.normal(scale=1./self.K, size=(self.num_items, self.K)) # (num_items, K)
        self.b_u = np.zeros(self.num_users) # (num_users,)
        self.b_d = np.zeros(self.num_items) # (num_items,)
        self.b = np.mean(self.R[self.R.nonzero()]) # 전체 평균
        rows, columns = self.R.nonzero()
        self.samples = [(i,j,self.R[i,j]) for i,j in zip(rows,columns)]
        
        training_process = []
        for i in range(self.iterations):
            np.random.shuffle(self.samples)
            self.sgd()
            rmse = self.rmse()
            training_process.append((i+1,rmse))
            if self.verbose:
                if (i+1)%10 ==0:
                    print(f'iteration : {i+1} ; Train RMSE = {rmse}')
        return training_process
    
    # 예측값 계산
    def get_prediction(self, i, j):
        prediction = self.b + self.b_u[i] + self.b_d[j] + self.P[i,:].dot(self.Q[j,:].T)
        return prediction
    
    # stochastic gradient descent 
    def sgd(self):
        for i, j, r in self.samples:
            prediction = self.get_prediction(i,j)
            e = r-prediction
            
            self.b_u[i] += self.alpha*(e - self.beta*self.b_u[i])
            self.b_d[j] += self.alpha*(e - self.beta*self.b_d[j])
            
            self.P[i,:] += self.alpha*(e*self.Q[j,:] - self.beta*self.P[i,:])
            self.Q[j,:] += self.alpha*(e*self.P[i,:] - self.beta*self.Q[j,:])
            
    # test set 선정
    def set_test(self, ratings_test):
        test_set = []
        for i in range(len(ratings_test)):
            x, y, z = ratings_test.iloc[i]
            test_set.append([x,y,z])
            self.R[x,y] = 0
        self.test_set = test_set
        return test_set
    
    def test_rmse(self):
        error = 0
        for one_set in self.test_set:
            predicted = self.get_prediction(one_set[0],one_set[1])
            error += pow(one_set[2] - predicted, 2)
        return np.sqrt(error/len(self.test_set))
    
    def test(self):
        self.P = np.random.normal(scale=1./self.K, size=(self.num_users, self.K)) # (num_users, K)
        self.Q = np.random.normal(scale=1./self.K, size=(self.num_items, self.K)) # (num_items, K)
        self.b_u = np.zeros(self.num_users) # (num_users,)
        self.b_d = np.zeros(self.num_items) # (num_items,)
        self.b = np.mean(self.R[self.R.nonzero()]) # 전체 평균
        
        rows, columns = self.R.nonzero()
        self.samples = [(i,j,self.R[i,j]) for i,j in zip(rows,columns)]
        
        training_process = []
        for i in range(self.iterations):
            np.random.shuffle(self.samples)
            self.sgd()
            rmse = self.rmse()
            rmse_test = self.test_rmse()
            training_process.append((i+1,rmse,rmse_test))
            if self.verbose:
                if (i+1)%10 ==0:
                    print(f'iteration : {i+1} ; Train RMSE = {rmse} ; Test RMSE = {rmse_test:.4f}')
        return training_process
    
    def get_one_prediction(self, user_id, item_id):
        return self.get_prediction(user_id, item_id)

In [9]:
R_temp = ratings.copy()
mf = MF(R_temp, 200, 0.001, 0.02, 100, True)
test_set = mf.set_test(ratings_test)
result = mf.test()

iteration : 10 ; Train RMSE = 0.9664009442861345 ; Test RMSE = 0.9834
iteration : 20 ; Train RMSE = 0.9419612621130171 ; Test RMSE = 0.9644
iteration : 30 ; Train RMSE = 0.9313176202679793 ; Test RMSE = 0.9566
iteration : 40 ; Train RMSE = 0.9252744867350418 ; Test RMSE = 0.9524
iteration : 50 ; Train RMSE = 0.9213810641088099 ; Test RMSE = 0.9498
iteration : 60 ; Train RMSE = 0.9186405099032052 ; Test RMSE = 0.9480
iteration : 70 ; Train RMSE = 0.9165406101485817 ; Test RMSE = 0.9468
iteration : 80 ; Train RMSE = 0.9147671241622809 ; Test RMSE = 0.9459
iteration : 90 ; Train RMSE = 0.9130804755106557 ; Test RMSE = 0.9451
iteration : 100 ; Train RMSE = 0.9112540523029912 ; Test RMSE = 0.9444
