# Collaborative filtering

<a id='0'></a>

## Table of contents
[1.Matrix factorization](#1)   
[2.TF-IDF](#2)

In [3]:
# import libraries
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error

<a id='1'></a>

## [Matrix factorization](#0)

In [27]:
# create example data for collaborative filtering
UserItemMatrix = np.array([np.array([5, np.nan, 4, np.nan, 1, np.nan, 3]),
                           np.array([4, 4, 4, np.nan, np.nan, np.nan, 1]),
                           np.array([5, 4, np.nan, 1, 2, np.nan, 3]),
                           np.array([1, 2, 1, 4, 3, 5, 2]),
                           np.array([np.nan, 1, np.nan, 3, 5, 5, np.nan]),
                           np.array([np.nan, 2, np.nan, np.nan, 4, 4, 2]),
                           np.array([5, np.nan, np.nan, 1, np.nan, np.nan, 2])
                          ])
UserItemMatrix

array([[ 5., nan,  4., nan,  1., nan,  3.],
       [ 4.,  4.,  4., nan, nan, nan,  1.],
       [ 5.,  4., nan,  1.,  2., nan,  3.],
       [ 1.,  2.,  1.,  4.,  3.,  5.,  2.],
       [nan,  1., nan,  3.,  5.,  5., nan],
       [nan,  2., nan, nan,  4.,  4.,  2.],
       [ 5., nan, nan,  1., nan, nan,  2.]])

In [113]:
# explore User-Item matrix
df = pd.DataFrame(UserItemMatrix, 
                  columns=['item1', 'item2', 'item3', 'item4', 'item5', 'item6', 'item7'])
df['user_id'] = list(df.index)
df.head()

Unnamed: 0,item1,item2,item3,item4,item5,item6,item7,user_id
0,5.0,,4.0,,1.0,,3.0,0
1,4.0,4.0,4.0,,,,1.0,1
2,5.0,4.0,,1.0,2.0,,3.0,2
3,1.0,2.0,1.0,4.0,3.0,5.0,2.0,3
4,,1.0,,3.0,5.0,5.0,,4


In [161]:
# R: 복원하고자 하는 matrix
# K: latent dimension
# 참고: pytorch의 Tensor로 SGD 알고리즘을 적용해서 matrix factorization이 가능함.
import torch

def matrix_factorization_using_torch(R, K, steps=200, learning_rate=0.01, set_seed = True, verbose = True, smooth= True):
    
    num_users, num_items = R.shape
    
    if set_seed:
        np.random.seed(1)
    
    # initailize user-latent, item-latent matrices.
    
    P = torch.randn((num_users, K),requires_grad=True)
    Q = torch.randn((num_items, K),requires_grad=True)
    
    # R > 0인 행 위치, 열 위치, 값을 non_zeros 리스트에 저장한다.
    R_copy = R.copy()
    R_copy[np.isnan(R_copy)] = 0
    non_zeros_idx = torch.tensor(R_copy > 0)
    R_copy = torch.tensor(R_copy)

    # SGD 기법으로 P, Q 매트릭스를 업데이트 함
    optimizer = torch.optim.SGD([P,Q], lr= learning_rate, momentum= 0.9)
    
    for step in range(steps):
        pred = P@Q.T
        loss = torch.mean((R_copy[non_zeros_idx] - pred[non_zeros_idx])**2)
        
        with torch.no_grad():
            rmse = torch.sqrt(loss)
        if step % 10 == 0 and verbose:
            print("iter step: {0}, rmse: {1:4f}".format(step, rmse))
            
        # zero gradients, perform a backward pass, and update the weights
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    R_hat = P.detach().numpy()@Q.detach().numpy().T
    
    if smooth:
        R_hat_round = np.round(R_hat)
        R_hat_round[R_hat_round>5]= 5
        R_hat_round[R_hat_round<1]= 1
        R_hat = R_hat_round
    
    return P.detach().numpy(), Q.detach().numpy(), R_hat


In [162]:
P, Q, R_hat = matrix_factorization_using_torch(UserItemMatrix,3)

iter step: 0, rmse: 3.566551
iter step: 10, rmse: 3.200260
iter step: 20, rmse: 2.818874
iter step: 30, rmse: 2.559651
iter step: 40, rmse: 2.385012
iter step: 50, rmse: 2.256513
iter step: 60, rmse: 2.139624
iter step: 70, rmse: 2.020568
iter step: 80, rmse: 1.869156
iter step: 90, rmse: 1.652264
iter step: 100, rmse: 1.335446
iter step: 110, rmse: 0.931263
iter step: 120, rmse: 0.575246
iter step: 130, rmse: 0.403377
iter step: 140, rmse: 0.346849
iter step: 150, rmse: 0.328058
iter step: 160, rmse: 0.319115
iter step: 170, rmse: 0.312351
iter step: 180, rmse: 0.307335
iter step: 190, rmse: 0.303335


In [163]:
# User matrix
UserItemMatrix

array([[ 5., nan,  4., nan,  1., nan,  3.],
       [ 4.,  4.,  4., nan, nan, nan,  1.],
       [ 5.,  4., nan,  1.,  2., nan,  3.],
       [ 1.,  2.,  1.,  4.,  3.,  5.,  2.],
       [nan,  1., nan,  3.,  5.,  5., nan],
       [nan,  2., nan, nan,  4.,  4.,  2.],
       [ 5., nan, nan,  1., nan, nan,  2.]])

In [164]:
# Prediction
R_hat

array([[5., 5., 4., 1., 1., 4., 3.],
       [4., 4., 4., 1., 1., 1., 1.],
       [5., 4., 3., 1., 2., 4., 3.],
       [1., 2., 1., 3., 4., 5., 2.],
       [1., 1., 1., 4., 5., 5., 2.],
       [1., 2., 3., 2., 4., 4., 2.],
       [5., 1., 1., 1., 1., 1., 2.]], dtype=float32)

In [165]:
# explore User-Item matrix
df_filled = pd.DataFrame(R_hat, 
                  columns=['item1', 'item2', 'item3', 'item4', 'item5', 'item6', 'item7'])
df_filled['user_id'] = list(df_filled.index)
df_filled.head()

Unnamed: 0,item1,item2,item3,item4,item5,item6,item7,user_id
0,5.0,5.0,4.0,1.0,1.0,4.0,3.0,0
1,4.0,4.0,4.0,1.0,1.0,1.0,1.0,1
2,5.0,4.0,3.0,1.0,2.0,4.0,3.0,2
3,1.0,2.0,1.0,3.0,4.0,5.0,2.0,3
4,1.0,1.0,1.0,4.0,5.0,5.0,2.0,4


In [166]:
# please compare the result with the raw data: df
df.head()

Unnamed: 0,item1,item2,item3,item4,item5,item6,item7,user_id
0,5.0,,4.0,,1.0,,3.0,0
1,4.0,4.0,4.0,,,,1.0,1
2,5.0,4.0,,1.0,2.0,,3.0,2
3,1.0,2.0,1.0,4.0,3.0,5.0,2.0,3
4,,1.0,,3.0,5.0,5.0,,4


## 실제 데이터 활용

* 전처리 방식에 주목해주세요.

In [144]:
rating_data, movie_data = pd.read_csv('./Data/ratings.csv'), pd.read_csv('./Data/movies.csv')

In [148]:
rating_data.drop('timestamp',axis=1,inplace=True)
rating_data.head(3)

Unnamed: 0,userId,movieId,rating
0,1,31,2.5
1,1,1029,3.0
2,1,1061,3.0


In [149]:
movie_data.head(3)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance


In [151]:
# movieid 를 기준으로 두 데이터 프레임을 합침
user_movie_rating = pd.merge(rating_data, movie_data, on='movieId')

In [152]:
user_movie_rating.head()

Unnamed: 0,userId,movieId,rating,title,genres
0,1,31,2.5,Dangerous Minds (1995),Drama
1,7,31,3.0,Dangerous Minds (1995),Drama
2,31,31,4.0,Dangerous Minds (1995),Drama
3,32,31,4.0,Dangerous Minds (1995),Drama
4,36,31,3.0,Dangerous Minds (1995),Drama


* 사용자 - 영화 평점 점수 데이터 형식으로 바꿔줘야 합니다.

In [168]:
user_movie_rating = user_movie_rating.pivot_table('rating', index= 'userId',columns='title')

In [169]:
user_movie_rating.head()

title,"""Great Performances"" Cats (1998)",$9.99 (2008),'Hellboy': The Seeds of Creation (2004),'Neath the Arizona Skies (1934),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),...,Zulu (1964),Zulu (2013),[REC] (2007),eXistenZ (1999),loudQUIETloud: A Film About the Pixies (2006),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931),İtirazım Var (2014)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,


In [171]:
R = user_movie_rating.values

In [176]:
# Note: K(latent dim) is the hyper-parameter to be tuned.
P, Q, R_hat= matrix_factorization_using_torch(R, 1000, smooth= False)

iter step: 0, rmse: 44.901671
iter step: 10, rmse: 43.226596
iter step: 20, rmse: 40.150266
iter step: 30, rmse: 36.829590
iter step: 40, rmse: 33.666791
iter step: 50, rmse: 30.776875
iter step: 60, rmse: 28.172058
iter step: 70, rmse: 25.831556
iter step: 80, rmse: 23.726817
iter step: 90, rmse: 21.829986
iter step: 100, rmse: 20.116287
iter step: 110, rmse: 18.564316
iter step: 120, rmse: 17.155702
iter step: 130, rmse: 15.874653
iter step: 140, rmse: 14.707520
iter step: 150, rmse: 13.642438
iter step: 160, rmse: 12.669038
iter step: 170, rmse: 11.778209
iter step: 180, rmse: 10.961910
iter step: 190, rmse: 10.213017


<a id='2'></a>

## [TF-IDF](#0)

TF-IDF:
Term Frequency - Inverse Document Frequency (직역: 단어의 빈도와 역 문서 빈도)   
__특정 문서 내에 있는 어떤 단어가 얼마나 중요한 지를 나타내는 통계량__입니다.   
basic idea: 특정 문서에 특별히 자주 등장하는 단어가 해당 문서의 키워드라는 전제가 깔려있습니다.   

* TF (Term Frequency): 문서 내 특정 단어의 빈도를 의미합니다.    
* IDF (Inverse Document Frequency): DF (Document frequency) 의 역수이며, 특정 단어가 발견되는 문서의 수입니다.     
(참고로 특정 단어가 IDF가 크다는 것은 대부분의 문서에서 발견된다는 뜻입니다. 즉, 그 단어가 덜 중요하다고 해석할 수 있습니다.) 
* TF-IDF : TF $\times$ IDF 

주로 문서의 유사도를 구하는 작업 또는 특정 문서 내에서 특정 단어의 중요도를 구하는 작업 등에 쓰일 수 있습니다.


In [10]:
# 사이킷런은 TF-IDF를 자동 계산해주는 TfidfVectorizer를 제공합니다.
from sklearn.feature_extraction.text import TfidfVectorizer
documents = [
    'attention is all you need',
    'you have my word',
    'I like you',
    'what you should do ',
    'you you you why you',
    'you, you, you, you everywhere'
]
tfidfv = TfidfVectorizer().fit(documents)
results = tfidfv.transform(documents).toarray()
vocab = tfidfv.vocabulary_
print(results)
print(vocab)

[[0.48812169 0.48812169 0.         0.         0.         0.48812169
  0.         0.         0.48812169 0.         0.         0.
  0.         0.2166769 ]
 [0.         0.         0.         0.         0.55927514 0.
  0.         0.55927514 0.         0.         0.         0.
  0.55927514 0.24826187]
 [0.         0.         0.         0.         0.         0.
  0.91399636 0.         0.         0.         0.         0.
  0.         0.40572238]
 [0.         0.         0.55927514 0.         0.         0.
  0.         0.         0.         0.55927514 0.55927514 0.
  0.         0.24826187]
 [0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.49071837
  0.         0.87131824]
 [0.         0.         0.         0.49071837 0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.87131824]]
{'attention': 1, 'is': 5, 'all': 0, 'you': 13, 'need': 8, 'have': 4, 'my': 7, 'word': 12, 'like': 6, 'what': 10, 

In [12]:
# 결과를 좀 더 직관적으로 볼 수 있게 정리했습니다.
vocab_sorted = sorted(vocab.items(), key=lambda x: x[1], reverse=False)
features = [k for k,v in vocab_sorted]
df = pd.DataFrame(results)
df.columns =features
df['words'] = documents
df.set_index('words',inplace=True)

In [13]:
df.head(6)

Unnamed: 0_level_0,all,attention,do,everywhere,have,is,like,my,need,should,what,why,word,you
words,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
attention is all you need,0.488122,0.488122,0.0,0.0,0.0,0.488122,0.0,0.0,0.488122,0.0,0.0,0.0,0.0,0.216677
you have my word,0.0,0.0,0.0,0.0,0.559275,0.0,0.0,0.559275,0.0,0.0,0.0,0.0,0.559275,0.248262
I like you,0.0,0.0,0.0,0.0,0.0,0.0,0.913996,0.0,0.0,0.0,0.0,0.0,0.0,0.405722
what you should do,0.0,0.0,0.559275,0.0,0.0,0.0,0.0,0.0,0.0,0.559275,0.559275,0.0,0.0,0.248262
you you you why you,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.490718,0.0,0.871318
"you, you, you, you everywhere",0.0,0.0,0.0,0.490718,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.871318
