# 잠재요인 협업 필터링 Example
- 행렬분해 이용한 잠재요인 협업 필터링 예제
- movie 추천

In [None]:
import pandas as pd
import numpy as np

In [14]:
def matrix_factorization(R,K,steps=200, learning_rate=0.01, r_lambda=0.01):
    num_users,num_items=R.shape
    # P,Q matrix 크기 지정, 정규분포 가진 random값으로 초기화
    np.random.seed(1)
    P=np.random.normal(scale=1./K,size=(num_users,K))
    Q=np.random.normal(scale=1./K, size=(num_items,K))

    prev_rmse=10000
    break_count=0

    # R>0인 행, 열 위치, 값을 non_zeros list 객체에 저장
    non_zeros=[(i,j,R[i,j]) for i in range(num_users) for j in range(num_items) if R[i,j]>0]

    # SGD 기법으로 P, Q 매트릭스 계속 update
    for step in range(steps):
        for i,j,r in non_zeros:
            # 실제 값, 예측값 차이 오류 구함
            eij=r-np.dot(P[i,:],Q[j,:].T)
            P[i,:]+=learning_rate*(eij*Q[j,:]-r_lambda*P[i,:])
            Q[j,:]+=learning_rate*(eij*P[i,:]-r_lambda*Q[j,:])

        #rmse=get_rmse(R,P,Q,non_zeros)
        if(step%10)==0:
            print("## iteration step:", step)

    return P,Q

In [5]:
movies=pd.read_csv('movies.csv')
ratings=pd.read_csv('ratings.csv')

In [6]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [7]:
ratings=ratings[['userId', 'movieId','rating']]

In [8]:
ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,31,2.5
1,1,1029,3.0
2,1,1061,3.0
3,1,1129,2.0
4,1,1172,4.0


In [9]:
rating_matrix=ratings.pivot_table('rating', index='userId', columns='movieId')

In [10]:
rating_matrix

movieId,1,2,3,4,5,6,7,8,9,10,...,161084,161155,161594,161830,161918,161944,162376,162542,162672,163949
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,4.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,4.0,...,,,,,,,,,,
5,,,4.0,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
667,,,,,,4.0,,,,,...,,,,,,,,,,
668,,,,,,,,,,,...,,,,,,,,,,
669,,,,,,,,,,,...,,,,,,,,,,
670,4.0,,,,,,,,,,...,,,,,,,,,,


In [11]:
# title column 얻기 위해 movies와 join
rating_movies=pd.merge(ratings,movies,on='movieId')

In [12]:
rating_matrix=rating_movies.pivot_table('rating',index='userId', columns='title')

In [21]:
rating_matrix.head()

title,"""Great Performances"" Cats (1998)",$9.99 (2008),'Hellboy': The Seeds of Creation (2004),'Neath the Arizona Skies (1934),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),...,Zulu (1964),Zulu (2013),[REC] (2007),eXistenZ (1999),loudQUIETloud: A Film About the Pixies (2006),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931),İtirazım Var (2014)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,


In [15]:
P,Q=matrix_factorization(rating_matrix.values, K=50,steps=200, learning_rate=0.01, r_lambda=0.01)
pred_matrix=np.dot(P,Q.T)

## iteration step: 0
## iteration step: 10
## iteration step: 20
## iteration step: 30
## iteration step: 40
## iteration step: 50
## iteration step: 60
## iteration step: 70
## iteration step: 80
## iteration step: 90
## iteration step: 100
## iteration step: 110
## iteration step: 120
## iteration step: 130
## iteration step: 140
## iteration step: 150
## iteration step: 160
## iteration step: 170
## iteration step: 180
## iteration step: 190


In [16]:
ratings_pred_matrix=pd.DataFrame(data=pred_matrix, index=rating_matrix.index, columns=rating_matrix.columns)

In [17]:
ratings_pred_matrix.head()

title,"""Great Performances"" Cats (1998)",$9.99 (2008),'Hellboy': The Seeds of Creation (2004),'Neath the Arizona Skies (1934),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),...,Zulu (1964),Zulu (2013),[REC] (2007),eXistenZ (1999),loudQUIETloud: A Film About the Pixies (2006),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931),İtirazım Var (2014)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.641676,2.889387,1.099882,0.29516,1.345731,1.77425,1.729358,2.561874,2.762497,2.718208,...,2.289714,0.664554,2.501499,1.309686,2.337709,1.99515,0.530999,2.579704,2.272978,2.022199
2,2.16473,3.624265,1.49434,0.269798,1.933644,2.503397,2.519713,2.729019,3.605812,2.809703,...,3.298091,0.77081,2.924901,1.390264,2.935596,3.271624,0.628967,2.769774,2.934985,2.792867
3,2.142794,3.537876,1.40333,0.337382,1.591148,2.4066,2.453896,3.081602,3.818645,3.11011,...,3.323351,1.038857,3.539153,3.361631,3.097248,1.265515,0.598941,2.450413,3.032541,2.731907
4,2.461128,4.054156,1.746749,0.401319,2.157944,3.086039,3.325203,3.552246,4.937948,4.146138,...,4.249978,1.214484,3.937198,3.857297,3.783061,3.392717,0.748204,3.995616,3.47762,3.176754
5,2.625009,4.017993,1.403513,0.279689,1.807459,2.322589,2.956322,2.929722,4.409274,4.117485,...,3.676541,0.885883,3.23545,1.648348,3.028822,3.557031,0.53892,3.053927,2.928269,2.995926


In [18]:
def get_unseen_movies(rating_matrix, userId):
    # userId로 입력받은 사용자의 모든 영화 정보 추출해 Series로 변환
    # 반환 된 user_rating은 영화명(title)을 idx로 가지는 Series 객체
    user_rating=rating_matrix.loc[userId,:]

    # user_rating>0이면 기존에 관람한 영화 의미
    already_seen=user_rating[user_rating>0].index.tolist()
    # 모든 영화 명을 list로
    movies_list=rating_matrix.columns.tolist()

    # already_seen에 해당하는 애 빼주기
    unseen_list=[movie for movie in movies_list if movie not in already_seen]
    return unseen_list

In [19]:
def recomm_movie(pred_df, userid,unseenlist,top=10):
    # 예측 평점 df에서 사용자
    recomm=pred_df.loc[userid,unseenlist].sort_values(ascending=False)[:top]
    return recomm

In [20]:
#사용자 관람 X 영화 추출
unseen_list=get_unseen_movies(rating_matrix,9)
recomm_movies=recomm_movie(ratings_pred_matrix,9,unseen_list,top=10)

In [22]:
# 평점 데이터를 df로 생성
recomm_movie=pd.DataFrame(data=recomm_movies.values, index=recomm_movies.index, columns=['pred_score'])

In [23]:
recomm_movies

title
Lives of Others, The (Das leben der Anderen) (2006)     5.284725
Bowling for Columbine (2002)                            5.273318
Amelie (Fabuleux destin d'Amélie Poulain, Le) (2001)    5.217321
Dancer in the Dark (2000)                               5.179148
In the Line of Fire (1993)                              5.174083
Run Lola Run (Lola rennt) (1998)                        5.154215
WALL·E (2008)                                           5.110012
Searching for Bobby Fischer (1993)                      5.039262
Finding Nemo (2003)                                     5.022706
Boys Don't Cry (1999)                                   5.015613
Name: 9, dtype: float64