In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math

import warnings
warnings.filterwarnings(action='ignore')

# CBF (content based filtering)

## 영화 데이터 불러오기

In [2]:
movie=pd.read_csv("movie_0725.csv")
movie.head()

Unnamed: 0,adult,genres,id,original_language,title
0,False,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",862,en,Toy Story
1,False,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",8844,en,Jumanji
2,False,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",15602,en,Grumpier Old Men
3,False,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",31357,en,Waiting to Exhale
4,False,"[{'id': 35, 'name': 'Comedy'}]",11862,en,Father of the Bride Part II


In [3]:
movie.shape

(8875, 5)

## genres 컬럼 장르 정보만 남기기

In [4]:
movie['genres'][1]

"[{'id': 12, 'name': 'Adventure'}, {'id': 14, 'name': 'Fantasy'}, {'id': 10751, 'name': 'Family'}]"

In [5]:
#데이터 타입을 보면 문자열로 되어있음
type(movie['genres'][1])

str

In [6]:
#문자열이 아닌 리스트 형태로 다시 바꿔주는 작업
from ast import literal_eval

movie["genres"]=movie['genres'].apply(literal_eval)

type(movie['genres'][1])

list

In [8]:
movie["genres"][1]

[{'id': 12, 'name': 'Adventure'},
 {'id': 14, 'name': 'Fantasy'},
 {'id': 10751, 'name': 'Family'}]

In [10]:
#장르값만 남기는 함수
def get_genres(x):
    try:
        genre=[i['name'] for i in x]
        return genre
 
    except:
        genre=[]
        return genre

In [11]:
movie["genres"]=movie['genres'].apply(get_genres)
movie.head()

Unnamed: 0,adult,genres,id,original_language,title
0,False,"[Animation, Comedy, Family]",862,en,Toy Story
1,False,"[Adventure, Fantasy, Family]",8844,en,Jumanji
2,False,"[Romance, Comedy]",15602,en,Grumpier Old Men
3,False,"[Comedy, Drama, Romance]",31357,en,Waiting to Exhale
4,False,[Comedy],11862,en,Father of the Bride Part II


## 장르 공백 제거하기

In [117]:
movie['genres'][21:30]

21                              [Drama, Thriller]
22           [Action, Adventure, Crime, Thriller]
23    [Drama, Fantasy, Science Fiction, Thriller]
24                               [Drama, Romance]
25                                        [Drama]
26                        [Comedy, Drama, Family]
27                               [Drama, Romance]
28          [Fantasy, Science Fiction, Adventure]
29                                 [Drama, Crime]
Name: genres, dtype: object

In [12]:
### 문자 공백 없애기 

movie["genres"]=movie["genres"].apply(lambda x: [str(i).replace(" ","") for i in x])
movie['genres'][21:30]

21                             [Drama, Thriller]
22          [Action, Adventure, Crime, Thriller]
23    [Drama, Fantasy, ScienceFiction, Thriller]
24                              [Drama, Romance]
25                                       [Drama]
26                       [Comedy, Drama, Family]
27                              [Drama, Romance]
28          [Fantasy, ScienceFiction, Adventure]
29                                [Drama, Crime]
Name: genres, dtype: object

## 리스트에서 텍스트만 추출

In [13]:
def get_text(x):
    return ' '.join(x)
# 리스트를 텍스트로 추출

movie['genres']=movie['genres'].apply(get_text)
movie.head()

Unnamed: 0,adult,genres,id,original_language,title
0,False,Animation Comedy Family,862,en,Toy Story
1,False,Adventure Fantasy Family,8844,en,Jumanji
2,False,Romance Comedy,15602,en,Grumpier Old Men
3,False,Comedy Drama Romance,31357,en,Waiting to Exhale
4,False,Comedy,11862,en,Father of the Bride Part II


## 장르 TF-IDF 계산 

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf=TfidfVectorizer()

# 장르의 벡터화
tfidf_matrix= tfidf.fit_transform(movie['genres']).toarray()
tfidf_matrix

array([[0.        , 0.        , 0.72387097, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.52947876, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

## 코사인 유사도 계산

In [15]:
from sklearn.metrics.pairwise import cosine_similarity

cos_matrix = cosine_similarity(tfidf_matrix,tfidf_matrix)
cos_matrix

array([[1.        , 0.34861102, 0.21362182, ..., 0.35100189, 0.        ,
        0.26805428],
       [0.34861102, 1.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.21362182, 0.        , 1.        , ..., 0.6086059 , 0.        ,
        0.46478216],
       ...,
       [0.35100189, 0.        , 0.6086059 , ..., 1.        , 0.        ,
        0.76368329],
       [0.        , 0.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.26805428, 0.        , 0.46478216, ..., 0.76368329, 0.        ,
        1.        ]])

In [16]:
cos_matrix.shape

(8875, 8875)

In [17]:
df_cosine= pd.DataFrame(data=cos_matrix, index=movie['id'], columns=movie['id'])
df_cosine.head()

id,862,8844,15602,31357,11862,949,11860,45325,9091,710,...,399106,368620,401387,314420,390989,159550,402672,97995,391698,265189
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
862,1.0,0.348611,0.213622,0.189955,0.351002,0.0,0.213622,0.362279,0.0,0.0,...,0.936375,0.0,0.0,0.0,0.136336,0.0,0.0,0.351002,0.0,0.268054
8844,0.348611,1.0,0.0,0.0,0.0,0.0,0.0,0.649309,0.337934,0.337934,...,0.372299,0.0,0.0,0.0,0.0,0.0,0.263262,0.0,0.0,0.0
15602,0.213622,0.0,1.0,0.889212,0.608606,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.236395,0.0,0.336923,0.608606,0.0,0.464782
31357,0.189955,0.0,0.889212,1.0,0.54118,0.153224,0.889212,0.139394,0.0,0.0,...,0.0,0.0,0.0,0.254105,0.210206,0.457495,0.425556,0.54118,0.0,0.708644
11862,0.351002,0.0,0.608606,0.54118,1.0,0.0,0.608606,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.388421,0.0,0.0,1.0,0.0,0.763683


## 평점 데이터 불러오기

In [18]:
rating= pd.read_csv("rating_0725.csv")

In [19]:
#평점데이터를 8:2로 train test 데이터로 나우기

from sklearn.model_selection import train_test_split

train_rate, test_rate =train_test_split(rating, test_size=0.2, random_state=100)

In [20]:
train_rate.head()

Unnamed: 0,userId,rating,id
91215,606,4.5,10315
81219,553,4.0,36557
41536,299,4.5,1103
17858,119,4.0,105
76547,530,3.0,17832


# CBF 평점 공식
### user ID 5가 영화 i에 남길 예상 평점


$$ \hat{r}_{5,i}  =  \frac{\sum_{j\in I_5}{sim(i,j)}\cdot r_{5,j}}{\sum_{j\in I_5}{sim(i,j)}} $$

* $\hat{r}_{5,i}$ 사용자 5가 영화 i에 남길 평점
* $r_{5,j}$ 사용자 5가 영화 j에 남긴 평점
* $sim(i,j)$ 영화 i와 j의 유사도
* $I_5$ 사용자 5가 평점을 남긴 영화 전체 집합 

In [21]:
#train 데이터에서 유저아이디가 5인 데이터 추출
#id는 영화 아이디에 관함
user5_train= train_rate[train_rate['userId'] == 5][['id', 'rating']]
user5_train

Unnamed: 0,id,rating
376,639,4.0
392,8916,4.0
373,510,3.0
423,1430,3.5
396,817,4.0
...,...,...
415,8835,4.5
404,1213,3.5
377,621,5.0
422,4147,4.0


In [22]:
df_cosine

id,862,8844,15602,31357,11862,949,11860,45325,9091,710,...,399106,368620,401387,314420,390989,159550,402672,97995,391698,265189
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
862,1.000000,0.348611,0.213622,0.189955,0.351002,0.000000,0.213622,0.362279,0.000000,0.000000,...,0.936375,0.000000,0.0,0.000000,0.136336,0.000000,0.000000,0.351002,0.0,0.268054
8844,0.348611,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.649309,0.337934,0.337934,...,0.372299,0.000000,0.0,0.000000,0.000000,0.000000,0.263262,0.000000,0.0,0.000000
15602,0.213622,0.000000,1.000000,0.889212,0.608606,0.000000,1.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.0,0.000000,0.236395,0.000000,0.336923,0.608606,0.0,0.464782
31357,0.189955,0.000000,0.889212,1.000000,0.541180,0.153224,0.889212,0.139394,0.000000,0.000000,...,0.000000,0.000000,0.0,0.254105,0.210206,0.457495,0.425556,0.541180,0.0,0.708644
11862,0.351002,0.000000,0.608606,0.541180,1.000000,0.000000,0.608606,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.0,0.000000,0.388421,0.000000,0.000000,1.000000,0.0,0.763683
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
159550,0.000000,0.000000,0.000000,0.457495,0.000000,0.334921,0.000000,0.304690,0.000000,0.000000,...,0.000000,0.000000,0.0,0.555428,0.000000,1.000000,0.275326,0.000000,0.0,0.645591
402672,0.000000,0.263262,0.336923,0.425556,0.000000,0.092212,0.336923,0.357473,0.317339,0.317339,...,0.000000,0.000000,0.0,0.152923,0.000000,0.275326,1.000000,0.000000,0.0,0.177748
97995,0.351002,0.000000,0.608606,0.541180,1.000000,0.000000,0.608606,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.0,0.000000,0.388421,0.000000,0.000000,1.000000,0.0,0.763683
391698,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.704743,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,1.0,0.000000


In [24]:
user5_train['id'].values

array([  639,  8916,   510,  1430,   817,   812,   426,    69,   568,
        1640,   118,  8193,  6957,  9522, 11003,   863,  9377,   788,
        8467, 15121,   423,   330,  9032,   392,   170,   462,  2105,
        3049,  9327,   557,   238,  9489,  9603, 11000,   630,  2770,
        8346,   205,  1624,   786,   254,   455,   114,  9614,   597,
         565, 15602,   745,    70,  3981,  1597,  9390,  9372,  2109,
         854, 11224, 11566,  8587,   544,   453,   153,   411,  2898,
        2048,   808,   380,   492,  2108,  1637,  1542, 10020,   242,
         787,  2024,  8835,  1213,   621,  4147,    12])

In [30]:
df_cosine.loc[user5_train['id'].values, :].sum() # 열을 기준으로 더함

id
862       17.035665
8844      11.006463
15602     31.293886
31357     36.093916
11862     25.830575
            ...    
159550    18.070168
402672    16.816537
97995     25.830575
391698     3.335714
265189    31.392318
Length: 8875, dtype: float64

In [25]:
### 공식의 분모 유사도 합 구하기 (user5_train의 79개 영화인 j와 전체 영화 i 에 대해)
sim_sum = df_cosine.loc[user5_train['id'].values, :].sum().values    #유사도 합
sim_sum = sim_sum + 1                                                #분모가 0인경우 발생할 계산오류를 피하기 위해 +1 해줌
sim_sum

array([18.03566517, 12.00646275, 32.29388642, ..., 26.83057516,
        4.33571418, 32.39231754])

In [26]:
#공식의 분자 계산  각각 곱하고 더한 값이 필요하므로 내적 함수사용  
# user5_train['rating'].values : 사용자 u가 영화 j에 남긴 평점
sim_rating = np.matmul(df_cosine.loc[user5_train['id'].values, :].T.values, user5_train['rating'].values)
sim_rating

array([ 67.35395135,  42.83755096, 126.84272225, ..., 103.06395446,
        14.06628734, 124.02959458])

In [31]:
df_cosine.loc[user5_train['id'].values, :].T.values.shape

(8875, 79)

In [32]:
user5_train['rating'].values.shape

(79,)

In [33]:
#최종 평점 예측
pred_rating = pd.DataFrame(np.divide(sim_rating, sim_sum), index=df_cosine.index)
pred_rating.columns=["pred"]
pred_rating

Unnamed: 0_level_0,pred
id,Unnamed: 1_level_1
862,3.734487
8844,3.567874
15602,3.927763
31357,3.906486
11862,3.841288
...,...
159550,3.681214
402672,3.772871
97995,3.841288
391698,3.244284


## 예측한 평점 성능평가

In [34]:
user5_test= test_rate[test_rate['userId'] == 5][['id', 'rating']]
user5_test.head()

Unnamed: 0,id,rating
436,558,4.5
428,310,4.0
374,240,2.5
417,161,3.0
425,672,3.0


In [37]:
user5= pd.merge(user5_test, pred_rating.loc[user5_test['id']], left_on = 'id',right_index=True)
user5.head()
# pred 는 rating을 예측

Unnamed: 0,id,rating,pred
436,558,4.5,3.491039
428,310,4.0,3.739688
374,240,2.5,3.446859
417,161,3.0,3.045749
425,672,3.0,3.567874


In [36]:
### 평점을 잘 예측했는지 성능 평가 함수

def evaluation(pred, rating):
    rmse=np.round(math.sqrt(np.mean((pred-rating)**2)),3)
    mae=np.mean(np.abs(pred-rating)).round(3)
    
    print('RMSE:', rmse)
    print('MAE:', mae)
    
evaluation(user5['rating'],user5['pred'])

RMSE: 0.956
MAE: 0.58


## 유사한 영화를 알려주는 함수 만들기

In [40]:
movie

Unnamed: 0,adult,genres,id,original_language,title
0,False,Animation Comedy Family,862,en,Toy Story
1,False,Adventure Fantasy Family,8844,en,Jumanji
2,False,Romance Comedy,15602,en,Grumpier Old Men
3,False,Comedy Drama Romance,31357,en,Waiting to Exhale
4,False,Comedy,11862,en,Father of the Bride Part II
...,...,...,...,...,...
8870,False,Drama,159550,en,The Last Brickmaker in America
8871,False,Adventure Drama History Romance,402672,hi,Mohenjo Daro
8872,False,Comedy,97995,en,Seven Years Bad Luck
8873,False,Documentary Music,391698,en,The Beatles: Eight Days a Week - The Touring Y...


In [38]:
### 어떤 영화 제목을 입력하면 그와 유사도가 높은 10개 영화와 각 예상 평점 나오도록 하는 함수 만들기
indices = pd.Series(movie.index , index=movie['title'])
indices

title
Toy Story                                                0
Jumanji                                                  1
Grumpier Old Men                                         2
Waiting to Exhale                                        3
Father of the Bride Part II                              4
                                                      ... 
The Last Brickmaker in America                        8870
Mohenjo Daro                                          8871
Seven Years Bad Luck                                  8872
The Beatles: Eight Days a Week - The Touring Years    8873
Force Majeure                                         8874
Length: 8875, dtype: int64

In [39]:
indices["Toy Story"]

0

In [41]:
def similar_movie_1(title, cos_matrix):
    idx = indices[title]                #제목을 입력하면 해당 인덱스 값(몇번째 행인지)을 가져옴
    
    cos_sim = list(enumerate(cos_matrix[idx]))        #몇번째 행인지와 유사도를 같이 가져옴
#     return cos_sim
    cos_sim.sort(key=lambda x :x[1], reverse=True)   #유사도 값을 기준으로 내림차순 정렬
    
    cos_sim = cos_sim[0:11]                          #상위 11개 작품 (다음 코드에서 입력한 작품 제외할 것)   
    sim_movie_idx=[x[0] for x in cos_sim]            #cos_sim에서 첫번쨰 항목만 즉, 몇행인지 값 추출
    
    if idx in sim_movie_idx:                         #해당 작품 제외(입력한 영화)
        sim_movie_idx.remove(idx)
              
    sim_movie_idx = sim_movie_idx[0:10]              #상위 10개 작품만
#     return sim_movie_idx
    
    title_list=movie['title'].iloc[sim_movie_idx]  #인덱스로 유사한 영화 제목 추출
    title_list=pd.DataFrame(title_list)            #데이터프레임 만듦
#     return title_list
    
    id_list=movie['id'].iloc[sim_movie_idx]        #인덱스로 영화 id 추출
    id_list=pd.DataFrame(id_list)                  #데이터 프레임으로 만듦
#     return id_list

    temp1 = pd.concat([title_list,id_list], axis=1)                              #title_list, id_list 데이터프레임 병합
#     return temp1
    temp2=pd.merge(temp1, pred_rating["pred"],left_on = 'id',right_index=True)   #temp1과 pred_rating 병합

    result=temp2.drop(["id"], axis=1)
    result.rename(columns={"title":"유사한 영화 TOP10", "pred":"예상평점"},inplace=True)
    
    print(result)

    
   

In [42]:
indices["The City of Lost Children"]

28

In [43]:
similar_movie_1("The City of Lost Children",cos_matrix)

                          유사한 영화 TOP10      예상평점
5977                A Trip to the Moon  3.429416
6713       Hellboy II: The Golden Army  3.429416
6721                            Jumper  3.429416
7756                  The Hunger Games  3.429416
3354                    The Lost World  3.580827
5166  Frank Herbert's Children of Dune  3.580827
1259                Kull the Conqueror  3.492034
2089                          Superman  3.492034
2090                       Superman II  3.492034
3059                         Supergirl  3.492034


In [141]:
# similar_movie_1("Sabrina",cos_matrix)   #영화제목은 같으나 영화id가 다른 경우 에러 발생

In [44]:
# 영화아이디 두개
indices["Sabrina"]

title
Sabrina      6
Sabrina    725
dtype: int64

In [45]:
#영화 제목은 같은데 영화 id는 다를경우 위 함수로는 에러가 남. 영화 id가 다르면 다른 영화로 보고 모두 출력하는 함수 만들기

def similar_movie(title, cos_matrix):
    try:
        
        idx = indices[title]                #제목을 입력하면 해당 인덱스 값(몇번째 행인지)을 가져옴
    
        cos_sim = list(enumerate(cos_matrix[idx]))         #몇번째 행인지와 유사도를 같이 가져옴
        cos_sim.sort(key=lambda x :x[1], reverse=True)    #유사도 값을 기준으로 내림차순 정렬

        cos_sim = cos_sim[0:11]                           #상위 11개 작품 (다음 코드에서 입력한 작품 제외할 것)   
        sim_movie_idx=[x[0] for x in cos_sim]            #cos_sim에서 첫번쨰 항목만 즉, 몇행인지 값 추출
        sim_movie_idx.remove(idx)                        #해당 작품 제외
        sim_movie_idx = sim_movie_idx[0:10]              #상위 10개 작품만


        title_list=movie['title'].iloc[sim_movie_idx]  #인덱스로 유사한 영화 제목 추출
        title_list=pd.DataFrame(title_list)           #데이터프레임 만듦

        id_list=movie['id'].iloc[sim_movie_idx]     #인덱스로 영화 id 추출
        id_list=pd.DataFrame(id_list)  #데이터 프레임으로 만듦

        temp1 = pd.concat([title_list,id_list], axis=1)   #title_list, id_list 데이터프레임 병합
        temp2=pd.merge(temp1, pred_rating["pred"],left_on = 'id',right_index=True)    #temp1과 pred_rating 병합

        result=temp2.drop(["id"], axis=1)
        result.rename(columns={"title":"유사한 영화 TOP10", "pred":"예상평점"},inplace=True)

        print(result)


    except:
        idx=[]
        if len(indices[title])>1:

            for i in range(len(indices[title])):
                idx.append(indices[title][i])                          #같은 제목중 코드 하나만 


                cos_sim = list(enumerate(cos_matrix[idx[i]]))         #몇번째 행인지와 유사도를 같이 가져옴
                cos_sim.sort(key=lambda x :x[1], reverse=True)    #유사도 값을 기준으로 내림차순 정렬


                cos_sim = cos_sim[0:11]                         #상위 11개 작품 (다음 코드에서 입력한 작품 제외할 것)   
                sim_movie_idx=[x[0] for x in cos_sim]            #cos_sim에서 첫번쨰 항목만 즉, 몇행인지 값 추출


                if idx[i] in sim_movie_idx:
                    sim_movie_idx.remove(idx[i])                    #해당 작품 제외

                sim_movie_idx = sim_movie_idx[0:10]              #상위 10개 작품만



                title_list = movie['title'].iloc[sim_movie_idx]  #인덱스로 유사한 영화 제목 추출
                title_list=pd.DataFrame(title_list)



                id_list=movie['id'].iloc[sim_movie_idx]     #인덱스로 영화 id 추출
                id_list=pd.DataFrame(id_list)


                temp1 = pd.concat([title_list,id_list], axis=1)

                temp2=pd.merge(temp1, pred_rating["pred"],left_on = 'id',right_index=True)    #temp1과 pred_rating 병합



                result=temp2.drop(["id"], axis=1)
                result.rename(columns={"title":f"영화 id:{idx[i]} 유사한 영화 TOP10", "pred":"예상평점"},inplace=True)
                
                print(result)


In [46]:
similar_movie("Sabrina",cos_matrix)

       영화 id:6 유사한 영화 TOP10      예상평점
2          Grumpier Old Men  3.927763
48         Mighty Aphrodite  3.927763
59            Two If by Sea  3.927763
62             French Twist  3.927763
92                Mr. Wrong  3.927763
105            If Lucy Fell  3.927763
108               Boomerang  3.927763
111  Flirting with Disaster  3.927763
113          Pie in the Sky  3.927763
157                Mallrats  3.927763
       영화 id:725 유사한 영화 TOP10      예상평점
3           Waiting to Exhale  3.906486
10     The American President  3.906486
36                   Clueless  3.906486
52      Home for the Holidays  3.906486
53                The Postman  3.906486
66      Kicking and Screaming  3.906486
84            Beautiful Girls  3.906486
109         Chungking Express  3.906486
121     The Brothers McMullen  3.906486
125  An Awfully Big Adventure  3.906486


In [47]:
similar_movie("Alice in Wonderland",cos_matrix)

                             영화 id:825 유사한 영화 TOP10      예상평점
1629                                   The Rescuers  3.574249
2217         Little Nemo: Adventures In Slumberland  3.574249
3901                           Return to Never Land  3.574249
4167                                  Spirited Away  3.574249
5522                              The Polar Express  3.574249
5769                      Kirikou and the Sorceress  3.574249
6444                      Arthur and the Invisibles  3.574249
7316                       How to Train Your Dragon  3.574249
7439  Legend of the Guardians: The Owls of Ga'Hoole  3.574249
8117                                           Epic  3.574249
                       영화 id:7293 유사한 영화 TOP10      예상평점
1                                      Jumanji  3.567874
55                  The Indian in the Cupboard  3.567874
729                           The Wizard of Oz  3.567874
1508                                 Labyrinth  3.567874
1632                             

# User based CF (collaborative filtering)

In [48]:
#평점데이터와 영화데이터의 id, title 병합하기
df= pd.merge(rating, movie[["id","title"]], on="id")
df.head()

Unnamed: 0,userId,rating,id,title
0,1,2.5,9909,Dangerous Minds
1,7,3.0,9909,Dangerous Minds
2,31,4.0,9909,Dangerous Minds
3,32,4.0,9909,Dangerous Minds
4,36,3.0,9909,Dangerous Minds


In [53]:
#각 유저가 영화에 남긴 평점들로 이루어진 데이터 프레임 만들기
df3=df.copy().pivot_table("rating",index="userId",columns="id")
df3.fillna(0, inplace=True) 
df3

id,2,5,6,11,12,13,14,15,16,18,...,376570,384798,387893,390989,391698,399106,401387,402672,410921,416437
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,3.0,5.0,4.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,5.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,4.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
667,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
668,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
669,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
670,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## user-based cf 유사도 계산

In [50]:
from sklearn.metrics.pairwise import cosine_similarity

cos_matrix_3 = cosine_similarity(df3.values, df3.values)
cos_matrix_3

array([[1.        , 0.        , 0.        , ..., 0.0480577 , 0.        ,
        0.02367533],
       [0.        , 1.        , 0.12682458, ..., 0.02478699, 0.19422398,
        0.09883407],
       [0.        , 0.12682458, 1.        , ..., 0.08253112, 0.12760636,
        0.17510851],
       ...,
       [0.0480577 , 0.02478699, 0.08253112, ..., 1.        , 0.05424404,
        0.05384711],
       [0.        , 0.19422398, 0.12760636, ..., 0.05424404, 1.        ,
        0.1614514 ],
       [0.02367533, 0.09883407, 0.17510851, ..., 0.05384711, 0.1614514 ,
        1.        ]])

In [54]:
cos_matrix_3.shape
# userId을 기준으로

(671, 671)

In [51]:
df3_cosine= pd.DataFrame(data=cos_matrix_3, index=df3.index, columns=df3.index)
df3_cosine.head()

userId,1,2,3,4,5,6,7,8,9,10,...,662,663,664,665,666,667,668,669,670,671
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.0,0.0,0.074714,0.016818,0.0,0.084825,0.0,0.012843,0.0,...,0.0,0.0,0.01885,0.048426,0.0,0.0,0.0,0.048058,0.0,0.023675
2,0.0,1.0,0.126825,0.12017,0.104498,0.0,0.197844,0.11463,0.114265,0.043568,...,0.448826,0.020123,0.073289,0.161596,0.444087,0.481427,0.005611,0.024787,0.194224,0.098834
3,0.0,0.126825,1.0,0.082879,0.153354,0.061421,0.158332,0.253912,0.136093,0.116052,...,0.149219,0.024364,0.160037,0.115863,0.180214,0.177542,0.0,0.082531,0.127606,0.175109
4,0.074714,0.12017,0.082879,1.0,0.131055,0.079896,0.324338,0.192461,0.030512,0.137612,...,0.106947,0.016478,0.138503,0.244845,0.093511,0.090381,0.073517,0.095145,0.067317,0.215614
5,0.016818,0.104498,0.153354,0.131055,1.0,0.063796,0.096964,0.166451,0.086616,0.03237,...,0.201964,0.0,0.157836,0.18514,0.141644,0.05881,0.044216,0.039065,0.08483,0.221317


In [52]:
df3.head(3)

id,2,5,6,11,12,13,14,15,16,18,...,376570,384798,387893,390989,391698,399106,401387,402672,410921,416437
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,3.0,5.0,4.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# user-based CF 평점 공식
$$ \hat{r}_{5,i}  = \bar{r_5}+ \frac{\sum_{v\in U_i}(r_{v,i}- \bar{r_v})\cdot{sim(5,v)} }{\sum_{v\in U_i}{sim(5,v)}} $$

* $\hat{r}_{5,i}$ 사용자 5가 영화 i에 남길 평점
* $\bar{r_5}$ 사용자 5가 영화에 남긴 평점들의 평균
* $sim(5,v)$ 사용자 5와 v의 유사도
* $U_i$ 영화 i에 대해 평점을 남긴 사용자 전체 집합

In [55]:
#df3에 대해 0과 1로 이루어진 데이터프레임 만들기 
bin_df=df3.copy()
bin_df[bin_df==0.0]=0.0
bin_df[bin_df!=0.0]=1.0
bin_df.head(3)

id,2,5,6,11,12,13,14,15,16,18,...,376570,384798,387893,390989,391698,399106,401387,402672,410921,416437
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [56]:
#유저가 평점을 남긴 영화의 개수 (즉, bin_df에서 1의 개수)
bin_sum= bin_df.sum(axis=1)
bin_sum

userId
1       20.0
2       75.0
3       50.0
4      203.0
5      100.0
       ...  
667     67.0
668     18.0
669     36.0
670     29.0
671    111.0
Length: 671, dtype: float64

In [57]:
#유저별 남긴 평점 평균 구하기
#평균 구하기
rating_average= pd.DataFrame(df3.sum(axis=1).divide(bin_sum))
rating_average.columns=['average']
rating_average.head()

Unnamed: 0_level_0,average
userId,Unnamed: 1_level_1
1,2.55
2,3.48
3,3.56
4,4.344828
5,3.91


### user based CF 평점 공식 분자 구하기
$$\sum_{v\in U_i}(r_{v,i}- \bar{r_v})\cdot{sim(5,v)}$$

In [62]:
df3_cosine

userId,1,2,3,4,5,6,7,8,9,10,...,662,663,664,665,666,667,668,669,670,671
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.000000,0.000000,0.074714,0.016818,0.000000,0.084825,0.000000,0.012843,0.000000,...,0.000000,0.000000,0.018850,0.048426,0.000000,0.000000,0.000000,0.048058,0.000000,0.023675
2,0.000000,1.000000,0.126825,0.120170,0.104498,0.000000,0.197844,0.114630,0.114265,0.043568,...,0.448826,0.020123,0.073289,0.161596,0.444087,0.481427,0.005611,0.024787,0.194224,0.098834
3,0.000000,0.126825,1.000000,0.082879,0.153354,0.061421,0.158332,0.253912,0.136093,0.116052,...,0.149219,0.024364,0.160037,0.115863,0.180214,0.177542,0.000000,0.082531,0.127606,0.175109
4,0.074714,0.120170,0.082879,1.000000,0.131055,0.079896,0.324338,0.192461,0.030512,0.137612,...,0.106947,0.016478,0.138503,0.244845,0.093511,0.090381,0.073517,0.095145,0.067317,0.215614
5,0.016818,0.104498,0.153354,0.131055,1.000000,0.063796,0.096964,0.166451,0.086616,0.032370,...,0.201964,0.000000,0.157836,0.185140,0.141644,0.058810,0.044216,0.039065,0.084830,0.221317
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
667,0.000000,0.481427,0.177542,0.090381,0.058810,0.000000,0.256565,0.095494,0.102470,0.057056,...,0.410318,0.034160,0.077470,0.154941,0.344634,1.000000,0.019051,0.034715,0.193229,0.130597
668,0.000000,0.005611,0.000000,0.073517,0.044216,0.025189,0.026237,0.039900,0.137311,0.083474,...,0.000000,0.000000,0.060893,0.052358,0.000000,0.019051,1.000000,0.000000,0.068219,0.094058
669,0.048058,0.024787,0.082531,0.095145,0.039065,0.041728,0.068092,0.047863,0.044747,0.059797,...,0.033136,0.015104,0.058473,0.102318,0.019535,0.034715,0.000000,1.000000,0.054244,0.053847
670,0.000000,0.194224,0.127606,0.067317,0.084830,0.028388,0.130945,0.197141,0.326163,0.138931,...,0.175331,0.085627,0.098305,0.108001,0.154261,0.193229,0.068219,0.054244,1.000000,0.161451


In [63]:
#유저 아이디 5와 다른 유저들의 유사도   sim(5,v)
sim_user5 = df3_cosine.loc[5, :]
sim_user5

userId
1      0.016818
2      0.104498
3      0.153354
4      0.131055
5      1.000000
         ...   
667    0.058810
668    0.044216
669    0.039065
670    0.084830
671    0.221317
Name: 5, Length: 671, dtype: float64

In [64]:
sim_user5[5] = 0 #유저 5자신 값 0으로 바꿈

In [65]:
#최종 sim(5,v) 
sim_user5

userId
1      0.016818
2      0.104498
3      0.153354
4      0.131055
5      0.000000
         ...   
667    0.058810
668    0.044216
669    0.039065
670    0.084830
671    0.221317
Name: 5, Length: 671, dtype: float64

In [66]:
print(df3.shape)
print(rating_average.shape)
print(bin_df.shape)

(671, 8834)
(671, 1)
(671, 8834)


In [69]:
bin_df.T

userId,1,2,3,4,5,6,7,8,9,10,...,662,663,664,665,666,667,668,669,670,671
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
12,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
399106,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
401387,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
402672,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
410921,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [70]:
rating_average['average']

userId
1      2.550000
2      3.480000
3      3.560000
4      4.344828
5      3.910000
         ...   
667    3.641791
668    3.833333
669    3.333333
670    3.775862
671    3.923423
Name: average, Length: 671, dtype: float64

In [67]:
#bin_df 활용 크기 맞추기
bin_df.T.multiply(rating_average['average']).T

id,2,5,6,11,12,13,14,15,16,18,...,376570,384798,387893,390989,391698,399106,401387,402672,410921,416437
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.000000,0.00,0.000000,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.000000,0.00,3.480000,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.000000,3.56,3.560000,3.560000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,4.344828,0.00,4.344828,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.000000,3.91,3.910000,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
667,0.0,0.0,0.0,0.000000,0.00,3.641791,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
668,0.0,0.0,0.0,3.833333,0.00,0.000000,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
669,0.0,0.0,0.0,0.000000,0.00,0.000000,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
670,0.0,0.0,0.0,0.000000,0.00,0.000000,3.775862,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [71]:
#유사한 유저들을 곱해 공식 분자 식 완성
user_simrating = (df3 - bin_df.T.multiply(rating_average['average']).T).T.multiply(sim_user5).T.sum(axis=0)  

In [72]:
user_simrating     #영화별로 값이 나옴

id
2          0.013414
5          0.040713
6         -0.077773
11        11.898586
12         2.022910
            ...    
399106     0.087097
401387    -0.400894
402672     0.057055
410921    -0.121685
416437     0.102452
Length: 8834, dtype: float64

### user based CF 평점 공식 분모 계산
$$\sum_{v\in U_i}{sim(5,v)}$$

In [73]:
#유사도의 합
sim_sum = bin_df.T.multiply(sim_user5).T.sum(axis=0)

In [74]:
sim_sum     #이 값도 영화별로 나옴  최종적으로 구하는 것이 영화별 평점 예측이니까 

id
2          0.023691
5          3.111457
6          0.586714
11        34.680364
12        21.804937
            ...    
399106     0.137344
401387     0.189209
402672     0.064925
410921     0.169107
416437     0.097541
Length: 8834, dtype: float64

### user based CF 평점 공식 최종 계산
$$ \hat{r}_{5,i}  = \bar{r_5}+ \frac{\sum_{v\in U_i}(r_{v,i}- \bar{r_v})\cdot{sim(5,v)} }{\sum_{v\in U_i}{sim(5,v)}} $$

In [75]:
#최종 평점 예측 값
pred_rating = rating_average.loc[5].values + pd.Series(data = np.divide(user_simrating,sim_sum), name='pred')
pred_rating.fillna(0, inplace=True)

pred_rating

id
2         4.476225
5         3.923085
6         3.777443
11        4.253093
12        4.002773
            ...   
399106    4.544152
401387    1.791209
402672    4.788788
410921    3.190421
416437    4.960347
Name: pred, Length: 8834, dtype: float64

## user based CF 성능평가

In [76]:
user5_test= test_rate[test_rate['userId'] == 5][['id', 'rating']]
user5_test.head(3)

Unnamed: 0,id,rating
436,558,4.5
428,310,4.0
374,240,2.5


In [77]:
user_temp =pd.merge(user5_test, pred_rating.loc[user5_test['id']], left_on = 'id',right_index=True)
user_temp

Unnamed: 0,id,rating,pred
436,558,4.5,4.014069
428,310,4.0,3.583973
374,240,2.5,4.59479
417,161,3.0,4.074043
425,672,3.0,3.830929
375,37247,4.0,4.265914
411,8358,3.5,3.867502
414,824,3.5,4.110406
412,1493,4.5,3.445898
388,9552,3.5,3.94692


In [78]:
# 오차 확인
evaluation(user_temp['pred'],user_temp['rating'])

RMSE: 0.744
MAE: 0.58


# 💻 과제   Item-based CF (collaborative filtering)

In [79]:
df= pd.merge(rating, movie[["id","title"]], on="id")
df.head(3)

Unnamed: 0,userId,rating,id,title
0,1,2.5,9909,Dangerous Minds
1,7,3.0,9909,Dangerous Minds
2,31,4.0,9909,Dangerous Minds


### 1. 위의 df를 사용하여 item-based cf에 필요한 평점으로 이루어진 데이터 df2 를 만드세요.  (결측값은 0으로) <font color=blue>(10점)</font>

In [82]:
df2= df.copy().pivot_table("rating",index='id',columns="userId")
# user-based라면 df2= df.copy().pivot_table("rating",index='userId',columns="id")
df2.fillna(0, inplace=True) 
df2.head()

userId,1,2,3,4,5,6,7,8,9,10,...,662,663,664,665,666,667,668,669,670,671
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11,0.0,0.0,0.0,5.0,0.0,0.0,5.0,3.5,0.0,0.0,...,0.0,0.0,4.0,3.0,0.0,0.0,3.0,0.0,0.0,4.0
12,0.0,0.0,3.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### 2. df2의 데이터를 사용하여 코사인 유사도 cos_matrix_2 를 구하여 데이터 프레임 df2_cosine을 만드세요.<font color=blue>(10점)</font><br>(인덱스, 컬럼 모두 df2의 인덱스로)

In [81]:
cos_matrix_2 =  cosine_similarity(df2, df2)
cos_matrix_2.shape

(8834, 8834)

In [83]:
df2_cosine=pd.DataFrame(cos_matrix_2,index=df2.index,columns=df2.index)
df2_cosine.head()

id,2,5,6,11,12,13,14,15,16,18,...,376570,384798,387893,390989,391698,399106,401387,402672,410921,416437
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.226779,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,1.0,0.0,0.197563,0.099554,0.150947,0.098432,0.079485,0.055027,0.169195,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.038605,0.0
6,0.0,0.0,1.0,0.087767,0.0,0.122642,0.074557,0.100317,0.0,0.087997,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11,0.0,0.197563,0.087767,1.0,0.399134,0.547707,0.46527,0.349091,0.169882,0.477609,...,0.06462,0.05744,0.0,0.04308,0.0,0.0,0.0718,0.0,0.040616,0.0
12,0.0,0.099554,0.0,0.399134,1.0,0.442042,0.371946,0.199155,0.153964,0.345131,...,0.095659,0.0,0.0,0.035872,0.0,0.0,0.023915,0.0,0.0,0.0


## 3.item-based CF 평점 공식을 이용하여 예측 평점을 계산하세요
$$ \hat{r}_{5,i}  =  \frac{\sum_{j\in I_5}{sim(i,j)}\cdot r_{5,j}}{\sum_{j\in I_5}{sim(i,j)}} $$

* $\hat{r}_{5,i}$ 사용자 5가 영화 i에 남길 평점
* $r_{5,j}$ 사용자 5가 영화 j에 남긴 평점
* $sim(i,j)$ 영화 i와 j의 유사도
* $I_5$ 사용자 5가 평점을 남긴 영화 전체 집합 

### 3-(1) train_rate에서 유저 5의 데이터를 user5_train으로 만들어주세요.<font color=blue>(5점)</font>

In [84]:
rating= pd.read_csv("rating_0725.csv")
train_rate, test_rate =train_test_split(rating, test_size=0.2, random_state=100)

In [85]:
user5_train= train_rate[train_rate['userId'] == 5][['id', 'rating']]
user5_train

Unnamed: 0,id,rating
376,639,4.0
392,8916,4.0
373,510,3.0
423,1430,3.5
396,817,4.0
...,...,...
415,8835,4.5
404,1213,3.5
377,621,5.0
422,4147,4.0


In [86]:
user5_train.shape

(79, 2)

### 3-(2) 공식의 분모인 유사도 합을 구하세요. <font color=blue>(20점)</font>

In [95]:
df2_cosine.loc[user5_train['id'].values,:].sum().values

array([0.38829044, 9.91533732, 5.65166546, ..., 0.30397893, 3.53056159,
       1.42612583])

In [98]:
sim_sum = df2_cosine.loc[user5_train['id'].values, :].sum().values  #유사도 합
sim_sum = sim_sum + 1                                                #분모가 0인경우 발생할 계산오류를 피하기 위해 +1 해줌
sim_sum

array([ 1.38829044, 10.91533732,  6.65166546, ...,  1.30397893,
        4.53056159,  2.42612583])

In [97]:
df2_cosine.loc[user5_train['id'].values, :].T

id,639,8916,510,1430,817,812,426,69,568,1640,...,1542,10020,242,787,2024,8835,1213,621,4147,12
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.115099,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
5,0.104121,0.139182,0.144018,0.055009,0.098412,0.149900,0.110360,0.124083,0.193661,0.106660,...,0.092651,0.145473,0.105651,0.165544,0.103852,0.137026,0.094794,0.034580,0.108943,0.099554
6,0.024196,0.110556,0.081410,0.082161,0.088564,0.082134,0.034316,0.000000,0.119932,0.000000,...,0.162500,0.028784,0.115391,0.000000,0.043005,0.000000,0.040207,0.040466,0.084991,0.000000
11,0.373116,0.253065,0.425156,0.244607,0.380904,0.440295,0.297299,0.237438,0.400085,0.241838,...,0.390358,0.367313,0.264689,0.248146,0.292186,0.223710,0.232509,0.345301,0.197002,0.399134
12,0.253276,0.205737,0.336651,0.382858,0.345350,0.337863,0.178229,0.391867,0.334515,0.435632,...,0.378002,0.315614,0.161110,0.348523,0.295800,0.282927,0.224348,0.286538,0.236903,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
399106,0.121733,0.000000,0.048760,0.120564,0.081015,0.000000,0.143874,0.178647,0.036569,0.144715,...,0.086711,0.096543,0.096758,0.125905,0.000000,0.154793,0.134859,0.075404,0.000000,0.000000
401387,0.135259,0.074163,0.097520,0.137788,0.027005,0.009183,0.143874,0.000000,0.054854,0.126626,...,0.123873,0.000000,0.000000,0.146889,0.000000,0.066340,0.067430,0.075404,0.023756,0.023915
402672,0.000000,0.166867,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
410921,0.019128,0.104883,0.124123,0.000000,0.095477,0.051946,0.101734,0.000000,0.058181,0.000000,...,0.000000,0.054613,0.136837,0.000000,0.076497,0.000000,0.095360,0.106637,0.000000,0.000000


### 3-(3) 공식의 분자를 계산해서 sim_rating에 할당해주세요.<font color=blue>(20점)</font>

In [99]:
sim_rating =np.matmul(df2_cosine.loc[user5_train['id'].values, :].T.values, user5_train['rating'].values)
sim_rating

array([ 1.28562007, 39.05991653, 22.01407663, ...,  1.26279642,
       13.65901885,  5.43237159])

### 3-(4) 구한 분모, 분자 값을 이용하여 평점 예측 값 pred_rating 데이터 프레임을 만들어 주세요.<font color=blue>(15점)</font> <br>(인덱스는 df2_cosine의 인덱스)

In [100]:
pred_rating = pd.DataFrame(np.divide(sim_rating, sim_sum), index=df2_cosine.index)
pred_rating.columns=["pred"]
pred_rating

Unnamed: 0_level_0,pred
id,Unnamed: 1_level_1
2,0.926045
5,3.578443
6,3.309559
11,3.763187
12,3.811819
...,...
399106,3.393501
401387,3.304334
402672,0.968418
410921,3.014862


### 4. test_rate에서 유저5의 데이터를 user5_test에 할당해주세요.<font color=blue>(5점)</font>

In [101]:
user5_test= test_rate[test_rate['userId'] == 5][['id', 'rating']]
user5_test.head()

Unnamed: 0,id,rating
436,558,4.5
428,310,4.0
374,240,2.5
417,161,3.0
425,672,3.0


### 5. user5_test와 pred_rating 데이터프레임을 병합해 아래의 결과처럼 나오도록 해주세요.<font color=blue>(5점)</font>

In [102]:
item_cf = pd.merge(user5_test,pred_rating.loc[user5_test['id']], left_on = 'id',right_index=True)
item_cf.head()

Unnamed: 0,id,rating,pred
436,558,4.5,3.807301
428,310,4.0,3.784333
374,240,2.5,3.700531
417,161,3.0,3.809754
425,672,3.0,3.794419


### 6. 앞에서 만들었던 평가지표 함수를 사용하여 rmse, mae 결과를 출력하세요.<font color=blue>(5점)</font>

In [103]:
def evaluation(pred, rating):
    rmse=np.round(math.sqrt(np.mean((pred-rating)**2)),3)
    mae=np.mean(np.abs(pred-rating)).round(3)
    
    print('RMSE:', rmse)
    print('MAE:', mae)
    
evaluation(item_cf['rating'],item_cf['pred'])

RMSE: 0.588
MAE: 0.486


### 7. similar movie 함수를 이용하여 Avatar와 2번째로 유사한 영화 제목과 예상 평점을 마크다운으로 작성해주세요.<font color=blue>(5점)</font><br>(유사도는 cos_matirx_2 사용)

In [104]:
similar_movie("Avatar",cos_matrix_2)

                                           유사한 영화 TOP10      예상평점
8128                                The Kings of Summer  3.284065
8587                                           The Loft  3.321532
7497  The Chronicles of Narnia: The Voyage of the Da...  2.842588
7072                                    The Hurt Locker  3.645068
7752                                     Violet & Daisy  3.321532
4925                                        Naked Lunch  3.376687
1714                          The Man Who Knew Too Much  3.597158
4256                                    Far from Heaven  3.589522
8244                    The Hunger Games: Catching Fire  3.435007


Avatar와 2번째로 유사한 영화 제목 : The Man Who Knew Too Much <br/>
Avatar와 2번째로 유사한 예상 평점 : 3.597158 