In [1]:
import warnings
warnings.filterwarnings('ignore')

In [9]:
import pandas as pd
from ast import literal_eval

In [22]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Contents Based Filtering
- 사용자가 특정 아이템을 선호하는 경우, 그 아이템과 비슷한 컨텐츠를 가진 다른 아이템을 추천
  - 사용자가 특정 영화에 높은 평점을 부여
    - 그 영화의 장르, 출연배우, 감독, 키워드와 유사한 다른 영화를 추천

## 1. TMDB 5000 Movie Dataset
- IMDB 영화 중 주요 5000개 영화에 대한 정보를 제공
- https://www.kaggle.com/tmdb/tmdb-movie-metadata

### 1) Load Data

In [48]:
url = 'https://raw.githubusercontent.com/rusita-ai/pyData/master/tmdb_5000_movies.csv'

DF = pd.read_csv(url)

DF.info()

# 결측치가 있는 것이 보인다

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4803 non-null   int64  
 1   genres                4803 non-null   object 
 2   homepage              1712 non-null   object 
 3   id                    4803 non-null   int64  
 4   keywords              4803 non-null   object 
 5   original_language     4803 non-null   object 
 6   original_title        4803 non-null   object 
 7   overview              4800 non-null   object 
 8   popularity            4803 non-null   float64
 9   production_companies  4803 non-null   object 
 10  production_countries  4803 non-null   object 
 11  release_date          4802 non-null   object 
 12  revenue               4803 non-null   int64  
 13  runtime               4801 non-null   float64
 14  spoken_languages      4803 non-null   object 
 15  status               

In [49]:
DF.head()

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""name"": ""Fantasy""}, {...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"": 2964, ""name"": ""future""}, {""id"": 3386, ""name"": ""sp...",en,Avatar,"In the 22nd century, a paraplegic Marine is dispatched to the moon Pandora on a unique mission, ...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289}, {""name"": ""Twentieth Century Fox Film Corporatio...","[{""iso_3166_1"": ""US"", ""name"": ""United States of America""}, {""iso_3166_1"": ""GB"", ""name"": ""United ...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso_639_1"": ""es"", ""name"": ""Espa\u00f1ol""}]",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""name"": ""Fantasy""}, {""id"": 28, ""name"": ""Action""}]",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""name"": ""drug abuse""}, {""id"": 911, ""name"": ""exotic is...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, has come back to life and is headed to the edge of t...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""name"": ""Jerry Bruckheimer Films"", ""id"": 130}, {""na...","[{""iso_3166_1"": ""US"", ""name"": ""United States of America""}]",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""name"": ""Adventure""}, {""id"": 80, ""name"": ""Crime""}]",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name"": ""based on novel""}, {""id"": 4289, ""name"": ""secret...",en,Spectre,A cryptic message from Bond’s past sends him on a trail to uncover a sinister organization. Whil...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""name"": ""Danjaq"", ""id"": 10761}, {""name"": ""B24"", ""id"": ...","[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""}, {""iso_3166_1"": ""US"", ""name"": ""United States of ...",2015-10-26,880674609,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""}, {""iso_639_1"": ""en"", ""name"": ""English""}, {""iso_639...",Released,A Plan No One Escapes,Spectre,6.3,4466
3,250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""name"": ""Crime""}, {""id"": 18, ""name"": ""Drama""}, {""id"": ...",http://www.thedarkknightrises.com/,49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853, ""name"": ""crime fighter""}, {""id"": 949, ""name"": ""te...",en,The Dark Knight Rises,"Following the death of District Attorney Harvey Dent, Batman assumes responsibility for Dent's c...",112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""name"": ""Warner Bros."", ""id"": 6194}, {""name"": ""DC E...","[{""iso_3166_1"": ""US"", ""name"": ""United States of America""}]",2012-07-16,1084939099,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106
4,260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""name"": ""Adventure""}, {""id"": 878, ""name"": ""Science Fic...",http://movies.disney.com/john-carter,49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"": 839, ""name"": ""mars""}, {""id"": 1456, ""name"": ""medal...",en,John Carter,"John Carter is a war-weary, former military captain who's inexplicably transported to the myster...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]","[{""iso_3166_1"": ""US"", ""name"": ""United States of America""}]",2012-03-07,284139100,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124


### 2) 필요정보 DataFrame 재구성
- 'id': 고유번호
- 'title': 제목
- 'genres': 장르
- 'vote_average': 평균 평점
- 'vote_count': 평점 투표 수
- 'popularity': 인기도
- 'keywords': 주요키워드
- 'overview': 개요

In [50]:
DF_MV = DF[['id' , 'title' , 'genres' , 'vote_average' ,
            'vote_count' , 'popularity' , 'keywords' , 'overview']]

DF_MV.head()

# 리스트 안의 딕셔너리처럼 보이나 문자열이다

Unnamed: 0,id,title,genres,vote_average,vote_count,popularity,keywords,overview
0,19995,Avatar,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""name"": ""Fantasy""}, {...",7.2,11800,150.437577,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"": 2964, ""name"": ""future""}, {""id"": 3386, ""name"": ""sp...","In the 22nd century, a paraplegic Marine is dispatched to the moon Pandora on a unique mission, ..."
1,285,Pirates of the Caribbean: At World's End,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""name"": ""Fantasy""}, {""id"": 28, ""name"": ""Action""}]",6.9,4500,139.082615,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""name"": ""drug abuse""}, {""id"": 911, ""name"": ""exotic is...","Captain Barbossa, long believed to be dead, has come back to life and is headed to the edge of t..."
2,206647,Spectre,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""name"": ""Adventure""}, {""id"": 80, ""name"": ""Crime""}]",6.3,4466,107.376788,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name"": ""based on novel""}, {""id"": 4289, ""name"": ""secret...",A cryptic message from Bond’s past sends him on a trail to uncover a sinister organization. Whil...
3,49026,The Dark Knight Rises,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""name"": ""Crime""}, {""id"": 18, ""name"": ""Drama""}, {""id"": ...",7.6,9106,112.31295,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853, ""name"": ""crime fighter""}, {""id"": 949, ""name"": ""te...","Following the death of District Attorney Harvey Dent, Batman assumes responsibility for Dent's c..."
4,49529,John Carter,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""name"": ""Adventure""}, {""id"": 878, ""name"": ""Science Fic...",6.1,2124,43.926995,"[{""id"": 818, ""name"": ""based on novel""}, {""id"": 839, ""name"": ""mars""}, {""id"": 1456, ""name"": ""medal...","John Carter is a war-weary, former military captain who's inexplicably transported to the myster..."


### 3) Preprocessing
- List 구조 내에 Dictionary 포함

In [51]:
pd.set_option('max_colwidth' , 100)

DF_MV[['genres' , 'keywords']][:1]

Unnamed: 0,genres,keywords
0,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""name"": ""Fantasy""}, {...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"": 2964, ""name"": ""future""}, {""id"": 3386, ""name"": ""sp..."


- Python 'ast' Module
  - 문자열을 List 구조로 반환

In [52]:
DF_MV['genres'] = DF_MV['genres'].apply(literal_eval)
DF_MV['keywords'] = DF_MV['keywords'].apply(literal_eval)

DF_MV[['genres' , 'keywords']][:1]

# 눈에 보이는 건 같지만 리스트가 되었다!

Unnamed: 0,genres,keywords
0,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'name': 'Adventure'}, {'id': 14, 'name': 'Fantasy'}, {...","[{'id': 1463, 'name': 'culture clash'}, {'id': 2964, 'name': 'future'}, {'id': 3386, 'name': 'sp..."


- List 구조 내 Dictionary 'name' Key 정보 추출

In [13]:
DF_MV['genres'] = DF_MV['genres'].apply(lambda x : [ y['name'] for y in x ])
DF_MV['keywords'] = DF_MV['keywords'].apply(lambda x : [ y['name'] for y in x])

DF_MV[['genres' , 'keywords']][:1]

Unnamed: 0,genres,keywords
0,"[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colony, society, space travel, futuristic, romance, spa..."


## 2. 장르('genre') 컨텐츠 유사도
- 개별 영화 별 장르 리스트에 대한 유사도 측정
  - 어떤 방식으로 유사도를 측정하든 숫자로는 바꿔야한다
    (CountVectorizer())
  - Count 기반 Vectorizer 변환
  - 코사인 유사도 값 객체 생성
  - 장르 유사도가 높고, 평점이 높은 순으로 추천

### 1) CountVectorizer()
- 공백문자로 word 단위가 구분되는 문자열로 반환

In [15]:
DF_MV['genres_literal'] = DF_MV['genres'].apply(lambda x : (' '.join(x)))
DF_MV['genres_literal'][:1]

# 문제점: Science Fiction이 Science와 Fiction으로 떨어짐

0    Action Adventure Fantasy Science Fiction
Name: genres_literal, dtype: object

In [17]:
DF_MV.head()

Unnamed: 0,id,title,genres,vote_average,vote_count,popularity,keywords,overview,genres_literal
0,19995,Avatar,"[Action, Adventure, Fantasy, Science Fiction]",7.2,11800,150.437577,"[culture clash, future, space war, space colony, society, space travel, futuristic, romance, spa...","In the 22nd century, a paraplegic Marine is dispatched to the moon Pandora on a unique mission, ...",Action Adventure Fantasy Science Fiction
1,285,Pirates of the Caribbean: At World's End,"[Adventure, Fantasy, Action]",6.9,4500,139.082615,"[ocean, drug abuse, exotic island, east india trading company, love of one's life, traitor, ship...","Captain Barbossa, long believed to be dead, has come back to life and is headed to the edge of t...",Adventure Fantasy Action
2,206647,Spectre,"[Action, Adventure, Crime]",6.3,4466,107.376788,"[spy, based on novel, secret agent, sequel, mi6, british secret service, united kingdom]",A cryptic message from Bond’s past sends him on a trail to uncover a sinister organization. Whil...,Action Adventure Crime
3,49026,The Dark Knight Rises,"[Action, Crime, Drama, Thriller]",7.6,9106,112.31295,"[dc comics, crime fighter, terrorist, secret identity, burglar, hostage drama, time bomb, gotham...","Following the death of District Attorney Harvey Dent, Batman assumes responsibility for Dent's c...",Action Crime Drama Thriller
4,49529,John Carter,"[Action, Adventure, Science Fiction]",6.1,2124,43.926995,"[based on novel, mars, medallion, space travel, princess, alien, steampunk, martian, escape, edg...","John Carter is a war-weary, former military captain who's inexplicably transported to the myster...",Action Adventure Science Fiction


- CountVectorizer() 적용

In [19]:
count_vect = CountVectorizer(min_df = 0, ngram_range = (1, 2))
genre_mat = count_vect.fit_transform(DF_MV['genres_literal'])

genre_mat.shape

(4803, 276)

### 2) Cosine_similarity()

In [23]:
genre_sim = cosine_similarity(genre_mat, genre_mat)

genre_sim.shape

(4803, 4803)

- 영화 장르 유사도 정보
  - 장르 유사도가 높은 순으로 컨텐츠 기반 필터링 수행
  - 'genre_sim' 행별로 유사도가 높은 인덱스값 추출

In [25]:
genre_sim[:1]
# 0이 가장 많은 sparse matrix

array([[1.        , 0.59628479, 0.4472136 , ..., 0.        , 0.        ,
        0.        ]])

### 3) argsort()
- 행렬 재정렬
- 장르 유사도가 높은 순으로 정리된 인덱스 값 획득

In [26]:
genre_sim_sorted_ind = genre_sim.argsort()[: , ::-1]

- '0'번 레코드
  - 자신을 제외하고 '3494', '813' 순서로 유사도가 높음
  - '2401'번 레코드의 유사도가 가장 낮음

In [27]:
genre_sim_sorted_ind[:1]

array([[   0, 3494,  813, ..., 3038, 3037, 2401]])

## 3. 장르 컨텐츠 필터링 영화 추천

### 1) 장르 유사도 기반 영화추천 함수
- '영화 DataFrame'과 '장르 코사인 유사도 인덱스' 기반
  - 추천기준 '영화제목' 및 '영화건수' 입력
  - 추천영화 정보 반환

In [33]:
def find_sim_movie(df, sorted_ind, title_name, top_n = 10) :

  # 인자로 입력된 DF_MV DataFrame에서 'title'컬럼이 입력된 title_name 값인 DataFrame 추출
  title_movie = df[df['title'] == title_name]

  # title_name을 가진 DataFrame의 index객체를 ndarray로 반환하고
  # sorted_ind 인자로 입력된 genre_sim_sorted_ind 객체에서 유사도 순으로 top_n개의 index 추출
  title_index = title_movie.index.values
  similar_indexes = sorted_ind[title_index, : (top_n)]

  # 추출된 top_n index들 출력 (top_n index는 2차원 데이터임)
  # dataframe에서 index로 사용하기 위해 1차원 array로 변경
  print(similar_indexes)
  similar_indexes = similar_indexes.reshape(-1)

  return df.iloc[similar_indexes]

### 2) 'The Godfather' 입력
- 유사한 영화 10편 추천
  - 평점이 0이거나 관련없어 보이는 영화가 추천되는 문제 발생

In [34]:
similar_movies = find_sim_movie(DF_MV, genre_sim_sorted_ind, 'The Godfather' , 10)

similar_movies[['title' , 'vote_average']]

[[2731 1243 3636 1946 2640 4065 1847 4217  883 3866]]


Unnamed: 0,title,vote_average
2731,The Godfather: Part II,8.3
1243,Mean Streets,7.2
3636,Light Sleeper,5.7
1946,The Bad Lieutenant: Port of Call - New Orleans,6.0
2640,Things to Do in Denver When You're Dead,6.7
4065,Mi America,0.0
1847,GoodFellas,8.2
4217,Kids,6.8
883,Catch Me If You Can,7.7
3866,City of God,8.1


## 4. 평점 기반 필터링 추가
- 평점 + 평점에 참여한 사람의 수도 고려

### 1) 'vote_average' 기준 내림차순 정렬
- 평점은 높지만 'vote_count'가 낮은 문제 포함

In [35]:
DF_MV[['title' , 'vote_average' , 'vote_count']].sort_values('vote_average' ,
                                                             ascending = False)[:10]

# 평점은 10점이지만 참여자 1명

Unnamed: 0,title,vote_average,vote_count
3519,Stiff Upper Lips,10.0,1
4247,Me You and Five Bucks,10.0,2
4045,"Dancer, Texas Pop. 81",10.0,1
4662,Little Big Top,10.0,1
3992,Sardaarji,9.5,2
2386,One Man's Hero,9.3,2
2970,There Goes My Baby,8.5,2
1881,The Shawshank Redemption,8.5,8205
2796,The Prisoner of Zenda,8.4,11
3337,The Godfather,8.4,5893


### 2) 가중평점(Weighted Rating)
- 식은 만들기 나름 (고정된 알고리즘은 아니다)
- 가중평점 = (v / (v+m)) * R + (m \/ (v+m)) * C
  - v: 개별 영화 평점 투표 횟수('vote_count')
  - m: 평점 부여를 위한 최소 투표 횟수(가중치 조절 역할)
  - R: 개별 영화 평균 평점('vote_average')
  - C: 전체 영화 평균 평점
- C: 전체 영화 평균 평점
- m: 상위 60% 투표 횟수 적용
  - 수집된 데이터의 특징에 따라 달라질 수 있다


In [39]:
C = DF_MV['vote_average'].mean()
m = DF_MV['vote_count'].quantile(0.6)

print('C: ' , round(C, 3) , '\t' , 'm: ' , round(m, 3))

C:  6.092 	 m:  370.2


### 3) weighted_vote_average()

In [40]:
percentile = 0.6

m = DF_MV['vote_count'].quantile(percentile)
C = DF_MV['vote_average'].mean()

def weighted_vote_average(record) :
  v = record['vote_count']
  R = record['vote_average']

  return (( v / (v+m)) * R) + (( m / (m+v)) * C)

- 'DF_MV'에 'weighted_vote' 열 추가

In [41]:
DF_MV['weighted_vote'] = DF_MV.apply(weighted_vote_average, axis = 1)

- 'weighted_vote' 상위 10개 확인

In [43]:
DF_MV[['title' , 'vote_average' , 'weighted_vote' , 'vote_count']].sort_values('weighted_vote',
                                                                               ascending = False)[:10]
# 좀 더 객관적으로 처리

Unnamed: 0,title,vote_average,weighted_vote,vote_count
1881,The Shawshank Redemption,8.5,8.396052,8205
3337,The Godfather,8.4,8.263591,5893
662,Fight Club,8.3,8.216455,9413
3232,Pulp Fiction,8.3,8.207102,8428
65,The Dark Knight,8.2,8.13693,12002
1818,Schindler's List,8.3,8.126069,4329
3865,Whiplash,8.3,8.123248,4254
809,Forrest Gump,8.2,8.105954,7927
2294,Spirited Away,8.3,8.105867,3840
2731,The Godfather: Part II,8.3,8.079586,3338


## 5. find_sim_movie() update

In [46]:
def find_sim_movie(df, sorted_ind, title_name, top_n = 10) :

  title_movie = df[df['title'] == title_name]

  title_index = title_movie.index.values

  # top_n 의 2배에 해당하는 장르 유사성이 높은 index 추출
  similar_indexes = sorted_ind[title_index, :(top_n * 2)]
  similar_indexes = similar_indexes.reshape(-1)

  # 기준영화 index는 제외
  similar_indexes = similar_indexes[similar_indexes != title_index]

  # top_n의 2배에 해당하는 후보군에서 'weighted_vote' 높은 순으로 top_n만큼 추출
  return df.iloc[similar_indexes].sort_values('weighted_vote' , ascending = False)[:top_n]

- 추천 결과 확인

In [47]:
similar_movies = find_sim_movie(DF_MV, genre_sim_sorted_ind, 'The Godfather' , 10)

similar_movies[['title' , 'vote_average' , 'weighted_vote']]

# 처음 결과보다는 고도화되었음을 볼 수 있다

Unnamed: 0,title,vote_average,weighted_vote
2731,The Godfather: Part II,8.3,8.079586
1847,GoodFellas,8.2,7.976937
3866,City of God,8.1,7.759693
1663,Once Upon a Time in America,8.2,7.657811
883,Catch Me If You Can,7.7,7.557097
281,American Gangster,7.4,7.141396
4041,This Is England,7.4,6.739664
1149,American Hustle,6.8,6.717525
1243,Mean Streets,7.2,6.626569
2839,Rounders,6.9,6.530427
