### 9.1 컨텐츠 기반 필터링 실습 – TMDB 5000 Movie Dataset

In [2]:
import pandas as pd
import numpy as np
import warnings; warnings.filterwarnings('ignore')

### 전처리
movies =pd.read_csv('./tmdb_5000_movies.csv')
print(movies.shape)
movies.head(1)

(4803, 20)


Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800


In [3]:
# 하나의 영화당, 장르부분을 보면 복합적으로 섞여있는걸 알 수 있다. 
movies['genres'][0]

'[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'

In [4]:
# 일단 나머지 필요없는 애들을 빼고
movies_df = movies[['id','title', 'genres', 'vote_average', 'vote_count',
                 'popularity', 'keywords', 'overview']]

In [5]:
# 다시 장르랑 키워드만 빼보면 아래와 같이 나온다. 
pd.set_option('max_colwidth', 100)
movies_df[['genres','keywords']][:1]


Unnamed: 0,genres,keywords
0,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""name"": ""Fantasy""}, {...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"": 2964, ""name"": ""future""}, {""id"": 3386, ""name"": ""sp..."


In [6]:
# 장르랑 키워드는 모두 object임
movies_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   id            4803 non-null   int64  
 1   title         4803 non-null   object 
 2   genres        4803 non-null   object 
 3   vote_average  4803 non-null   float64
 4   vote_count    4803 non-null   int64  
 5   popularity    4803 non-null   float64
 6   keywords      4803 non-null   object 
 7   overview      4800 non-null   object 
dtypes: float64(2), int64(2), object(4)
memory usage: 300.3+ KB


In [7]:
# 이거는 왜 해주냐면 람다 적용할때 개체로 변형(딕셔너리로)해줘야 함
# 이걸 안하면 장르랑 키워드는 모두 str이어서 에러남
# 이걸 해야 dict 형태로 변경이 됨
from ast import literal_eval

movies_df['genres'] = movies_df['genres'].apply(literal_eval)
movies_df['keywords'] = movies_df['keywords'].apply(literal_eval)

In [8]:
# 사실 겉보기에는 바뀐게 없고, type만 바뀐 것
movies_df['genres'].head(1)

0    [{'id': 28, 'name': 'Action'}, {'id': 12, 'name': 'Adventure'}, {'id': 14, 'name': 'Fantasy'}, {...
Name: genres, dtype: object

In [9]:
# 둘다 name key에 달려있는 value만 추출
movies_df['genres'] = movies_df['genres'].apply(lambda x : [ y['name'] for y in x])
movies_df['keywords'] = movies_df['keywords'].apply(lambda x : [ y['name'] for y in x])
movies_df[['genres', 'keywords']][:1]

Unnamed: 0,genres,keywords
0,"[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colony, society, space travel, futuristic, romance, spa..."


In [12]:
### 장르 문자열의 count 기반 피처 벡터화

# join 펑션으로 join안에 있는 변수들을 합칠 수 있다. 
print(type(('*').join(['test', 'test2'])))
print(('*').join(['test', 'test2']))

<class 'str'>
test*test2


In [13]:
# join으로 ,를 날리고 ' '로 문자열을 변환
from sklearn.feature_extraction.text import CountVectorizer

# CountVectorizer를 적용하기 위해 공백문자로 word 단위가 구분되는 문자열로 변환. 
movies_df['genres_literal'] = movies_df['genres'].apply(lambda x : (' ').join(x))

# CountVector를 선언하고, fit_transform을 시킴
count_vect = CountVectorizer(min_df=0, ngram_range=(1,2))
genre_mat = count_vect.fit_transform(movies_df['genres_literal'])
print(genre_mat.shape)

(4803, 276)


In [14]:
# 아래처럼 나옴
movies_df['genres_literal'][0]

'Action Adventure Fantasy Science Fiction'

In [16]:
### 장르에 따른 영화별 코사인 유사도 추출
from sklearn.metrics.pairwise import cosine_similarity

genre_sim = cosine_similarity(genre_mat, genre_mat)
print(genre_sim.shape)
print(genre_sim[:2])


(4803, 4803)
[[1.         0.59628479 0.4472136  ... 0.         0.         0.        ]
 [0.59628479 1.         0.4        ... 0.         0.         0.        ]]


In [25]:
# 내림차순으로 코사인 값이 큰 인덱스를 추출
genre_sim_sorted_ind = genre_sim.argsort()[:, ::-1]
print(genre_sim_sorted_ind[:2])


[[   0 3494  813 ... 3038 3037 2401]
 [ 262    1  129 ... 3069 3067 2401]]


In [32]:
def find_sim_movie(df, sorted_ind, title_name, top_n=10):
    
    # 인자로 입력된 movies_df DataFrame에서 'title' 컬럼이 입력된 title_name 값인 DataFrame추출
    title_movie = df[df['title'] == title_name]
    
    # title_named을 가진 DataFrame의 index 객체를 ndarray로 반환하고 
    # sorted_ind 인자로 입력된 genre_sim_sorted_ind 객체에서 유사도 순으로 top_n 개의 index 추출
    title_index = title_movie.index.values
    similar_indexes = sorted_ind[title_index, :(top_n)]
    
    # 추출된 top_n index들 출력. top_n index는 2차원 데이터 임. 
    #dataframe에서 index로 사용하기 위해서 1차원 array로 변경
    print(similar_indexes)
    similar_indexes = similar_indexes.reshape(-1)
    print(similar_indexes)
    
    return df.iloc[similar_indexes]

In [24]:
movies_df.index.values

array([   0,    1,    2, ..., 4800, 4801, 4802], dtype=int64)

In [34]:
similar_movies = find_sim_movie(movies_df, genre_sim_sorted_ind, 'The Godfather',10)
similar_movies

[[2731 1243 3636 1946 2640 4065 1847 4217  883 3866]]
[2731 1243 3636 1946 2640 4065 1847 4217  883 3866]


Unnamed: 0,id,title,genres,vote_average,vote_count,popularity,keywords,overview,genres_literal
2731,240,The Godfather: Part II,"[Drama, Crime]",8.3,3338,105.792936,"[italo-american, cuba, vororte, melancholy, praise, revenge, mafia, lawyer, blood, corrupt polit...","In the continuing saga of the Corleone crime family, a young Vito Corleone grows up in Sicily an...",Drama Crime
1243,203,Mean Streets,"[Drama, Crime]",7.2,345,17.002096,"[epilepsy, protection money, secret love, money, redemption]","A small-time hood must choose from among love, friendship and the chance to rise within the mob.",Drama Crime
3636,36351,Light Sleeper,"[Drama, Crime]",5.7,15,6.063868,"[suicide, drug dealer, redemption, addict, existentialism]",A drug dealer with upscale clientele is having moral problems going about his daily deliveries. ...,Drama Crime
1946,11699,The Bad Lieutenant: Port of Call - New Orleans,"[Drama, Crime]",6.0,326,17.339852,"[police brutality, organized crime, policeman, illegal drugs, murder investigation, corrupt cop]","Terrence McDonagh, a New Orleans Police sergeant, who starts out as a good cop, receiving a meda...",Drama Crime
2640,400,Things to Do in Denver When You're Dead,"[Drama, Crime]",6.7,85,6.932221,"[father son relationship, bounty hunter, boat, way of life, coffin, denver, godmother, paranoia,...",A mafia film in Tarantino style with a star-studded cast. Jimmy’s “The Saint” gangster career ha...,Drama Crime
4065,364083,Mi America,"[Drama, Crime]",0.0,0,0.039007,"[new york state, hate crime]","A hate-crime has been committed in a the small city of Braxton, N.Y. Five migrant laborers have ...",Drama Crime
1847,769,GoodFellas,"[Drama, Crime]",8.2,3128,63.654244,"[prison, based on novel, florida, 1970s, mass murder, irish-american, drug traffic, biography, b...","The true story of Henry Hill, a half-Irish, half-Sicilian Brooklyn kid who is adopted by neighbo...",Drama Crime
4217,9344,Kids,"[Drama, Crime]",6.8,279,13.291991,"[puberty, first time]",A controversial portrayal of teens in New York City which exposes a deeply disturbing world of s...,Drama Crime
883,640,Catch Me If You Can,"[Drama, Crime]",7.7,3795,73.944049,"[con man, biography, fbi agent, overhead camera shot, attempted jailbreak, engagement party, mis...","A true story about Frank Abagnale Jr. who, before his 19th birthday, successfully conned million...",Drama Crime
3866,598,City of God,"[Drama, Crime]",8.1,1814,44.356711,"[male nudity, street gang, brazilian, photographer, 1970s, puberty, ghetto, gang war, coming of ...",Cidade de Deus is a shantytown that started during the 1960s and became one of Rio de Janeiro’s ...,Drama Crime


In [35]:
similar_movies[['title', 'vote_average']]

Unnamed: 0,title,vote_average
2731,The Godfather: Part II,8.3
1243,Mean Streets,7.2
3636,Light Sleeper,5.7
1946,The Bad Lieutenant: Port of Call - New Orleans,6.0
2640,Things to Do in Denver When You're Dead,6.7
4065,Mi America,0.0
1847,GoodFellas,8.2
4217,Kids,6.8
883,Catch Me If You Can,7.7
3866,City of God,8.1


In [36]:
# vote_average는 과연 높기만 하면 장땡? count를 같이 고려해야 한다. 
movies_df[['title', 'vote_average', 'vote_count']].sort_values('vote_average', ascending=False)[:10]

Unnamed: 0,title,vote_average,vote_count
3519,Stiff Upper Lips,10.0,1
4247,Me You and Five Bucks,10.0,2
4045,"Dancer, Texas Pop. 81",10.0,1
4662,Little Big Top,10.0,1
3992,Sardaarji,9.5,2
2386,One Man's Hero,9.3,2
2970,There Goes My Baby,8.5,2
1881,The Shawshank Redemption,8.5,8205
2796,The Prisoner of Zenda,8.4,11
3337,The Godfather,8.4,5893


평가 횟수에 대한 가중치가 부여된 평점(Weighted Rating) 계산
가중 평점(Weighted Rating) = (v/(v+m)) * R + (m/(v+m)) * C

■ v: 개별 영화에 평점을 투표한 횟수 ■ m: 평점을 부여하기 위한 최소 투표 횟수 ■ R: 개별 영화에 대한 평균 평점. ■ C: 전체 영화에 대한 평균 평점

In [None]:
# 따라서, 가중 평점을 쓰자
# 평가 횟수에 대한 가중치가 부여된 평점(Weighted Rating) 계산
# 가중 평점(Weighted Rating) = (v/(v+m)) * R + (m/(v+m)) * C

In [42]:
# c값이랑 m값은 아래와 같이 구한다. 
C = movies_df['vote_average'].mean()

# 0.6에 해당하는(60%에 해당하는 지점을 구해라)
m = movies_df['vote_count'].quantile(0.6)
print(C, m)

6.092171559442011 370.1999999999998


In [43]:
# m, C는 전체에 대한 변수니까 함수 밖에서 구하고, v, R은 개별 영화에 대한 변수니까 함수 안에서 구한다. 
percentile = 0.6
m = movies_df['vote_count'].quantile(percentile)
C = movies_df['vote_average'].mean()

def weighted_vote_average(record):
    v = record['vote_count']
    R = record['vote_average']
    
    return ((v/(v+m)) * R) + (m/(v+m) * C)

# apply로 신규 컬럼 추가한다. (axis=1)
movies_df['weighted_vote'] = movies_df.apply(weighted_vote_average, axis=1)

In [44]:
movies_df[['title', 'vote_average', 'weighted_vote', 'vote_count']].sort_values('weighted_vote', ascending=False)[:10]

Unnamed: 0,title,vote_average,weighted_vote,vote_count
1881,The Shawshank Redemption,8.5,8.396052,8205
3337,The Godfather,8.4,8.263591,5893
662,Fight Club,8.3,8.216455,9413
3232,Pulp Fiction,8.3,8.207102,8428
65,The Dark Knight,8.2,8.13693,12002
1818,Schindler's List,8.3,8.126069,4329
3865,Whiplash,8.3,8.123248,4254
809,Forrest Gump,8.2,8.105954,7927
2294,Spirited Away,8.3,8.105867,3840
2731,The Godfather: Part II,8.3,8.079586,3338


In [45]:
### 최종 가중평균을 반영한 함수
def find_sim_movie(df, sorted_ind, title_name, top_n=10):
    
    # 인자로 입력된 movies_df DataFrame에서 'title' 컬럼이 입력된 title_name 값인 DataFrame추출
    title_movie = df[df['title'] == title_name]
    
    # title_named을 가진 DataFrame의 index 객체를 ndarray로 반환하고 
    # sorted_ind 인자로 입력된 genre_sim_sorted_ind 객체에서 유사도 순으로 top_n 개의 index 추출
    title_index = title_movie.index.values
    similar_indexes = sorted_ind[title_index, :(top_n)]
    
    # 추출된 top_n index들 출력. top_n index는 2차원 데이터 임. 
    #dataframe에서 index로 사용하기 위해서 1차원 array로 변경
    print(similar_indexes)
    similar_indexes = similar_indexes.reshape(-1)
    print(similar_indexes)
    
    # 기준 영화 index는 제외
    similar_indexes = similar_indexes[similar_indexes != title_index]
        
    
    return df.iloc[similar_indexes].sort_values('weighted_vote', ascending=False)[:10]

similar_movies = find_sim_movie(movies_df, genre_sim_sorted_ind, 'The Godfather',10)
similar_movies[['title', 'vote_average', 'weighted_vote', 'vote_count']]

[[2731 1243 3636 1946 2640 4065 1847 4217  883 3866]]
[2731 1243 3636 1946 2640 4065 1847 4217  883 3866]


Unnamed: 0,title,vote_average,weighted_vote,vote_count
2731,The Godfather: Part II,8.3,8.079586,3338
1847,GoodFellas,8.2,7.976937,3128
3866,City of God,8.1,7.759693,1814
883,Catch Me If You Can,7.7,7.557097,3795
1243,Mean Streets,7.2,6.626569,345
4217,Kids,6.8,6.396368,279
2640,Things to Do in Denver When You're Dead,6.7,6.205672,85
4065,Mi America,0.0,6.092172,0
3636,Light Sleeper,5.7,6.0769,15
1946,The Bad Lieutenant: Port of Call - New Orleans,6.0,6.049012,326
