## 영화 추천 시스템 (알고리즘)
- 콘텐츠 기반 필터링 : 특정 아이템 선호 -> 비슷한 아이템 추천
- 줄거리 기반 추천

### 데이터 전처리

##### 사용할 column만 남기기

In [None]:
# 경고 지우기 
import warnings
warnings.filterwarnings("ignore")

In [None]:
# 라이브러리 import
import pandas as pd 

## 문자열 -> 객체 변경 라이브러리
from ast import literal_eval 

## 벡터화 라이브러리 
from sklearn.feature_extraction.text import TfidfVectorizer

## 코사인 유사도 라이브러리
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
movie = pd.read_csv('tmdb_5000_movies.csv')
movie.head()

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...","[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""...",2015-10-26,880674609,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466
3,250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",http://www.thedarkknightrises.com/,49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-07-16,1084939099,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106
4,260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://movies.disney.com/john-carter,49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-03-07,284139100,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124


In [None]:
movie.shape

(4803, 20)

In [None]:
# 필요한 column만 남기기
columns = ['id', 'title', 'genres', 'vote_average', 'vote_count', 'popularity', 'keywords', 'overview']

In [None]:
df = movie[columns]

##### overview 결측치 제거

In [None]:
df['overview'].isnull().sum()

3

In [None]:
df['overview'] = df['overview'].fillna('')

##### 장르, 키워드 추출 

In [None]:
# str -> list{dict} : 문자열을 객체로 변환
df['genres'] = df['genres'].apply(literal_eval)
df['keywords'] = df['keywords'].apply(literal_eval)

# dict에 있는 장르 name만 추출
df['genres'] = df['genres'].apply(lambda x : [dic['name'] for dic in x])
df['keywords'] = df['keywords'].apply(lambda x : [dic['name'] for dic in x])


##### 가중 평점 계산 
- 투표수와 평점모두 반영하는 지표 구하기

IMDb의 체계 이용 
- R : 개별 영화의 평점
- V : 개별 영화의 평점 투표 수 
- m : 순위 안에 들어야 하는 최소 투표 
- c : 전체 영화에 대한 평균 평점 

In [None]:
m = df['vote_count'].quantile(0.8)
c = df['vote_average'].mean()

# 가중 평점 계산 함수 
def weighted_rating(x, m=m, c=c) : 
    V = x['vote_count']
    R = x['vote_average']
    return ((V/(V+m))*R) + ((m/(m+V))*c) # 계산식은 IMDb참고 

df['weighted_vote'] = df.apply(weighted_rating, axis=1)

### 줄거리 유사도 측정 
- 코사인 유사도

In [None]:
# 줄거리 벡터화 
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['overview'])
print(tfidf_matrix.shape)

(4803, 20978)


In [None]:
# 코사인 유사도 구하기 
overview_csine = cosine_similarity(tfidf_matrix, tfidf_matrix)

### 추천 함수

In [None]:
# 줄거리 기반 추천 함수 
def overview_sim_movie(title_name) : 
    
    # 영화 제목 일치 행 찾기 
    title_movie = df[df['title'] == title_name]

    # 영화의 인덱스 찾기 
    title_index = title_movie.index.values

    # 입력한 영화 제목과 영화 목록의 코사인 유사도 
    df['similarity'] = overview_csine[title_index, :].reshape(-1,1)

    # 유사도 기반 내림차순 정렬 
    temp = df.sort_values(by='similarity', ascending=False)

    # 입력한 영화 제외 
    temp = temp[temp.index.values != title_index]
    
    # 상위 10개 영화의 인덱스 
    final_index = temp.index.values[:10]

    return df[['title', 'genres', 'weighted_vote', 'similarity']].iloc[final_index]

In [None]:
overview_sim_movie('The Dark Knight')

Unnamed: 0,title,genres,weighted_vote,similarity
3,The Dark Knight Rises,"[Action, Crime, Drama, Thriller]",7.456523,0.301512
428,Batman Returns,"[Action, Fantasy]",6.415139,0.249431
3854,"Batman: The Dark Knight Returns, Part 2","[Action, Animation]",6.642426,0.224516
299,Batman Forever,"[Action, Crime, Fantasy]",5.547916,0.21407
1359,Batman,"[Fantasy, Action]",6.715308,0.182334
119,Batman Begins,"[Action, Crime, Drama]",7.337898,0.162037
1181,JFK,"[Drama, Thriller, History]",6.576366,0.134627
9,Batman v Superman: Dawn of Justice,"[Action, Adventure, Fantasy]",5.747169,0.118637
2507,Slow Burn,"[Mystery, Crime, Drama, Thriller]",6.08244,0.113738
210,Batman & Robin,"[Action, Crime, Fantasy]",4.962731,0.106896
