# 데이터 로드 및 데이터 전처리

In [19]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings(action='ignore')
import os
os.chdir('/Users/younghun/Desktop/gitrepo/data/tmdb_data')

In [20]:
movies = pd.read_csv("tmdb_5000_movies.csv", encoding='utf-8')
print(movies.shape)
pd.options.display.max_colwidth = 100
movies.head(1)

(4803, 20)


Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""name"": ""Fantasy""}, {...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"": 2964, ""name"": ""future""}, {""id"": 3386, ""name"": ""sp...",en,Avatar,"In the 22nd century, a paraplegic Marine is dispatched to the moon Pandora on a unique mission, ...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289}, {""name"": ""Twentieth Century Fox Film Corporatio...","[{""iso_3166_1"": ""US"", ""name"": ""United States of America""}, {""iso_3166_1"": ""GB"", ""name"": ""United ...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso_639_1"": ""es"", ""name"": ""Espa\u00f1ol""}]",Released,Enter the World of Pandora.,Avatar,7.2,11800


In [21]:
# literal_eval을 사용해서 파이썬 자료구조가 복합적으로 섞여있는 값들을 하나의 객체 씩 할당되도록!
from ast import literal_eval

movies['genres'] = movies['genres'].apply(literal_eval)
movies['keywords'] = movies['keywords'].apply(literal_eval)
movies['production_companies'] = movies['production_companies'].apply(literal_eval)
movies['production_countries'] = movies['production_countries'].apply(literal_eval)
movies['spoken_languages'] = movies['spoken_languages'].apply(literal_eval)

In [22]:
# Genres 변수에서 필요한 텍스트들만 추출해서 변수 덮어씌우기
movies['genres'] = movies['genres'].apply(lambda x: [y['name'] for y in x])
movies['genres'] = movies['genres'].apply(lambda x: (' ').join(x))

In [26]:
# keywords 변수에서 필요한 텍스트들만 추출해서 변수 덮어씌우기
movies['keywords'] = movies['keywords'].apply(lambda x: [y['name'] for y in x])
movies['keywords'] = movies['keywords'].apply(lambda x: (' ').join(x))

In [39]:
# production_countries에서 나라 이니셜만 추출해 덮어씌우기
movies['production_countries'] = movies['production_countries'].apply(lambda x: [y['iso_3166_1'] for y in x])
movies['production_countries'] = movies['production_countries'].apply(lambda x: (' ').join(x))

In [42]:
# producction_companies에서 회사 이름 추출해 덮어 씌우기
movies['production_companies'] = movies['production_companies'].apply(lambda x: [y['name'] for y in x])
movies['production_companies'] = movies['production_companies'].apply(lambda x: (' ').join(x))

In [46]:
# spoken_languages에서 해당 영화 지원하는 언어 추출해 덮어 씌우기
movies['spoken_languages'] = movies['spoken_languages'].apply(lambda x: [y['name'] for y in x])
movies['spoken_languages'] = movies['spoken_languages'].apply(lambda x: (' ').join(x))

# 데이터 Feature Vectorization

In [57]:
# Overview에 있는 결측치 3개 해당 데이터 행의 title로 대체!
null_idx = movies[movies['overview'].isnull()].index.tolist()
for idx in null_idx:
    movies.loc[idx, 'overview'] = movies.loc[idx, 'title']

In [59]:
from sklearn.feature_extraction.text import CountVectorizer

cnt_vect = CountVectorizer(max_df=0.9,
                          ngram_range=(1,1),
                          stop_words='english')
# 장르 텍스트 벡터화 시키기
genres_vect = cnt_vect.fit_transform(movies['genres'])
# 키워드 텍스트 벡터화 시키기
keyword_vect = cnt_vect.fit_transform(movies['keywords'])
# Overview 텍스트 벡터화 시키기
overview_vect = cnt_vect.fit_transform(movies['overview'])
print("장르 벡터화: ", genres_vect.shape)
print("키워드 벡터화: ", keyword_vect.shape)
print("Overview 벡터화: ", overview_vect.shape)

장르 벡터화:  (4803, 22)
키워드 벡터화:  (4803, 7069)
Overview 벡터화:  (4803, 20982)


# 영화간 유사도 출력
- cosine similarity 사용

## 장르 유사도 이용해 비슷한 영화 추천

In [64]:
from sklearn.metrics.pairwise import cosine_similarity

genre_sim = cosine_similarity(genres_vect, genres_vect)
print(genre_sim.shape)
print(genre_sim[:3])

genres_sim_idx = genre_sim.argsort()[::-1]

(4803, 4803)
[[1.         0.77459667 0.51639778 ... 0.         0.         0.        ]
 [0.77459667 1.         0.66666667 ... 0.         0.         0.        ]
 [0.51639778 0.66666667 1.         ... 0.         0.         0.        ]]


In [95]:
# 특정 영화를 기준으로 유사도가 높은 영화 추출
def find_sim_movie(df, sim_sorted_idx, title_name, top_n=10):
    target_movie = df[df['title'] == title_name]
    target_movie_idx = target_movie.index.values
    
    # 2차원 array로 반환되어 있음
    target_sim_idx = sim_sorted_idx[target_movie_idx, :top_n]
    similar_movies = df.loc[target_sim_idx.reshape(-1,),
                            ['title','vote_average','vote_count']]
    return similar_movies.iloc[1:]

In [96]:
df = find_sim_movie(movies, genres_sim_idx, 'The Avengers',
              top_n=10)
df

Unnamed: 0,title,vote_average,vote_count
2743,The Butterfly Effect,7.3,2060
2749,Child's Play 2,5.8,308
2750,No Good Deed,5.6,181
2751,The Mist,6.7,1399
2752,Ex Machina,7.6,4737
2755,Earth to Echo,5.7,290
2757,Letters from Iwo Jima,7.2,541
2760,Room,8.1,2757
2764,Light It Up,6.6,7


# 가중 평점 활용해 추가 변수 넣기

In [100]:
# 전체 영화에 대한 평균
C = movies['vote_average'].mean()
# 최소 투표 횟수 설정(투표횟수의 상위 70%째의 횟수를 최소 횟수로 설정)
m = movies['vote_count'].quantile(0.7)

def weighted_vote_avg(record):
    v = record['vote_count']
    R = record['vote_average']
    return ( (v/(v+m)) * R) + ( (m/(m+v)) * C)

In [102]:
movies['weighted_vote'] = movies.apply(weighted_vote_avg, axis=1)

In [106]:
movies.isnull().sum()

budget                     0
genres                     0
homepage                3091
id                         0
keywords                   0
original_language          0
original_title             0
overview                   0
popularity                 0
production_companies       0
production_countries       0
release_date               1
revenue                    0
runtime                    2
spoken_languages           0
status                     0
tagline                  844
title                      0
vote_average               0
vote_count                 0
weighted_vote              0
dtype: int64

In [111]:
movies.loc[1,'title']

"Pirates of the Caribbean: At World's End"

In [114]:
# 함수 재정의
def find_sim_movie(df, sim_sorted_idx, title_name, top_n=10):
    target_movie = df[df['title'] == title_name]
    target_movie_idx = target_movie.index.values
    
    sim_movie_idx = sim_sorted_idx[target_movie_idx, 1:top_n]
    sim_movie_idx = sim_movie_idx.reshape(-1,)
    similar_movies = df.iloc[sim_movie_idx]
    similar_movies = similar_movies[['title','vote_count',
                                    'vote_average','weighted_vote']]
                    
    similar_movies = similar_movies.sort_values(by='weighted_vote',
                                               ascending=False)
    return similar_movies

In [115]:
similar_movies = find_sim_movie(movies, genres_sim_idx,
                               'The Avengers', top_n=10)
similar_movies

Unnamed: 0,title,vote_count,vote_average,weighted_vote
2760,Room,2757,8.1,7.750525
2752,Ex Machina,4737,7.6,7.435267
2743,The Butterfly Effect,2060,7.3,7.034287
2757,Letters from Iwo Jima,541,7.2,6.626338
2751,The Mist,1399,6.7,6.521642
2764,Light It Up,7,6.6,6.098217
2749,Child's Play 2,308,5.8,5.990947
2750,No Good Deed,181,5.6,5.975265
2755,Earth to Echo,290,5.7,5.961598
