## 데이터 출처

https://www.kaggle.com/rounakbanik/the-movies-dataset

## Import Library (step.01)

In [1]:
import pandas as pd
import numpy as np

import warnings; warnings.filterwarnings('ignore')

#문자열 파싱 라이브러리
from ast import literal_eval

#Bow
from sklearn.feature_extraction.text import CountVectorizer

#cosine similarity
from sklearn.metrics.pairwise import cosine_similarity

## Data Process (step.02)

In [2]:
movies = pd.read_csv('movies_metadata.csv')

In [3]:
print(movies.shape)
movies.head(5)

(45466, 24)


Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [4]:
#필요한 컬럼만 추출
movies_df = movies[['id','genres','imdb_id','original_language','original_title', 'title', 'vote_average','vote_count']]
movies_df.head(3)

Unnamed: 0,id,genres,imdb_id,original_language,original_title,title,vote_average,vote_count
0,862,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",tt0114709,en,Toy Story,Toy Story,7.7,5415.0
1,8844,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",tt0113497,en,Jumanji,Jumanji,6.9,2413.0
2,15602,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",tt0113228,en,Grumpier Old Men,Grumpier Old Men,6.5,92.0


In [5]:
movies_df['genres'][0]

"[{'id': 16, 'name': 'Animation'}, {'id': 35, 'name': 'Comedy'}, {'id': 10751, 'name': 'Family'}]"

In [6]:
type(movies_df['genres'][0])

str

### genres 칼럼 str 형태를 list 형태로 바꿔주기

In [7]:
movies_df['genres'] = movies_df['genres'].apply(literal_eval)

In [8]:
movies_df['genres'][0]

[{'id': 16, 'name': 'Animation'},
 {'id': 35, 'name': 'Comedy'},
 {'id': 10751, 'name': 'Family'}]

In [9]:
type(movies_df['genres'][0])

list

In [10]:
movies_df['genres'] = movies_df['genres'].apply(lambda x : [y['name'].lower() for y in x])

In [11]:
movies_df[['genres']][:3]

Unnamed: 0,genres
0,"[animation, comedy, family]"
1,"[adventure, fantasy, family]"
2,"[romance, comedy]"


### CountVectorizer

In [12]:
# 1. 문서를 토큰 리스트로 변환한다. 
# 2. 각 문서에서 토큰의 출현 빈도를 센다. 
#3. 각 문서를 BOW 인코딩 벡터로 변환한다. 
count_vector = CountVectorizer(min_df=0, ngram_range=(1,2))

In [13]:
movies_df['genres']

0         [animation, comedy, family]
1        [adventure, fantasy, family]
2                   [romance, comedy]
3            [comedy, drama, romance]
4                            [comedy]
                     ...             
45461                 [drama, family]
45462                         [drama]
45463       [action, drama, thriller]
45464                              []
45465                              []
Name: genres, Length: 45466, dtype: object

In [14]:
movies_df['genres_literal'] = movies_df['genres'].apply(lambda x : (' ').join(x))
movies_df['genres_literal']

0         animation comedy family
1        adventure fantasy family
2                  romance comedy
3            comedy drama romance
4                          comedy
                   ...           
45461                drama family
45462                       drama
45463       action drama thriller
45464                            
45465                            
Name: genres_literal, Length: 45466, dtype: object

In [15]:
vector_genres = count_vector.fit_transform(movies_df['genres_literal'])
print(vector_genres.shape)

(45466, 449)


In [16]:
print(vector_genres)

  (0, 40)	1
  (0, 67)	1
  (0, 151)	1
  (0, 43)	1
  (0, 74)	1
  (1, 151)	1
  (1, 20)	1
  (1, 171)	1
  (1, 28)	1
  (1, 179)	1
  (2, 67)	1
  (2, 351)	1
  (2, 355)	1
  (3, 67)	1
  (3, 351)	1
  (3, 129)	1
  (3, 73)	1
  (3, 143)	1
  (4, 67)	1
  (5, 129)	1
  (5, 0)	1
  (5, 89)	1
  (5, 383)	1
  (5, 4)	1
  (5, 95)	1
  :	:
  (45456, 257)	1
  (45456, 320)	1
  (45456, 336)	1
  (45456, 270)	1
  (45457, 257)	1
  (45457, 320)	1
  (45457, 332)	1
  (45458, 257)	1
  (45459, 371)	1
  (45459, 191)	1
  (45459, 372)	1
  (45460, 351)	1
  (45460, 129)	1
  (45460, 0)	1
  (45460, 130)	1
  (45460, 14)	1
  (45461, 151)	1
  (45461, 129)	1
  (45461, 136)	1
  (45462, 129)	1
  (45463, 129)	1
  (45463, 0)	1
  (45463, 383)	1
  (45463, 145)	1
  (45463, 6)	1


In [17]:
count_vect2 = CountVectorizer(min_df=0, ngram_range=(1, 1)) #min_df: 단어장에 들어갈 최소빈도, ngram_range: 1 <= n <= 2
vector_genres2 = count_vect2.fit_transform(movies_df['genres_literal'])
print(vector_genres2.shape)

(45466, 46)


In [18]:
print(vector_genres2)

  (0, 2)	1
  (0, 7)	1
  (0, 13)	1
  (1, 13)	1
  (1, 1)	1
  (1, 14)	1
  (2, 7)	1
  (2, 33)	1
  (3, 7)	1
  (3, 33)	1
  (3, 11)	1
  (4, 7)	1
  (5, 11)	1
  (5, 0)	1
  (5, 9)	1
  (5, 40)	1
  (6, 7)	1
  (6, 33)	1
  (7, 13)	1
  (7, 1)	1
  (7, 11)	1
  (7, 0)	1
  (8, 1)	1
  (8, 0)	1
  (8, 40)	1
  :	:
  (45451, 7)	1
  (45451, 14)	1
  (45452, 10)	1
  (45453, 11)	1
  (45453, 9)	1
  (45453, 40)	1
  (45454, 7)	1
  (45454, 11)	1
  (45456, 40)	1
  (45456, 22)	1
  (45456, 27)	1
  (45457, 22)	1
  (45457, 27)	1
  (45458, 22)	1
  (45459, 34)	1
  (45459, 15)	1
  (45460, 33)	1
  (45460, 11)	1
  (45460, 0)	1
  (45461, 13)	1
  (45461, 11)	1
  (45462, 11)	1
  (45463, 11)	1
  (45463, 0)	1
  (45463, 40)	1


In [19]:
print(vector_genres2.toarray())

[[0 0 1 ... 0 0 0]
 [0 1 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [1 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [20]:
print(vector_genres2.shape)

(45466, 46)


In [21]:
print(count_vect2.get_feature_names())

['action', 'adventure', 'animation', 'aniplex', 'brosta', 'carousel', 'cartel', 'comedy', 'committee', 'crime', 'documentary', 'drama', 'entertainment', 'family', 'fantasy', 'fiction', 'film', 'filmworks', 'foreign', 'gohands', 'group', 'history', 'horror', 'mardock', 'media', 'movie', 'music', 'mystery', 'odyssey', 'production', 'productions', 'pulser', 'rogue', 'romance', 'science', 'scramble', 'sentai', 'state', 'telescene', 'the', 'thriller', 'tv', 'view', 'vision', 'war', 'western']


## 코사인 유사도 (cosine_similarity) 

In [22]:
genre_sim_sorted =  cosine_similarity(vector_genres2,vector_genres2).argsort()[:,::-1] 
print(genre_sim_sorted[:1])

[[    0 30729 30861 ... 27112 27113 22732]]


In [23]:
genre_sim_sorted.shape

(45466, 45466)

In [24]:
genre_sim_sorted

array([[    0, 30729, 30861, ..., 27112, 27113, 22732],
       [ 6101, 12614, 23473, ..., 29300, 29299, 22732],
       [ 4668, 14556,  7378, ..., 27237, 27238, 22732],
       ...,
       [  344, 16766, 21162, ..., 25898, 25899,     0],
       [45465, 15184, 15160, ..., 30306, 30305,     0],
       [45465, 15184, 15160, ..., 30306, 30305,     0]], dtype=int64)

###  장르 코사인 유사

In [132]:
def find_sim_movie(df, sorted_ind, title_name, top_n=10):
    title_movie = df[df['title'] == title_name].sort_values('weighted_vote',ascending=False)[:1] #이름 중복 방지 #이름 중복 방지
    title_index = title_movie.index.values
    
    # top_n의 2배에 해당하는 쟝르 유사성이 높은 index 추출
    similar_indexes = sorted_ind[title_index, :(top_n*2)]
    
    #2차원 array를 1차원 array로 변경
    similar_indexes = similar_indexes.reshape(-1)

    # 기준 영화 index는 제외
    similar_indexes = similar_indexes[similar_indexes != title_index]
    
    # top_n의 2배에 해당하는 후보군에서 weighted_vote 높은 순으로 top_n 만큼 추출 
    return df.iloc[similar_indexes].sort_values('weighted_vote', ascending=False)[:top_n]

## 가중평점

In [100]:
C = movies_df['vote_average'].mean()
m = movies_df['vote_count'].quantile(0.8)

In [101]:
print(round(C,3) , round(m,3))

5.618 50.0


In [102]:
def weighted_rating(df):
    v = df['vote_count']
    R = df['vote_average']
    
    return (v/(v+m))*R + (m/(v+m))*C

In [103]:
movies_df['weighted_vote']= movies_df.apply(weighted_rating,axis=1)

In [104]:
movies_df[['weighted_vote','title','vote_average','vote_count','genres']].sort_values('weighted_vote',ascending=False)[:10]

Unnamed: 0,weighted_vote,title,vote_average,vote_count,genres
10309,8.855148,Dilwale Dulhania Le Jayenge,9.1,661.0,"[comedy, drama, romance]"
314,8.482863,The Shawshank Redemption,8.5,8358.0,"[drama, crime]"
834,8.476278,The Godfather,8.5,6024.0,"[drama, crime]"
40251,8.366584,Your Name.,8.5,1030.0,"[romance, animation, drama]"
12481,8.289115,The Dark Knight,8.3,12269.0,"[drama, action, crime, thriller]"
2843,8.286216,Fight Club,8.3,9678.0,[drama]
292,8.284623,Pulp Fiction,8.3,8670.0,"[thriller, crime]"
522,8.270109,Schindler's List,8.3,4436.0,"[drama, history, war]"
23673,8.269704,Whiplash,8.3,4376.0,[drama]
5481,8.266628,Spirited Away,8.3,3968.0,"[fantasy, adventure, animation, family]"


In [131]:
recomm_movies = find_sim_movie(movies_df,genre_sim_sorted,'Frozen',10)
recomm_movies

Unnamed: 0,id,genres,imdb_id,original_language,original_title,title,vote_average,vote_count,genres_literal,weighted_vote
40015,313297,"[animation, adventure, family]",tt4302938,en,Kubo and the Two Strings,Kubo and the Two Strings,7.7,982.0,animation adventure family,7.599138
1798,10674,"[animation, family, adventure]",tt0120762,en,Mulan,Mulan,7.6,2089.0,animation family adventure,7.553675
4391,12144,"[animation, adventure, family]",tt0095489,en,The Land Before Time,The Land Before Time,7.0,660.0,animation adventure family,6.902691
28665,326359,"[adventure, animation, family]",tt4007502,en,Frozen Fever,Frozen Fever,6.9,630.0,adventure animation family,6.805751
40969,136799,"[adventure, animation, family]",tt1679335,en,Trolls,Trolls,6.7,1054.0,adventure animation family,6.651006
9115,9732,"[animation, family, adventure]",tt0120131,en,The Lion King 2: Simba's Pride,The Lion King 2: Simba's Pride,6.7,1034.0,animation family adventure,6.650102
30526,223706,"[animation, adventure, family]",tt3183630,pt,O Menino e o Mundo,Boy & the World,7.0,69.0,animation adventure family,6.419415
23520,140870,"[animation, adventure, family]",tt2368672,xx,Minuscule - La vallée des fourmis perdues,Minuscule: Valley of the Lost Ants,6.6,128.0,animation adventure family,6.324216
36436,286940,"[family, animation, adventure]",tt3246908,en,The Boxcar Children,The Boxcar Children,7.6,7.0,family animation adventure,5.861585
40875,89825,"[animation, adventure, family]",tt1734113,ja,おまえうまそうだな,You Are Umasou,7.0,10.0,animation adventure family,5.848506


In [130]:
movies_df[movies_df['title']=='Frozen'].sort_values('weighted_vote',ascending=False)

Unnamed: 0,id,genres,imdb_id,original_language,original_title,title,vote_average,vote_count,genres_literal,weighted_vote
22110,109445,"[animation, adventure, family]",tt2294629,en,Frozen,Frozen,7.3,5440.0,animation adventure family,7.284683
14926,44363,[thriller],tt1323045,en,Frozen,Frozen,5.9,586.0,thriller,5.877846
16276,170986,[drama],tt1071798,en,Frozen,Frozen,8.5,2.0,drama,5.729045
