## 추천 시스템
    - 컨텐츠 기반 필터링(Content Based Filtering)
    - 협업 필터링 (Collaborative Filtering)
    - 결합(hybrid Filtering)

![사진1](https://velog.velcdn.com/images%2Fsdubee10%2Fpost%2F1377969d-e8b9-4276-8122-09d70110af41%2Fimage.png)

In [3]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

movies= pd.read_csv('../datasets/tmdb_5000_movies.csv')
print(movies.shape)
movies.head(3)

(4803, 20)


Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...","[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""...",2015-10-26,880674609,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466


In [6]:
from ast  import literal_eval

movies['genres']= movies['genres'].apply(literal_eval)
movies['keywords']= movies['keywords'].apply(literal_eval)

In [7]:
movies['genres']= movies['genres'].apply(lambda x: [y['name'] for y in x])
movies['keywords']= movies['keywords'].apply(lambda x: [y['name'] for y in x])
movies[['genres','keywords']][:1]

Unnamed: 0,genres,keywords
0,"[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon..."


In [9]:
from sklearn.feature_extraction.text import CountVectorizer

movies['genres_literal']= movies['genres'].apply(lambda x :(' ').join(x))
count_vect=CountVectorizer(min_df=0, ngram_range=(1,2))
genre_mat= count_vect.fit_transform(movies['genres_literal'])
print(genre_mat.shape)



(4803, 276)


In [10]:
from sklearn.metrics.pairwise import cosine_similarity

genre_sim =cosine_similarity(genre_mat,genre_mat)
print(genre_sim.shape)
print(genre_sim[:2])

(4803, 4803)
[[1.         0.59628479 0.4472136  ... 0.         0.         0.        ]
 [0.59628479 1.         0.4        ... 0.         0.         0.        ]]


In [12]:
genre_sim_sorted_ind= genre_sim.argsort()[:,::-1]
print(genre_sim_sorted_ind[:1])

[[   0 3494  813 ... 3038 3037 2401]]


In [13]:
def find_sim_movie(df, sorted_ind, title_name, top_n=10):
    title_movie = df[df['title'] == title_name]

    title_index = title_movie.index.values
    similar_indexes = sorted_ind[title_index,:(top_n)]

    print(similar_indexes)
    similar_indexes = similar_indexes.reshape(-1)

    return df.iloc[similar_indexes]

In [14]:
similar_movies= find_sim_movie(movies, genre_sim_sorted_ind, 'The Godfather',10)
similar_movies[['title', 'vote_average']]


[[2731 1243 3636 1946 2640 4065 1847 4217  883 3866]]


Unnamed: 0,title,vote_average
2731,The Godfather: Part II,8.3
1243,Mean Streets,7.2
3636,Light Sleeper,5.7
1946,The Bad Lieutenant: Port of Call - New Orleans,6.0
2640,Things to Do in Denver When You're Dead,6.7
4065,Mi America,0.0
1847,GoodFellas,8.2
4217,Kids,6.8
883,Catch Me If You Can,7.7
3866,City of God,8.1


In [15]:
C= movies['vote_average'].mean()
m=movies['vote_count'].quantile(0.6)

In [16]:
print('C',round(C,3),'m',round(m,3))

C 6.092 m 370.2


In [17]:
def weighted_vote_average(record):
    v=record['vote_count']
    R=record['vote_average']

    return((v/(v+m)*R)+(m/(m+v)*C))

In [19]:
movies['weighted_vote']=movies.apply(weighted_vote_average,axis=1)

In [20]:
def find_sim_movie_ver2(df,sorted_ind, title_name,top_n=10):
    title_movie =df[df['title']==title_name]
    title_index =title_movie.index.values

    similar_indexes =sorted_ind[title_index, :(top_n*2)]
    similar_indexes =similar_indexes.reshape(-1)

    similar_indexes =similar_indexes[similar_indexes !=title_index]

    return df.iloc[similar_indexes].sort_values('weighted_vote', ascending=False)[:top_n]

In [21]:
similar_movies = find_sim_movie_ver2(movies, genre_sim_sorted_ind,'The Godfather',10)

In [22]:
similar_movies[['title','weighted_vote','genres','vote_count']]

Unnamed: 0,title,weighted_vote,genres,vote_count
2731,The Godfather: Part II,8.079586,"[Drama, Crime]",3338
1847,GoodFellas,7.976937,"[Drama, Crime]",3128
3866,City of God,7.759693,"[Drama, Crime]",1814
1663,Once Upon a Time in America,7.657811,"[Drama, Crime]",1069
883,Catch Me If You Can,7.557097,"[Drama, Crime]",3795
281,American Gangster,7.141396,"[Drama, Crime]",1502
4041,This Is England,6.739664,"[Drama, Crime]",363
1149,American Hustle,6.717525,"[Drama, Crime]",2807
1243,Mean Streets,6.626569,"[Drama, Crime]",345
2839,Rounders,6.530427,"[Drama, Crime]",439
