## Word2Vec 알고리즘

In [3]:
# !pip install gensim

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import gensim # Word2Vec의 알고리즘을 호출해주는 패키지

In [2]:
import warnings
warnings.filterwarnings(action='ignore')

In [3]:
# 경로의 경우 각자의 환경에 맞게 설정해주면 됩니다. 
path = './data/movies/'

In [4]:
movie = pd.read_csv(path + 'ratings.csv', low_memory=False)
movie.head(2)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179


In [5]:
# 시간순으로 정렬 = sort_values by timestamp
movie = movie.sort_values(by='timestamp', ascending=True).reset_index(drop=True)
movie.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,383,21,3.0,789652009
1,383,47,5.0,789652009
2,383,1079,3.0,789652009
3,409,21,5.0,828212412
4,409,25,4.0,828212412


In [6]:
# 영화의 Metadata를 불러와서 movieID에 맞는 TITLE을 구해줍니다. 
meta = pd.read_csv(path + 'movies_metadata.csv', low_memory=False)
meta.head(2)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0


In [7]:
meta.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')

In [8]:
meta = meta.rename(columns={'id':'movieId'}) # 컬럼명 변경
movie['movieId'] = movie['movieId'].astype(str) # 데이터형태를 통일해주기 위해 str로 변형
meta['movieId'] = meta['movieId'].astype(str)

movie = pd.merge(movie, meta[['movieId', 'original_title']], how='left', on='movieId') # 조인해줌

In [11]:
movie.head()

Unnamed: 0,userId,movieId,rating,timestamp,original_title
0,383,21,3.0,789652009,The Endless Summer
1,409,21,5.0,828212412,The Endless Summer
2,409,25,4.0,828212412,Jarhead
3,409,30,5.0,828212412,彼女の想いで
4,409,35,4.0,828212412,The Simpsons Movie


In [9]:
movie = movie[movie['original_title'].notnull()].reset_index(drop=True) # 결측치제거하고, 인덱스 재정렬

In [14]:
# grouped.mean() -> 하나의 함수만 표현가능
# grouped.agg(['mean', 'max', ...]) -> 여러개를 넣을 수 있음
agg = movie.groupby(['userId'])['original_title'].agg({'unique'})
# agg = movie.groupby(['userId'])['original_title'].unique()
agg.head()

Unnamed: 0_level_0,unique
userId,Unnamed: 1_level_1
1,"[Jay and Silent Bob Strike Back, Vivement dima..."
2,"[Terminator 3: Rise of the Machines, The Conve..."
3,"[300, The Killing, Shortbus, Finding Neverland..."
4,"[David, The Wedding Planner, Casablanca, Sleep..."
5,"[Gleaming the Cube, Cool Hand Luke, Hidalgo, U..."


In [15]:
movie['original_title'].unique()

array(['The Endless Summer', 'Jarhead', '彼女の想いで', ...,
       'The Lonedale Operator', 'Violeta se fue a los cielos',
       'To Kill a Priest'], dtype=object)

### Word2Vec 적용
해리포터 시리즈 마법사의 돌, 비밀의방, 아즈카반죄수가 있는데,

비밀의방 양옆에 해리포터 시리즈를 많이 본다면 word2vec입장에서 해리포터가 많이 나올 것이다.

In [16]:
# int형식은 Word2vec에서 학습이 안되어서 String으로 변경해줍니다. 
sentence = []
for user_sentence in agg['unique'].values:
    # map -> list내의 모든 원소에 대해서 String
    sentence.append(list(map(str, user_sentence)))

In [21]:
sentence[:1]

[['Jay and Silent Bob Strike Back',
  'Vivement dimanche!',
  'Rocky III',
  'American Pie',
  'My Tutor',
  'Greed']]

In [24]:
# Word2vec의 학습을 진행해줍니다. 
from gensim.models import Word2Vec
embedding_model = Word2Vec(sentence, vector_size=20, window = 5, 
                           min_count=1, workers=4, epochs=200, sg=1)
# size = vector_size = weight 크기, window = 몇개를 볼지, workers = CPU자원, iter = epochs = 몇바퀴돌지,
# sg : 0 = CBOW, sg : 1 = SKIP-GRAM
# 시간 꽤 걸림

In [25]:
embedding_model.wv.most_similar(positive=['Spider-Man 2'], topn=10) # 학습이 안되는 경우도 발생
# 메타 데이터가 잘 구축되지 않은 경우에는 컨텐츠 기반 모델이 잘 구현이 안된다.

[('Snow Cake', 0.8593770861625671),
 ('Domicile Conjugal', 0.7443365454673767),
 ('Snow White and the Seven Dwarfs', 0.7443079352378845),
 ('Forrest Gump', 0.7282701730728149),
 ('Star Trek: Nemesis', 0.7201566696166992),
 ('Me, Myself & Irene', 0.7186054587364197),
 ('Fail-Safe', 0.7147039175033569),
 ('Licence to Kill', 0.7143845558166504),
 ('Die Ehe der Maria Braun', 0.7136154770851135),
 ('Rumor Has It...', 0.7128531336784363)]