## Word2Vec 알고리즘


In [4]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import gensim 

In [5]:
import warnings
warnings.filterwarnings(action='ignore')

In [6]:
# 경로의 경우 각자의 환경에 맞게 설정해주면 됩니다. 
path = '../input/t-academy-recommendation2/movies/'

In [7]:
movie = pd.read_csv(path + 'ratings.csv', low_memory=False)
movie.head(2)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179


In [8]:
movie = movie.sort_values(by='timestamp', ascending=True).reset_index(drop=True)
movie.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,383,21,3.0,789652009
1,383,47,5.0,789652009
2,383,1079,3.0,789652009
3,409,21,5.0,828212412
4,409,25,4.0,828212412


In [9]:
# 영화의 Metadata를 불러와서 movieID에 맞는 TITLE을 구해줍니다. 
meta = pd.read_csv(path + 'movies_metadata.csv', low_memory=False)
meta.head(2)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0


In [10]:
meta.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')

In [11]:
meta = meta.rename(columns={'id':'movieId'})
movie['movieId'] = movie['movieId'].astype(str)
meta['movieId'] = meta['movieId'].astype(str)

movie = pd.merge(movie, meta[['movieId', 'original_title']], how='left', on='movieId')

In [12]:
movie = movie[movie['original_title'].notnull()].reset_index(drop=True)

In [13]:
agg = movie.groupby(['userId'])['original_title'].agg({'unique'})
agg.head()

Unnamed: 0_level_0,unique
userId,Unnamed: 1_level_1
1,"[Jay and Silent Bob Strike Back, Vivement dima..."
2,"[Terminator 3: Rise of the Machines, The Conve..."
3,"[300, The Killing, Shortbus, Finding Neverland..."
4,"[David, The Wedding Planner, Casablanca, Sleep..."
5,"[Gleaming the Cube, Cool Hand Luke, Hidalgo, U..."


In [14]:
movie['original_title'].unique()

array(['The Endless Summer', 'Jarhead', '彼女の想いで', ...,
       'The Lonedale Operator', 'Violeta se fue a los cielos',
       'To Kill a Priest'], dtype=object)

Word2vec 적용

In [15]:
# int형식은 Word2vec에서 학습이 안되어서 String으로 변경해줍니다. 
sentence = []
for user_sentence in agg['unique'].values:
    sentence.append(list(map(str, user_sentence))) # 300과 같은 int 형태의 데이터가 있기 떄문

In [16]:
sentence[:2]

[['Jay and Silent Bob Strike Back',
  'Vivement dimanche!',
  'Rocky III',
  'American Pie',
  'My Tutor',
  'Greed'],
 ['Terminator 3: Rise of the Machines',
  'The Conversation',
  'The Hours',
  '48 Hrs.',
  'Back to the Future Part II',
  'Silent Hill',
  'Crustacés et coquillages',
  'Lost in Translation',
  'Night on Earth',
  "Dave Chappelle's Block Party",
  "Ocean's Eleven",
  'Sissi',
  'Live and Let Die',
  'A Clockwork Orange',
  'Солярис',
  'Sommer vorm Balkon',
  'La science des rêves',
  'Trois couleurs : Rouge',
  'Grbavica',
  'Czlowiek z zelaza',
  'Le Mépris',
  'Batman Returns',
  'Romeo + Juliet',
  'Monsoon Wedding',
  'Stand by Me',
  'Lucky Number Slevin',
  'Cat on a Hot Tin Roof',
  'The Dark',
  'The Devil Wears Prada',
  'Lili Marleen',
  'Star Trek IV: The Voyage Home',
  'A Nightmare on Elm Street',
  'Notting Hill',
  'Once Were Warriors',
  'Reservoir Dogs',
  '2001: A Space Odyssey',
  'Rebecca',
  'Psycho',
  'The Poseidon Adventure',
  'Batman Begins

In [17]:
# Word2vec의 학습을 진행해줍니다. 
from gensim.models import Word2Vec
embedding_model = Word2Vec(sentence, size=20, window = 5, 
                           # size는 weight의 크기, window는 양 옆의 단어를 몇 개까지 볼
                           min_count=1, workers=4, iter=200, sg=1) 
                            # sg = 0 - CBOW, sg =1 - Skip-gram

In [18]:
embedding_model.wv.most_similar(positive=['Spider-Man 2'], topn=10)
# tf-idf와는 다르게 성능이 그다지 좋지 않아보인다.
# 따라서 상황에 따라 추천시스템 여러 개를 쓰는 것이 좋아보인다.
# 메타 데이터에 따라 성능의 차이가 날 수 있다.

[('Snow Cake', 0.8224702477455139),
 ('Harry Potter and the Prisoner of Azkaban', 0.7357112169265747),
 ('Heavenly Creatures', 0.7288028001785278),
 ('Forrest Gump', 0.7280134558677673),
 ('Licence to Kill', 0.7241933345794678),
 ('Domicile Conjugal', 0.7176676988601685),
 ('Sunrise: A Song of Two Humans', 0.7125868797302246),
 ('Mr. Brooks', 0.7109860181808472),
 ('Fail-Safe', 0.7029184103012085),
 ('Krull', 0.6963222622871399)]

## Doc2Vec 적용

![](https://drive.google.com/uc?export=view&id=1g2ausKfoaAT0jMwSatRUG3fiGWfDuysV
)

In [19]:
from gensim.models import doc2vec

In [20]:
meta = pd.read_csv(path + 'movies_metadata.csv', low_memory=False)
meta = meta[meta['original_title'].notnull()].reset_index(drop=True)
meta = meta[meta['overview'].notnull()].reset_index(drop=True)

In [29]:
meta.head(2)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,pre_overview
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0,led woody andy toys live happily room andy bir...
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,when siblings judy peter discover enchanted bo...


In [21]:
from nltk.corpus import stopwords 
from tqdm.notebook import tqdm
from nltk.tokenize import word_tokenize, sent_tokenize
import re 
stop_words = set(stopwords.words('english')) 

overview = []
for words in tqdm(meta['overview']):
    word_tokens = word_tokenize(words)
    sentence = re.sub('[^A-Za-z0-9]+', ' ', str(word_tokens))
    sentence = sentence.strip()
    
    sentence_tokens = word_tokenize(sentence)
    result = ''
    for token in sentence_tokens: 
        if token not in stop_words:
            result += ' ' + token 
    result = result.strip().lower()
    overview.append(result)

HBox(children=(FloatProgress(value=0.0, max=44512.0), HTML(value='')))




In [31]:
overview[:2]

['led woody andy toys live happily room andy birthday brings buzz lightyear onto scene afraid losing place andy heart woody plots buzz but circumstances separate buzz woody owner duo eventually learns put aside differences',
 'when siblings judy peter discover enchanted board game opens door magical world unwittingly invite alan adult trapped inside game 26 years living room alan hope freedom finish game proves risky three find running giant rhinoceroses evil monkeys terrifying creatures']

In [22]:
meta['pre_overview'] = overview

In [23]:
doc_vectorizer = doc2vec.Doc2Vec(
    dm=0,            # PV-DBOW / default 1
    dbow_words=1,    # w2v simultaneous with DBOW d2v / default 0
    window=10,        # distance between the predicted word and context words
    size=100,        # vector size
    alpha=0.025,     # learning-rate
    seed=1234,
    min_count=5,    # ignore with freq lower
    min_alpha=0.025, # min learning-rate
    workers=4,   # multi cpu
    hs = 1,          # hierar chical softmax / default 0
    negative = 10   # negative sampling / default 5
)

In [24]:
from collections import namedtuple
# namedtuple 참고 자료 : https://velog.io/@jewon119/Python-%EC%8B%AC%ED%99%94-NamedTuple
agg = meta[['id', 'original_title', 'pre_overview']]
TaggedDocument = namedtuple('TaggedDocument', 'words tags')
tagged_train_docs = [TaggedDocument((c), [d]) for d, c in agg[['original_title', 'pre_overview']].values]

In [35]:
agg[['original_title', 'pre_overview']].values[:2]

array([['Toy Story',
        'led woody andy toys live happily room andy birthday brings buzz lightyear onto scene afraid losing place andy heart woody plots buzz but circumstances separate buzz woody owner duo eventually learns put aside differences'],
       ['Jumanji',
        'when siblings judy peter discover enchanted board game opens door magical world unwittingly invite alan adult trapped inside game 26 years living room alan hope freedom finish game proves risky three find running giant rhinoceroses evil monkeys terrifying creatures']],
      dtype=object)

In [32]:
tagged_train_docs[:2]

[TaggedDocument(words='led woody andy toys live happily room andy birthday brings buzz lightyear onto scene afraid losing place andy heart woody plots buzz but circumstances separate buzz woody owner duo eventually learns put aside differences', tags=['Toy Story']),
 TaggedDocument(words='when siblings judy peter discover enchanted board game opens door magical world unwittingly invite alan adult trapped inside game 26 years living room alan hope freedom finish game proves risky three find running giant rhinoceroses evil monkeys terrifying creatures', tags=['Jumanji'])]

In [25]:
doc_vectorizer.build_vocab(tagged_train_docs)
print(str(doc_vectorizer))

Doc2Vec(dbow+w,d100,n10,hs,w10,mc5,s0.001,t4)


In [26]:
doc_vectorizer.docvecs.most_similar('Toy Story', topn=20)

[('世界名作童話 白鳥の湖', 0.4052150249481201),
 ('The Tangerine Bear: Home in Time for Christmas!', 0.3577999174594879),
 ('Monster High: Why Do Ghouls Fall in Love?', 0.3559913635253906),
 ('Company Man', 0.3553657531738281),
 ('Flaming Star', 0.35182690620422363),
 ('The Blue Gardenia', 0.35163426399230957),
 ('Idol of the Crowds', 0.34330350160598755),
 ("O Costa d'África", 0.34228602051734924),
 ("Je suis né d'une cigogne", 0.3412458896636963),
 ('Blind Vaysha', 0.33950215578079224),
 ('Decoding Annie Parker', 0.3366684317588806),
 ('Sweeney!', 0.3337143659591675),
 ('Detective Byomkesh Bakshy!', 0.33310264348983765),
 ('The Guest House', 0.331345796585083),
 ('Kukačka v temném lese', 0.32903191447257996),
 ('Bio-Dome', 0.3283519744873047),
 ('The Unfolding', 0.32750314474105835),
 ('American Samurai', 0.32531821727752686),
 ('Pammal K. Sambandam', 0.3247126042842865),
 ('Steel Dawn', 0.31877344846725464)]

In [27]:
doc_vectorizer.docvecs.most_similar('Harry Potter and the Deathly Hallows: Part 1', topn=20)

[('Little Hiawatha', 0.4055997133255005),
 ('La vida loca', 0.39578911662101746),
 ('Angriff der Lederhosenzombies', 0.3769531846046448),
 ("I'm King Kong!: The Exploits of Merian C. Cooper", 0.36994028091430664),
 ('On Your Mark ジブリ実験剧场', 0.35459762811660767),
 ("Quai d'Orsay", 0.3538854718208313),
 ('Bhoot', 0.34964945912361145),
 ('Scooby-Doo! WrestleMania Mystery', 0.341795951128006),
 ('Rakudai Wa Shitakeredo', 0.341130793094635),
 ('Vrijdag', 0.3404476046562195),
 ('Strigoi', 0.34019142389297485),
 ('The Wild Party', 0.3349694013595581),
 ('Perfect Match', 0.3344232439994812),
 ('Oggi a me... domani a te!', 0.3333624303340912),
 ('Viagem ao Princípio do Mundo', 0.3305419087409973),
 ('Dominick and Eugene', 0.32962095737457275),
 ('The Reaping', 0.32786089181900024),
 ('Le Rossignol', 0.3203725218772888),
 ('Mindenki', 0.31815657019615173),
 ('Oklahoma!', 0.3164016604423523)]

In [28]:
# 벡터 문서 학습
from time import time

start = time()

for epoch in tqdm(range(5)):
    doc_vectorizer.train(tagged_train_docs, total_examples=doc_vectorizer.corpus_count, epochs=doc_vectorizer.iter)
    doc_vectorizer.alpha -= 0.002 # decrease the learning rate
    doc_vectorizer.min_alpha = doc_vectorizer.alpha # fix the learning rate, no decay

#doc_vectorizer.train(tagged_train_docs, total_examples=doc_vectorizer.corpus_count, epochs=doc_vectorizer.iter)
end = time()
print("During Time: {}".format(end-start))

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


During Time: 761.0237629413605
