In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Word2Vec 알고리즘


In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import gensim  # Word2Vec 알고리즘 호출

In [3]:
import warnings
warnings.filterwarnings(action='ignore')

In [7]:
# 경로의 경우 각자의 환경에 맞게 설정해주면 됩니다. 
path = '/content/drive/MyDrive/추천시스템/archive/movies/'

In [8]:
movie = pd.read_csv(path + 'ratings.csv', low_memory=False)
movie.head(2)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179


In [9]:
# 시간 순으로 정렬
movie = movie.sort_values(by='timestamp', ascending=True).reset_index(drop=True)
movie.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,383,21,3.0,789652009
1,383,47,5.0,789652009
2,383,1079,3.0,789652009
3,409,21,5.0,828212412
4,409,25,4.0,828212412


In [10]:
# 영화의 Metadata를 불러와서 movieID에 맞는 TITLE을 구해줍니다. 
meta = pd.read_csv(path + 'movies_metadata.csv', low_memory=False)
meta.head(2)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,17.015539,/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0


In [11]:
meta.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')

In [16]:
meta = meta.rename(columns={'id':'movieId'}) # 컬럼명 변경
movie['movieId'] = movie['movieId'].astype(str)
meta['movieId'] = meta['movieId'].astype(str)

movie = pd.merge(movie, meta[['movieId', 'original_title']], how='left', on='movieId') # 조인

In [17]:
movie = movie[movie['original_title'].notnull()].reset_index(drop=True)

In [18]:
agg = movie.groupby(['userId'])['original_title'].agg({'unique'})
agg.head()

Unnamed: 0_level_0,unique
userId,Unnamed: 1_level_1
1,"[Jay and Silent Bob Strike Back, Vivement dima..."
2,"[Terminator 3: Rise of the Machines, The Conve..."
3,"[300, The Killing, Shortbus, Finding Neverland..."
4,"[David, The Wedding Planner, Casablanca, Sleep..."
5,"[Gleaming the Cube, Cool Hand Luke, Hidalgo, U..."


In [19]:
movie['original_title'].unique()

array(['The Endless Summer', 'Jarhead', '彼女の想いで', ...,
       'The Lonedale Operator', 'Violeta se fue a los cielos',
       'To Kill a Priest'], dtype=object)

Word2vec 적용

In [20]:
# int형식은 Word2vec에서 학습이 안되어서 String으로 변경해줍니다. 
sentence = []
for user_sentence in agg['unique'].values:
    # map -> list내의 모든 원소에 대해서 string
    sentence.append(list(map(str, user_sentence)))

In [21]:
# Word2vec의 학습을 진행해줍니다. 
from gensim.models import Word2Vec
# 
embedding_model = Word2Vec(sentence, size=20, window = 5, # size : 웨이트의 크기, window : 양 옆의 주변 단어 개수
                           min_count=1, workers=4, iter=200, sg=1) # workers : cpu자원, sg : 0 - CBOW / sg : 1 - skipgram
                                                                   # iter = epoch

In [22]:
embedding_model.wv.most_similar(positive=['Spider-Man 2'], topn=10)

[('Snow Cake', 0.8102991580963135),
 ('Domicile Conjugal', 0.7816393375396729),
 ('Cruel Intentions 3', 0.7412868738174438),
 ('The Godfather', 0.7384905815124512),
 ('Krull', 0.7372698187828064),
 ('Blow', 0.7355198264122009),
 ('Sunrise: A Song of Two Humans', 0.7259267568588257),
 ('Face/Off', 0.7253625392913818),
 ('I, Robot', 0.725335419178009),
 ('Some Like It Hot', 0.719811201095581)]

## Doc2Vec 적용

![](https://drive.google.com/uc?export=view&id=1g2ausKfoaAT0jMwSatRUG3fiGWfDuysV
)

In [None]:
from gensim.models import doc2vec

In [None]:
meta = pd.read_csv(path + 'movies_metadata.csv', low_memory=False)
meta = meta[meta['original_title'].notnull()].reset_index(drop=True)
meta = meta[meta['overview'].notnull()].reset_index(drop=True)

In [None]:
from nltk.corpus import stopwords 
from tqdm.notebook import tqdm
from nltk.tokenize import word_tokenize, sent_tokenize
import re 
stop_words = set(stopwords.words('english')) 

overview = []
for words in tqdm(meta['overview']):
    word_tokens = word_tokenize(words)
    sentence = re.sub('[^A-Za-z0-9]+', ' ', str(word_tokens))
    sentence = sentence.strip()
    
    sentence_tokens = word_tokenize(sentence)
    result = ''
    for token in sentence_tokens: 
        if token not in stop_words:
            result += ' ' + token 
    result = result.strip().lower()
    overview.append(result)

HBox(children=(FloatProgress(value=0.0, max=44512.0), HTML(value='')))




In [None]:
meta['pre_overview'] = overview

In [None]:
doc_vectorizer = doc2vec.Doc2Vec(
    dm=0,            # PV-DBOW / default 1
    dbow_words=1,    # w2v simultaneous with DBOW d2v / default 0
    window=10,        # distance between the predicted word and context words
    size=100,        # vector size
    alpha=0.025,     # learning-rate
    seed=1234,
    min_count=5,    # ignore with freq lower
    min_alpha=0.025, # min learning-rate
    workers=4,   # multi cpu
    hs = 1,          # hierar chical softmax / default 0
    negative = 10   # negative sampling / default 5
)

In [None]:
from collections import namedtuple

agg = meta[['id', 'original_title', 'pre_overview']]
TaggedDocument = namedtuple('TaggedDocument', 'words tags')
tagged_train_docs = [TaggedDocument((c), [d]) for d, c in agg[['original_title', 'pre_overview']].values]

In [None]:
doc_vectorizer.build_vocab(tagged_train_docs)
print(str(doc_vectorizer))

Doc2Vec(dbow+w,d100,n10,hs,w10,mc5,s0.001,t4)


In [None]:
# 벡터 문서 학습
from time import time

start = time()

for epoch in tqdm(range(5)):
    doc_vectorizer.train(tagged_train_docs, total_examples=doc_vectorizer.corpus_count, epochs=doc_vectorizer.iter)
    doc_vectorizer.alpha -= 0.002 # decrease the learning rate
    doc_vectorizer.min_alpha = doc_vectorizer.alpha # fix the learning rate, no decay

#doc_vectorizer.train(tagged_train_docs, total_examples=doc_vectorizer.corpus_count, epochs=doc_vectorizer.iter)
end = time()
print("During Time: {}".format(end-start))

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


During Time: 868.6329123973846


In [None]:
doc_vectorizer.docvecs.most_similar('Toy Story', topn=20)

[('It Stains the Sands Red', 0.7269709706306458),
 ('Unstrung Heroes', 0.7174409031867981),
 ('Due Amici', 0.7136842012405396),
 ('La moutarde me monte au nez', 0.7097397446632385),
 ('Children in the Surf at Coney Island', 0.6954304575920105),
 ('Live Forever as You Are Now with Alan Resnick', 0.6951810717582703),
 ('Letzte Worte', 0.694780170917511),
 ('Skazka o Poteryannom Vremeni', 0.6917924880981445),
 ('Killing Zoe', 0.6796911954879761),
 ('Testről és lélekről', 0.6681605577468872),
 ("Independents' Day", 0.6628682017326355),
 ('Meet Me in Venice', 0.662304699420929),
 ('Trois vies et une seule mort', 0.6607695817947388),
 ('エクスマキナ', 0.6600291132926941),
 ('El vendedor de humo', 0.6584940552711487),
 ('The Aristocats', 0.6533187031745911),
 ('Wszyscy jesteśmy Chrystusami', 0.6474179029464722),
 ('Begegnung mit Fritz Lang', 0.6451306343078613),
 ('Return to the Batcave: The Misadventures of Adam and Burt',
  0.6444643139839172),
 ('Milk Money', 0.6425882577896118)]

In [None]:
doc_vectorizer.docvecs.most_similar('Harry Potter and the Deathly Hallows: Part 1', topn=20)

[('Hatchet II', 0.7217295169830322),
 ('The Mad Miss Manton', 0.6965864896774292),
 ('Soloalbum', 0.6962880492210388),
 ('Handsome Harry', 0.6917732954025269),
 ('Winnie the Pooh and the Honey Tree', 0.6841850876808167),
 ('The Princess and the Goblin', 0.6834225654602051),
 ('Winnie the Pooh and a Day for Eeyore', 0.6833527088165283),
 ("I Don't Know How She Does It", 0.6819157600402832),
 ('Just Go with It', 0.67984938621521),
 ('$ Dollars', 0.6774436235427856),
 ('Train', 0.674703061580658),
 ('Bulldog Drummond Escapes', 0.6741313934326172),
 ('1990: I guerrieri del Bronx', 0.6741003394126892),
 ('Zamilované Maso', 0.6739451289176941),
 ('Riot in Cell Block 11', 0.6702640056610107),
 ('La Neuvaine', 0.6693864464759827),
 ('The Green Years', 0.6691372990608215),
 ('The Overbrook Brothers', 0.668933093547821),
 ('Se sei vivo spara', 0.6686355471611023),
 ('Hævnen', 0.6685347557067871)]