In [1]:
def read_data(filename):
    with open(filename, 'r') as f:
        data = [line.split('\t') for line in f.read().splitlines()]
        data = data[1:]   # header 제외
    return data
train_data = read_data('./dataset/ratings_train.txt')
test_data = read_data('./dataset/ratings_test.txt')

In [2]:
# row, column의 수가 제대로 읽혔는지 확인
print(len(train_data))      # nrows: 150000
print(len(train_data[0]))   # ncols: 3
print(len(test_data))       # nrows: 50000
print(len(test_data[0]))     # ncols: 3

150000
3
50000
3


In [3]:
from konlpy.tag import Twitter
pos_tagger = Twitter()
def tokenize(doc):
    # norm, stem은 optional
    return ['/'.join(t) for t in pos_tagger.pos(doc, norm=True, stem=True)]
train_docs = [(tokenize(row[1]), row[2]) for row in train_data]
test_docs = [(tokenize(row[1]), row[2]) for row in test_data]
# 잘 들어갔는지 확인
from pprint import pprint
pprint(train_docs[0])


(['아/Exclamation',
  '더빙/Noun',
  '../Punctuation',
  '진짜/Noun',
  '짜증/Noun',
  '나다/Verb',
  '목소리/Noun'],
 '0')


In [4]:
tokens = [t for d in train_docs for t in d[0]]
print(len(tokens))

2194536


In [5]:
import nltk
text = nltk.Text(tokens, name='NMSC')
print(text)

<Text: NMSC>


In [6]:
print(len(text.tokens))                 # returns number of tokens
# => 2194536
print(len(set(text.tokens)))            # returns number of unique tokens
# => 48765
pprint(text.vocab().most_common(10)) 

2194536
48765
[('./Punctuation', 68630),
 ('영화/Noun', 51365),
 ('하다/Verb', 50281),
 ('이/Josa', 39123),
 ('보다/Verb', 34764),
 ('의/Josa', 30480),
 ('../Punctuation', 29055),
 ('에/Josa', 27108),
 ('가/Josa', 26696),
 ('을/Josa', 23481)]


In [7]:
nltk.download('stopwords')
text.collocations()

[nltk_data] Downloading package stopwords to /home/oj-
[nltk_data]     pjt-03/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
이/Determiner 것/Noun; 적/Suffix 인/Josa; 이/Determiner 거/Noun; 안/Noun
되다/Verb; 것/Noun 은/Josa; 10/Number 점/Noun; 배우/Noun 들/Suffix; 수/Noun
있다/Adjective; 이/Noun 게/Josa; 내/Noun 가/Josa; 최고/Noun 의/Josa; 네/Suffix
요/Josa; 이/Noun 영화/Noun; 끝/Noun 까지/Josa; 들/Suffix 이/Josa; 봐/Noun
도/Josa; 때문/Noun 에/Josa; 적/Suffix 으로/Josa; 사람/Noun 들/Suffix; 영화/Noun
를/Josa


In [8]:
import multiprocessing
cores = multiprocessing.cpu_count()
cores

4

In [9]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)


In [None]:
doc_vectorizer = Doc2Vec(
    dm=0,            # PV-DBOW / default 1
    dbow_words=1,    # w2v simultaneous with DBOW d2v / default 0
    window=8,        # distance between the predicted word and context words
    size=300,        # vector size
    alpha=0.025,     # learning-rate
    seed=1234,
    min_count=20,    # ignore with freq lower
    min_alpha=0.025, # min learning-rate
    workers=cores,   # multi cpu
    hs = 1,          # hierarchical softmax / default 0
    negative = 10,   # negative sampling / default 5
)

In [None]:
selected_words = [f[0] for f in text.vocab().most_common(2000)]
def term_exists(doc):
    return {'exists({})'.format(word): (word in set(doc)) for word in selected_words}
# 시간 단축을 위한 꼼수로 training corpus의 일부만 사용할 수 있음
train_docs = train_docs[:150000]
train_xy = [(term_exists(d), c) for d, c in train_docs]
test_xy = [(term_exists(d), c) for d, c in test_docs]

In [15]:
classifier = nltk.NaiveBayesClassifier.train(train_xy)
print(nltk.classify.accuracy(classifier, test_xy))
# => 0.80418
classifier.show_most_informative_features(10)

0.79246
Most Informative Features
         exists(수준/Noun) = True                0 : 1      =     25.6 : 1.0
          exists(굿/Noun) = True                1 : 0      =     21.0 : 1.0
   exists(아깝다/Adjective) = True                0 : 1      =     19.5 : 1.0
         exists(최악/Noun) = True                0 : 1      =     19.3 : 1.0
         exists(실망/Noun) = True                0 : 1      =     17.4 : 1.0
  exists(재미없다/Adjective) = True                0 : 1      =     16.7 : 1.0
        exists(쓰레기/Noun) = True                0 : 1      =     15.9 : 1.0
exists(ㅉㅉ/KoreanParticle) = True                0 : 1      =     15.7 : 1.0
       exists(♥/Foreign) = True                1 : 0      =     14.3 : 1.0
         exists(졸작/Noun) = True                0 : 1      =     12.3 : 1.0


In [4]:
from collections import namedtuple
TaggedDocument = namedtuple('TaggedDocument', 'words tags')
# 여기서는 15만개 training documents 전부 사용함
tagged_train_docs = [TaggedDocument(d, [c]) for d, c in train_docs]
#tagged_test_docs = [TaggedDocument(d, [c]) for d, c in test_docs]

In [6]:
tagged_train_docs[0]

TaggedDocument(words=['아/Exclamation', '더빙/Noun', '../Punctuation', '진짜/Noun', '짜증/Noun', '나다/Verb', '목소리/Noun'], tags=['0'])

In [8]:
import multiprocessing
cores = multiprocessing.cpu_count()
cores

4

In [9]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [10]:
from gensim.models import doc2vec
# 사전 구축
doc_vectorizer = doc2vec.Doc2Vec(dm=0,
                                                    vector_size=300, 
                                                    alpha=0.025, 
                                                    window=8, 
                                                    min_alpha=0.025, 
                                                    seed=1234)
doc_vectorizer.build_vocab(tagged_train_docs)
total_examples=doc_vectorizer.corpus_count
# Train document vectors!
print(str(doc_vectorizer))

2018-06-23 15:21:54,100 : INFO : collecting all words and their counts
2018-06-23 15:21:54,101 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2018-06-23 15:21:54,160 : INFO : PROGRESS: at example #10000, processed 149198 words (2575653/s), 12986 word types, 2 tags
2018-06-23 15:21:54,213 : INFO : PROGRESS: at example #20000, processed 294391 words (2729577/s), 18460 word types, 2 tags
2018-06-23 15:21:54,269 : INFO : PROGRESS: at example #30000, processed 442219 words (2711021/s), 22487 word types, 2 tags
2018-06-23 15:21:54,324 : INFO : PROGRESS: at example #40000, processed 591431 words (2731949/s), 26000 word types, 2 tags
2018-06-23 15:21:54,379 : INFO : PROGRESS: at example #50000, processed 737345 words (2660786/s), 28953 word types, 2 tags
2018-06-23 15:21:54,435 : INFO : PROGRESS: at example #60000, processed 883112 words (2654152/s), 31531 word types, 2 tags
2018-06-23 15:21:54,491 : INFO : PROGRESS: at example #70000, processed 1027398 words (

Doc2Vec(dbow,d300,n5,mc5,s0.001,t3)


In [11]:
doc_vectorizer.corpus_count, doc_vectorizer.iter

  """Entry point for launching an IPython kernel.


(150000, 5)

In [12]:
import time

In [13]:
for epoch in range(30):
    print(epoch)
    doc_vectorizer.train(tagged_train_docs, 
                         total_examples=doc_vectorizer.corpus_count, 
                         epochs=doc_vectorizer.iter)
    doc_vectorizer.alpha -= 0.002 # decrease the learning rate
    doc_vectorizer.min_alpha = doc_vectorizer.alpha # fix the learning rate, no decay


  """
2018-06-23 15:22:33,228 : INFO : training model with 3 workers on 14809 vocabulary and 300 features, using sg=1 hs=0 sample=0.001 negative=5 window=8


0


2018-06-23 15:22:34,256 : INFO : EPOCH 1 - PROGRESS: at 27.96% examples, 520002 words/s, in_qsize 5, out_qsize 0
2018-06-23 15:22:35,272 : INFO : EPOCH 1 - PROGRESS: at 57.60% examples, 532391 words/s, in_qsize 5, out_qsize 0
2018-06-23 15:22:36,289 : INFO : EPOCH 1 - PROGRESS: at 87.31% examples, 536415 words/s, in_qsize 6, out_qsize 0
2018-06-23 15:22:36,696 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-06-23 15:22:36,711 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-06-23 15:22:36,715 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-06-23 15:22:36,715 : INFO : EPOCH - 1 : training on 2194536 raw words (1872147 effective words) took 3.5s, 538704 effective words/s
2018-06-23 15:22:37,723 : INFO : EPOCH 2 - PROGRESS: at 28.42% examples, 535013 words/s, in_qsize 5, out_qsize 0
2018-06-23 15:22:38,728 : INFO : EPOCH 2 - PROGRESS: at 57.14% examples, 534596 words/s, in_qsize 5, out_qsize 0
2018-06-23 15:22:39,760 :

1


2018-06-23 15:22:51,591 : INFO : EPOCH 1 - PROGRESS: at 28.86% examples, 541346 words/s, in_qsize 6, out_qsize 0
2018-06-23 15:22:52,614 : INFO : EPOCH 1 - PROGRESS: at 58.53% examples, 541344 words/s, in_qsize 5, out_qsize 0
2018-06-23 15:22:53,620 : INFO : EPOCH 1 - PROGRESS: at 87.76% examples, 541597 words/s, in_qsize 5, out_qsize 0
2018-06-23 15:22:54,004 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-06-23 15:22:54,020 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-06-23 15:22:54,029 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-06-23 15:22:54,030 : INFO : EPOCH - 1 : training on 2194536 raw words (1872176 effective words) took 3.4s, 543205 effective words/s
2018-06-23 15:22:55,068 : INFO : EPOCH 2 - PROGRESS: at 29.30% examples, 535838 words/s, in_qsize 5, out_qsize 0
2018-06-23 15:22:56,089 : INFO : EPOCH 2 - PROGRESS: at 58.98% examples, 538772 words/s, in_qsize 5, out_qsize 0
2018-06-23 15:22:57,091 :

2


2018-06-23 15:23:08,859 : INFO : EPOCH 1 - PROGRESS: at 28.86% examples, 543191 words/s, in_qsize 6, out_qsize 0
2018-06-23 15:23:09,880 : INFO : EPOCH 1 - PROGRESS: at 58.51% examples, 542534 words/s, in_qsize 5, out_qsize 0
2018-06-23 15:23:10,891 : INFO : EPOCH 1 - PROGRESS: at 88.23% examples, 544503 words/s, in_qsize 5, out_qsize 0
2018-06-23 15:23:11,267 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-06-23 15:23:11,276 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-06-23 15:23:11,286 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-06-23 15:23:11,287 : INFO : EPOCH - 1 : training on 2194536 raw words (1871833 effective words) took 3.4s, 545561 effective words/s
2018-06-23 15:23:12,296 : INFO : EPOCH 2 - PROGRESS: at 28.86% examples, 543458 words/s, in_qsize 5, out_qsize 0
2018-06-23 15:23:13,297 : INFO : EPOCH 2 - PROGRESS: at 57.60% examples, 539857 words/s, in_qsize 5, out_qsize 0
2018-06-23 15:23:14,311 :

3


2018-06-23 15:23:26,117 : INFO : EPOCH 1 - PROGRESS: at 28.86% examples, 539389 words/s, in_qsize 5, out_qsize 0
2018-06-23 15:23:27,139 : INFO : EPOCH 1 - PROGRESS: at 58.53% examples, 540349 words/s, in_qsize 5, out_qsize 0
2018-06-23 15:23:28,146 : INFO : EPOCH 1 - PROGRESS: at 88.19% examples, 543757 words/s, in_qsize 5, out_qsize 0
2018-06-23 15:23:28,522 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-06-23 15:23:28,530 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-06-23 15:23:28,541 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-06-23 15:23:28,541 : INFO : EPOCH - 1 : training on 2194536 raw words (1872131 effective words) took 3.4s, 545032 effective words/s
2018-06-23 15:23:29,558 : INFO : EPOCH 2 - PROGRESS: at 28.86% examples, 540071 words/s, in_qsize 5, out_qsize 0
2018-06-23 15:23:30,576 : INFO : EPOCH 2 - PROGRESS: at 58.53% examples, 541853 words/s, in_qsize 6, out_qsize 0
2018-06-23 15:23:31,587 :

4


2018-06-23 15:23:43,341 : INFO : EPOCH 1 - PROGRESS: at 28.86% examples, 541429 words/s, in_qsize 5, out_qsize 0
2018-06-23 15:23:44,362 : INFO : EPOCH 1 - PROGRESS: at 58.53% examples, 541851 words/s, in_qsize 5, out_qsize 0
2018-06-23 15:23:45,366 : INFO : EPOCH 1 - PROGRESS: at 87.76% examples, 542294 words/s, in_qsize 5, out_qsize 0
2018-06-23 15:23:45,757 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-06-23 15:23:45,782 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-06-23 15:23:45,786 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-06-23 15:23:45,786 : INFO : EPOCH - 1 : training on 2194536 raw words (1871619 effective words) took 3.5s, 542230 effective words/s
2018-06-23 15:23:46,812 : INFO : EPOCH 2 - PROGRESS: at 28.86% examples, 540468 words/s, in_qsize 5, out_qsize 0
2018-06-23 15:23:47,814 : INFO : EPOCH 2 - PROGRESS: at 58.05% examples, 542019 words/s, in_qsize 5, out_qsize 0
2018-06-23 15:23:48,822 :

5


2018-06-23 15:24:00,541 : INFO : EPOCH 1 - PROGRESS: at 28.86% examples, 541356 words/s, in_qsize 5, out_qsize 0
2018-06-23 15:24:01,556 : INFO : EPOCH 1 - PROGRESS: at 58.05% examples, 539240 words/s, in_qsize 5, out_qsize 0
2018-06-23 15:24:02,568 : INFO : EPOCH 1 - PROGRESS: at 87.31% examples, 539145 words/s, in_qsize 5, out_qsize 0
2018-06-23 15:24:02,962 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-06-23 15:24:02,980 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-06-23 15:24:02,984 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-06-23 15:24:02,985 : INFO : EPOCH - 1 : training on 2194536 raw words (1871800 effective words) took 3.5s, 542382 effective words/s
2018-06-23 15:24:03,998 : INFO : EPOCH 2 - PROGRESS: at 28.86% examples, 542334 words/s, in_qsize 5, out_qsize 0
2018-06-23 15:24:05,016 : INFO : EPOCH 2 - PROGRESS: at 58.05% examples, 538476 words/s, in_qsize 5, out_qsize 0
2018-06-23 15:24:06,022 :

6


2018-06-23 15:24:17,780 : INFO : EPOCH 1 - PROGRESS: at 28.86% examples, 539275 words/s, in_qsize 5, out_qsize 0
2018-06-23 15:24:18,787 : INFO : EPOCH 1 - PROGRESS: at 58.05% examples, 540043 words/s, in_qsize 5, out_qsize 0
2018-06-23 15:24:19,817 : INFO : EPOCH 1 - PROGRESS: at 87.76% examples, 539331 words/s, in_qsize 5, out_qsize 0
2018-06-23 15:24:20,206 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-06-23 15:24:20,214 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-06-23 15:24:20,220 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-06-23 15:24:20,221 : INFO : EPOCH - 1 : training on 2194536 raw words (1871472 effective words) took 3.5s, 542200 effective words/s
2018-06-23 15:24:21,242 : INFO : EPOCH 2 - PROGRESS: at 28.86% examples, 537162 words/s, in_qsize 6, out_qsize 0
2018-06-23 15:24:22,253 : INFO : EPOCH 2 - PROGRESS: at 58.53% examples, 542285 words/s, in_qsize 5, out_qsize 0
2018-06-23 15:24:23,260 :

7


2018-06-23 15:24:35,043 : INFO : EPOCH 1 - PROGRESS: at 28.86% examples, 535285 words/s, in_qsize 5, out_qsize 0
2018-06-23 15:24:36,070 : INFO : EPOCH 1 - PROGRESS: at 58.51% examples, 537317 words/s, in_qsize 5, out_qsize 0
2018-06-23 15:24:37,087 : INFO : EPOCH 1 - PROGRESS: at 88.23% examples, 539879 words/s, in_qsize 5, out_qsize 0
2018-06-23 15:24:37,472 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-06-23 15:24:37,475 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-06-23 15:24:37,481 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-06-23 15:24:37,481 : INFO : EPOCH - 1 : training on 2194536 raw words (1872145 effective words) took 3.5s, 541614 effective words/s
2018-06-23 15:24:38,499 : INFO : EPOCH 2 - PROGRESS: at 28.86% examples, 539106 words/s, in_qsize 5, out_qsize 0
2018-06-23 15:24:39,500 : INFO : EPOCH 2 - PROGRESS: at 58.05% examples, 541878 words/s, in_qsize 5, out_qsize 0
2018-06-23 15:24:40,514 :

8


2018-06-23 15:24:52,276 : INFO : EPOCH 1 - PROGRESS: at 28.86% examples, 543290 words/s, in_qsize 5, out_qsize 0
2018-06-23 15:24:53,292 : INFO : EPOCH 1 - PROGRESS: at 58.08% examples, 539884 words/s, in_qsize 5, out_qsize 0
2018-06-23 15:24:54,292 : INFO : EPOCH 1 - PROGRESS: at 87.76% examples, 544564 words/s, in_qsize 6, out_qsize 0
2018-06-23 15:24:54,690 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-06-23 15:24:54,698 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-06-23 15:24:54,709 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-06-23 15:24:54,709 : INFO : EPOCH - 1 : training on 2194536 raw words (1871659 effective words) took 3.4s, 544724 effective words/s
2018-06-23 15:24:55,720 : INFO : EPOCH 2 - PROGRESS: at 28.42% examples, 534722 words/s, in_qsize 5, out_qsize 0
2018-06-23 15:24:56,725 : INFO : EPOCH 2 - PROGRESS: at 57.60% examples, 538529 words/s, in_qsize 5, out_qsize 0
2018-06-23 15:24:57,729 :

9


2018-06-23 15:25:09,493 : INFO : EPOCH 1 - PROGRESS: at 28.86% examples, 543337 words/s, in_qsize 5, out_qsize 0
2018-06-23 15:25:10,500 : INFO : EPOCH 1 - PROGRESS: at 58.53% examples, 546679 words/s, in_qsize 5, out_qsize 0
2018-06-23 15:25:11,511 : INFO : EPOCH 1 - PROGRESS: at 87.78% examples, 544316 words/s, in_qsize 6, out_qsize 0
2018-06-23 15:25:11,904 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-06-23 15:25:11,910 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-06-23 15:25:11,920 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-06-23 15:25:11,920 : INFO : EPOCH - 1 : training on 2194536 raw words (1871715 effective words) took 3.4s, 545743 effective words/s
2018-06-23 15:25:12,944 : INFO : EPOCH 2 - PROGRESS: at 29.30% examples, 544117 words/s, in_qsize 5, out_qsize 0
2018-06-23 15:25:13,949 : INFO : EPOCH 2 - PROGRESS: at 58.53% examples, 543462 words/s, in_qsize 5, out_qsize 0
2018-06-23 15:25:14,953 :

10


2018-06-23 15:25:26,720 : INFO : EPOCH 1 - PROGRESS: at 28.86% examples, 540000 words/s, in_qsize 6, out_qsize 0
2018-06-23 15:25:27,735 : INFO : EPOCH 1 - PROGRESS: at 58.53% examples, 543048 words/s, in_qsize 5, out_qsize 0
2018-06-23 15:25:28,755 : INFO : EPOCH 1 - PROGRESS: at 88.19% examples, 543257 words/s, in_qsize 6, out_qsize 0
2018-06-23 15:25:29,128 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-06-23 15:25:29,136 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-06-23 15:25:29,138 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-06-23 15:25:29,139 : INFO : EPOCH - 1 : training on 2194536 raw words (1872560 effective words) took 3.4s, 546341 effective words/s
2018-06-23 15:25:30,154 : INFO : EPOCH 2 - PROGRESS: at 28.86% examples, 540838 words/s, in_qsize 6, out_qsize 0
2018-06-23 15:25:31,160 : INFO : EPOCH 2 - PROGRESS: at 58.53% examples, 545448 words/s, in_qsize 6, out_qsize 0
2018-06-23 15:25:32,164 :

11


2018-06-23 15:25:43,899 : INFO : EPOCH 1 - PROGRESS: at 29.30% examples, 544162 words/s, in_qsize 5, out_qsize 0
2018-06-23 15:25:44,909 : INFO : EPOCH 1 - PROGRESS: at 58.53% examples, 541959 words/s, in_qsize 5, out_qsize 0
2018-06-23 15:25:45,946 : INFO : EPOCH 1 - PROGRESS: at 88.23% examples, 539346 words/s, in_qsize 5, out_qsize 0
2018-06-23 15:25:46,312 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-06-23 15:25:46,319 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-06-23 15:25:46,329 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-06-23 15:25:46,330 : INFO : EPOCH - 1 : training on 2194536 raw words (1871773 effective words) took 3.4s, 542928 effective words/s
2018-06-23 15:25:47,338 : INFO : EPOCH 2 - PROGRESS: at 28.86% examples, 544076 words/s, in_qsize 5, out_qsize 0
2018-06-23 15:25:48,345 : INFO : EPOCH 2 - PROGRESS: at 58.05% examples, 542622 words/s, in_qsize 5, out_qsize 0
2018-06-23 15:25:49,356 :

12


2018-06-23 15:26:01,128 : INFO : EPOCH 1 - PROGRESS: at 28.86% examples, 542758 words/s, in_qsize 6, out_qsize 0
2018-06-23 15:26:02,129 : INFO : EPOCH 1 - PROGRESS: at 58.05% examples, 543742 words/s, in_qsize 5, out_qsize 0
2018-06-23 15:26:03,135 : INFO : EPOCH 1 - PROGRESS: at 86.88% examples, 540586 words/s, in_qsize 5, out_qsize 0
2018-06-23 15:26:03,561 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-06-23 15:26:03,579 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-06-23 15:26:03,582 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-06-23 15:26:03,582 : INFO : EPOCH - 1 : training on 2194536 raw words (1872353 effective words) took 3.5s, 541473 effective words/s
2018-06-23 15:26:04,595 : INFO : EPOCH 2 - PROGRESS: at 28.86% examples, 543424 words/s, in_qsize 5, out_qsize 0
2018-06-23 15:26:05,620 : INFO : EPOCH 2 - PROGRESS: at 58.53% examples, 541666 words/s, in_qsize 5, out_qsize 0
2018-06-23 15:26:06,644 :

13


2018-06-23 15:26:18,427 : INFO : EPOCH 1 - PROGRESS: at 29.75% examples, 560934 words/s, in_qsize 5, out_qsize 0
2018-06-23 15:26:19,447 : INFO : EPOCH 1 - PROGRESS: at 60.34% examples, 560123 words/s, in_qsize 5, out_qsize 0
2018-06-23 15:26:20,454 : INFO : EPOCH 1 - PROGRESS: at 90.97% examples, 562417 words/s, in_qsize 5, out_qsize 0
2018-06-23 15:26:20,737 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-06-23 15:26:20,740 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-06-23 15:26:20,742 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-06-23 15:26:20,743 : INFO : EPOCH - 1 : training on 2194536 raw words (1871409 effective words) took 3.3s, 564090 effective words/s
2018-06-23 15:26:21,755 : INFO : EPOCH 2 - PROGRESS: at 30.18% examples, 567530 words/s, in_qsize 6, out_qsize 0
2018-06-23 15:26:22,755 : INFO : EPOCH 2 - PROGRESS: at 60.34% examples, 564619 words/s, in_qsize 5, out_qsize 0
2018-06-23 15:26:23,756 :

14


2018-06-23 15:26:35,076 : INFO : EPOCH 1 - PROGRESS: at 29.75% examples, 561661 words/s, in_qsize 5, out_qsize 0
2018-06-23 15:26:36,101 : INFO : EPOCH 1 - PROGRESS: at 60.37% examples, 559116 words/s, in_qsize 6, out_qsize 0
2018-06-23 15:26:37,118 : INFO : EPOCH 1 - PROGRESS: at 90.98% examples, 560050 words/s, in_qsize 5, out_qsize 0
2018-06-23 15:26:37,370 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-06-23 15:26:37,384 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-06-23 15:26:37,394 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-06-23 15:26:37,395 : INFO : EPOCH - 1 : training on 2194536 raw words (1871760 effective words) took 3.3s, 563907 effective words/s
2018-06-23 15:26:38,411 : INFO : EPOCH 2 - PROGRESS: at 30.18% examples, 565112 words/s, in_qsize 5, out_qsize 0
2018-06-23 15:26:39,411 : INFO : EPOCH 2 - PROGRESS: at 60.34% examples, 563602 words/s, in_qsize 5, out_qsize 0
2018-06-23 15:26:40,423 :

15


2018-06-23 15:26:51,696 : INFO : EPOCH 1 - PROGRESS: at 30.18% examples, 560661 words/s, in_qsize 5, out_qsize 0
2018-06-23 15:26:52,707 : INFO : EPOCH 1 - PROGRESS: at 60.82% examples, 562785 words/s, in_qsize 5, out_qsize 0
2018-06-23 15:26:53,721 : INFO : EPOCH 1 - PROGRESS: at 90.97% examples, 560308 words/s, in_qsize 6, out_qsize 0
2018-06-23 15:26:53,975 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-06-23 15:26:53,995 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-06-23 15:26:54,002 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-06-23 15:26:54,002 : INFO : EPOCH - 1 : training on 2194536 raw words (1872743 effective words) took 3.3s, 563376 effective words/s
2018-06-23 15:26:55,046 : INFO : EPOCH 2 - PROGRESS: at 30.65% examples, 557834 words/s, in_qsize 5, out_qsize 0
2018-06-23 15:26:56,050 : INFO : EPOCH 2 - PROGRESS: at 61.28% examples, 563299 words/s, in_qsize 6, out_qsize 0
2018-06-23 15:26:57,057 :

16


2018-06-23 15:27:08,316 : INFO : EPOCH 1 - PROGRESS: at 29.75% examples, 561768 words/s, in_qsize 5, out_qsize 0
2018-06-23 15:27:09,322 : INFO : EPOCH 1 - PROGRESS: at 59.89% examples, 560513 words/s, in_qsize 6, out_qsize 0
2018-06-23 15:27:10,341 : INFO : EPOCH 1 - PROGRESS: at 90.52% examples, 560650 words/s, in_qsize 6, out_qsize 0
2018-06-23 15:27:10,618 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-06-23 15:27:10,630 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-06-23 15:27:10,632 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-06-23 15:27:10,633 : INFO : EPOCH - 1 : training on 2194536 raw words (1872841 effective words) took 3.3s, 564559 effective words/s
2018-06-23 15:27:11,657 : INFO : EPOCH 2 - PROGRESS: at 30.18% examples, 560114 words/s, in_qsize 5, out_qsize 0
2018-06-23 15:27:12,666 : INFO : EPOCH 2 - PROGRESS: at 60.81% examples, 563015 words/s, in_qsize 5, out_qsize 0
2018-06-23 15:27:13,677 :

17


2018-06-23 15:27:24,956 : INFO : EPOCH 1 - PROGRESS: at 29.75% examples, 551462 words/s, in_qsize 5, out_qsize 0
2018-06-23 15:27:25,972 : INFO : EPOCH 1 - PROGRESS: at 58.53% examples, 539887 words/s, in_qsize 6, out_qsize 0
2018-06-23 15:27:26,982 : INFO : EPOCH 1 - PROGRESS: at 88.23% examples, 542870 words/s, in_qsize 5, out_qsize 0
2018-06-23 15:27:27,360 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-06-23 15:27:27,368 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-06-23 15:27:27,371 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-06-23 15:27:27,371 : INFO : EPOCH - 1 : training on 2194536 raw words (1871693 effective words) took 3.4s, 545056 effective words/s
2018-06-23 15:27:28,383 : INFO : EPOCH 2 - PROGRESS: at 28.40% examples, 534073 words/s, in_qsize 6, out_qsize 0
2018-06-23 15:27:29,390 : INFO : EPOCH 2 - PROGRESS: at 57.14% examples, 533345 words/s, in_qsize 5, out_qsize 0
2018-06-23 15:27:30,399 :

18


2018-06-23 15:27:41,785 : INFO : EPOCH 1 - PROGRESS: at 29.75% examples, 556426 words/s, in_qsize 5, out_qsize 0
2018-06-23 15:27:42,808 : INFO : EPOCH 1 - PROGRESS: at 60.34% examples, 557289 words/s, in_qsize 5, out_qsize 0
2018-06-23 15:27:43,820 : INFO : EPOCH 1 - PROGRESS: at 90.98% examples, 559775 words/s, in_qsize 6, out_qsize 0
2018-06-23 15:27:44,093 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-06-23 15:27:44,100 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-06-23 15:27:44,105 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-06-23 15:27:44,105 : INFO : EPOCH - 1 : training on 2194536 raw words (1872371 effective words) took 3.3s, 562245 effective words/s
2018-06-23 15:27:45,114 : INFO : EPOCH 2 - PROGRESS: at 29.75% examples, 560758 words/s, in_qsize 6, out_qsize 0
2018-06-23 15:27:46,135 : INFO : EPOCH 2 - PROGRESS: at 60.34% examples, 560089 words/s, in_qsize 5, out_qsize 0
2018-06-23 15:27:47,148 :

19


2018-06-23 15:27:58,412 : INFO : EPOCH 1 - PROGRESS: at 29.75% examples, 556935 words/s, in_qsize 5, out_qsize 0
2018-06-23 15:27:59,436 : INFO : EPOCH 1 - PROGRESS: at 60.34% examples, 557161 words/s, in_qsize 5, out_qsize 0
2018-06-23 15:28:00,447 : INFO : EPOCH 1 - PROGRESS: at 90.98% examples, 559740 words/s, in_qsize 6, out_qsize 0
2018-06-23 15:28:00,711 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-06-23 15:28:00,724 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-06-23 15:28:00,731 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-06-23 15:28:00,732 : INFO : EPOCH - 1 : training on 2194536 raw words (1872179 effective words) took 3.3s, 562331 effective words/s
2018-06-23 15:28:01,749 : INFO : EPOCH 2 - PROGRESS: at 30.18% examples, 563885 words/s, in_qsize 5, out_qsize 0
2018-06-23 15:28:02,750 : INFO : EPOCH 2 - PROGRESS: at 59.90% examples, 558922 words/s, in_qsize 6, out_qsize 0
2018-06-23 15:28:03,755 :

20


2018-06-23 15:28:15,045 : INFO : EPOCH 1 - PROGRESS: at 30.18% examples, 563565 words/s, in_qsize 5, out_qsize 0
2018-06-23 15:28:16,073 : INFO : EPOCH 1 - PROGRESS: at 60.81% examples, 559070 words/s, in_qsize 5, out_qsize 0
2018-06-23 15:28:17,094 : INFO : EPOCH 1 - PROGRESS: at 91.44% examples, 559302 words/s, in_qsize 5, out_qsize 0
2018-06-23 15:28:17,328 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-06-23 15:28:17,350 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-06-23 15:28:17,354 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-06-23 15:28:17,354 : INFO : EPOCH - 1 : training on 2194536 raw words (1872251 effective words) took 3.3s, 563585 effective words/s
2018-06-23 15:28:18,379 : INFO : EPOCH 2 - PROGRESS: at 30.18% examples, 561410 words/s, in_qsize 5, out_qsize 0
2018-06-23 15:28:19,389 : INFO : EPOCH 2 - PROGRESS: at 60.81% examples, 563140 words/s, in_qsize 6, out_qsize 0
2018-06-23 15:28:20,411 :

21


2018-06-23 15:28:31,693 : INFO : EPOCH 1 - PROGRESS: at 30.18% examples, 563512 words/s, in_qsize 5, out_qsize 0
2018-06-23 15:28:32,704 : INFO : EPOCH 1 - PROGRESS: at 59.89% examples, 555694 words/s, in_qsize 6, out_qsize 0
2018-06-23 15:28:33,714 : INFO : EPOCH 1 - PROGRESS: at 90.07% examples, 556136 words/s, in_qsize 5, out_qsize 0
2018-06-23 15:28:34,031 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-06-23 15:28:34,032 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-06-23 15:28:34,044 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-06-23 15:28:34,045 : INFO : EPOCH - 1 : training on 2194536 raw words (1872249 effective words) took 3.4s, 556501 effective words/s
2018-06-23 15:28:35,078 : INFO : EPOCH 2 - PROGRESS: at 30.63% examples, 563873 words/s, in_qsize 6, out_qsize 0
2018-06-23 15:28:36,083 : INFO : EPOCH 2 - PROGRESS: at 60.81% examples, 561751 words/s, in_qsize 5, out_qsize 0
2018-06-23 15:28:37,084 :

22


2018-06-23 15:28:48,881 : INFO : EPOCH 1 - PROGRESS: at 28.42% examples, 529438 words/s, in_qsize 5, out_qsize 0
2018-06-23 15:28:49,886 : INFO : EPOCH 1 - PROGRESS: at 58.99% examples, 548413 words/s, in_qsize 5, out_qsize 0
2018-06-23 15:28:50,886 : INFO : EPOCH 1 - PROGRESS: at 89.13% examples, 553282 words/s, in_qsize 5, out_qsize 0
2018-06-23 15:28:51,222 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-06-23 15:28:51,238 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-06-23 15:28:51,242 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-06-23 15:28:51,243 : INFO : EPOCH - 1 : training on 2194536 raw words (1872605 effective words) took 3.4s, 554638 effective words/s
2018-06-23 15:28:52,281 : INFO : EPOCH 2 - PROGRESS: at 30.63% examples, 562285 words/s, in_qsize 5, out_qsize 0
2018-06-23 15:28:53,313 : INFO : EPOCH 2 - PROGRESS: at 61.73% examples, 561685 words/s, in_qsize 6, out_qsize 0
2018-06-23 15:28:54,349 :

23


2018-06-23 15:29:05,514 : INFO : EPOCH 1 - PROGRESS: at 29.73% examples, 561231 words/s, in_qsize 5, out_qsize 0
2018-06-23 15:29:06,530 : INFO : EPOCH 1 - PROGRESS: at 60.34% examples, 561248 words/s, in_qsize 5, out_qsize 0
2018-06-23 15:29:07,532 : INFO : EPOCH 1 - PROGRESS: at 90.52% examples, 561512 words/s, in_qsize 5, out_qsize 0
2018-06-23 15:29:07,815 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-06-23 15:29:07,827 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-06-23 15:29:07,835 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-06-23 15:29:07,835 : INFO : EPOCH - 1 : training on 2194536 raw words (1872324 effective words) took 3.3s, 563484 effective words/s
2018-06-23 15:29:08,844 : INFO : EPOCH 2 - PROGRESS: at 29.73% examples, 561756 words/s, in_qsize 5, out_qsize 0
2018-06-23 15:29:09,851 : INFO : EPOCH 2 - PROGRESS: at 60.36% examples, 564264 words/s, in_qsize 5, out_qsize 0
2018-06-23 15:29:10,856 :

24


2018-06-23 15:29:22,118 : INFO : EPOCH 1 - PROGRESS: at 30.18% examples, 562253 words/s, in_qsize 5, out_qsize 0
2018-06-23 15:29:23,125 : INFO : EPOCH 1 - PROGRESS: at 60.34% examples, 560591 words/s, in_qsize 5, out_qsize 0
2018-06-23 15:29:24,131 : INFO : EPOCH 1 - PROGRESS: at 90.53% examples, 560217 words/s, in_qsize 5, out_qsize 0
2018-06-23 15:29:24,408 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-06-23 15:29:24,411 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-06-23 15:29:24,421 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-06-23 15:29:24,422 : INFO : EPOCH - 1 : training on 2194536 raw words (1872672 effective words) took 3.3s, 564264 effective words/s
2018-06-23 15:29:25,451 : INFO : EPOCH 2 - PROGRESS: at 30.18% examples, 559551 words/s, in_qsize 6, out_qsize 0
2018-06-23 15:29:26,466 : INFO : EPOCH 2 - PROGRESS: at 60.81% examples, 560875 words/s, in_qsize 5, out_qsize 0
2018-06-23 15:29:27,474 :

25


2018-06-23 15:29:38,749 : INFO : EPOCH 1 - PROGRESS: at 30.18% examples, 564912 words/s, in_qsize 5, out_qsize 0
2018-06-23 15:29:39,760 : INFO : EPOCH 1 - PROGRESS: at 60.34% examples, 560325 words/s, in_qsize 6, out_qsize 0
2018-06-23 15:29:40,773 : INFO : EPOCH 1 - PROGRESS: at 90.97% examples, 561425 words/s, in_qsize 5, out_qsize 0
2018-06-23 15:29:41,059 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-06-23 15:29:41,060 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-06-23 15:29:41,066 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-06-23 15:29:41,067 : INFO : EPOCH - 1 : training on 2194536 raw words (1871689 effective words) took 3.3s, 562498 effective words/s
2018-06-23 15:29:42,109 : INFO : EPOCH 2 - PROGRESS: at 30.63% examples, 558835 words/s, in_qsize 5, out_qsize 0
2018-06-23 15:29:43,122 : INFO : EPOCH 2 - PROGRESS: at 61.28% examples, 561201 words/s, in_qsize 5, out_qsize 0
2018-06-23 15:29:44,137 :

26


2018-06-23 15:29:55,354 : INFO : EPOCH 1 - PROGRESS: at 29.30% examples, 552432 words/s, in_qsize 5, out_qsize 0
2018-06-23 15:29:56,358 : INFO : EPOCH 1 - PROGRESS: at 59.45% examples, 556256 words/s, in_qsize 6, out_qsize 0
2018-06-23 15:29:57,360 : INFO : EPOCH 1 - PROGRESS: at 90.07% examples, 561012 words/s, in_qsize 5, out_qsize 0
2018-06-23 15:29:57,668 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-06-23 15:29:57,673 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-06-23 15:29:57,680 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-06-23 15:29:57,680 : INFO : EPOCH - 1 : training on 2194536 raw words (1872715 effective words) took 3.3s, 562662 effective words/s
2018-06-23 15:29:58,691 : INFO : EPOCH 2 - PROGRESS: at 30.18% examples, 568793 words/s, in_qsize 5, out_qsize 0
2018-06-23 15:29:59,693 : INFO : EPOCH 2 - PROGRESS: at 60.34% examples, 564865 words/s, in_qsize 5, out_qsize 0
2018-06-23 15:30:00,703 :

27


2018-06-23 15:30:11,988 : INFO : EPOCH 1 - PROGRESS: at 30.18% examples, 562490 words/s, in_qsize 5, out_qsize 0
2018-06-23 15:30:12,999 : INFO : EPOCH 1 - PROGRESS: at 60.34% examples, 559253 words/s, in_qsize 5, out_qsize 0
2018-06-23 15:30:14,005 : INFO : EPOCH 1 - PROGRESS: at 90.52% examples, 559305 words/s, in_qsize 5, out_qsize 0
2018-06-23 15:30:14,290 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-06-23 15:30:14,305 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-06-23 15:30:14,305 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-06-23 15:30:14,306 : INFO : EPOCH - 1 : training on 2194536 raw words (1872464 effective words) took 3.3s, 561856 effective words/s
2018-06-23 15:30:15,316 : INFO : EPOCH 2 - PROGRESS: at 29.75% examples, 559832 words/s, in_qsize 5, out_qsize 0
2018-06-23 15:30:16,321 : INFO : EPOCH 2 - PROGRESS: at 60.34% examples, 563818 words/s, in_qsize 6, out_qsize 0
2018-06-23 15:30:17,348 :

28


2018-06-23 15:30:28,631 : INFO : EPOCH 1 - PROGRESS: at 30.18% examples, 560540 words/s, in_qsize 6, out_qsize 0
2018-06-23 15:30:29,642 : INFO : EPOCH 1 - PROGRESS: at 60.81% examples, 562803 words/s, in_qsize 5, out_qsize 0
2018-06-23 15:30:30,662 : INFO : EPOCH 1 - PROGRESS: at 90.97% examples, 559039 words/s, in_qsize 6, out_qsize 0
2018-06-23 15:30:30,931 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-06-23 15:30:30,934 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-06-23 15:30:30,938 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-06-23 15:30:30,939 : INFO : EPOCH - 1 : training on 2194536 raw words (1872291 effective words) took 3.3s, 563008 effective words/s
2018-06-23 15:30:31,962 : INFO : EPOCH 2 - PROGRESS: at 29.30% examples, 544469 words/s, in_qsize 6, out_qsize 0
2018-06-23 15:30:32,968 : INFO : EPOCH 2 - PROGRESS: at 59.89% examples, 555663 words/s, in_qsize 5, out_qsize 0
2018-06-23 15:30:33,976 :

29


2018-06-23 15:30:45,274 : INFO : EPOCH 1 - PROGRESS: at 29.75% examples, 560571 words/s, in_qsize 5, out_qsize 0
2018-06-23 15:30:46,304 : INFO : EPOCH 1 - PROGRESS: at 60.36% examples, 557408 words/s, in_qsize 5, out_qsize 0
2018-06-23 15:30:47,309 : INFO : EPOCH 1 - PROGRESS: at 90.97% examples, 561184 words/s, in_qsize 5, out_qsize 0
2018-06-23 15:30:47,578 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-06-23 15:30:47,584 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-06-23 15:30:47,594 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-06-23 15:30:47,594 : INFO : EPOCH - 1 : training on 2194536 raw words (1872759 effective words) took 3.3s, 563545 effective words/s
2018-06-23 15:30:48,619 : INFO : EPOCH 2 - PROGRESS: at 30.18% examples, 559792 words/s, in_qsize 5, out_qsize 0
2018-06-23 15:30:49,624 : INFO : EPOCH 2 - PROGRESS: at 60.81% examples, 563697 words/s, in_qsize 5, out_qsize 0
2018-06-23 15:30:50,634 :

In [14]:
model_name = 'Doc2vec_dm_naver.model'
doc_vectorizer.save(model_name)

2018-06-23 16:17:43,011 : INFO : saving Doc2Vec object under Doc2vec_dm_naver.model, separately None
2018-06-23 16:17:43,352 : INFO : saved Doc2vec_dm_naver.model


In [15]:
doc_dbow_naver = doc2vec.Doc2Vec.load(model_name)


2018-06-23 16:18:00,620 : INFO : loading Doc2Vec object from Doc2vec_dm_naver.model
2018-06-23 16:18:00,880 : INFO : loading vocabulary recursively from Doc2vec_dm_naver.model.vocabulary.* with mmap=None
2018-06-23 16:18:00,881 : INFO : loading trainables recursively from Doc2vec_dm_naver.model.trainables.* with mmap=None
2018-06-23 16:18:00,881 : INFO : loading wv recursively from Doc2vec_dm_naver.model.wv.* with mmap=None
2018-06-23 16:18:00,882 : INFO : loading docvecs recursively from Doc2vec_dm_naver.model.docvecs.* with mmap=None
2018-06-23 16:18:00,883 : INFO : loaded Doc2vec_dm_naver.model


In [19]:
pprint(doc_dbow_naver.most_similar('남자/Noun'))


[('에반스/Noun', 0.24304839968681335),
 ('비중/Noun', 0.22372911870479584),
 ('업자/Noun', 0.21670472621917725),
 ('의드/Noun', 0.20631510019302368),
 ('극장판/Noun', 0.20278209447860718),
 ('세바퀴/Noun', 0.19723764061927795),
 ('빛/Noun', 0.19628781080245972),
 ('고혹/Noun', 0.1916559338569641),
 ('조이/Noun', 0.18985727429389954),
 ('게/Noun', 0.18824177980422974)]


  """Entry point for launching an IPython kernel.


In [21]:
texts=['이게 영화야? '.split(' '),
       '애니는 일본이 갑인듯'.split(' '),
       '롭 코헨의 몰락의 OO점 '.split(' '),
       '감동적이다.... '.split(' '),
       '제임스 헷필드 50먹고 더 파워풀해졌어ㅋㅋㅋㅋ'.split(' ')
       ]       
print(texts)
for txt in texts:
    print(doc_vectorizer.docvecs.most_similar(positive=[doc_vectorizer.infer_vector(txt)],topn=2))


[['이게', '영화야?', ''], ['애니는', '일본이', '갑인듯'], ['롭', '코헨의', '몰락의', 'OO점', ''], ['감동적이다....', ''], ['제임스', '헷필드', '50먹고', '더', '파워풀해졌어ㅋㅋㅋㅋ']]
[('1', -0.09687576442956924), ('0', -0.0998920425772667)]
[('0', 0.0011044591665267944), ('1', -0.0052378810942173)]
[('1', -0.015255899168550968), ('0', -0.048225872218608856)]
[('1', -0.00586763396859169), ('0', -0.01486484706401825)]
[('0', -0.01680712029337883), ('1', -0.08324410766363144)]
