In [None]:
%%bash
apt-get update
apt-get install g++ openjdk-8-jdk python-dev python3-dev
pip3 install JPype1
pip3 install konlpy

In [4]:
%env JAVA_HOME "/usr/lib/jvm/java-8-openjdk-amd64"

env: JAVA_HOME="/usr/lib/jvm/java-8-openjdk-amd64"


In [None]:
%%bash
bash <(curl -s https://raw.githubusercontent.com/konlpy/konlpy/master/scripts/mecab.sh)
pip3 install /tmp/mecab-python-0.996

In [1]:
import konlpy
from konlpy.tag import Kkma, Komoran, Hannanum, Okt
from konlpy.utils import pprint
from konlpy.tag import Mecab
from tqdm import tqdm
import re
import pickle
import csv
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('./df_contents.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,title,review
0,0,우리들의 블루스,이 화려한 캐스팅 라인업을 보고 안 볼수가 있나.. ㄷㄷㄷ 게다가 노희경 극본..ㄷ...
1,1,우리들의 블루스,"침묵의 순간들이 더욱 처연하게 빛나는, 노희경 극본의 또 다른 경지."
2,2,우리들의 블루스,세상에..노희경 작가와 이 배우들이라면 어떤 스토리가 됐든 꼭 보고싶다..
3,3,우리들의 블루스,하 진짜 좋아하는 배우 전부 다 나오네…\r\n.\r\n한수의 위험한 생각이 뭔지 ...
4,4,우리들의 블루스,이 작품 뒤에 결혼할것 같은 두 찐커플이 출연


In [5]:
def clean_text(text):
    text = text.replace(".", "").strip()
    text = text.replace("·", " ").strip()
    pattern = '[^ ㄱ-ㅣ가-힣|0-9]+'
    text = re.sub(pattern=pattern, repl='', string=text)
    return text

In [6]:
def get_nouns(tokenizer, sentence):
    tagged = tokenizer.pos(sentence)
    nouns = [s for s, t in tagged if t in ['NNG', 'NNP'] and len(s) >1]
    return nouns

def tokenize(df):
    tokenizer = Mecab()
    processed_data = []
    for sent in tqdm(df['review']):
        sentence = clean_text(str(sent).replace("\n", "").strip())
        processed_data.append(get_nouns(tokenizer, sentence))
    return processed_data

In [7]:
preprocessed_data = tokenize(df)
preprocessed_data[:5]

100%|██████████████████████████████████| 25676/25676 [00:14<00:00, 1780.80it/s]


[['캐스팅', '라인업', '노희경', '극본', '몰입', '차승원'],
 ['침묵', '순간', '노희경', '극본', '경지'],
 ['세상', '노희경', '작가', '배우', '스토리'],
 ['배우', '한수', '위험', '생각', '고통'],
 ['작품', '결혼', '커플', '출연']]

In [8]:
from gensim.models.ldamodel import LdaModel
from gensim.models.callbacks import CoherenceMetric
from gensim import corpora
from gensim.models.callbacks import PerplexityMetric

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [10]:
dictionary = corpora.Dictionary(preprocessed_data)

2022-04-15 18:18:11,785 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2022-04-15 18:18:12,440 : INFO : adding document #10000 to Dictionary(10735 unique tokens: ['극본', '노희경', '라인업', '몰입', '차승원']...)
2022-04-15 18:18:12,967 : INFO : adding document #20000 to Dictionary(14872 unique tokens: ['극본', '노희경', '라인업', '몰입', '차승원']...)
2022-04-15 18:18:13,215 : INFO : built Dictionary(16060 unique tokens: ['극본', '노희경', '라인업', '몰입', '차승원']...) from 25676 documents (total 275668 corpus positions)
2022-04-15 18:18:13,217 : INFO : Dictionary lifecycle event {'msg': "built Dictionary(16060 unique tokens: ['극본', '노희경', '라인업', '몰입', '차승원']...) from 25676 documents (total 275668 corpus positions)", 'datetime': '2022-04-15T18:18:13.217831', 'gensim': '4.1.2', 'python': '3.7.11 (default, Jul 27 2021, 09:42:29) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-8.1-6.3.9600-SP0', 'event': 'created'}


In [11]:
dictionary.filter_extremes(no_below=2, no_above=0.5)

2022-04-15 18:18:50,710 : INFO : discarding 7032 tokens: [('강세', 1), ('진에', 1), ('우빈', 1), ('파라다이스', 1), ('후감', 1), ('김규태', 1), ('프랜즈', 1), ('다저', 1), ('코빅', 1), ('돈때', 1)]...
2022-04-15 18:18:50,713 : INFO : keeping 9028 tokens which were in no less than 2 and no more than 12838 (=50.0%) documents
2022-04-15 18:18:50,744 : INFO : resulting dictionary: Dictionary(9028 unique tokens: ['극본', '노희경', '라인업', '몰입', '차승원']...)


In [13]:
corpus = [dictionary.doc2bow(text) for text in preprocessed_data]

In [14]:
num_topics = 4
chunksize = 2000
passes = 20
iterations = 400
eval_every = None

temp = dictionary[0]
id2word = dictionary.id2token

model = LdaModel(
    corpus=corpus,
    id2word=id2word,
    chunksize=chunksize,
    alpha='auto',
    eta='auto',
    iterations=iterations,
    num_topics=num_topics,
    passes=passes,
    eval_every=1
)

2022-04-15 18:19:34,991 : INFO : using autotuned alpha, starting with [0.25, 0.25, 0.25, 0.25]
2022-04-15 18:19:34,999 : INFO : using serial LDA version on this node
2022-04-15 18:19:35,014 : INFO : running online (multi-pass) LDA training, 4 topics, 20 passes over the supplied corpus of 25676 documents, updating model once every 2000 documents, evaluating perplexity every 2000 documents, iterating 400x with a convergence threshold of 0.001000
2022-04-15 18:19:37,884 : INFO : -9.821 per-word bound, 904.4 perplexity estimate based on a held-out corpus of 2000 documents with 20937 words
2022-04-15 18:19:37,886 : INFO : PROGRESS: pass 0, at document #2000/25676
2022-04-15 18:19:40,284 : INFO : optimized alpha [0.20886348, 0.20059188, 0.19753723, 0.17163032]
2022-04-15 18:19:40,291 : INFO : merging changes from 2000 documents into a model of 25676 documents
2022-04-15 18:19:40,299 : INFO : topic #0 (0.209): 0.029*"사랑" + 0.022*"배우" + 0.015*"드라마" + 0.014*"연기" + 0.012*"작가" + 0.012*"청춘" + 0.01

2022-04-15 18:19:57,999 : INFO : topic #1 (0.193): 0.027*"사람" + 0.022*"학교" + 0.020*"생각" + 0.013*"드라마" + 0.010*"시간" + 0.010*"사회" + 0.009*"이야기" + 0.009*"학생" + 0.008*"폭력" + 0.007*"필요"
2022-04-15 18:19:58,003 : INFO : topic #2 (0.266): 0.060*"드라마" + 0.021*"연기" + 0.016*"소희" + 0.016*"연출" + 0.015*"원작" + 0.015*"웹툰" + 0.015*"배우" + 0.014*"장면" + 0.012*"장르" + 0.009*"정도"
2022-04-15 18:19:58,006 : INFO : topic #3 (0.188): 0.019*"신파" + 0.016*"캐릭터" + 0.015*"게임" + 0.015*"작품" + 0.011*"액션" + 0.010*"영화" + 0.010*"한국" + 0.010*"드라마" + 0.009*"정도" + 0.008*"장면"
2022-04-15 18:19:58,009 : INFO : topic diff=0.456954, rho=0.377964
2022-04-15 18:19:59,566 : INFO : -8.153 per-word bound, 284.6 perplexity estimate based on a held-out corpus of 2000 documents with 22102 words
2022-04-15 18:19:59,567 : INFO : PROGRESS: pass 0, at document #16000/25676
2022-04-15 18:20:00,947 : INFO : optimized alpha [0.2287178, 0.19310127, 0.28374416, 0.19464396]
2022-04-15 18:20:00,954 : INFO : merging changes from 2000 documents into 

2022-04-15 18:20:17,059 : INFO : topic #0 (0.238): 0.016*"배우" + 0.015*"캐릭터" + 0.013*"아저씨" + 0.013*"작품" + 0.012*"지옥" + 0.011*"연기" + 0.010*"생각" + 0.010*"드라마" + 0.009*"사랑" + 0.009*"이야기"
2022-04-15 18:20:17,062 : INFO : topic #1 (0.241): 0.068*"사람" + 0.025*"사랑" + 0.023*"생각" + 0.021*"어른" + 0.020*"행복" + 0.015*"드라마" + 0.013*"세상" + 0.013*"이야기" + 0.013*"마음" + 0.010*"현실"
2022-04-15 18:20:17,065 : INFO : topic #2 (0.348): 0.117*"드라마" + 0.019*"대사" + 0.018*"연기" + 0.017*"지안" + 0.015*"배우" + 0.015*"인생" + 0.009*"연출" + 0.009*"작품" + 0.009*"결말" + 0.009*"생각"
2022-04-15 18:20:17,068 : INFO : topic #3 (0.165): 0.017*"작품" + 0.015*"감독" + 0.011*"영화" + 0.010*"캐릭터" + 0.010*"연상호" + 0.008*"드라마" + 0.008*"한국" + 0.007*"인간" + 0.007*"신파" + 0.006*"연민"
2022-04-15 18:20:17,071 : INFO : topic diff=0.407928, rho=0.259605
2022-04-15 18:20:19,016 : INFO : -8.070 per-word bound, 268.8 perplexity estimate based on a held-out corpus of 2000 documents with 30688 words
2022-04-15 18:20:19,017 : INFO : PROGRESS: pass 1, at document 

2022-04-15 18:20:31,715 : INFO : PROGRESS: pass 1, at document #16000/25676
2022-04-15 18:20:32,570 : INFO : optimized alpha [0.25991154, 0.24470142, 0.45624977, 0.20453407]
2022-04-15 18:20:32,577 : INFO : merging changes from 2000 documents into a model of 25676 documents
2022-04-15 18:20:32,584 : INFO : topic #0 (0.260): 0.019*"지옥" + 0.018*"캐릭터" + 0.016*"배우" + 0.012*"작품" + 0.012*"시즌" + 0.011*"연기" + 0.011*"생각" + 0.008*"이야기" + 0.008*"종교" + 0.007*"장면"
2022-04-15 18:20:32,586 : INFO : topic #1 (0.245): 0.047*"사람" + 0.021*"생각" + 0.015*"세상" + 0.014*"인간" + 0.011*"사랑" + 0.011*"이야기" + 0.011*"학교" + 0.009*"사회" + 0.009*"행복" + 0.009*"드라마"
2022-04-15 18:20:32,589 : INFO : topic #2 (0.456): 0.065*"드라마" + 0.027*"연기" + 0.022*"소희" + 0.021*"배우" + 0.015*"연출" + 0.013*"대사" + 0.013*"한소희" + 0.012*"스토리" + 0.012*"액션" + 0.012*"장면"
2022-04-15 18:20:32,592 : INFO : topic #3 (0.205): 0.022*"게임" + 0.016*"작품" + 0.014*"신파" + 0.013*"영화" + 0.011*"감독" + 0.010*"연상호" + 0.010*"한국" + 0.010*"캐릭터" + 0.010*"오징어" + 0.008*"인간"

2022-04-15 18:20:44,203 : INFO : topic diff=0.331968, rho=0.251275
2022-04-15 18:20:45,383 : INFO : -7.899 per-word bound, 238.8 perplexity estimate based on a held-out corpus of 2000 documents with 30688 words
2022-04-15 18:20:45,384 : INFO : PROGRESS: pass 2, at document #4000/25676
2022-04-15 18:20:46,235 : INFO : optimized alpha [0.27253523, 0.31628844, 0.5548747, 0.19436151]
2022-04-15 18:20:46,243 : INFO : merging changes from 2000 documents into a model of 25676 documents
2022-04-15 18:20:46,250 : INFO : topic #0 (0.273): 0.017*"캐릭터" + 0.014*"배우" + 0.012*"시즌" + 0.012*"작품" + 0.010*"지옥" + 0.010*"생각" + 0.010*"이야기" + 0.009*"연기" + 0.008*"작가" + 0.007*"인물"
2022-04-15 18:20:46,252 : INFO : topic #1 (0.316): 0.066*"사람" + 0.022*"생각" + 0.022*"사랑" + 0.017*"어른" + 0.016*"행복" + 0.014*"이야기" + 0.013*"세상" + 0.012*"마음" + 0.011*"인간" + 0.009*"현실"
2022-04-15 18:20:46,255 : INFO : topic #2 (0.555): 0.103*"드라마" + 0.023*"연기" + 0.019*"배우" + 0.018*"대사" + 0.011*"인생" + 0.011*"캐릭터" + 0.011*"연출" + 0.010*"작품" 

2022-04-15 18:20:59,310 : INFO : topic #3 (0.235): 0.024*"게임" + 0.015*"작품" + 0.015*"신파" + 0.013*"영화" + 0.011*"연상호" + 0.011*"한국" + 0.010*"감독" + 0.010*"오징어" + 0.009*"사회" + 0.008*"캐릭터"
2022-04-15 18:20:59,311 : INFO : topic diff=0.298504, rho=0.251275
2022-04-15 18:21:00,436 : INFO : -7.522 per-word bound, 183.8 perplexity estimate based on a held-out corpus of 2000 documents with 19928 words
2022-04-15 18:21:00,437 : INFO : PROGRESS: pass 2, at document #18000/25676
2022-04-15 18:21:01,205 : INFO : optimized alpha [0.31343707, 0.32400796, 0.6519677, 0.24441315]
2022-04-15 18:21:01,211 : INFO : merging changes from 2000 documents into a model of 25676 documents
2022-04-15 18:21:01,219 : INFO : topic #0 (0.313): 0.033*"지옥" + 0.021*"시즌" + 0.015*"종교" + 0.014*"캐릭터" + 0.012*"작품" + 0.011*"배우" + 0.010*"생각" + 0.009*"이야기" + 0.008*"연기" + 0.007*"인물"
2022-04-15 18:21:01,222 : INFO : topic #1 (0.324): 0.052*"사람" + 0.026*"인간" + 0.021*"생각" + 0.021*"세상" + 0.011*"사랑" + 0.011*"이야기" + 0.009*"현실" + 0.008*"존재

2022-04-15 18:21:11,975 : INFO : topic #2 (0.766): 0.099*"드라마" + 0.024*"연기" + 0.021*"배우" + 0.019*"대사" + 0.013*"캐릭터" + 0.013*"작품" + 0.012*"생각" + 0.011*"연출" + 0.010*"인생" + 0.010*"장면"
2022-04-15 18:21:11,980 : INFO : topic #3 (0.228): 0.046*"게임" + 0.016*"오징어" + 0.014*"작품" + 0.012*"영화" + 0.011*"한국" + 0.009*"감독" + 0.009*"사회" + 0.008*"신파" + 0.008*"캐릭터" + 0.007*"범죄"
2022-04-15 18:21:11,983 : INFO : topic diff=0.426902, rho=0.243700
2022-04-15 18:21:13,046 : INFO : -7.479 per-word bound, 178.4 perplexity estimate based on a held-out corpus of 2000 documents with 22561 words
2022-04-15 18:21:13,048 : INFO : PROGRESS: pass 3, at document #6000/25676
2022-04-15 18:21:13,947 : INFO : optimized alpha [0.31935227, 0.3995345, 0.7518459, 0.24968491]
2022-04-15 18:21:13,954 : INFO : merging changes from 2000 documents into a model of 25676 documents
2022-04-15 18:21:13,961 : INFO : topic #0 (0.319): 0.018*"시즌" + 0.015*"캐릭터" + 0.011*"작품" + 0.010*"지옥" + 0.009*"배우" + 0.009*"생각" + 0.009*"이야기" + 0.008*"인물" 

2022-04-15 18:21:26,057 : INFO : topic #1 (0.406): 0.055*"사람" + 0.027*"인간" + 0.021*"세상" + 0.021*"생각" + 0.013*"사랑" + 0.011*"이야기" + 0.008*"존재" + 0.008*"현실" + 0.008*"행복" + 0.008*"마음"
2022-04-15 18:21:26,060 : INFO : topic #2 (0.854): 0.061*"드라마" + 0.031*"연기" + 0.023*"배우" + 0.017*"연출" + 0.015*"소희" + 0.015*"장면" + 0.014*"대사" + 0.014*"캐릭터" + 0.014*"스토리" + 0.014*"작품"
2022-04-15 18:21:26,064 : INFO : topic #3 (0.288): 0.022*"게임" + 0.020*"연상호" + 0.016*"작품" + 0.015*"신파" + 0.014*"감독" + 0.013*"영화" + 0.011*"사회" + 0.011*"오징어" + 0.010*"한국" + 0.009*"자극"
2022-04-15 18:21:26,068 : INFO : topic diff=0.234563, rho=0.243700
2022-04-15 18:21:27,140 : INFO : -7.423 per-word bound, 171.6 perplexity estimate based on a held-out corpus of 2000 documents with 18928 words
2022-04-15 18:21:27,142 : INFO : PROGRESS: pass 3, at document #20000/25676
2022-04-15 18:21:27,948 : INFO : optimized alpha [0.3768687, 0.4126754, 0.8839161, 0.2871046]
2022-04-15 18:21:27,955 : INFO : merging changes from 2000 documents into a 

2022-04-15 18:21:37,579 : INFO : topic #0 (0.363): 0.020*"시즌" + 0.013*"캐릭터" + 0.012*"지옥" + 0.009*"작품" + 0.008*"이야기" + 0.008*"인물" + 0.007*"이병헌" + 0.007*"생각" + 0.007*"다음" + 0.007*"배우"
2022-04-15 18:21:37,582 : INFO : topic #1 (0.489): 0.075*"사람" + 0.022*"생각" + 0.021*"사랑" + 0.016*"어른" + 0.015*"행복" + 0.014*"인간" + 0.014*"세상" + 0.013*"마음" + 0.013*"이야기" + 0.011*"아저씨"
2022-04-15 18:21:37,585 : INFO : topic #2 (0.931): 0.088*"드라마" + 0.025*"연기" + 0.021*"배우" + 0.020*"대사" + 0.016*"캐릭터" + 0.015*"작품" + 0.014*"생각" + 0.012*"연출" + 0.011*"마지막" + 0.011*"장면"
2022-04-15 18:21:37,588 : INFO : topic #3 (0.294): 0.066*"게임" + 0.018*"오징어" + 0.014*"작품" + 0.013*"한국" + 0.011*"영화" + 0.010*"사회" + 0.009*"장르" + 0.009*"신파" + 0.007*"자극" + 0.007*"감독"
2022-04-15 18:21:37,590 : INFO : topic diff=0.247618, rho=0.236770
2022-04-15 18:21:38,497 : INFO : -7.584 per-word bound, 191.8 perplexity estimate based on a held-out corpus of 2000 documents with 23710 words
2022-04-15 18:21:38,499 : INFO : PROGRESS: pass 4, at document #

2022-04-15 18:21:48,316 : INFO : optimized alpha [0.42473438, 0.49063653, 1.0687221, 0.3369945]
2022-04-15 18:21:48,323 : INFO : merging changes from 2000 documents into a model of 25676 documents
2022-04-15 18:21:48,329 : INFO : topic #0 (0.425): 0.037*"지옥" + 0.027*"시즌" + 0.016*"종교" + 0.011*"감독" + 0.010*"이병헌" + 0.010*"캐릭터" + 0.009*"이야기" + 0.008*"작품" + 0.007*"유아인" + 0.007*"메시지"
2022-04-15 18:21:48,331 : INFO : topic #1 (0.491): 0.057*"사람" + 0.029*"인간" + 0.022*"세상" + 0.021*"사랑" + 0.021*"생각" + 0.010*"마음" + 0.010*"이야기" + 0.009*"행복" + 0.009*"시간" + 0.008*"현실"
2022-04-15 18:21:48,334 : INFO : topic #2 (1.069): 0.068*"드라마" + 0.028*"연기" + 0.023*"배우" + 0.020*"대사" + 0.017*"캐릭터" + 0.016*"연출" + 0.014*"작품" + 0.014*"생각" + 0.014*"장면" + 0.012*"스토리"
2022-04-15 18:21:48,336 : INFO : topic #3 (0.337): 0.022*"연상호" + 0.021*"게임" + 0.015*"신파" + 0.015*"작품" + 0.015*"감독" + 0.013*"사회" + 0.012*"영화" + 0.011*"오징어" + 0.009*"한국" + 0.009*"자극"
2022-04-15 18:21:48,337 : INFO : topic diff=0.209056, rho=0.236770
2022-04-1

2022-04-15 18:21:58,061 : INFO : -7.560 per-word bound, 188.7 perplexity estimate based on a held-out corpus of 2000 documents with 23710 words
2022-04-15 18:21:58,062 : INFO : PROGRESS: pass 5, at document #8000/25676
2022-04-15 18:21:58,655 : INFO : optimized alpha [0.4203076, 0.57936394, 1.0818496, 0.32498083]
2022-04-15 18:21:58,660 : INFO : merging changes from 2000 documents into a model of 25676 documents
2022-04-15 18:21:58,667 : INFO : topic #0 (0.420): 0.013*"이병헌" + 0.013*"시즌" + 0.011*"역사" + 0.009*"캐릭터" + 0.009*"조선" + 0.008*"김혜자" + 0.008*"애신" + 0.008*"지옥" + 0.008*"김은숙" + 0.008*"인물"
2022-04-15 18:21:58,669 : INFO : topic #1 (0.579): 0.064*"사람" + 0.022*"사랑" + 0.021*"생각" + 0.017*"행복" + 0.014*"세상" + 0.014*"마음" + 0.012*"인생" + 0.012*"어른" + 0.011*"인간" + 0.011*"이야기"
2022-04-15 18:21:58,670 : INFO : topic #2 (1.082): 0.096*"드라마" + 0.027*"연기" + 0.024*"배우" + 0.021*"대사" + 0.016*"작품" + 0.016*"캐릭터" + 0.015*"생각" + 0.013*"마지막" + 0.013*"연출" + 0.012*"장면"
2022-04-15 18:21:58,672 : INFO : topic 

2022-04-15 18:22:09,300 : INFO : topic #3 (0.389): 0.022*"연상호" + 0.021*"게임" + 0.015*"신파" + 0.015*"작품" + 0.014*"사회" + 0.013*"감독" + 0.012*"영화" + 0.011*"오징어" + 0.009*"한국" + 0.009*"자극"
2022-04-15 18:22:09,305 : INFO : topic diff=0.198359, rho=0.230400
2022-04-15 18:22:10,424 : INFO : -7.136 per-word bound, 140.7 perplexity estimate based on a held-out corpus of 2000 documents with 17506 words
2022-04-15 18:22:10,427 : INFO : PROGRESS: pass 5, at document #22000/25676
2022-04-15 18:22:11,069 : INFO : optimized alpha [0.46880868, 0.5748606, 1.292837, 0.37554985]
2022-04-15 18:22:11,075 : INFO : merging changes from 2000 documents into a model of 25676 documents
2022-04-15 18:22:11,083 : INFO : topic #0 (0.469): 0.032*"지옥" + 0.026*"시즌" + 0.017*"이병헌" + 0.014*"종교" + 0.012*"감독" + 0.008*"캐릭터" + 0.008*"이야기" + 0.008*"폭력" + 0.007*"인물" + 0.007*"유아인"
2022-04-15 18:22:11,086 : INFO : topic #1 (0.575): 0.063*"사람" + 0.028*"사랑" + 0.025*"인간" + 0.020*"생각" + 0.020*"세상" + 0.012*"마음" + 0.012*"행복" + 0.010*"시간" 

2022-04-15 18:22:20,685 : INFO : topic #2 (1.208): 0.094*"드라마" + 0.027*"연기" + 0.025*"배우" + 0.020*"대사" + 0.017*"작품" + 0.017*"캐릭터" + 0.016*"생각" + 0.013*"연출" + 0.013*"마지막" + 0.012*"장면"
2022-04-15 18:22:20,688 : INFO : topic #3 (0.370): 0.061*"게임" + 0.017*"오징어" + 0.013*"작품" + 0.013*"한국" + 0.011*"사회" + 0.010*"장르" + 0.010*"영화" + 0.009*"신파" + 0.007*"자극" + 0.007*"넷플릭스"
2022-04-15 18:22:20,691 : INFO : topic diff=0.271110, rho=0.224518
2022-04-15 18:22:21,727 : INFO : -7.526 per-word bound, 184.3 perplexity estimate based on a held-out corpus of 2000 documents with 16219 words
2022-04-15 18:22:21,728 : INFO : PROGRESS: pass 6, at document #10000/25676
2022-04-15 18:22:22,305 : INFO : optimized alpha [0.45262837, 0.65057284, 1.2728716, 0.36264327]
2022-04-15 18:22:22,311 : INFO : merging changes from 2000 documents into a model of 25676 documents
2022-04-15 18:22:22,320 : INFO : topic #0 (0.453): 0.012*"김혜자" + 0.012*"시즌" + 0.012*"이병헌" + 0.010*"역사" + 0.009*"혜자" + 0.008*"캐릭터" + 0.008*"조선" + 0.007*

2022-04-15 18:22:31,968 : INFO : topic #1 (0.633): 0.064*"사람" + 0.029*"사랑" + 0.025*"인간" + 0.020*"생각" + 0.020*"세상" + 0.013*"마음" + 0.012*"행복" + 0.010*"시간" + 0.010*"이야기" + 0.008*"인생"
2022-04-15 18:22:31,972 : INFO : topic #2 (1.423): 0.082*"드라마" + 0.025*"대사" + 0.023*"연기" + 0.022*"배우" + 0.020*"캐릭터" + 0.016*"작품" + 0.015*"생각" + 0.015*"연출" + 0.013*"장면" + 0.011*"마지막"
2022-04-15 18:22:31,976 : INFO : topic #3 (0.423): 0.020*"연상호" + 0.020*"게임" + 0.015*"사회" + 0.015*"작품" + 0.014*"신파" + 0.013*"감독" + 0.012*"영화" + 0.010*"오징어" + 0.010*"한국" + 0.009*"넷플릭스"
2022-04-15 18:22:31,980 : INFO : topic diff=0.174518, rho=0.224518
2022-04-15 18:22:32,829 : INFO : -7.119 per-word bound, 139.0 perplexity estimate based on a held-out corpus of 2000 documents with 20074 words
2022-04-15 18:22:32,831 : INFO : PROGRESS: pass 6, at document #24000/25676
2022-04-15 18:22:33,373 : INFO : optimized alpha [0.47857246, 0.69020283, 1.363622, 0.40227604]
2022-04-15 18:22:33,379 : INFO : merging changes from 2000 documents int

2022-04-15 18:22:42,174 : INFO : topic #0 (0.478): 0.012*"시즌" + 0.012*"김혜자" + 0.012*"이병헌" + 0.010*"역사" + 0.010*"혜자" + 0.008*"조선" + 0.008*"지옥" + 0.007*"애신" + 0.007*"인물" + 0.007*"김은숙"
2022-04-15 18:22:42,177 : INFO : topic #1 (0.704): 0.062*"사람" + 0.024*"사랑" + 0.020*"생각" + 0.017*"행복" + 0.015*"마음" + 0.014*"인생" + 0.014*"세상" + 0.013*"하루" + 0.010*"순간" + 0.010*"인간"
2022-04-15 18:22:42,181 : INFO : topic #2 (1.377): 0.091*"드라마" + 0.026*"연기" + 0.024*"배우" + 0.020*"캐릭터" + 0.018*"대사" + 0.016*"생각" + 0.015*"작품" + 0.014*"마지막" + 0.012*"연출" + 0.011*"장면"
2022-04-15 18:22:42,185 : INFO : topic #3 (0.402): 0.056*"게임" + 0.015*"오징어" + 0.013*"한국" + 0.013*"작품" + 0.011*"사회" + 0.010*"장르" + 0.010*"영화" + 0.009*"신파" + 0.007*"자극" + 0.007*"넷플릭스"
2022-04-15 18:22:42,188 : INFO : topic diff=0.174049, rho=0.219064
2022-04-15 18:22:43,109 : INFO : -7.732 per-word bound, 212.6 perplexity estimate based on a held-out corpus of 2000 documents with 25795 words
2022-04-15 18:22:43,111 : INFO : PROGRESS: pass 7, at document #

2022-04-15 18:22:53,331 : INFO : optimized alpha [0.5008555, 0.737818, 1.4594748, 0.4403963]
2022-04-15 18:22:53,337 : INFO : merging changes from 2000 documents into a model of 25676 documents
2022-04-15 18:22:53,346 : INFO : topic #0 (0.501): 0.031*"지옥" + 0.024*"시즌" + 0.016*"이병헌" + 0.013*"종교" + 0.010*"폭력" + 0.009*"감독" + 0.007*"이야기" + 0.007*"메시지" + 0.007*"유아인" + 0.007*"인물"
2022-04-15 18:22:53,349 : INFO : topic #1 (0.738): 0.072*"사람" + 0.025*"사랑" + 0.020*"인간" + 0.019*"생각" + 0.018*"아저씨" + 0.018*"어른" + 0.017*"세상" + 0.015*"행복" + 0.014*"마음" + 0.013*"지안"
2022-04-15 18:22:53,353 : INFO : topic #2 (1.459): 0.099*"드라마" + 0.023*"대사" + 0.022*"연기" + 0.020*"배우" + 0.019*"캐릭터" + 0.018*"작품" + 0.015*"생각" + 0.013*"장면" + 0.013*"연출" + 0.011*"마지막"
2022-04-15 18:22:53,356 : INFO : topic #3 (0.440): 0.018*"게임" + 0.018*"연상호" + 0.016*"사회" + 0.015*"작품" + 0.013*"신파" + 0.011*"감독" + 0.011*"영화" + 0.010*"한국" + 0.009*"넷플릭스" + 0.009*"오징어"
2022-04-15 18:22:53,360 : INFO : topic diff=0.209437, rho=0.219064
2022-04-15 

2022-04-15 18:23:03,757 : INFO : -7.720 per-word bound, 210.8 perplexity estimate based on a held-out corpus of 2000 documents with 25795 words
2022-04-15 18:23:03,759 : INFO : PROGRESS: pass 8, at document #12000/25676
2022-04-15 18:23:04,451 : INFO : optimized alpha [0.5023738, 0.7168353, 1.4817619, 0.4685453]
2022-04-15 18:23:04,458 : INFO : merging changes from 2000 documents into a model of 25676 documents
2022-04-15 18:23:04,466 : INFO : topic #0 (0.502): 0.014*"시즌" + 0.011*"폭력" + 0.010*"이병헌" + 0.010*"김혜자" + 0.008*"역사" + 0.008*"인물" + 0.007*"혜자" + 0.007*"지옥" + 0.007*"조선" + 0.007*"시대"
2022-04-15 18:23:04,468 : INFO : topic #1 (0.717): 0.062*"사람" + 0.023*"사랑" + 0.020*"생각" + 0.015*"행복" + 0.014*"마음" + 0.013*"세상" + 0.013*"인생" + 0.012*"하루" + 0.011*"어른" + 0.011*"시간"
2022-04-15 18:23:04,471 : INFO : topic #2 (1.482): 0.078*"드라마" + 0.026*"연기" + 0.024*"배우" + 0.022*"캐릭터" + 0.016*"대사" + 0.016*"장면" + 0.016*"생각" + 0.015*"작품" + 0.015*"연출" + 0.011*"마지막"
2022-04-15 18:23:04,473 : INFO : topic #3 (

2022-04-15 18:23:12,843 : INFO : topic #3 (0.473): 0.018*"게임" + 0.018*"연상호" + 0.016*"사회" + 0.015*"작품" + 0.013*"신파" + 0.011*"영화" + 0.011*"감독" + 0.010*"넷플릭스" + 0.010*"한국" + 0.009*"오징어"
2022-04-15 18:23:12,845 : INFO : topic diff=0.202436, rho=0.213990
2022-04-15 18:23:13,638 : INFO : -6.778 per-word bound, 109.7 perplexity estimate based on a held-out corpus of 1676 documents with 12349 words
2022-04-15 18:23:13,640 : INFO : PROGRESS: pass 8, at document #25676/25676
2022-04-15 18:23:14,054 : INFO : optimized alpha [0.49786693, 0.8327288, 1.4993703, 0.45152703]
2022-04-15 18:23:14,059 : INFO : merging changes from 1676 documents into a model of 25676 documents
2022-04-15 18:23:14,066 : INFO : topic #0 (0.498): 0.030*"지옥" + 0.022*"시즌" + 0.015*"이병헌" + 0.012*"종교" + 0.011*"폭력" + 0.007*"감독" + 0.007*"인물" + 0.007*"형제" + 0.007*"메시지" + 0.006*"다음"
2022-04-15 18:23:14,068 : INFO : topic #1 (0.833): 0.078*"사람" + 0.023*"어른" + 0.023*"사랑" + 0.022*"아저씨" + 0.019*"생각" + 0.018*"행복" + 0.018*"인간" + 0.017*"지안

2022-04-15 18:23:24,441 : INFO : topic #2 (1.545): 0.078*"드라마" + 0.026*"연기" + 0.024*"배우" + 0.022*"캐릭터" + 0.016*"대사" + 0.016*"생각" + 0.016*"장면" + 0.015*"작품" + 0.015*"연출" + 0.012*"마지막"
2022-04-15 18:23:24,444 : INFO : topic #3 (0.498): 0.036*"게임" + 0.015*"신파" + 0.014*"학교" + 0.013*"사회" + 0.013*"한국" + 0.013*"작품" + 0.012*"장르" + 0.012*"오징어" + 0.010*"자극" + 0.009*"넷플릭스"
2022-04-15 18:23:24,447 : INFO : topic diff=0.246681, rho=0.209253
2022-04-15 18:23:25,309 : INFO : -7.603 per-word bound, 194.4 perplexity estimate based on a held-out corpus of 2000 documents with 17436 words
2022-04-15 18:23:25,311 : INFO : PROGRESS: pass 9, at document #14000/25676
2022-04-15 18:23:25,838 : INFO : optimized alpha [0.51431507, 0.72189623, 1.6009727, 0.5210344]
2022-04-15 18:23:25,845 : INFO : merging changes from 2000 documents into a model of 25676 documents
2022-04-15 18:23:25,852 : INFO : topic #0 (0.514): 0.016*"시즌" + 0.013*"폭력" + 0.009*"이병헌" + 0.008*"김혜자" + 0.008*"지옥" + 0.007*"역사" + 0.007*"인물" + 0.007*"혜

2022-04-15 18:23:39,162 : INFO : topic #1 (0.860): 0.078*"사람" + 0.023*"사랑" + 0.023*"어른" + 0.022*"아저씨" + 0.019*"생각" + 0.018*"인간" + 0.018*"행복" + 0.017*"지안" + 0.016*"세상" + 0.015*"마음"
2022-04-15 18:23:39,165 : INFO : topic #2 (1.553): 0.110*"드라마" + 0.022*"연기" + 0.022*"대사" + 0.020*"배우" + 0.019*"작품" + 0.019*"캐릭터" + 0.016*"생각" + 0.014*"장면" + 0.013*"연출" + 0.011*"마지막"
2022-04-15 18:23:39,169 : INFO : topic #3 (0.477): 0.017*"게임" + 0.017*"사회" + 0.016*"연상호" + 0.015*"작품" + 0.012*"신파" + 0.010*"영화" + 0.010*"넷플릭스" + 0.010*"한국" + 0.010*"감독" + 0.009*"장르"
2022-04-15 18:23:39,172 : INFO : topic diff=0.144519, rho=0.209253
2022-04-15 18:23:41,026 : INFO : -7.684 per-word bound, 205.7 perplexity estimate based on a held-out corpus of 2000 documents with 20937 words
2022-04-15 18:23:41,028 : INFO : PROGRESS: pass 10, at document #2000/25676
2022-04-15 18:23:41,841 : INFO : optimized alpha [0.50833994, 0.8676889, 1.5919701, 0.46344832]
2022-04-15 18:23:41,847 : INFO : merging changes from 2000 documents into

2022-04-15 18:23:56,677 : INFO : topic #0 (0.523): 0.016*"시즌" + 0.014*"폭력" + 0.009*"이병헌" + 0.008*"김혜자" + 0.008*"지옥" + 0.008*"역사" + 0.007*"인물" + 0.007*"나라" + 0.007*"혜자" + 0.007*"시대"
2022-04-15 18:23:56,680 : INFO : topic #1 (0.743): 0.062*"사람" + 0.023*"사랑" + 0.020*"생각" + 0.015*"행복" + 0.014*"마음" + 0.013*"세상" + 0.013*"인생" + 0.011*"하루" + 0.011*"어른" + 0.011*"시간"
2022-04-15 18:23:56,683 : INFO : topic #2 (1.649): 0.070*"드라마" + 0.026*"연기" + 0.024*"배우" + 0.022*"캐릭터" + 0.016*"장면" + 0.016*"생각" + 0.016*"연출" + 0.015*"작품" + 0.015*"대사" + 0.012*"원작"
2022-04-15 18:23:56,687 : INFO : topic #3 (0.546): 0.030*"게임" + 0.016*"신파" + 0.015*"학교" + 0.013*"사회" + 0.013*"장르" + 0.013*"작품" + 0.012*"한국" + 0.011*"넷플릭스" + 0.010*"오징어" + 0.010*"자극"
2022-04-15 18:23:56,690 : INFO : topic diff=0.157481, rho=0.204817
2022-04-15 18:23:58,048 : INFO : -7.626 per-word bound, 197.6 perplexity estimate based on a held-out corpus of 2000 documents with 22102 words
2022-04-15 18:23:58,050 : INFO : PROGRESS: pass 10, at document #1

2022-04-15 18:24:09,461 : INFO : PROGRESS: pass 11, at document #2000/25676
2022-04-15 18:24:09,967 : INFO : optimized alpha [0.51589006, 0.8870748, 1.6325804, 0.48417884]
2022-04-15 18:24:09,973 : INFO : merging changes from 2000 documents into a model of 25676 documents
2022-04-15 18:24:09,979 : INFO : topic #0 (0.516): 0.023*"지옥" + 0.018*"시즌" + 0.011*"이병헌" + 0.010*"폭력" + 0.010*"종교" + 0.009*"선자" + 0.008*"시대" + 0.006*"다음" + 0.006*"이름" + 0.006*"인물"
2022-04-15 18:24:09,982 : INFO : topic #1 (0.887): 0.069*"사람" + 0.031*"사랑" + 0.019*"행복" + 0.019*"어른" + 0.019*"생각" + 0.018*"아저씨" + 0.016*"마음" + 0.015*"인간" + 0.015*"세상" + 0.014*"인생"
2022-04-15 18:24:09,986 : INFO : topic #2 (1.633): 0.099*"드라마" + 0.023*"연기" + 0.021*"배우" + 0.020*"대사" + 0.017*"캐릭터" + 0.017*"작품" + 0.015*"생각" + 0.013*"장면" + 0.012*"연출" + 0.011*"마지막"
2022-04-15 18:24:09,990 : INFO : topic #3 (0.484): 0.017*"게임" + 0.015*"사회" + 0.015*"작품" + 0.014*"연상호" + 0.011*"신파" + 0.011*"한국" + 0.010*"영화" + 0.010*"넷플릭스" + 0.009*"장르" + 0.009*"감독"
202

2022-04-15 18:24:19,287 : INFO : topic diff=0.152920, rho=0.200651
2022-04-15 18:24:20,278 : INFO : -7.621 per-word bound, 196.9 perplexity estimate based on a held-out corpus of 2000 documents with 22102 words
2022-04-15 18:24:20,280 : INFO : PROGRESS: pass 11, at document #16000/25676
2022-04-15 18:24:20,807 : INFO : optimized alpha [0.53515303, 0.74976, 1.6939294, 0.58122045]
2022-04-15 18:24:20,814 : INFO : merging changes from 2000 documents into a model of 25676 documents
2022-04-15 18:24:20,824 : INFO : topic #0 (0.535): 0.026*"지옥" + 0.019*"시즌" + 0.015*"폭력" + 0.010*"종교" + 0.007*"이병헌" + 0.007*"묘사" + 0.007*"김혜자" + 0.007*"이름" + 0.006*"나라" + 0.006*"시대"
2022-04-15 18:24:20,827 : INFO : topic #1 (0.750): 0.062*"사람" + 0.021*"사랑" + 0.020*"생각" + 0.019*"인간" + 0.017*"세상" + 0.013*"마음" + 0.012*"행복" + 0.011*"인생" + 0.010*"이야기" + 0.009*"시간"
2022-04-15 18:24:20,831 : INFO : topic #2 (1.694): 0.063*"드라마" + 0.028*"연기" + 0.025*"배우" + 0.020*"캐릭터" + 0.016*"연출" + 0.016*"장면" + 0.016*"생각" + 0.016*"작품" +

2022-04-15 18:24:29,648 : INFO : topic #3 (0.501): 0.017*"게임" + 0.015*"사회" + 0.015*"작품" + 0.014*"연상호" + 0.011*"신파" + 0.011*"한국" + 0.010*"넷플릭스" + 0.010*"영화" + 0.009*"장르" + 0.009*"문제"
2022-04-15 18:24:29,650 : INFO : topic diff=0.207456, rho=0.196730
2022-04-15 18:24:30,519 : INFO : -7.679 per-word bound, 204.9 perplexity estimate based on a held-out corpus of 2000 documents with 30688 words
2022-04-15 18:24:30,520 : INFO : PROGRESS: pass 12, at document #4000/25676
2022-04-15 18:24:31,060 : INFO : optimized alpha [0.51983494, 0.8652032, 1.6372613, 0.5299204]
2022-04-15 18:24:31,067 : INFO : merging changes from 2000 documents into a model of 25676 documents
2022-04-15 18:24:31,075 : INFO : topic #0 (0.520): 0.020*"시즌" + 0.019*"지옥" + 0.011*"폭력" + 0.010*"이병헌" + 0.008*"시대" + 0.007*"종교" + 0.007*"에피소드" + 0.007*"메시지" + 0.006*"선자" + 0.006*"다음"
2022-04-15 18:24:31,077 : INFO : topic #1 (0.865): 0.071*"사람" + 0.029*"사랑" + 0.020*"생각" + 0.017*"어른" + 0.017*"행복" + 0.016*"아저씨" + 0.015*"마음" + 0.015*"세상

2022-04-15 18:24:40,043 : INFO : topic #2 (1.721): 0.063*"드라마" + 0.028*"연기" + 0.025*"배우" + 0.020*"캐릭터" + 0.016*"연출" + 0.016*"장면" + 0.016*"생각" + 0.016*"작품" + 0.015*"대사" + 0.014*"소희"
2022-04-15 18:24:40,047 : INFO : topic #3 (0.597): 0.026*"게임" + 0.015*"사회" + 0.014*"신파" + 0.014*"작품" + 0.013*"장르" + 0.012*"넷플릭스" + 0.012*"학교" + 0.011*"한국" + 0.011*"연상호" + 0.010*"영화"
2022-04-15 18:24:40,050 : INFO : topic diff=0.187977, rho=0.196730
2022-04-15 18:24:40,981 : INFO : -7.479 per-word bound, 178.4 perplexity estimate based on a held-out corpus of 2000 documents with 19928 words
2022-04-15 18:24:40,982 : INFO : PROGRESS: pass 12, at document #18000/25676
2022-04-15 18:24:41,564 : INFO : optimized alpha [0.55801225, 0.76452315, 1.6752793, 0.614336]
2022-04-15 18:24:41,572 : INFO : merging changes from 2000 documents into a model of 25676 documents
2022-04-15 18:24:41,580 : INFO : topic #0 (0.558): 0.040*"지옥" + 0.027*"시즌" + 0.018*"종교" + 0.016*"폭력" + 0.008*"유아인" + 0.008*"메시지" + 0.007*"박정민" + 0.007*"묘

2022-04-15 18:24:49,985 : INFO : topic #1 (0.876): 0.071*"사람" + 0.029*"사랑" + 0.019*"생각" + 0.017*"어른" + 0.017*"행복" + 0.016*"마음" + 0.016*"아저씨" + 0.015*"세상" + 0.015*"인간" + 0.013*"인생"
2022-04-15 18:24:49,988 : INFO : topic #2 (1.660): 0.092*"드라마" + 0.025*"연기" + 0.023*"배우" + 0.020*"캐릭터" + 0.019*"대사" + 0.017*"작품" + 0.016*"생각" + 0.013*"연출" + 0.013*"장면" + 0.011*"이야기"
2022-04-15 18:24:49,992 : INFO : topic #3 (0.544): 0.039*"게임" + 0.014*"사회" + 0.014*"오징어" + 0.014*"작품" + 0.011*"한국" + 0.011*"장르" + 0.010*"넷플릭스" + 0.010*"영화" + 0.009*"신파" + 0.008*"자극"
2022-04-15 18:24:49,996 : INFO : topic diff=0.280992, rho=0.193030
2022-04-15 18:24:51,093 : INFO : -7.425 per-word bound, 171.8 perplexity estimate based on a held-out corpus of 2000 documents with 22561 words
2022-04-15 18:24:51,094 : INFO : PROGRESS: pass 13, at document #6000/25676
2022-04-15 18:24:51,623 : INFO : optimized alpha [0.5223315, 0.8355304, 1.6220465, 0.5843644]
2022-04-15 18:24:51,629 : INFO : merging changes from 2000 documents into a

2022-04-15 18:25:02,135 : INFO : topic #0 (0.561): 0.040*"지옥" + 0.028*"시즌" + 0.018*"종교" + 0.016*"폭력" + 0.008*"유아인" + 0.008*"메시지" + 0.008*"박정민" + 0.007*"묘사" + 0.006*"다음" + 0.006*"공포"
2022-04-15 18:25:02,142 : INFO : topic #1 (0.774): 0.062*"사람" + 0.027*"인간" + 0.021*"세상" + 0.020*"생각" + 0.019*"사랑" + 0.011*"마음" + 0.010*"행복" + 0.010*"인생" + 0.010*"이야기" + 0.009*"자신"
2022-04-15 18:25:02,146 : INFO : topic #2 (1.696): 0.060*"드라마" + 0.030*"연기" + 0.025*"배우" + 0.019*"캐릭터" + 0.017*"연출" + 0.017*"장면" + 0.017*"생각" + 0.017*"작품" + 0.014*"대사" + 0.013*"스토리"
2022-04-15 18:25:02,149 : INFO : topic #3 (0.627): 0.023*"게임" + 0.017*"연상호" + 0.015*"사회" + 0.015*"작품" + 0.014*"신파" + 0.013*"넷플릭스" + 0.011*"장르" + 0.011*"영화" + 0.011*"오징어" + 0.010*"한국"
2022-04-15 18:25:02,156 : INFO : topic diff=0.160178, rho=0.193030
2022-04-15 18:25:03,222 : INFO : -7.329 per-word bound, 160.8 perplexity estimate based on a held-out corpus of 2000 documents with 18928 words
2022-04-15 18:25:03,224 : INFO : PROGRESS: pass 13, at documen

2022-04-15 18:25:14,361 : INFO : PROGRESS: pass 14, at document #6000/25676
2022-04-15 18:25:15,042 : INFO : optimized alpha [0.52412534, 0.8440006, 1.6395118, 0.5958233]
2022-04-15 18:25:15,048 : INFO : merging changes from 2000 documents into a model of 25676 documents
2022-04-15 18:25:15,055 : INFO : topic #0 (0.524): 0.026*"시즌" + 0.018*"지옥" + 0.014*"폭력" + 0.010*"이병헌" + 0.008*"시대" + 0.007*"다음" + 0.007*"메시지" + 0.006*"종교" + 0.006*"일본" + 0.006*"에피소드"
2022-04-15 18:25:15,058 : INFO : topic #1 (0.844): 0.077*"사람" + 0.026*"사랑" + 0.020*"생각" + 0.016*"어른" + 0.016*"행복" + 0.016*"마음" + 0.016*"인간" + 0.015*"세상" + 0.014*"아저씨" + 0.013*"인생"
2022-04-15 18:25:15,062 : INFO : topic #2 (1.640): 0.084*"드라마" + 0.025*"연기" + 0.022*"배우" + 0.021*"캐릭터" + 0.019*"대사" + 0.017*"생각" + 0.017*"작품" + 0.013*"연출" + 0.013*"장면" + 0.011*"마지막"
2022-04-15 18:25:15,066 : INFO : topic #3 (0.596): 0.057*"게임" + 0.016*"오징어" + 0.014*"작품" + 0.013*"한국" + 0.012*"사회" + 0.012*"장르" + 0.010*"넷플릭스" + 0.010*"영화" + 0.009*"신파" + 0.008*"자극"
2

2022-04-15 18:25:26,425 : INFO : topic diff=0.155958, rho=0.189531
2022-04-15 18:25:27,318 : INFO : -7.326 per-word bound, 160.4 perplexity estimate based on a held-out corpus of 2000 documents with 18928 words
2022-04-15 18:25:27,320 : INFO : PROGRESS: pass 14, at document #20000/25676
2022-04-15 18:25:27,870 : INFO : optimized alpha [0.57491064, 0.7872037, 1.7452351, 0.6309669]
2022-04-15 18:25:27,877 : INFO : merging changes from 2000 documents into a model of 25676 documents
2022-04-15 18:25:27,885 : INFO : topic #0 (0.575): 0.041*"지옥" + 0.031*"시즌" + 0.018*"종교" + 0.015*"폭력" + 0.012*"이병헌" + 0.009*"유아인" + 0.009*"박정민" + 0.008*"메시지" + 0.007*"다음" + 0.006*"공포"
2022-04-15 18:25:27,888 : INFO : topic #1 (0.787): 0.062*"사람" + 0.028*"인간" + 0.026*"사랑" + 0.021*"세상" + 0.020*"생각" + 0.014*"마음" + 0.011*"행복" + 0.010*"인생" + 0.009*"시간" + 0.009*"이야기"
2022-04-15 18:25:27,891 : INFO : topic #2 (1.745): 0.066*"드라마" + 0.027*"연기" + 0.024*"배우" + 0.020*"캐릭터" + 0.018*"대사" + 0.017*"생각" + 0.017*"연출" + 0.016*"작품

2022-04-15 18:25:38,793 : INFO : topic #3 (0.605): 0.056*"게임" + 0.016*"오징어" + 0.014*"작품" + 0.013*"한국" + 0.012*"사회" + 0.012*"장르" + 0.010*"넷플릭스" + 0.009*"영화" + 0.009*"신파" + 0.008*"자극"
2022-04-15 18:25:38,796 : INFO : topic diff=0.170491, rho=0.186216
2022-04-15 18:25:39,911 : INFO : -7.452 per-word bound, 175.1 perplexity estimate based on a held-out corpus of 2000 documents with 23710 words
2022-04-15 18:25:39,913 : INFO : PROGRESS: pass 15, at document #8000/25676
2022-04-15 18:25:40,513 : INFO : optimized alpha [0.54067874, 0.85314584, 1.6273271, 0.57111037]
2022-04-15 18:25:40,519 : INFO : merging changes from 2000 documents into a model of 25676 documents
2022-04-15 18:25:40,528 : INFO : topic #0 (0.541): 0.017*"시즌" + 0.015*"이병헌" + 0.012*"역사" + 0.012*"지옥" + 0.010*"폭력" + 0.010*"조선" + 0.009*"시대" + 0.009*"김혜자" + 0.008*"애신" + 0.008*"나라"
2022-04-15 18:25:40,532 : INFO : topic #1 (0.853): 0.067*"사람" + 0.026*"사랑" + 0.020*"생각" + 0.017*"행복" + 0.017*"마음" + 0.017*"인생" + 0.015*"세상" + 0.013*"어른"

2022-04-15 18:25:51,934 : INFO : topic #2 (1.756): 0.066*"드라마" + 0.027*"연기" + 0.024*"배우" + 0.021*"캐릭터" + 0.018*"대사" + 0.017*"생각" + 0.017*"연출" + 0.016*"작품" + 0.016*"장면" + 0.012*"마지막"
2022-04-15 18:25:51,937 : INFO : topic #3 (0.640): 0.022*"게임" + 0.019*"연상호" + 0.015*"작품" + 0.015*"사회" + 0.014*"신파" + 0.013*"넷플릭스" + 0.011*"장르" + 0.011*"영화" + 0.010*"한국" + 0.010*"오징어"
2022-04-15 18:25:51,940 : INFO : topic diff=0.139570, rho=0.186216
2022-04-15 18:25:52,936 : INFO : -7.090 per-word bound, 136.2 perplexity estimate based on a held-out corpus of 2000 documents with 17506 words
2022-04-15 18:25:52,937 : INFO : PROGRESS: pass 15, at document #22000/25676
2022-04-15 18:25:53,458 : INFO : optimized alpha [0.57438445, 0.80930895, 1.8149655, 0.6175131]
2022-04-15 18:25:53,467 : INFO : merging changes from 2000 documents into a model of 25676 documents
2022-04-15 18:25:53,476 : INFO : topic #0 (0.574): 0.036*"지옥" + 0.029*"시즌" + 0.019*"이병헌" + 0.016*"종교" + 0.015*"폭력" + 0.008*"박정민" + 0.008*"유아인" + 0.007

2022-04-15 18:26:03,291 : INFO : topic #1 (0.858): 0.067*"사람" + 0.027*"사랑" + 0.020*"생각" + 0.017*"행복" + 0.017*"인생" + 0.017*"마음" + 0.015*"세상" + 0.013*"어른" + 0.013*"인간" + 0.012*"아저씨"
2022-04-15 18:26:03,294 : INFO : topic #2 (1.638): 0.091*"드라마" + 0.026*"연기" + 0.025*"배우" + 0.020*"대사" + 0.020*"캐릭터" + 0.018*"작품" + 0.018*"생각" + 0.014*"장면" + 0.014*"연출" + 0.013*"마지막"
2022-04-15 18:26:03,299 : INFO : topic #3 (0.579): 0.053*"게임" + 0.015*"오징어" + 0.014*"작품" + 0.014*"한국" + 0.012*"장르" + 0.012*"사회" + 0.010*"넷플릭스" + 0.009*"영화" + 0.009*"신파" + 0.008*"자극"
2022-04-15 18:26:03,303 : INFO : topic diff=0.205698, rho=0.183069
2022-04-15 18:26:04,314 : INFO : -7.466 per-word bound, 176.8 perplexity estimate based on a held-out corpus of 2000 documents with 16219 words
2022-04-15 18:26:04,316 : INFO : PROGRESS: pass 16, at document #10000/25676
2022-04-15 18:26:04,791 : INFO : optimized alpha [0.53872526, 0.86070216, 1.6959062, 0.566615]
2022-04-15 18:26:04,796 : INFO : merging changes from 2000 documents into

2022-04-15 18:26:17,115 : INFO : topic #0 (0.574): 0.036*"지옥" + 0.029*"시즌" + 0.018*"이병헌" + 0.016*"종교" + 0.015*"폭력" + 0.009*"박정민" + 0.008*"유아인" + 0.007*"메시지" + 0.006*"다음" + 0.006*"시대"
2022-04-15 18:26:17,117 : INFO : topic #1 (0.813): 0.065*"사람" + 0.032*"사랑" + 0.024*"인간" + 0.019*"세상" + 0.019*"생각" + 0.015*"마음" + 0.012*"행복" + 0.012*"인생" + 0.010*"시간" + 0.009*"이야기"
2022-04-15 18:26:17,120 : INFO : topic #2 (1.822): 0.080*"드라마" + 0.023*"연기" + 0.023*"대사" + 0.023*"배우" + 0.022*"캐릭터" + 0.017*"작품" + 0.016*"생각" + 0.015*"연출" + 0.014*"장면" + 0.012*"감독"
2022-04-15 18:26:17,122 : INFO : topic #3 (0.625): 0.021*"게임" + 0.017*"연상호" + 0.015*"사회" + 0.015*"작품" + 0.013*"신파" + 0.012*"넷플릭스" + 0.011*"장르" + 0.011*"한국" + 0.010*"영화" + 0.010*"자극"
2022-04-15 18:26:17,125 : INFO : topic diff=0.128205, rho=0.183069
2022-04-15 18:26:18,204 : INFO : -7.071 per-word bound, 134.5 perplexity estimate based on a held-out corpus of 2000 documents with 20074 words
2022-04-15 18:26:18,206 : INFO : PROGRESS: pass 16, at document

2022-04-15 18:26:30,753 : INFO : optimized alpha [0.538944, 0.8643265, 1.7030474, 0.5734164]
2022-04-15 18:26:30,761 : INFO : merging changes from 2000 documents into a model of 25676 documents
2022-04-15 18:26:30,769 : INFO : topic #0 (0.539): 0.015*"시즌" + 0.013*"이병헌" + 0.012*"김혜자" + 0.011*"지옥" + 0.011*"역사" + 0.010*"폭력" + 0.010*"혜자" + 0.009*"시대" + 0.009*"나라" + 0.009*"조선"
2022-04-15 18:26:30,772 : INFO : topic #1 (0.864): 0.064*"사람" + 0.027*"사랑" + 0.019*"생각" + 0.019*"인생" + 0.017*"행복" + 0.017*"마음" + 0.015*"세상" + 0.012*"하루" + 0.011*"어른" + 0.011*"인간"
2022-04-15 18:26:30,776 : INFO : topic #2 (1.703): 0.089*"드라마" + 0.025*"연기" + 0.025*"배우" + 0.021*"캐릭터" + 0.018*"대사" + 0.017*"생각" + 0.016*"작품" + 0.013*"마지막" + 0.013*"장면" + 0.012*"연출"
2022-04-15 18:26:30,780 : INFO : topic #3 (0.573): 0.049*"게임" + 0.014*"오징어" + 0.014*"작품" + 0.013*"한국" + 0.012*"사회" + 0.012*"장르" + 0.010*"넷플릭스" + 0.009*"영화" + 0.008*"자극" + 0.008*"신파"
2022-04-15 18:26:30,784 : INFO : topic diff=0.129194, rho=0.180076
2022-04-15 18:2

2022-04-15 18:26:42,021 : INFO : -7.068 per-word bound, 134.2 perplexity estimate based on a held-out corpus of 2000 documents with 20074 words
2022-04-15 18:26:42,024 : INFO : PROGRESS: pass 17, at document #24000/25676
2022-04-15 18:26:42,670 : INFO : optimized alpha [0.5502798, 0.8726455, 1.7471293, 0.5996752]
2022-04-15 18:26:42,676 : INFO : merging changes from 2000 documents into a model of 25676 documents
2022-04-15 18:26:42,684 : INFO : topic #0 (0.550): 0.034*"지옥" + 0.027*"시즌" + 0.017*"이병헌" + 0.016*"폭력" + 0.014*"종교" + 0.008*"박정민" + 0.007*"유아인" + 0.007*"메시지" + 0.006*"시대" + 0.006*"이름"
2022-04-15 18:26:42,687 : INFO : topic #1 (0.873): 0.072*"사람" + 0.028*"사랑" + 0.020*"인간" + 0.019*"생각" + 0.017*"아저씨" + 0.017*"어른" + 0.017*"세상" + 0.016*"마음" + 0.016*"인생" + 0.015*"행복"
2022-04-15 18:26:42,691 : INFO : topic #2 (1.747): 0.094*"드라마" + 0.023*"연기" + 0.022*"대사" + 0.021*"배우" + 0.021*"캐릭터" + 0.018*"작품" + 0.017*"생각" + 0.014*"장면" + 0.014*"연출" + 0.011*"감독"
2022-04-15 18:26:42,694 : INFO : topic #

2022-04-15 18:26:52,386 : INFO : topic #3 (0.579): 0.049*"게임" + 0.014*"오징어" + 0.014*"작품" + 0.013*"한국" + 0.012*"사회" + 0.012*"장르" + 0.010*"넷플릭스" + 0.009*"영화" + 0.009*"자극" + 0.008*"신파"
2022-04-15 18:26:52,389 : INFO : topic diff=0.126292, rho=0.177226
2022-04-15 18:26:53,333 : INFO : -7.654 per-word bound, 201.4 perplexity estimate based on a held-out corpus of 2000 documents with 25795 words
2022-04-15 18:26:53,334 : INFO : PROGRESS: pass 18, at document #12000/25676
2022-04-15 18:26:53,893 : INFO : optimized alpha [0.5408505, 0.835457, 1.7259998, 0.6127367]
2022-04-15 18:26:53,900 : INFO : merging changes from 2000 documents into a model of 25676 documents
2022-04-15 18:26:53,908 : INFO : topic #0 (0.541): 0.017*"시즌" + 0.015*"폭력" + 0.011*"이병헌" + 0.010*"지옥" + 0.010*"김혜자" + 0.009*"역사" + 0.009*"나라" + 0.008*"시대" + 0.008*"혜자" + 0.007*"조선"
2022-04-15 18:26:53,911 : INFO : topic #1 (0.835): 0.065*"사람" + 0.026*"사랑" + 0.019*"생각" + 0.017*"인생" + 0.016*"마음" + 0.016*"행복" + 0.014*"세상" + 0.012*"어른" + 

2022-04-15 18:27:02,494 : INFO : topic #2 (1.752): 0.094*"드라마" + 0.023*"연기" + 0.022*"대사" + 0.021*"배우" + 0.021*"캐릭터" + 0.018*"작품" + 0.017*"생각" + 0.015*"장면" + 0.014*"연출" + 0.011*"감독"
2022-04-15 18:27:02,498 : INFO : topic #3 (0.605): 0.019*"게임" + 0.016*"사회" + 0.016*"작품" + 0.015*"연상호" + 0.012*"신파" + 0.012*"넷플릭스" + 0.011*"한국" + 0.011*"장르" + 0.010*"영화" + 0.010*"문제"
2022-04-15 18:27:02,501 : INFO : topic diff=0.157533, rho=0.177226
2022-04-15 18:27:03,197 : INFO : -6.784 per-word bound, 110.2 perplexity estimate based on a held-out corpus of 1676 documents with 12349 words
2022-04-15 18:27:03,200 : INFO : PROGRESS: pass 18, at document #25676/25676
2022-04-15 18:27:03,589 : INFO : optimized alpha [0.53095996, 0.929087, 1.7077514, 0.5795697]
2022-04-15 18:27:03,595 : INFO : merging changes from 1676 documents into a model of 25676 documents
2022-04-15 18:27:03,601 : INFO : topic #0 (0.531): 0.033*"지옥" + 0.025*"시즌" + 0.017*"폭력" + 0.016*"이병헌" + 0.013*"종교" + 0.008*"박정민" + 0.007*"유아인" + 0.007*"이름

2022-04-15 18:27:12,063 : INFO : topic #1 (0.838): 0.065*"사람" + 0.026*"사랑" + 0.019*"생각" + 0.018*"인생" + 0.016*"마음" + 0.016*"행복" + 0.014*"세상" + 0.012*"어른" + 0.011*"하루" + 0.011*"인간"
2022-04-15 18:27:12,065 : INFO : topic #2 (1.730): 0.079*"드라마" + 0.025*"연기" + 0.024*"배우" + 0.023*"캐릭터" + 0.017*"생각" + 0.017*"대사" + 0.016*"장면" + 0.016*"작품" + 0.015*"연출" + 0.012*"마지막"
2022-04-15 18:27:12,068 : INFO : topic #3 (0.618): 0.035*"게임" + 0.014*"신파" + 0.013*"작품" + 0.013*"한국" + 0.013*"사회" + 0.013*"장르" + 0.012*"학교" + 0.011*"오징어" + 0.011*"자극" + 0.010*"넷플릭스"
2022-04-15 18:27:12,070 : INFO : topic diff=0.188114, rho=0.174507
2022-04-15 18:27:12,862 : INFO : -7.569 per-word bound, 189.9 perplexity estimate based on a held-out corpus of 2000 documents with 17436 words
2022-04-15 18:27:12,863 : INFO : PROGRESS: pass 19, at document #14000/25676
2022-04-15 18:27:13,453 : INFO : optimized alpha [0.53866345, 0.81304055, 1.7799608, 0.6393836]
2022-04-15 18:27:13,460 : INFO : merging changes from 2000 documents into

2022-04-15 18:27:23,159 : INFO : topic #0 (0.531): 0.033*"지옥" + 0.025*"시즌" + 0.017*"폭력" + 0.016*"이병헌" + 0.013*"종교" + 0.008*"박정민" + 0.007*"이름" + 0.007*"유아인" + 0.007*"메시지" + 0.007*"화살촉"
2022-04-15 18:27:23,161 : INFO : topic #1 (0.929): 0.076*"사람" + 0.025*"사랑" + 0.021*"어른" + 0.021*"아저씨" + 0.018*"인생" + 0.018*"생각" + 0.018*"인간" + 0.017*"행복" + 0.016*"마음" + 0.016*"세상"
2022-04-15 18:27:23,164 : INFO : topic #2 (1.711): 0.105*"드라마" + 0.023*"연기" + 0.021*"대사" + 0.021*"배우" + 0.020*"캐릭터" + 0.019*"작품" + 0.017*"생각" + 0.014*"장면" + 0.013*"연출" + 0.012*"이야기"
2022-04-15 18:27:23,167 : INFO : topic #3 (0.584): 0.019*"게임" + 0.017*"사회" + 0.016*"작품" + 0.014*"연상호" + 0.012*"신파" + 0.011*"넷플릭스" + 0.011*"한국" + 0.010*"장르" + 0.010*"문제" + 0.010*"영화"
2022-04-15 18:27:23,170 : INFO : topic diff=0.117206, rho=0.174507
2022-04-15 18:27:23,192 : INFO : LdaModel lifecycle event {'msg': 'trained LdaModel(num_terms=9028, num_topics=4, decay=0.5, chunksize=2000) in 468.18s', 'datetime': '2022-04-15T18:27:23.192801', 'gensim':

In [15]:
top_topics = model.top_topics(corpus) #, num_words=20)

# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
print('Average topic coherence: %.4f.' % avg_topic_coherence)

from pprint import pprint
pprint(top_topics)

2022-04-15 18:27:39,900 : INFO : CorpusAccumulator accumulated stats from 1000 documents
2022-04-15 18:27:39,913 : INFO : CorpusAccumulator accumulated stats from 2000 documents
2022-04-15 18:27:39,929 : INFO : CorpusAccumulator accumulated stats from 3000 documents
2022-04-15 18:27:39,947 : INFO : CorpusAccumulator accumulated stats from 4000 documents
2022-04-15 18:27:39,962 : INFO : CorpusAccumulator accumulated stats from 5000 documents
2022-04-15 18:27:39,990 : INFO : CorpusAccumulator accumulated stats from 6000 documents
2022-04-15 18:27:40,018 : INFO : CorpusAccumulator accumulated stats from 7000 documents
2022-04-15 18:27:40,040 : INFO : CorpusAccumulator accumulated stats from 8000 documents
2022-04-15 18:27:40,061 : INFO : CorpusAccumulator accumulated stats from 9000 documents
2022-04-15 18:27:40,080 : INFO : CorpusAccumulator accumulated stats from 10000 documents
2022-04-15 18:27:40,108 : INFO : CorpusAccumulator accumulated stats from 11000 documents
2022-04-15 18:27:40

Average topic coherence: -3.2291.
[([(0.10471779, '드라마'),
   (0.023051577, '연기'),
   (0.021149863, '대사'),
   (0.020926178, '배우'),
   (0.020246599, '캐릭터'),
   (0.018738464, '작품'),
   (0.01679082, '생각'),
   (0.014471733, '장면'),
   (0.013483906, '연출'),
   (0.011537948, '이야기'),
   (0.010793281, '마지막'),
   (0.010442243, '감독'),
   (0.010110605, '느낌'),
   (0.010037513, '정도'),
   (0.010003222, '현실'),
   (0.009311659, '스토리'),
   (0.0075154672, '기대'),
   (0.0070537743, '부분'),
   (0.0069722445, '작가'),
   (0.0068899547, '매력')],
  -2.189812055929553),
 ([(0.018609596, '게임'),
   (0.016501972, '사회'),
   (0.015892196, '작품'),
   (0.014407201, '연상호'),
   (0.011692213, '신파'),
   (0.011429536, '넷플릭스'),
   (0.010738567, '한국'),
   (0.01024241, '장르'),
   (0.009940374, '문제'),
   (0.009762504, '영화'),
   (0.009415802, '인간'),
   (0.00936915, '자극'),
   (0.008335626, '오징어'),
   (0.00828265, '감독'),
   (0.007233473, '학교'),
   (0.006935305, '세계'),
   (0.00657659, '주제'),
   (0.0057260673, '상황'),
   (0.005010325, '비판')

In [16]:
import pickle
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis
from gensim.models.coherencemodel import CoherenceModel
import matplotlib.pyplot as plt

In [17]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(model, corpus, dictionary, sort_topics=False)
pyLDAvis.display(vis)

  by='saliency', ascending=False).head(R).drop('saliency', 1)
