In [None]:
import pandas as pd

nwb = pd.read_csv('data/NAVER.csv', encoding='cp949', usecols=['title', 'genre', 'story', 'platform'])
print(nwb.shape)
nwb.head(3)

In [None]:
nwb_end = pd.read_csv('data/Naver_FINISH.csv', encoding='cp949', usecols=['title', 'genre', 'story', 'platform'])
print(nwb_end.shape)
nwb_end.head(3)

In [None]:
# 연재 중, 완결 웹툰 합치기
nwb = pd.concat([nwb, nwb_end])
nwb.reset_index(inplace=True, drop=True)
nwb.fillna('완결', inplace=True)
print(nwb.shape)
nwb.head(3)

In [None]:
lz = pd.read_csv('data/LZ.csv', encoding='cp949', usecols=['title', 'genre', 'story', 'platform'])
print(lz.shape)
lz.head(3)

In [None]:
# 네이버, 레진 웹툰 합치기
wb = pd.concat([nwb, lz])
wb.reset_index(inplace=True, drop=True)
print(wb.shape)
wb.tail(3)

## 전처리
- 장르, 줄거리에 대하여 특수문자, 숫자 등을 없앤 순수 한글만 남김
- 제목도 하려 했으나, 토큰화 할 때 오류가 생겨 하지 않음

In [None]:
import re

def hangul(text):
    return re.sub('[^ㄱ-ㅎ|ㅏ-ㅣ|가-힣]', ' ', text)

In [None]:
wb_copy = wb.copy()
wb_copy['genre'] = wb_copy['genre'].apply(lambda x:hangul(x))
wb_copy['story'] = wb_copy['story'].apply(lambda x:hangul(x))

wb_copy.head(3)

In [None]:
print(wb['story'][11])
print(wb_copy['story'][11])

In [None]:
with open('data/stopwords_korean.txt', 'r', encoding='utf8') as f:
    stopwords = f.readline()
stopwords = stopwords.split()

stopwords += ['이야기', '시작', '날', '보다', '이다']
print(stopwords[-10:])

## 토큰화
- KoNLPy 설치 (교재 538 ~ 540 페이지)
- Mecab 설치(윈도우) (https://cleancode-ws.tistory.com/97)
- stemming : 갇힌 > 갇히다 등 단어의 원형으로 되돌리는 것
    - konlpy에서는 Okt만 지원함

In [None]:
from konlpy.tag import Okt
okt = Okt() # Okt : Open Korean Text

def okt_tokenizer(text):
    malist = okt.pos(text, norm=True, stem=True)
    filtered_words = []
    # 필요한 어구만 대상으로 하기
    for word in malist:
        # 어미/조사/구두점 제외
        if not word[1] in ['Josa', 'Eomi', 'Punctuation']:
            if (word[0] not in stopwords) and (len(word[0]) > 1):
                filtered_words.append(word[0])
    return filtered_words

In [None]:
print(okt_tokenizer(wb_copy['story'][11]))

In [None]:
# Gensim
import gensim
import gensim.corpora as corpora
from gensim.models import CoherenceModel

# Plotting tools
import pyLDAvis.gensim_models
import pyLDAvis

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

In [None]:
import matplotlib.pyplot as plt
from matplotlib import font_manager, rc
font_path = "C:/Windows/Fonts/NGULIM.TTF"
font = font_manager.FontProperties(fname=font_path).get_name()
rc('font', family=font)

In [None]:
#텍스트 데이터를 리스트로 변환
Data_list=wb_copy.story.values.tolist()

#리스트를 요소별로 가져와서 토큰화 후 저장
data_word=[]
for i in range(len(Data_list)):
    data_word.append(okt_tokenizer(Data_list[i]))

In [None]:
for words in data_word[:3]:
    print(words)

In [None]:
# Create Dictionary
id2word = corpora.Dictionary(data_word)

# Create Corpus
texts = data_word

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])

In [None]:
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        print(num_topics, end=' ')
        model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                              id2word=id2word,
                                              num_topics=num_topics, 
                                              random_state=100,
                                              update_every=1,
                                              chunksize=100,
                                              passes=10,
                                              alpha='auto',
                                              per_word_topics=True)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())
    return model_list, coherence_values

In [None]:
model_list, coherence_values = compute_coherence_values(dictionary=id2word, corpus=corpus, texts=texts, start=3, limit=21, step=1)

In [None]:
# Show graph
limit=21; start=3; step=1;
x = range(start, limit, step)
plt.plot(x, coherence_values)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()

In [None]:
# Print the coherence scores
for m, cv in zip(x, coherence_values):
    print("Num Topics =", m, " has Coherence Value of", round(cv, 4))

In [None]:
from pprint import pprint

# Select the model and print the topics
optimal_model = model_list[8]
model_topics = optimal_model.show_topics(formatted=False)
pprint(optimal_model.print_topics(num_words=10))

In [None]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(optimal_model, corpus, id2word)
vis