### **Library**

In [None]:
!pip install konlpy

from konlpy.tag import Mecab
!git clone https://github.com/SOMJANG/Mecab-ko-for-Google-Colab.git
%cd Mecab-ko-for-Google-Colab/
!bash install_mecab-ko_on_colab190912.sh

In [None]:
from konlpy.tag import Mecab,Okt
from tqdm import tqdm
import pickle
import csv
from pandas import DataFrame 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

import warnings
warnings.filterwarnings('ignore')




## **Functions**


In [None]:
def clean_text(text):
    text = text.replace(".", "").strip()
    text = text.replace("·", " ").strip()
    pattern = '[^ ㄱ-ㅣ가-힣|0-9]+'
    text = re.sub(pattern=pattern, repl='', string=text)
    return text

In [None]:
def get_nouns(tokenizer, sentence):
    tagged = tokenizer.nouns(sentence)
    nouns = [x for x in tagged if len(x) >1]

    stopwords = pd.read_csv("https://raw.githubusercontent.com/yoonkt200/FastCampusDataset/master/korean_stopwords.txt").values.tolist()
    ebike_stopwords = ['자전거', '생각', '정도', '아주', '사은', '모두','정말','타고','조금','바로','보고','일단','매우','느낌','하나','다시','진짜','한번','대비','자체','약간','사람','듭니','전기자전거','생각','정도','아주','사은','정말','타고','설명','조금','바로','마음','보고','일단','매우','느낌','하나','이상','다시','진짜','한번','대비','자체','약간','사람','듭니','구매','제품','생각','아주','문제','구입','바로','사장','바로','마음','처음','사용','그냥','부분','전기', '여기', '때문', '오늘','다른','살짝','삼천리','팬텀','볼트','다음','해주시','가능','정도', '생각', '사용', '제품', '사진', '장착', '사람', '때문', '라이', '경우', '하나', '이상', '이용', '우리', '확인', '여기', '일반', '감사', '선택', '카페', '추천', '다음', '시작', '우도', '안녕', '이번', '필요', '느낌', '처음', '저희', '문제' ,'추가', '부분', '마음', '하루', '가지', '최고', '화질', '모습', '고민', '재생', '진행', '최대', '마지막', '작업', '동안', '기분', '참고', '카카오' ,'설명' ,'부탁', '커피', '행복', '사고', '이마', '전문', '기준', ' 제공', '포스팅', '스타', '걱정', '포함', '관련', '네이버', '기존', '이야기', '소리', '나라', '적용', '이거', '기억', '지원', '블로그']
    
    for word in ebike_stopwords:
      stopwords.append(word)

    nouns = [words for words in nouns if words not in stopwords]
    return nouns

In [None]:
def tokenize(df):
    tokenizer = Okt()
    processed_data = []
    for sent in tqdm(df['내용']):
        sentence = clean_text(str(sent).replace("\n", "").strip()) #줄 마다 데이터 전처리
        processed_data.append(get_nouns(tokenizer, sentence)) 
    return processed_data

In [None]:
def save_processed_data(processed_data):
    with open("tokenized_data_jinyard.csv", 'w', newline="", encoding='utf-8') as f:
        writer = csv.writer(f)
        for data in processed_data:
            writer.writerow(data)

## **Running Code**

In [None]:
if __name__ == '__main__':
    df = pd.read_csv("/content/elec_bicycle_included.csv",encoding='utf-8')
    processed_data = tokenize(df)
    save_processed_data(processed_data)

In [None]:
#csv에 저장해놓은 단어들 불러오기


import csv

with open('/content/tokenized_data_social_mecab.csv', encoding='utf-8', newline='') as f:
  next(f)
  reader = csv.reader(f)
  data = list(reader)

print(type(data))

<class 'list'>


## **LDA modeling**


In [None]:
from gensim.models.ldamodel import LdaModel
from gensim.models.coherencemodel import CoherenceModel
from gensim.models.callbacks import CoherenceMetric
from gensim import corpora
from gensim.models.callbacks import PerplexityMetric

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [None]:
dictionary = corpora.Dictionary(data)
dictionary.filter_extremes(no_below=2, no_above=0.5) #단어 2번 이하 및 50% 이상 등장 제거
corpus = [dictionary.doc2bow(text) for text in data]

In [None]:
#passes -> epoch 반복에 따른 coherence 변화

coherences=[]
perplexities=[]
passes=[]
warnings.filterwarnings('ignore')

for i in range(7):
    
    ntopics, nwords = 4, 100
    if i==0:
        p=1
    else:
        p=i*5
    lda4 = LdaModel(corpus, id2word=dictionary, num_topics=ntopics, iterations=400, passes=p)
    print('epoch',p)
    # tfidf, corpus 무슨 차이?
    # lda = models.ldamodel.LdaModel(corpus, id2word=dictionary, num_topics=ntopics, iterations=200000)

    cm = CoherenceModel(model=lda4, corpus=corpus, coherence='u_mass')
    coherence = cm.get_coherence()
    print("Cpherence",coherence)
    coherences.append(coherence)
    print('Perplexity: ', lda4.log_perplexity(corpus),'\n\n')
    perplexities.append(lda4.log_perplexity(corpus))

In [None]:
#결과 시각화

for i in range(7):
  if i ==0:
    passes.append(1)
  else:
    passes.append(5 * i)

plt.plot(passes, coherences)
plt.show()


In [None]:
plt.plot(passes, perplexities)
plt.show()

In [None]:
coherencesT=[]
perplexitiesT=[]
passes=[]
warnings.filterwarnings('ignore')

for i in range(7):
    
    ntopics, nwords = 5, 100
    if i==0:
        ntopics = 5
    else:
        ntopics = (5 * (i+1) )
    lda4 = LdaModel(corpus, id2word=dictionary, num_topics=ntopics, iterations=400, passes=20)
    print('ntopis', ntopics)

    cm = CoherenceModel(model=lda4, corpus=corpus, coherence='u_mass')
    coherence = cm.get_coherence()
    print("Cpherence",coherence)
    coherencesT.append(coherence)
    print('Perplexity: ', lda4.log_perplexity(corpus),'\n\n')
    perplexitiesT.append(lda4.log_perplexity(corpus))

In [None]:
ntopicT = [5, 10, 15, 20, 25, 30, 35]

plt.plot(ntopicT, coherencesT)
plt.show()


In [None]:
plt.plot(ntopicT ,  perplexitiesT)
plt.show()

In [None]:
#최종 모델

num_topics = 5
chunksize = 2000
passes = 20
iterations = 400
eval_every = None

temp = dictionary[0]
id2word = dictionary.id2token

model = LdaModel(
    corpus=corpus,
    id2word=id2word,
    chunksize=chunksize,
    alpha='auto',
    eta='auto',
    iterations=iterations,
    num_topics=num_topics,
    passes=passes,
    eval_every=eval_every
)


In [None]:
top_topics = model.top_topics(corpus) #, num_words=20)

# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
print('Average topic coherence: %.4f.' % avg_topic_coherence)

from pprint import pprint
pprint(top_topics)

## **LDA model Visualization**


In [None]:
pip install pyldavis

In [None]:
import pickle
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis
from gensim.models.coherencemodel import CoherenceModel
import matplotlib.pyplot as plt

from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
%matplotlib inline


In [None]:
lda_visualization = gensimvis.prepare(model, corpus, dictionary, sort_topics=False)
pyLDAvis.save_html(lda_visualization, 'topic_4_name.html')