In [226]:
import pandas as pd
from pandas import DataFrame  as df
import numpy as np
from collections import Counter
from konlpy.tag import Okt
okt = Okt()
%matplotlib inline
import matplotlib.pyplot as plt
import re 
from datetime import datetime
import tomotopy as tp


data = pd.read_excel('reviews_youngtong.xlsx')

# 메뉴 검색

In [128]:
user_input = "짜장"


arr = []

for i in range(len(data['메뉴'])):
    if type(data['메뉴'][i]) == str and user_input in data['메뉴'][i]:
        arr.append(data.loc[i])
        

searched_reviews = pd.DataFrame(arr, columns = ['지점명','유저','메뉴','리뷰','총점','맛','양','배달','시간'])

# 데이터 전처리

In [129]:
# 데이터 프레임의 '리뷰' 열의 값들을 str 형식으로 바꾸기
searched_reviews.리뷰 = searched_reviews.리뷰.astype(str)

# 중복데이터 삭제
searched_reviews.drop_duplicates(subset=['리뷰'],inplace=True)

# 한글이 아니면 빈 문자열로 바꾸기
searched_reviews['리뷰'] = searched_reviews['리뷰'].str.replace('[^ㄱ-ㅎㅏ-ㅣ가-힣]',' ',regex=True)

# 빈 문자열 NAN 으로 바꾸기
searched_reviews = searched_reviews.replace({'': np.nan})
searched_reviews = searched_reviews.replace(r'^\s*$', np.nan, regex=True)

# NAN 이 있는 행 삭제
searched_reviews.dropna(how='any', inplace=True)

# 인덱스 차곡차곡
searched_reviews = searched_reviews.reset_index (drop = True)

# 데이터 프레임에 null 값이 있는지 확인
print(searched_reviews.isnull().values.any())

# 리뷰 데이터를 리스트로 변환
Data_list = searched_reviews.리뷰.values.tolist()
# 정규화 처리
Data_list = list(map(okt.normalize,Data_list))

False


# 불용어 제거 및 토큰화

In [130]:
# 명사,형용사 추출
data_word = []

# 한글자 짜리는 모두 없애야 함
stopword = ['같다','이다','있다','여기','항상','완전','정말','너무','보고','오늘','역시','이번','다음','아주']

for i, document in enumerate(Data_list):
    clean_words = []
    for word in okt.pos(document, stem=True): #어간 추출
        if word[1] in ['Noun','Adjective']: 
            if len(word[0]) >= 2 and word[0] not in stopword:
                clean_words.append(word[0])    
    data_word.append(clean_words)
data_word

[['가깝다',
  '중화요리',
  '많다',
  '굳이',
  '주문',
  '죄송하다',
  '아이',
  '맛있다',
  '짜장면',
  '짜장',
  '고기',
  '건더기',
  '많다',
  '고기',
  '한참',
  '탕수육',
  '세트',
  '탕슉',
  '양념',
  '종류',
  '주심',
  '맵다',
  '칠리',
  '취저',
  '냄새',
  '하나',
  '튀김',
  '산뜻',
  '눅지',
  '만두',
  '냉장고',
  '만두',
  '이집',
  '바로',
  '만두',
  '맛집',
  '배달',
  '쿠폰',
  '메모',
  '많다',
  '장님',
  '부디',
  '번창',
  '죄송하다',
  '주문'],
 ['맛있다'],
 [],
 ['간짜장', '간도', '면도', '좋다', '맛있다', '만두', '서비스', '좋다'],
 ['양도', '많다', '맛있다'],
 ['맛있다', '배달', '빨르다'],
 ['탕수육', '섭취', '어려움', '정도', '딱딱하다', '심지어', '부분', '많다', '기분', '굉장하다', '좋다'],
 ['간짜장', '맛있다', '탕수육', '맛있다', '짬뽕', '별로'],
 ['맛있다'],
 ['배달', '빠르다', '맛있다', '가게', '저렴하다', '맛있다'],
 ['중국집', '맛있다'],
 ['김치볶음밥',
  '맛있다',
  '후라이팬',
  '살짝',
  '아쉽다',
  '짜장',
  '인간',
  '매우',
  '별로',
  '짜장',
  '소스',
  '밋밋하다',
  '급식',
  '짜장',
  '소스',
  '짜파게티',
  '반도',
  '너무하다',
  '고추',
  '짜장',
  '쟁반짜장',
  '맛있다',
  '짜장',
  '안되다'],
 ['짠맛', '싱겁다', '적당하다', '부담', '맘껏', '사진'],
 ['빠르다',
  '배달',
  '양도',
  '많다',
  '탕수육',
  '바삭',
  '맛있다',
  '짬뽕',
  '다

In [132]:
# 모델 학습 테스트

model = tp.LDAModel(k=16, alpha=0.1, eta=0.01, min_cf=20,tw=tp.TermWeight.PMI, rm_top=1)
for i, line in enumerate(data_word):
    if not line:
        line.append(" ")
    model.add_doc(line) 
    if i % 10 == 0: print('Document #{} has been loaded'.format(i))
 
model.train(0)
print('Total docs:', len(model.docs))
print('Total words:', model.num_words)
print('Vocab size:', model.num_vocabs)
 
 
for i in range(1500):
    print('Iteration {}\tLL per word: {}'.format(i, model.ll_per_word))
    model.train(1)
 
for i in range(model.k):
    res = model.get_topic_words(i, top_n=10)
    print('Topic #{}'.format(i), end='\t')
    
    print(', '.join(w+':'+ str(p) for w, p in res))

Document #0 has been loaded
Document #10 has been loaded
Document #20 has been loaded
Document #30 has been loaded
Document #40 has been loaded
Document #50 has been loaded
Document #60 has been loaded
Document #70 has been loaded
Document #80 has been loaded
Document #90 has been loaded
Document #100 has been loaded
Document #110 has been loaded
Document #120 has been loaded
Document #130 has been loaded
Document #140 has been loaded
Document #150 has been loaded
Document #160 has been loaded
Document #170 has been loaded
Document #180 has been loaded
Document #190 has been loaded
Document #200 has been loaded
Document #210 has been loaded
Document #220 has been loaded
Document #230 has been loaded
Document #240 has been loaded
Document #250 has been loaded
Document #260 has been loaded
Document #270 has been loaded
Document #280 has been loaded
Document #290 has been loaded
Document #300 has been loaded
Document #310 has been loaded
Document #320 has been loaded
Document #330 has bee

Iteration 29	LL per word: -4.615125456437818
Iteration 30	LL per word: -4.6048942096563
Iteration 31	LL per word: -4.603310035340025
Iteration 32	LL per word: -4.595819223163998
Iteration 33	LL per word: -4.590728640060081
Iteration 34	LL per word: -4.5933737537925845
Iteration 35	LL per word: -4.590936642599044
Iteration 36	LL per word: -4.583329091251174
Iteration 37	LL per word: -4.587477589758737
Iteration 38	LL per word: -4.582438714803489
Iteration 39	LL per word: -4.5861292973345815
Iteration 40	LL per word: -4.576697061388775
Iteration 41	LL per word: -4.570658814014008
Iteration 42	LL per word: -4.569561967893212
Iteration 43	LL per word: -4.569704590011269
Iteration 44	LL per word: -4.570624519100903
Iteration 45	LL per word: -4.568297378098209
Iteration 46	LL per word: -4.565657614176159
Iteration 47	LL per word: -4.556830603624024
Iteration 48	LL per word: -4.5634269032854995
Iteration 49	LL per word: -4.562228852581254
Iteration 50	LL per word: -4.556623879391471
Iteration

Iteration 231	LL per word: -4.4285029714830415
Iteration 232	LL per word: -4.429955689979619
Iteration 233	LL per word: -4.427723124750859
Iteration 234	LL per word: -4.432450592249393
Iteration 235	LL per word: -4.431652846255693
Iteration 236	LL per word: -4.431862745039271
Iteration 237	LL per word: -4.426809077642345
Iteration 238	LL per word: -4.429850198322293
Iteration 239	LL per word: -4.42981645959632
Iteration 240	LL per word: -4.433459825953801
Iteration 241	LL per word: -4.4304911853970665
Iteration 242	LL per word: -4.430579048479878
Iteration 243	LL per word: -4.428916827112497
Iteration 244	LL per word: -4.4306908092209465
Iteration 245	LL per word: -4.4238934339807425
Iteration 246	LL per word: -4.4287644298359865
Iteration 247	LL per word: -4.426573457476201
Iteration 248	LL per word: -4.427169861172625
Iteration 249	LL per word: -4.435201673113109
Iteration 250	LL per word: -4.4301437034177065
Iteration 251	LL per word: -4.425606499708717
Iteration 252	LL per word: -4

Iteration 412	LL per word: -4.399195244309267
Iteration 413	LL per word: -4.406483991238928
Iteration 414	LL per word: -4.405615727083871
Iteration 415	LL per word: -4.409212713390936
Iteration 416	LL per word: -4.403528730436355
Iteration 417	LL per word: -4.4082022641415906
Iteration 418	LL per word: -4.4101084761515414
Iteration 419	LL per word: -4.404184800815834
Iteration 420	LL per word: -4.403118160705535
Iteration 421	LL per word: -4.404923244314476
Iteration 422	LL per word: -4.405402998691633
Iteration 423	LL per word: -4.403028853064129
Iteration 424	LL per word: -4.407900953649331
Iteration 425	LL per word: -4.408020566526314
Iteration 426	LL per word: -4.409244670116237
Iteration 427	LL per word: -4.411386736463152
Iteration 428	LL per word: -4.405613778659566
Iteration 429	LL per word: -4.405850310738155
Iteration 430	LL per word: -4.406772982379621
Iteration 431	LL per word: -4.410949565946187
Iteration 432	LL per word: -4.410315760105345
Iteration 433	LL per word: -4.40

Iteration 596	LL per word: -4.403463051585839
Iteration 597	LL per word: -4.405107577513904
Iteration 598	LL per word: -4.405535768184078
Iteration 599	LL per word: -4.403733060335244
Iteration 600	LL per word: -4.403077400229042
Iteration 601	LL per word: -4.400785862828771
Iteration 602	LL per word: -4.40278692120822
Iteration 603	LL per word: -4.3999120580345465
Iteration 604	LL per word: -4.402476348393387
Iteration 605	LL per word: -4.4063153167929725
Iteration 606	LL per word: -4.407003440887147
Iteration 607	LL per word: -4.403030108211316
Iteration 608	LL per word: -4.402876800654546
Iteration 609	LL per word: -4.402018135368037
Iteration 610	LL per word: -4.401949043838651
Iteration 611	LL per word: -4.408528645573917
Iteration 612	LL per word: -4.410508821301004
Iteration 613	LL per word: -4.411412084038332
Iteration 614	LL per word: -4.405143354183008
Iteration 615	LL per word: -4.406401537886576
Iteration 616	LL per word: -4.403586017560011
Iteration 617	LL per word: -4.402

Iteration 792	LL per word: -4.392446337827281
Iteration 793	LL per word: -4.396873573690836
Iteration 794	LL per word: -4.395834083117534
Iteration 795	LL per word: -4.396902635830892
Iteration 796	LL per word: -4.393106610297189
Iteration 797	LL per word: -4.396020741223245
Iteration 798	LL per word: -4.3975457618295035
Iteration 799	LL per word: -4.3945531496540395
Iteration 800	LL per word: -4.390965293640711
Iteration 801	LL per word: -4.393297949620206
Iteration 802	LL per word: -4.397399116768394
Iteration 803	LL per word: -4.401145349824266
Iteration 804	LL per word: -4.397377413016272
Iteration 805	LL per word: -4.404572081047195
Iteration 806	LL per word: -4.404582444263858
Iteration 807	LL per word: -4.402814706139133
Iteration 808	LL per word: -4.4044149237603545
Iteration 809	LL per word: -4.400410485748164
Iteration 810	LL per word: -4.398892773674987
Iteration 811	LL per word: -4.395829628205005
Iteration 812	LL per word: -4.397341709080071
Iteration 813	LL per word: -4.4

Iteration 970	LL per word: -4.392201470407719
Iteration 971	LL per word: -4.393475275749652
Iteration 972	LL per word: -4.39342711088224
Iteration 973	LL per word: -4.390555917783775
Iteration 974	LL per word: -4.393377068090026
Iteration 975	LL per word: -4.3905420706526535
Iteration 976	LL per word: -4.387211088380315
Iteration 977	LL per word: -4.390211632998033
Iteration 978	LL per word: -4.386660556872712
Iteration 979	LL per word: -4.379642778322368
Iteration 980	LL per word: -4.382899236804279
Iteration 981	LL per word: -4.3878316211077815
Iteration 982	LL per word: -4.385337577656372
Iteration 983	LL per word: -4.390158026734821
Iteration 984	LL per word: -4.390366043387401
Iteration 985	LL per word: -4.38903942308203
Iteration 986	LL per word: -4.387331846732495
Iteration 987	LL per word: -4.3891019002218385
Iteration 988	LL per word: -4.387112872610046
Iteration 989	LL per word: -4.391884385898499
Iteration 990	LL per word: -4.389894051773637
Iteration 991	LL per word: -4.391

Iteration 1158	LL per word: -4.393526133272895
Iteration 1159	LL per word: -4.392316905187879
Iteration 1160	LL per word: -4.387400785538103
Iteration 1161	LL per word: -4.389803032338456
Iteration 1162	LL per word: -4.386578785058991
Iteration 1163	LL per word: -4.388111751336449
Iteration 1164	LL per word: -4.381900829167261
Iteration 1165	LL per word: -4.386308615292287
Iteration 1166	LL per word: -4.376800956933129
Iteration 1167	LL per word: -4.385825130599879
Iteration 1168	LL per word: -4.385676795315466
Iteration 1169	LL per word: -4.382160627309482
Iteration 1170	LL per word: -4.385181379896575
Iteration 1171	LL per word: -4.383382958879304
Iteration 1172	LL per word: -4.379486589955646
Iteration 1173	LL per word: -4.379905160120508
Iteration 1174	LL per word: -4.379095824310227
Iteration 1175	LL per word: -4.379439097455529
Iteration 1176	LL per word: -4.3829773684525035
Iteration 1177	LL per word: -4.38680550081327
Iteration 1178	LL per word: -4.381480296514769
Iteration 117

Iteration 1346	LL per word: -4.381802889668635
Iteration 1347	LL per word: -4.38091294361996
Iteration 1348	LL per word: -4.381743434836129
Iteration 1349	LL per word: -4.386707699463038
Iteration 1350	LL per word: -4.386058401592961
Iteration 1351	LL per word: -4.385193285718492
Iteration 1352	LL per word: -4.380961300601947
Iteration 1353	LL per word: -4.3884319524757744
Iteration 1354	LL per word: -4.388506896330313
Iteration 1355	LL per word: -4.393241297347547
Iteration 1356	LL per word: -4.390521734548605
Iteration 1357	LL per word: -4.386739231763728
Iteration 1358	LL per word: -4.390772445576892
Iteration 1359	LL per word: -4.391020944568967
Iteration 1360	LL per word: -4.389093615406653
Iteration 1361	LL per word: -4.3800329298551635
Iteration 1362	LL per word: -4.386176689719357
Iteration 1363	LL per word: -4.385736338257492
Iteration 1364	LL per word: -4.386124002782011
Iteration 1365	LL per word: -4.379764411269576
Iteration 1366	LL per word: -4.3818110829647665
Iteration 1

# 일관성 계산 

In [133]:
def compute_coherence_values(data_word ,limit, start=4, step=2):

    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model = tp.LDAModel(k=num_topics, alpha=0.1, eta=0.01, min_cf=20,tw=tp.TermWeight.PMI, rm_top=1)
        for i, line in enumerate(data_word):
            if not line:
                line.append(" ")
            model.add_doc(line)
        model.train(1500) # 학습 정도
        model_list.append(model)
        coherence_model = coherence.Coherence(model, coherence='c_v', window_size=0, targets=None, top_n=10, eps=1e-12, gamma=1.0)
        coherence_values.append(coherence_model.get_score())
    return model_list, coherence_values

In [134]:
# 일관성 계산 테스트
from tomotopy import coherence
coherence_model = coherence.Coherence(model, coherence='c_v', window_size=0, targets=None, top_n=10, eps=1e-12, gamma=1.0)
coherence_model.get_score()

0.5761952466797085

In [135]:
# 토픽 수를 14~ 25 로 해서 각각의 모델과 일관성 점수를 계산한 리스트 얻기 
from tomotopy import coherence
model_list, coherence_values = compute_coherence_values(data_word=data_word, start=14, limit=25, step=2)

In [136]:
# 최적의 모델 얻기
limit=25; start=14; step=2;
x = range(start, limit, step)
topic_num = 0
count = 0
max_coherence = 0
for m, cv in zip(x, coherence_values):
    print("Num Topics =", m, " has Coherence Value of", cv)
    coherence = cv
    if coherence >= max_coherence:
        max_coherence = coherence
        topic_num = m
        model_list_num = count   
    count = count+1

# 최적의 모델
optimal_model = model_list[model_list_num]

Num Topics = 14  has Coherence Value of 0.5666960178741387
Num Topics = 16  has Coherence Value of 0.5601019229739904
Num Topics = 18  has Coherence Value of 0.5712031256407499
Num Topics = 20  has Coherence Value of 0.5666951548308135
Num Topics = 22  has Coherence Value of 0.5558717583052135
Num Topics = 24  has Coherence Value of 0.5478932381297151


In [137]:
for i in range(optimal_model.k):
    res = optimal_model.get_topic_words(i, top_n=10)
    print('Topic #{}'.format(i), end='\t')
    #print(', '.join(w for w, p in res))
    print(', '.join(w+':'+ str(p) for w, p in res))

Topic #0	리뷰:0.06581462174654007, 만두:0.041614335030317307, 이벤트:0.03986998274922371, 짬뽕:0.025652924552559853, 짜장면:0.024610761553049088, 주문:0.023129422217607498, 만족:0.022747252136468887, 서비스:0.022747192531824112, 배달:0.02232983335852623, 괜찮다:0.02169489488005638
Topic #1	굳다:0.30070242285728455, 사진:0.11010444909334183, 깜빡:0.0921492651104927, 번창:0.05905796214938164, 배고프다:0.05330663546919823, 상하이:0.05044722929596901, 많다:0.0483270138502121, 양도:0.04676007106900215, 급하다:0.04505987465381622, 진짜:0.04087330773472786
Topic #2	기분:0.1528807133436203, 사람:0.15064360201358795, 그냥:0.12660038471221924, 먹기:0.11761032044887543, 쿠폰:0.10961844027042389, 그렇다:0.10685716569423676, 나쁘다:0.09823422133922577, 금방:0.07945482432842255, 힘들다:0.029438627883791924, 양파:0.01138984877616167
Topic #3	탕수육:0.030428702011704445, 짜장:0.029197368770837784, 고기:0.02756735123693943, 없다:0.027452688664197922, 간짜장:0.026914866641163826, 짬뽕:0.026034507900476456, 많다:0.02393130585551262, 그냥:0.023883922025561333, 아쉽다:0.023196009919047356, 불어:0.0

# 토픽 레이블링

In [198]:
# 주제 단어 후보군 뽑기
extractor = tp.label.PMIExtractor(min_cf=15, min_df=5,min_len = 2, max_len=20, max_cand=10000)
cands = extractor.extract(optimal_model)

labels_per_topic = []

labeler = tp.label.FoRelevance(optimal_model, cands, min_df=5, smoothing=0.01, mu=0.25)
for k in range(optimal_model.k):
    print("== Topic #{} ==".format(k))
    print("Labels:", ', '.join(label for label, score in labeler.get_topic_labels(k, top_n=5)))
    labels_per_topic.append(labeler.get_topic_labels(k, top_n=2)[0][0]+' + '+labeler.get_topic_labels(k, top_n=2)[1][0])
    for word, prob in optimal_model.get_topic_words(k, top_n=10):
        print(word, prob, sep='\t')
    print()


== Topic #0 ==
Labels: 리뷰 이벤트, 짬뽕 국물, 세트 메뉴, 서비스 주신, 주신 만두
리뷰	0.06581462174654007
만두	0.041614335030317307
이벤트	0.03986998274922371
짬뽕	0.025652924552559853
짜장면	0.024610761553049088
주문	0.023129422217607498
만족	0.022747252136468887
서비스	0.022747192531824112
배달	0.02232983335852623
괜찮다	0.02169489488005638

== Topic #1 ==
Labels: 양도 많다, 맛있다 짜장면, 예상 시간, 맛있다 만두, 만두 맛있다
굳다	0.30070242285728455
사진	0.11010444909334183
깜빡	0.0921492651104927
번창	0.05905796214938164
배고프다	0.05330663546919823
상하이	0.05044722929596901
많다	0.0483270138502121
양도	0.04676007106900215
급하다	0.04505987465381622
진짜	0.04087330773472786

== Topic #2 ==
Labels: 소스 종류, 배달 음식, 짜장 소스, 떡볶이 맛있다, 음식 맛있다
기분	0.1528807133436203
사람	0.15064360201358795
그냥	0.12660038471221924
먹기	0.11761032044887543
쿠폰	0.10961844027042389
그렇다	0.10685716569423676
나쁘다	0.09823422133922577
금방	0.07945482432842255
힘들다	0.029438627883791924
양파	0.01138984877616167

== Topic #3 ==
Labels: 짜장 소스, 탕수육 고기, 세트 메뉴, 짬뽕 국물, 탕수육 소스
탕수육	0.030428702011704445
짜장	0.029197368770837784
고기	0

In [199]:
labels_per_topic

['리뷰 이벤트 + 짬뽕 국물',
 '양도 많다 + 맛있다 짜장면',
 '소스 종류 + 배달 음식',
 '짜장 소스 + 탕수육 고기',
 '양도 많다 + 양도 많다 배달',
 '배달 시간 + 시간 시간',
 '요청 사항 + 리뷰 이벤트',
 '짜장 떡볶이 + 떡볶이 맛있다',
 '최고 최고 + 탕수육 소스',
 '진짜 맛있다 + 차돌 짬뽕',
 '서비스 주신 만두 + 주신 만두',
 '맛있다 짬뽕 + 짬뽕 국물',
 '짜장 소스 + 탕수육 고기',
 '깔끔하다 맛있다 + 맛있다 자주',
 '짬뽕 국물 + 탕수육 소스',
 '최고 최고 + 양도 적당하다',
 '탕수육 맛있다 + 맛있다 짜장면',
 '빠르다 배달 + 기분 좋다']

# 리뷰별 토픽 할당

In [200]:
def format_topics_sentences(ldamodel=optimal_model):
    new_doc = ldamodel.docs
    sent_topics_df = pd.DataFrame()
    
    for i in range(len(data_word)):
        topics = new_doc[i].get_topics()
        for j, (topic_num, prop_topic) in enumerate(topics):
            if j == 0:
                wp = optimal_model.get_topic_words(topic_num, top_n=10)
                topic_keywords = ", ".join([word for word, prop in wp])
                topic_label = labels_per_topic[topics[j][0]]
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4),topic_label, topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution','Topic_Label', 'Topic_Keywords']
    sent_topics_df = pd.concat([sent_topics_df, searched_reviews['지점명'],searched_reviews['유저'],searched_reviews['메뉴'],searched_reviews['리뷰'],searched_reviews['총점'],searched_reviews['맛'],searched_reviews['양'],searched_reviews['배달'],searched_reviews['시간']], axis=1)
    
    return(sent_topics_df) 

In [201]:
df_topic_sents_keywords = format_topics_sentences(ldamodel=optimal_model)
df_topic_review = df_topic_sents_keywords.reset_index()
df_topic_review.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib','Topic_Label' ,'Keywords', '지점명','유저', '메뉴','리뷰','총점','맛','양','배달','시간' ]
df_topic_review

Unnamed: 0,Document_No,Dominant_Topic,Topic_Perc_Contrib,Topic_Label,Keywords,지점명,유저,메뉴,리뷰,총점,맛,양,배달,시간
0,0,3.0,0.4812,짜장 소스 + 탕수육 고기,"탕수육, 짜장, 고기, 없다, 간짜장, 짬뽕, 많다, 그냥, 아쉽다, 불어",상하이-인덕원점,gu**님,세트1（등심탕수육＋짜장1＋만두）/1(곱빼기 추가(곱빼기 추가)),집 가까운 중화요리집 많지만 굳이 비오는 날에 주문해서 죄송한데 아이들이 넘 맛있어...,5,5,5,5,2시간 전
1,1,14.0,0.2374,짬뽕 국물 + 탕수육 소스,"탕수육, 소스, 짬뽕, 좋다, 짜장면, 짜장, 바삭, 조금, 가지, 진짜",상하이-인덕원점,p2**님,세트1（등심탕수육＋짜장1＋만두）/1,항상맛있게 잘먹었습니다,5,5,5,5,15시간 전
2,2,15.0,0.9728,최고 최고 + 양도 적당하다,", 시간, 보통, 짜장, 불어, 좋다, 전화, 예상, 짬뽕, 아이",상하이-인덕원점,ch**님,"삼선짬뽕/1,세트1（등심탕수육＋짜장1＋만두）/1",감사합니다 잘먹었습니다,5,5,5,5,어제
3,3,14.0,0.9932,짬뽕 국물 + 탕수육 소스,"탕수육, 소스, 짬뽕, 좋다, 짜장면, 짜장, 바삭, 조금, 가지, 진짜",상하이-인덕원점,so**님,"짜장/1(곱빼기 추가(곱빼기 추가)),간짜장/2(곱빼기 추가(곱빼기 추가))",간짜장 딱 간도 맞고 면도 좋고 맛있게 잘먹었습니다 군만두 서비스도 좋아요,5,5,5,5,4일 전
4,4,4.0,0.9798,양도 많다 + 양도 많다 배달,"배달, 양도, 많다, 빠르다, 좋다, 빨르다, 배송, 괜찮다, 진짜, 시간",상하이-인덕원점,py**님,"짜장/1(곱빼기 추가(곱빼기 추가)),짬뽕/1(곱빼기 추가(곱빼기 추가))",양도 많고 맛있어요,5,5,5,5,5일 전
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4652,4652,2.0,0.9892,소스 종류 + 배달 음식,"기분, 사람, 그냥, 먹기, 쿠폰, 그렇다, 나쁘다, 금방, 힘들다, 양파",매운국물방앗간떡볶이-본점,손님,"치킨마요 컵밥세트 （떡볶이＋치킨마요컵밥＋순대＋쿨피스） /1(떡볶이 선택(짜장),매운...",떡이랑 오뎅이 너무안집혀져서 좀 먹기 힘들었는데 금방 배도차고맛있었어요 쿨피스 살얼...,4,5,5,3,2020년 3월 6일
4653,4653,4.0,0.9816,양도 많다 + 양도 많다 배달,"배달, 양도, 많다, 빠르다, 좋다, 빨르다, 배송, 괜찮다, 진짜, 시간",매운국물방앗간떡볶이-본점,u2**님,"둘이서세트（떡볶이1~2人＋모듬튀김＋고로케2개）/1(떡볶이 선택(짜장),매운맛 선택(...",빠르고 양많고 맛있고 박자가 다 아주 좋아요,5,5,5,5,2020년 1월 2일
4654,4654,9.0,0.9914,진짜 맛있다 + 차돌 짬뽕,"짜장, 굿굿, 간짜장, 단골, 중국음식, 차돌, 진짜, 짬뽕, 최고, 떡볶이",매운국물방앗간떡볶이-본점,u2**님,"둘이서세트（떡볶이1~2人＋모듬튀김＋고로케2개）/1(떡볶이 선택(짜장),매운맛 선택(...",지금까지 먹어본 짜장떡볶이 중에 제일 맛있어요,5,5,5,5,2019년 8월 31일
4655,4655,7.0,0.9900,짜장 떡볶이 + 떡볶이 맛있다,"떡볶이, 단계, 짜장, 맵다, 꽃빵, 연유, 돈까스, 이벤트, 리뷰, 처음",매운국물방앗간떡볶이-본점,손님,"돈튀김 떡볶이/1(떡볶이 선택(짜장),매운맛 선택(1단계 맛있게매콤한맛),떡 ,오뎅...",완전 짜장으로 만든 떡볶이인줄 알았는데 짜장맛이 강하지 않고 잘 어울려요 단...,5,5,5,5,2019년 8월 12일


In [222]:
# 토픽별 리뷰수
element_count = {}
for row in df_topic_review.iloc:
    element_count.setdefault(row['Dominant_Topic'],0)
    element_count[row['Dominant_Topic']]+=1

# 토픽별 총점 합계
topic_total = {}
for row in df_topic_review.iloc:
    topic_total.setdefault(row['Dominant_Topic'],0)
    topic_total[row['Dominant_Topic']]+=row['총점']

# 토픽별 평균 총점    
for topic in topic_total:
    topic_total[topic] = topic_total[topic] / element_count[topic]
topic_total


# 평균 별점 순으로 정렬
sdict= sorted(topic_total.items(), key= lambda x : x[1],reverse = True)
sdict


[(8.0, 4.877358490566038),
 (9.0, 4.858695652173913),
 (13.0, 4.857142857142857),
 (1.0, 4.848484848484849),
 (7.0, 4.8108108108108105),
 (16.0, 4.8),
 (10.0, 4.780952380952381),
 (15.0, 4.7560975609756095),
 (4.0, 4.746543778801843),
 (17.0, 4.711428571428572),
 (14.0, 4.6421319796954315),
 (6.0, 4.362068965517241),
 (0.0, 4.23574144486692),
 (11.0, 4.153846153846154),
 (3.0, 3.9190600522193213),
 (2.0, 3.8333333333333335),
 (12.0, 3.825870646766169),
 (5.0, 3.6785714285714284)]

In [225]:
result = pd.DataFrame()
for i in range(3):
    topic = int(sdict[i][0])
    score = round(sdict[i][1],4)
    res = optimal_model.get_topic_words(topic, top_n=10)
    label = labels_per_topic[topic]
    keywords = ', '.join(w for w, p in res)
    result = result.append(pd.Series([topic, label,keywords, score]), ignore_index=True)
result.columns = ['Topic_num', 'Topic_name','Keywords','Score']


Unnamed: 0,Topic_num,Topic_name,Keywords,Score
0,8.0,최고 최고 + 탕수육 소스,"최고, 중국집, 중식, 제일, 동네, 주문, 맛집, 북경, 상하이, 단골",4.8774
1,9.0,진짜 맛있다 + 차돌 짬뽕,"짜장, 굿굿, 간짜장, 단골, 중국음식, 차돌, 진짜, 짬뽕, 최고, 떡볶이",4.8587
2,13.0,깔끔하다 맛있다 + 맛있다 자주,"깔끔하다, 자주, 포장, 번창, 주문, 이용, 쟁반짜장, 중식, 좋다, 다시",4.8571
