# 리뷰데이터 추가

In [79]:
import pandas as pd
from pandas import DataFrame  as df
import numpy as np
from collections import Counter
from konlpy.tag import Okt
okt = Okt()
%matplotlib inline
import matplotlib.pyplot as plt
import re 
from datetime import datetime
import tomotopy as tp


data = pd.read_excel('reviews_youngtong.xlsx')

# 메뉴 검색(지역 주민이 고려하는 것)

In [91]:
user_input = input("메뉴를 검색하세요 : ")


arr = []

for i in range(len(data['메뉴'])):
    if type(data['메뉴'][i]) == str and user_input in data['메뉴'][i]:
        arr.append(data.loc[i])
        

searched_reviews = pd.DataFrame(arr, columns = ['지점명','유저','메뉴','리뷰','총점','맛','양','배달','시간'])



# 데이터 프레임의 '리뷰' 열의 값들을 str 형식으로 바꾸기
searched_reviews.리뷰 = searched_reviews.리뷰.astype(str)

# 중복데이터 삭제
searched_reviews.drop_duplicates(subset=['리뷰'],inplace=True)

# 한글이 아니면 빈 문자열로 바꾸기
searched_reviews['리뷰'] = searched_reviews['리뷰'].str.replace('[^ㄱ-ㅎㅏ-ㅣ가-힣]',' ',regex=True)

# 빈 문자열 NAN 으로 바꾸기
searched_reviews = searched_reviews.replace({'': np.nan})
searched_reviews = searched_reviews.replace(r'^\s*$', np.nan, regex=True)

# NAN 이 있는 행 삭제
searched_reviews.dropna(how='any', inplace=True)

# 인덱스 차곡차곡
searched_reviews = searched_reviews.reset_index (drop = True)


# 리뷰 데이터를 리스트로 변환
Data_list = searched_reviews.리뷰.values.tolist()
# 정규화 처리
Data_list = list(map(okt.normalize,Data_list))



# 명사,형용사 추출
data_word = []

# 한글자 짜리는 모두 없애야 함
stopword = ['같다','이다','있다','여기','항상','완전','정말','너무','보고','오늘','역시','이번','다음','아주']

for i, document in enumerate(Data_list):
    clean_words = []
    for word in okt.pos(document, stem=True): #어간 추출
        if word[1] in ['Noun','Adjective']: 
            if len(word[0]) >= 2 and word[0] not in stopword:
                clean_words.append(word[0])    
    data_word.append(clean_words)
    
    
def compute_coherence_values(data_word ,limit, start=4, step=2):

    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model = tp.LDAModel(k=num_topics, alpha=0.1, eta=0.01, min_cf=20,tw=tp.TermWeight.PMI, rm_top=1)
        for i, line in enumerate(data_word):
            if not line:
                line.append(" ")
            model.add_doc(line)
        model.train(1500) # 학습 정도
        model_list.append(model)
        coherence_model = coherence.Coherence(model, coherence='c_v', window_size=0, targets=None, top_n=10, eps=1e-12, gamma=1.0)
        coherence_values.append(coherence_model.get_score())
    return model_list, coherence_values


# 토픽 수를 14~ 25 로 해서 각각의 모델과 일관성 점수를 계산한 리스트 얻기 
from tomotopy import coherence
model_list, coherence_values = compute_coherence_values(data_word=data_word, start=14, limit=25, step=2)


# 최적의 모델 얻기
limit=25; start=14; step=2;
x = range(start, limit, step)
topic_num = 0
count = 0
max_coherence = 0
print()
print("=====================일관성 계산 결과======================")

for m, cv in zip(x, coherence_values):
    print("Num Topics =", m, " has Coherence Value of", cv)
    coherence = cv
    if coherence >= max_coherence:
        max_coherence = coherence
        topic_num = m
        model_list_num = count   
    count = count+1

# 최적의 모델
optimal_model = model_list[model_list_num]


# 주제 단어 후보군 뽑기
extractor = tp.label.PMIExtractor(min_cf=15, min_df=5,min_len = 2, max_len=20, max_cand=10000)
cands = extractor.extract(optimal_model)

labels_per_topic = []

labeler = tp.label.FoRelevance(optimal_model, cands, min_df=5, smoothing=0.01, mu=0.25)
print()
print("=====================추출된 토픽======================")
for k in range(optimal_model.k):
    print("== Topic #{} ==".format(k))
    print("Labels:", ', '.join(label for label, score in labeler.get_topic_labels(k, top_n=5)))
    labels_per_topic.append(labeler.get_topic_labels(k, top_n=2)[0][0]+' + '+labeler.get_topic_labels(k, top_n=2)[1][0])
    for word, prob in optimal_model.get_topic_words(k, top_n=10):
        print(word, prob, sep='\t')
    print()
    
    
def format_topics_sentences(ldamodel=optimal_model):
    new_doc = ldamodel.docs
    sent_topics_df = pd.DataFrame()
    
    for i in range(len(data_word)):
        topics = new_doc[i].get_topics()
        for j, (topic_num, prop_topic) in enumerate(topics):
            if j == 0:
                wp = optimal_model.get_topic_words(topic_num, top_n=10)
                topic_keywords = ", ".join([word for word, prop in wp])
                topic_label = labels_per_topic[topics[j][0]]
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4),topic_label, topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution','Topic_Label', 'Topic_Keywords']
    sent_topics_df = pd.concat([sent_topics_df, searched_reviews['지점명'],searched_reviews['유저'],searched_reviews['메뉴'],searched_reviews['리뷰'],searched_reviews['총점'],searched_reviews['맛'],searched_reviews['양'],searched_reviews['배달'],searched_reviews['시간']], axis=1)
    
    return(sent_topics_df) 


df_topic_sents_keywords = format_topics_sentences(ldamodel=optimal_model)
df_topic_review = df_topic_sents_keywords.reset_index()
df_topic_review.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib','Topic_Label' ,'Keywords', '지점명','유저', '메뉴','리뷰','총점','맛','양','배달','시간' ]
df_topic_review

# 토픽별 리뷰수
element_count = {}
for row in df_topic_review.iloc:
    element_count.setdefault(row['Dominant_Topic'],0)
    element_count[row['Dominant_Topic']]+=1

# 토픽별 총점 합계
topic_total = {}
for row in df_topic_review.iloc:
    topic_total.setdefault(row['Dominant_Topic'],0)
    topic_total[row['Dominant_Topic']]+=row['총점']

# 토픽별 평균 총점    
for topic in topic_total:
    topic_total[topic] = topic_total[topic] / element_count[topic]
topic_total


# 평균 별점 순으로 정렬
sdict= sorted(topic_total.items(), key= lambda x : x[1],reverse = True)


result = pd.DataFrame()
for i in range(3):
    topic = int(sdict[i][0])
    score = round(sdict[i][1],4)
    res = optimal_model.get_topic_words(topic, top_n=10)
    label = labels_per_topic[topic]
    keywords = ', '.join(w for w, p in res)
    result = result.append(pd.Series([topic, label,keywords, score]), ignore_index=True)
result.columns = ['Topic_num', 'Topic_name','Keywords','Score']


print()
print("=====================결과======================")
print()
print("영통 주민들이 << {} >>를 시킬때 고려하는 것".format(user_input))
result

메뉴를 검색하세요 : 짜장

Num Topics = 14  has Coherence Value of 0.5350377301020282
Num Topics = 16  has Coherence Value of 0.5435796621255576
Num Topics = 18  has Coherence Value of 0.532650276935763
Num Topics = 20  has Coherence Value of 0.530063804090023
Num Topics = 22  has Coherence Value of 0.5325206689536572
Num Topics = 24  has Coherence Value of 0.5236045934880772

== Topic #0 ==
Labels: 짬뽕 국물, 차돌 짬뽕, 진짜 맛있다, 짜장 짬뽕, 짜장 소스
짬뽕	0.03819144517183304
탕수육	0.03505537286400795
짜장	0.03410447761416435
많다	0.03008469007909298
짜장면	0.028170477598905563
주문	0.027027051895856857
없다	0.025977004319429398
간짜장	0.025405146181583405
좋다	0.02459040842950344
처음	0.022782929241657257

== Topic #1 ==
Labels: 탕수육 고기, 맛있다 탕수육, 진짜 맛있다, 짬뽕 국물, 맛있다 짜장면
탕수육	0.19666631519794464
고기	0.15484119951725006
별로	0.13634610176086426
튀김	0.12397678196430206
찹쌀	0.11088305711746216
진짜	0.07213860005140305
짜장면	0.062130752950906754
짬뽕	0.05222358554601669
느낌	0.04851856827735901
괜찮다	0.017821934074163437

== Topic #2 ==
Labels: 포장 깔끔하다, 맛있다

Unnamed: 0,Topic_num,Topic_name,Keywords,Score
0,14.0,차돌 짬뽕 + 짬뽕 맛있다,"최고, 짬뽕, 탕슉, 차돌, 자장면, 탕수육, 짜장, 괜찮다, 짜장면, 찹쌀",4.8966
1,15.0,포장 깔끔하다 + 짬뽕 국물,"깔끔하다, 국물, 포장, 짬뽕, 좋다, 짜장, 짜장면, 조금, 소스, 없다",4.875
2,10.0,양도 많다 맛있다 + 탕수육 소스,", 양도, 깔끔하다, 포장, 살짝, 정도, 빠르다, 시간, 배달, 중국집",4.8611


# 음식점 검색

## 음식점 리스트

In [81]:
element_count = {}

for item in data['지점명']:
    element_count.setdefault(item,0)
    element_count[item] += 1
    
list(element_count.keys())

['후라이드참잘하는집-영통흥덕점',
 '피자헛-영통로점',
 '김준현의피자헤븐-수원영통점',
 '호식이두마리치킨-영통1호점',
 'BBQ-용인서천점',
 '롯데리아-수원영통점',
 '미스터피자-영통점',
 'PB델리-수원경희대점',
 '아웃백-동탄반송점',
 '파파존스피자-영통점',
 '쫄면주는삼겹본능by놀부-수원망포점',
 '에그드랍-영통역점',
 '공차-수원영통점',
 '원할머니보쌈-수원광교배달점',
 '놀부보쌈족발-수원영통점',
 '부어치킨-광교호수공원점',
 '놀부부대찌개-수원영통점',
 '맘스터치-수원경희대점',
 '세븐일레븐-영통매영로점',
 '호치킨-용인흥덕점',
 '돈까스퐁당떡볶이공수간-수원망포점',
 '두마리찜닭두찜-용인한보라점',
 '족보잇는국밥&밀면-수원영통점',
 '마약보쌈족발-서천점',
 '맥도날드-경희대국제캠퍼스점',
 '마싰는끼니-기흥서천점',
 '카페인중독-수원영통점',
 '반올림피자샵-영통점',
 'KFC-영통씨네마점',
 '제육대가-영통점',
 '프랑킨숯불양념구이치킨-망포점',
 '쏘자토스트-영통점',
 '초밥쟁이',
 '우쿠야-영통점',
 '생생직판장',
 '롯데리아-용인보라점',
 '정성이가득찬집밥-영통점',
 '뱃살도둑샐러드&과일-영통점',
 '용인백암순대국왕뼈감자탕',
 '혜경궁',
 '석천칡냉면-영통점',
 '덮밥90도씨-수원영통점',
 '중앙왕족발보쌈',
 '보배반점-수원영통점',
 '야미가',
 '떡순튀-망포영통점',
 '해를품은짬뽕-용인점',
 '배터지는생동까스-신구갈점',
 '대만족동탄',
 '또야지',
 '싸움의고수-수원영통점',
 '리얼베트남쌀국수&월남쌈-본점',
 '던킨-영통홈플러스점',
 '미챠이-경희대점',
 'BBQ-수원망포점',
 '요녀석파스타&필라프-영통점',
 '중평떡볶이24시',
 '커피브라더-영통점',
 '청년다방-용인기흥역점',
 '카페페리스휠',
 '푸라닭-수원영통1호점',
 '이디야커피-수원경희대점',
 '진구네간장게장',
 '백암왕순대왕뼈감자탕',
 '단향',
 '쭈대가-영통점',
 '

In [93]:
user_input = input("음식점을 검색하세요 : ")

arr = []

for i in range(len(data['지점명'])):
    if type(data['지점명'][i]) == str and user_input in data['지점명'][i]:
        arr.append(data.loc[i])
        

searched_reviews = pd.DataFrame(arr, columns = ['지점명','유저','메뉴','리뷰','총점','맛','양','배달','시간'])


# 데이터 프레임의 '리뷰' 열의 값들을 str 형식으로 바꾸기
searched_reviews.리뷰 = searched_reviews.리뷰.astype(str)

# 중복데이터 삭제
searched_reviews.drop_duplicates(subset=['리뷰'],inplace=True)

# 한글이 아니면 빈 문자열로 바꾸기
searched_reviews['리뷰'] = searched_reviews['리뷰'].str.replace('[^ㄱ-ㅎㅏ-ㅣ가-힣]',' ',regex=True)

# 빈 문자열 NAN 으로 바꾸기
searched_reviews = searched_reviews.replace({'': np.nan})
searched_reviews = searched_reviews.replace(r'^\s*$', np.nan, regex=True)

# NAN 이 있는 행 삭제
searched_reviews.dropna(how='any', inplace=True)

# 인덱스 차곡차곡
searched_reviews = searched_reviews.reset_index (drop = True)


# 리뷰 데이터를 리스트로 변환
Data_list = searched_reviews.리뷰.values.tolist()
# 정규화 처리
Data_list = list(map(okt.normalize,Data_list))



# 명사,형용사 추출
data_word = []

# 한글자 짜리는 모두 없애야 함
stopword = ['같다','이다','있다','여기','항상','완전','정말','너무','보고','오늘','역시','이번','다음','아주']

for i, document in enumerate(Data_list):
    clean_words = []
    for word in okt.pos(document, stem=True): #어간 추출
        if word[1] in ['Noun','Adjective']: 
            if len(word[0]) >= 2 and word[0] not in stopword:
                clean_words.append(word[0])    
    data_word.append(clean_words)
    
    
def compute_coherence_values(data_word ,limit, start=4, step=2):

    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model = tp.LDAModel(k=num_topics, alpha=0.1, eta=0.01, min_cf=20,tw=tp.TermWeight.PMI, rm_top=1)
        for i, line in enumerate(data_word):
            if not line:
                line.append(" ")
            model.add_doc(line)
        model.train(1500) # 학습 정도
        model_list.append(model)
        coherence_model = coherence.Coherence(model, coherence='c_v', window_size=0, targets=None, top_n=10, eps=1e-12, gamma=1.0)
        coherence_values.append(coherence_model.get_score())
    return model_list, coherence_values


# 토픽 수를 14~ 25 로 해서 각각의 모델과 일관성 점수를 계산한 리스트 얻기 
from tomotopy import coherence
model_list, coherence_values = compute_coherence_values(data_word=data_word, start=14, limit=25, step=2)


# 최적의 모델 얻기
limit=25; start=14; step=2;
x = range(start, limit, step)
topic_num = 0
count = 0
max_coherence = 0
print()
print("=====================일관성 계산 결과======================")

for m, cv in zip(x, coherence_values):
    print("Num Topics =", m, " has Coherence Value of", cv)
    coherence = cv
    if coherence >= max_coherence:
        max_coherence = coherence
        topic_num = m
        model_list_num = count   
    count = count+1

# 최적의 모델
optimal_model = model_list[model_list_num]


# 주제 단어 후보군 뽑기
extractor = tp.label.PMIExtractor(min_cf=15, min_df=5,min_len = 2, max_len=20, max_cand=10000)
cands = extractor.extract(optimal_model)

labels_per_topic = []

labeler = tp.label.FoRelevance(optimal_model, cands, min_df=5, smoothing=0.01, mu=0.25)
print()
print("=====================추출된 토픽======================")
for k in range(optimal_model.k):
    print("== Topic #{} ==".format(k))
    print("Labels:", ', '.join(label for label, score in labeler.get_topic_labels(k, top_n=5)))
    labels_per_topic.append(labeler.get_topic_labels(k, top_n=2)[0][0]+' + '+labeler.get_topic_labels(k, top_n=2)[1][0])
    for word, prob in optimal_model.get_topic_words(k, top_n=10):
        print(word, prob, sep='\t')
    print()
    
    
def format_topics_sentences(ldamodel=optimal_model):
    new_doc = ldamodel.docs
    sent_topics_df = pd.DataFrame()
    
    for i in range(len(data_word)):
        topics = new_doc[i].get_topics()
        for j, (topic_num, prop_topic) in enumerate(topics):
            if j == 0:
                wp = optimal_model.get_topic_words(topic_num, top_n=10)
                topic_keywords = ", ".join([word for word, prop in wp])
                topic_label = labels_per_topic[topics[j][0]]
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4),topic_label, topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution','Topic_Label', 'Topic_Keywords']
    sent_topics_df = pd.concat([sent_topics_df, searched_reviews['지점명'],searched_reviews['유저'],searched_reviews['메뉴'],searched_reviews['리뷰'],searched_reviews['총점'],searched_reviews['맛'],searched_reviews['양'],searched_reviews['배달'],searched_reviews['시간']], axis=1)
    
    return(sent_topics_df) 


df_topic_sents_keywords = format_topics_sentences(ldamodel=optimal_model)
df_topic_review = df_topic_sents_keywords.reset_index()
df_topic_review.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib','Topic_Label' ,'Keywords', '지점명','유저', '메뉴','리뷰','총점','맛','양','배달','시간' ]
df_topic_review

# 토픽별 리뷰수
element_count = {}
for row in df_topic_review.iloc:
    element_count.setdefault(row['Dominant_Topic'],0)
    element_count[row['Dominant_Topic']]+=1

# 토픽별 총점 합계
topic_total = {}
for row in df_topic_review.iloc:
    topic_total.setdefault(row['Dominant_Topic'],0)
    topic_total[row['Dominant_Topic']]+=row['총점']

# 토픽별 평균 총점    
for topic in topic_total:
    topic_total[topic] = topic_total[topic] / element_count[topic]
topic_total


# 평균 별점 순으로 정렬
sdict= sorted(topic_total.items(), key= lambda x : x[1],reverse = True)


result = pd.DataFrame()
for i in range(3):
    topic = int(sdict[i][0])
    score = round(sdict[i][1],4)
    res = optimal_model.get_topic_words(topic, top_n=10)
    label = labels_per_topic[topic]
    keywords = ', '.join(w for w, p in res)
    result = result.append(pd.Series([topic, label,keywords, score]), ignore_index=True)
result.columns = ['Topic_num', 'Topic_name','Keywords','Score']


print()
print("=====================결과======================")
print()
print("소비자들이 평가한 << {} >> 의 특징".format(user_input))
result

음식점을 검색하세요 : 후라이드참잘하는집

Num Topics = 14  has Coherence Value of 0.5902505061056997
Num Topics = 16  has Coherence Value of 0.565648172062356
Num Topics = 18  has Coherence Value of 0.5634347373205755
Num Topics = 20  has Coherence Value of 0.5697247929126024
Num Topics = 22  has Coherence Value of 0.5521489466159519
Num Topics = 24  has Coherence Value of 0.5603062455697606

== Topic #0 ==
Labels: 예상 시간, 배달 예상 시간, 배달 예상, 양념 소스, 배달 시간
시간	0.08710204064846039
예상	0.06332171708345413
바삭	0.05766037851572037
배달	0.05551692098379135
후라이드	0.0504692867398262
간장	0.036472074687480927
매콤	0.03457072377204895
양념	0.03378007560968399
좋다	0.03322535753250122
치킨	0.029206683859229088

== Topic #1 ==
Labels: 맛있다 배달, 서비스 좋다, 빠르다 배달, 배달 빠르다, 예상 시간
배달	0.18901149928569794
빠르다	0.17753535509109497
좋다	0.06590453535318375
언제	0.060195744037628174
빨르다	0.056069012731313705
양은	0.03943304717540741
친절하다	0.0393812321126461
서비스	0.03911357372999191
요청	0.03779496252536774
배송	0.029695063829421997

== Topic #2 ==
Labels: 맵다 양념,

Unnamed: 0,Topic_num,Topic_name,Keywords,Score
0,8.0,양도 많다 + 많다 맛있다,"많다, 양도, 사진, 좋다, 부드럽다, 없다, 마리, 진짜, 추천, 서비스",4.9865
1,13.0,굳다 굳다 + 맛있다 후라이드 맛있다,"굳다, 가성, 서비스, 없다, 양념, 배달, 빠르다, 많다, 바삭바삭하다, 치킨",4.9231
2,1.0,맛있다 배달 + 서비스 좋다,"배달, 빠르다, 좋다, 언제, 빨르다, 양은, 친절하다, 서비스, 요청, 배송",4.9116


## 음식점의 아쉬운 점

In [124]:
searched_reviews = pd.DataFrame(arr, columns = ['지점명','유저','메뉴','리뷰','총점','맛','양','배달','시간'])
searched_reviews = searched_reviews[(searched_reviews.총점 <= 2)]

# 데이터 프레임의 '리뷰' 열의 값들을 str 형식으로 바꾸기
searched_reviews.리뷰 = searched_reviews.리뷰.astype(str)

# 중복데이터 삭제
searched_reviews.drop_duplicates(subset=['리뷰'],inplace=True)

# 한글이 아니면 빈 문자열로 바꾸기
searched_reviews['리뷰'] = searched_reviews['리뷰'].str.replace('[^ㄱ-ㅎㅏ-ㅣ가-힣]',' ',regex=True)

# 빈 문자열 NAN 으로 바꾸기
searched_reviews = searched_reviews.replace({'': np.nan})
searched_reviews = searched_reviews.replace(r'^\s*$', np.nan, regex=True)

# NAN 이 있는 행 삭제
searched_reviews.dropna(how='any', inplace=True)

# 인덱스 차곡차곡
searched_reviews = searched_reviews.reset_index (drop = True)


# 리뷰 데이터를 리스트로 변환
Data_list = searched_reviews.리뷰.values.tolist()
# 정규화 처리
Data_list = list(map(okt.normalize,Data_list))



# 명사,형용사 추출
data_word = []

# 한글자 짜리는 모두 없애야 함
stopword = ['같다','이다','있다','여기','항상','완전','정말','너무','보고','오늘','역시','이번','다음','아주','신랑','살후']

for i, document in enumerate(Data_list):
    clean_words = []
    for word in okt.pos(document, stem=True): #어간 추출
        if word[1] in ['Noun','Adjective']: 
            if len(word[0]) >= 2 and word[0] not in stopword:
                clean_words.append(word[0])    
    data_word.append(clean_words)
    
    
def compute_coherence_values(data_word ,limit, start=4, step=2):

    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model = tp.LDAModel(k=num_topics, alpha=0.1, eta=0.01, min_cf=1,tw=tp.TermWeight.PMI, rm_top=1)
        for i, line in enumerate(data_word):
            if not line:
                line.append(" ")
            model.add_doc(line)
        model.train(500) # 학습 정도
        model_list.append(model)
        coherence_model = coherence.Coherence(model, coherence='c_v', window_size=0, targets=None, top_n=10, eps=1e-12, gamma=1.0)
        coherence_values.append(coherence_model.get_score())
    return model_list, coherence_values


# 토픽 수를 14~ 25 로 해서 각각의 모델과 일관성 점수를 계산한 리스트 얻기 
from tomotopy import coherence
model_list, coherence_values = compute_coherence_values(data_word=data_word, start=4, limit=21, step=2)


# 최적의 모델 얻기
limit=21; start=4; step=2;
x = range(start, limit, step)
topic_num = 0
count = 0
max_coherence = 0
print()
print("=====================일관성 계산 결과======================")

for m, cv in zip(x, coherence_values):
    print("Num Topics =", m, " has Coherence Value of", cv)
    coherence = cv
    if coherence >= max_coherence:
        max_coherence = coherence
        topic_num = m
        model_list_num = count   
    count = count+1

# 최적의 모델
optimal_model = model_list[model_list_num]


# 주제 단어 후보군 뽑기
extractor = tp.label.PMIExtractor(min_cf=1, min_df=1,min_len = 2, max_len=5, max_cand=10000)
cands = extractor.extract(optimal_model)

labels_per_topic = []

labeler = tp.label.FoRelevance(optimal_model, cands, min_df=1, smoothing=0.01, mu=0.25)
print()
print("=====================추출된 토픽======================")
for k in range(optimal_model.k):
    print("== Topic #{} ==".format(k))
    print("Labels:", ', '.join(label for label, score in labeler.get_topic_labels(k, top_n=5)))
    labels_per_topic.append(labeler.get_topic_labels(k, top_n=2)[0][0]+' + '+labeler.get_topic_labels(k, top_n=2)[1][0])
    for word, prob in optimal_model.get_topic_words(k, top_n=10):
        print(word, prob, sep='\t')
    print()
    
    
def format_topics_sentences(ldamodel=optimal_model):
    new_doc = ldamodel.docs
    sent_topics_df = pd.DataFrame()
    
    for i in range(len(data_word)):
        topics = new_doc[i].get_topics()
        for j, (topic_num, prop_topic) in enumerate(topics):
            if j == 0:
                wp = optimal_model.get_topic_words(topic_num, top_n=10)
                topic_keywords = ", ".join([word for word, prop in wp])
                topic_label = labels_per_topic[topics[j][0]]
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4),topic_label, topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution','Topic_Label', 'Topic_Keywords']
    sent_topics_df = pd.concat([sent_topics_df, searched_reviews['지점명'],searched_reviews['유저'],searched_reviews['메뉴'],searched_reviews['리뷰'],searched_reviews['총점'],searched_reviews['맛'],searched_reviews['양'],searched_reviews['배달'],searched_reviews['시간']], axis=1)
    
    return(sent_topics_df) 


df_topic_sents_keywords = format_topics_sentences(ldamodel=optimal_model)
df_topic_review = df_topic_sents_keywords.reset_index()
df_topic_review.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib','Topic_Label' ,'Keywords', '지점명','유저', '메뉴','리뷰','총점','맛','양','배달','시간' ]
df_topic_review

# 토픽별 리뷰수
element_count = {}
for row in df_topic_review.iloc:
    element_count.setdefault(row['Dominant_Topic'],0)
    element_count[row['Dominant_Topic']]+=1

# 토픽별 총점 합계
topic_total = {}
for row in df_topic_review.iloc:
    topic_total.setdefault(row['Dominant_Topic'],0)
    topic_total[row['Dominant_Topic']]+=row['총점']

# 토픽별 평균 총점    
for topic in topic_total:
    topic_total[topic] = topic_total[topic] / element_count[topic]
topic_total


# 평균 별점 순으로 정렬
sdict= sorted(topic_total.items(), key= lambda x : x[1],reverse = True)


result = pd.DataFrame()
for i in range(3):
    topic = int(sdict[i][0])
    score = round(sdict[i][1],4)
    res = optimal_model.get_topic_words(topic, top_n=10)
    label = labels_per_topic[topic]
    keywords = ', '.join(w for w, p in res)
    result = result.append(pd.Series([topic, label,keywords, score]), ignore_index=True)
result.columns = ['Topic_num', 'Topic_name','Keywords','Score']


print()
print("=====================결과======================")
print()
print("소비자들이 평가한 << {} >> 의 아쉬운점".format(user_input))
result


Num Topics = 4  has Coherence Value of 0.7624119125306607
Num Topics = 6  has Coherence Value of 0.826604950428009
Num Topics = 8  has Coherence Value of 0.8537839680910111
Num Topics = 10  has Coherence Value of 0.9001901179552079
Num Topics = 12  has Coherence Value of 0.9060639346639313
Num Topics = 14  has Coherence Value of 0.9138416456324715
Num Topics = 16  has Coherence Value of 0.9086394879966975
Num Topics = 18  has Coherence Value of 0.9000288056002722
Num Topics = 20  has Coherence Value of 0.8939499065279962

== Topic #0 ==
Labels: 양념 뭍혀 후라이드 기름, 기름 오래되다, 후라이드 기름, 뭍혀 후라이드 기름, 기름 오래되다 튀김 냄새 양념
튀김	0.07932296395301819
아니다	0.07289432734251022
기름	0.06619047373533249
생각	0.061348967254161835
양념	0.05068545788526535
냄새	0.04872279614210129
마리	0.04425749182701111
양은	0.04313413426280022
심하다	0.041779547929763794
오래되다	0.04159277677536011

== Topic #1 ==
Labels: 모름 맛있다 연구 귀찮다 성공, 기본 모름 맛있다 연구 귀찮다, 프렌차이즈 이유 기본 모름 맛있다, 성공 리뷰 이벤트, 기본 프렌차이즈 이유 기본
기본	0.18941688537597656
프렌차이즈	0.0948476865887

Unnamed: 0,Topic_num,Topic_name,Keywords,Score
0,3.0,이상하다 후라이드 간장 + 안나 짠맛 일찍,"간장, 소스, 안나, 살이, 살짝, 느낌, 짠맛, 일찍, 리뷰, 후라이드",2.0
1,13.0,시간 다른 후기 배달 + 이상하다 치킨,"많다, 치킨, 추천, 별로, 가격, 좋다, 조금, 이상하다, 다른, 괜찮다",1.8333
2,12.0,맵다 후라이드 가슴 적당하다 아이듥 + 양념 맵다 후라이드 가슴,"보통, 아이, 후라이드, 음식, 맵다, 는데, 가슴, 적당하다, 아이듥, 양념",1.6667
