In [1]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pylab as plt

import glob
import os
from tqdm import tqdm

import MeCab
import urllib
import gensim
import pyLDAvis
import pyLDAvis.gensim
from IPython.display import HTML

import warnings
warnings.filterwarnings("ignore")

pyLDAvis.enable_notebook()
np.random.seed(0)

In [2]:
df = pd.read_csv('retty_sample.csv')
df

Unnamed: 0,id,date,time,author,text,links,lang
0,1196632983798571008,2019-11-19,2019-11-19 12:35:10,nryota04150415,もう Rettyや一休のようにGoogle マップがseoで表示されないYahooに身売りし...,,ja
1,1182112036060192769,2019-10-10,2019-10-10 10:54:06,kaitenportalbiz,飲食店集客に役立つ グルメサイト 17選比較 食べログ ホットペッパーグルメ ぐるなび Re...,https://t.co/f9FJJDlqOv,ja
2,1181184285463601152,2019-10-07,2019-10-07 21:27:33,wuv_l,Rettyなるものを教えていただいて早速検索してるけどスンドゥブ食べたくなってきた,,ja
3,1189375395201634305,2019-10-30,2019-10-30 11:56:06,harapeko_wktk,久しぶりにrettyみたら食べログみたいになってて買収でもされたのかと思った,,ja
4,1193735191832883200,2019-11-11,2019-11-11 12:40:22,jacotosh,地元食べRetty 地元のおすすめカフェ＆レストランご紹介第11弾 Cafe A Plus ...,https://t.co/jMml4FxgJb,ja
5,1188060882879700992,2019-10-26,2019-10-26 20:52:42,mmmmmmura,Rettyのアプリがマジくそ ネット予約してもマイページに反映されないし タコ焼きしたくて来...,,ja
6,1207148223694946306,2019-12-18,2019-12-18 12:58:58,hirominnnn,自分はもう離れてしまったのだけど 新卒で採用した子たちが ファーストキャリアをここにしてよか...,https://t.co/THVodZvPFD,ja
7,1187275941174636544,2019-10-24,2019-10-24 16:53:37,Oryzias_07,今お腹ぺこぺこですシンくんの先行販売行ったらなにか暇潰しにカフェでも入ります,,ja
8,1221383000115445760,2020-01-26,2020-01-26 19:42:54,minto0724_,やった れてぃちゃん色んな意味でやばそう,,ja
9,1215994237163958272,2020-01-11,2020-01-11 22:49:52,py1_01,アフリカあたりに輸送しとくね,,ja


In [3]:
def analyzer(text, target_part_of_speech=['proper_noun', 'noun', 'verb', 'adjective']):
    
    mecab = MeCab.Tagger('-Ochasen')
    node = mecab.parseToNode(text)
    words = []
    
    req = urllib.request.Request('http://svn.sourceforge.jp/svnroot/slothlib/CSharp/Version1/SlothLib/NLP/Filter/StopWord/word/Japanese.txt')
    with urllib.request.urlopen(req) as res:
        stopwords = res.read().decode('utf-8').split('\r\n')
    while '' in stopwords:
        stopwords.remove('')
    
    while node:
        
        features = node.feature.split(',')
        surface = features[6]
        
        if (surface == '*') or (len(surface) < 2) or (surface in stopwords):
            node = node.next
            continue
            
        noun_flag = (features[0] == '名詞')
        proper_noun_flag = (features[0] == '名詞') & (features[1] == '固有名詞')
        verb_flag = (features[0] == '動詞') & (features[1] == '自立')
        adjective_flag = (features[0] == '形容詞') & (features[1] == '自立')
        
 
        if ('proper_noun' in target_part_of_speech) & proper_noun_flag:
            words.append(surface)
        elif ('noun' in target_part_of_speech) & noun_flag:
            words.append(surface)
        elif ('verb' in target_part_of_speech) & verb_flag:
            words.append(surface)
        elif ('adjective' in target_part_of_speech) & adjective_flag:
            words.append(surface)
        
        node = node.next
        
    return words

In [4]:
texts = []
for i in range(len(df)):
    text = df['text'][i].replace('\n',' ')
    words = analyzer(text, target_part_of_speech=['proper_noun', 'noun'])
    texts.append(words)
    
dictionary = gensim.corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(t) for t in texts]

In [5]:
start = 2
limit = 21
step = 1
 
coherence_vals = []
perplexity_vals = []
 
for n_topic in tqdm(range(start, limit, step)):
 
    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=n_topic, random_state=0)
    perplexity_vals.append(np.exp2(-lda_model.log_perplexity(corpus)))
    coherence_model_lda = gensim.models.CoherenceModel(model=lda_model, texts=texts, dictionary=dictionary, coherence='c_v')
    coherence_vals.append(coherence_model_lda.get_coherence())

100%|██████████| 19/19 [00:08<00:00,  2.35it/s]


![](model_evals.png)

In [6]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus, 
                                                                                    id2word=dictionary, 
                                                                                    num_topics=10, 
                                                                                    random_state=0)

vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary, sort_topics=True)

In [8]:
HTML('lda_retty.html')