In [17]:
import pandas as pd
import numpy as np
import time
from MyNLP import WordDividerMecab
from sklearn.feature_extraction.text import TfidfVectorizer

In [18]:
df = pd.read_csv("data/2022_12_19/news_list_12_19.csv", encoding="cp932", header=None)
comment_list = []
row_data = df[2][1:].to_numpy().tolist()

temp_row_data = []
for text in row_data:
    if len(text) >= 0:
        temp_row_data.append(text)
row_data = temp_row_data

# 前処理
wd = WordDividerMecab()
stop_word_list = ["まだ", "ある", "なる", "なる", "する", "し", "する", "いる", "なっ", "せ", "い", "やる", "ない"]
for text in row_data:
    text = wd.wakati_text_delete(text=text, stop_word_list=stop_word_list)
    comment_list.append(text)

# 単語の集合(vocablury)の作成
vectorizer = TfidfVectorizer()
tfidf = vectorizer.fit_transform(comment_list)
voc = vectorizer.vocabulary_
voc = sorted(voc)

# HLDAの入力に合わせたリストの形状に変換する
corpus = []

for comment in comment_list:
    list = comment.split(" ")
    corpus.append(list)
filtered_corpus = []
for comment in corpus:
    filtered_comment = []
    for word in comment:
        if word in voc:
            filtered_comment.append(word)
    filtered_corpus.append(filtered_comment)

# 前処理で文字数がなくなったテキストがあるかのチェック
corpus = filtered_corpus
corpus = [comment for comment in corpus if len(comment) != 0]

# 単語の集合に対してindexを割り当てる
vocab_index = {}
for i, w in enumerate(voc):
    vocab_index[w] = i

# HLDAの入力に合わせてテキストを単語のindexで表す
new_corpus = []
corpus = [comment for comment in corpus if comment != '']
for sentence in corpus:
    new_sentence = []
    for word in sentence:
        word_idx = vocab_index[word]
        new_sentence.append(word_idx)
    new_corpus.append(new_sentence)

In [19]:
from hlda.sampler import HierarchicalLDA

#default
n_samples = 500       # no of iterations for the sampler
alpha = 10.0          # smoothing over level distributions
gamma = 1.0           # CRP smoothing parameter; number of imaginary customers at next, as yet unused table
eta = 0.1             # smoothing over topic-word distributions
num_levels = 3        # the number of levels in the tree
display_topics = 50   # the number of iterations between printing a brief summary of the topics so far
n_words = 7           # the number of most probable words to print for each topic after model estimation
with_weights = False  # whether to print the words with the weights

In [None]:
hlda = HierarchicalLDA(new_corpus, voc, alpha=alpha, gamma=gamma, eta=eta, num_levels=num_levels)
hlda.estimate(n_samples, display_topics=display_topics, n_words=n_words, with_weights=with_weights)

HierarchicalLDA sampling

.................................................. 50
topic=0 level=0 (documents=279): できる, 思う, いう, 発表, 行う, 見る, 日本, 
    topic=1 level=1 (documents=71): 放送, ケプラー, 選手, 意味, 公開, 終わる, 写真, 
        topic=2 level=2 (documents=11): ランキング, 翻訳, 字幕, 年間, 日本語, 厚切りジェイソン, 言語, 
        topic=10 level=2 (documents=16): アルゼンチン, 優勝, 滅私, 大会, フランス, 決勝, カタール, 
        topic=14 level=2 (documents=7): 直也, 描く, 映画, ノア, 監督, avatar, 紅蓮, 
        topic=15 level=2 (documents=9): 自分, チーム, フェラーリ, 浦和レッズ, 全て, シーズン, 藤原, 
        topic=17 level=2 (documents=7): 時間, レンズ, フルサイズ, 購入, 表示, 時計, シリーズ, 
        topic=28 level=2 (documents=5): アミノ酸, 粒子, 研究, 炭素質コンドライト, 流す, 分子, トイレ, 
        topic=37 level=2 (documents=6): 食べる, ハチ, アカシュモクザメ, 発見, 体内時計, 建物, 花粉, 
        topic=42 level=2 (documents=5): 舞踏, 参加, 少女, 眼差し, 今月, ラジオ日本, 古河, 
        topic=47 level=2 (documents=2): 被告, 遠山, 古典, 見える, 殺害, 懲役, 稚児, 
        topic=48 level=2 (documents=3): アプリ, 禁止, 脅威, 収集, 渋谷, データ, サウナ, 
    topic=3 level=1 (documents=42)

In [None]:
import pickle
import gzip

def save_zipped_pickle(obj, filename, protocol=-1):
    with gzip.open(filename, 'wb') as f:
        pickle.dump(obj, f, protocol)
        
def load_zipped_pickle(filename):
    with gzip.open(filename, 'rb') as f:
        loaded_object = pickle.load(f)
        return loaded_object

In [None]:
save_zipped_pickle(hlda, 'model/2022_12_19/yahoo_topic_news_hlda_default.pickle')
#save_zipped_pickle(hlda, '2022_11_15/yahoo_hlda_default.pickle')

In [None]:
hlda = load_zipped_pickle('model/2022_12_19/yahoo_topic_news_hlda_default.pickle')
#hlda = load_zipped_pickle('2022_11_15/yahoo_hlda_default.pickle')