In [7]:
import sys
sys.path.append('../Data')

import pandas as pd
import numpy as np
import time
from CommentDataset import CommentDataset

In [8]:
comment_list_name = "20230627_ichikawa_ennosuke"
comment_list_path = "2023_06_27/" + comment_list_name + ".csv"

In [9]:
# 指定すべき文字コードはファイルによる

df = pd.read_csv("../Data/yahoo/"+comment_list_path, encoding="utf-8", header=None)
comment_list = df[0][1:].to_numpy().tolist()
comment_dataset = CommentDataset(comment_list)
comment_dataset.delete_comment_specified_string(50)
comment_dataset.formatted_input_hlda(appear_tagging_list=["名詞", "形容詞", "動詞"], stop_tagging_list=["助詞", "助動詞", "非自立"])

In [10]:
from hlda.sampler import HierarchicalLDA

#default
n_samples = 500       # no of iterations for the sampler
alpha = 10.0          # smoothing over level distributions
gamma = 1.0           # CRP smoothing parameter; number of imaginary customers at next, as yet unused table
eta = 0.1             # smoothing over topic-word distributions
num_levels = 3        # the number of levels in the tree
display_topics = 50   # the number of iterations between printing a brief summary of the topics so far
n_words = 10           # the number of most probable words to print for each topic after model estimation
with_weights = False  # whether to print the words with the weights

In [11]:
hlda = HierarchicalLDA(comment_dataset.comment_list, comment_dataset.voc, alpha=alpha, gamma=gamma, eta=eta, num_levels=num_levels)
hlda.estimate(n_samples, display_topics=display_topics, n_words=n_words, with_weights=with_weights)

HierarchicalLDA sampling

.................................................. 50
topic=0 level=0 (documents=545): 自殺, 思う, 両親, れる, 死ぬ, 猿之助, 自分, 幇助, 殺人, 言う, 
    topic=1 level=1 (documents=119): ビニール, れる, せる, 捨てる, 死因, 窒息, 精神, 死ぬ, 睡眠薬, 見る, 
        topic=2 level=2 (documents=27): 飲む, 種類, 睡眠薬, マネージャー, 死ねる, 中毒, 成分, 眠る, 隠滅, 薬物, 
        topic=18 level=2 (documents=11): サスペンス, 進む, 隠す, 一生, ドラマ, 取り調べ, 終わる, 審判, 静か, マルチ, 
        topic=19 level=2 (documents=9): 結婚, そこ, つくる, 自殺, スタッフ, 迎える, 気がつく, 暴行, 認知, 無理心中, 
        topic=28 level=2 (documents=11): 原因, 正直, 良い, 時代, 載る, 社会, 気味, 倫理, 勢力, 進む, 
        topic=60 level=2 (documents=9): 有力, 焦点, 都合, 裁判, 懸念, ファン, 分量, 効く, 身勝手, やり方, 
        topic=63 level=2 (documents=12): ファン, 応援, 後悔, 関わる, 番組, どこ, 近く, 喜ぶ, 当事者, 方向, 
        topic=68 level=2 (documents=14): 猶予, 執行, 立件, 起訴, 一番, 裏方, 判決, 起こる, 保釈, おる, 
        topic=84 level=2 (documents=6): 全容, 別々, 関与, かかる, 隠す, ここ, 軽傷, 医師, 施設, やばい, 
        topic=160 level=2 (documents=8): 物的, 大丈夫, 書く, 削除, 厳しい, 嘘八百, 悼む, 殺し, 強制, 

In [12]:
from model.HldaModel import ExpandHldaModel
import os

create_date = "2023_06_27"
parameter_name = "default"
model_name = comment_list_name + "_" + parameter_name + ".pickle"
model_dir = "model/pickle/" + create_date + "/" +model_name
create_date_path = "model/pickle/" + create_date

if not os.path.exists(create_date_path):
    os.makedirs(create_date_path)

expandHlda = ExpandHldaModel(pickle_path=hlda)
expandHlda.save_zipped_pickle(filename=model_dir)