### Setup

In [116]:
# general
import numpy as np
import pandas as pd
import csv

# tokenization
import json
import MeCab
import import_ipynb
import thesis_preprocess
from stopwords_ja import stop_words
from stopwords_slothlib import stop_words_2

# lda topic modeling
import gensim, logging
import gensim.corpora as corpora
from gensim.models import CoherenceModel
import pyLDAvis
import pyLDAvis.gensim

In [55]:
# https://lda.readthedocs.io/en/latest/getting_started.html
# https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/
# https://github.com/deankuo/Japan-Manifesto-Classification/blob/main/topic_modeling.ipynb
# https://github.com/m3yrin/NTM/blob/master/LDA_jp.ipynb
# https://tdual.hatenablog.com/entry/2018/04/09/133000#1LDA%E3%81%AE%E5%89%8D%E3%81%AB%E3%83%88%E3%83%94%E3%83%83%E3%82%AF%E3%83%A2%E3%83%87%E3%83%AB%E3%81%A8%E3%81%AF

### Preprocessing and Tokenization

In [141]:
# tokenize cleaned tweets into words
def tokenize(text):
    mt = MeCab.Tagger("-d /usr/local/lib/mecab/dic/mecab-ipadic-neologd")
    parsed = mt.parseToNode(text)
    components = []
    
    while parsed:
        word = parsed.surface
        pos = parsed.feature.split(",")[0]

        # for lda, we only want nouns, verbs, adjectives
        include_pos = ["名詞", "動詞", "形容詞", "副詞"]
        if pos in include_pos: components.append(word)
        parsed = parsed.next
    
    # remove stopwords
    components = [token for token in components if ((not token in stop_words) and (not token in stop_words_2))]
    
    return components

In [154]:
# run preprocessing and tokenization for tweets from given .txt file
def preprocess_tokenize_all(filename, year):
    # store results and exception tweets
    tokens = []
    retweets = []
    not_parsed = []

    # iterate through tweets, preprocess and tokenize
    with open(filename, 'r') as file:
        for line in file:
            tweet = json.loads(line)
            if line == None or tweet == None:
                not_parsed.append((line, tweet))
                print("Parsing error: ", line, tweet)
            elif tweet['retweetedTweet']:
                retweets.append(tweet)
                print("Retweet: ", tweet['id'])
            # filter out 2024 sponsored(?) tweets
            elif int(tweet['date'].split("-")[0]) < int(year) + 1: 
                tweet_text = tweet['rawContent'] # note: need other prop for over 140 char?
                processed = preprocess(tweet_text)            
                components = tokenize(processed)
                tokens.append(components)

    file.close()
    return tokens, retweets, not_parsed

In [57]:
# pass in filename you want to save data as, including '.csv'
def save_to_csv(tweet_tokens, filename):
    f = open(filename, 'w')
    writer = csv.writer(f)
    for tweet in tweet_tokens:
        writer.writerow(tweet)
    f.close()

In [58]:
# pass in filename of csv you want to load, including '.csv'
def load_from_csv(filename):
    with open(filename, newline='') as f:
        reader = csv.reader(f)
        tweet_tokens = list(reader)
    return tweet_tokens

### Latent Dirichlet Allocation

In [294]:
# train and save lda model for given year; data_name is id suffix to save lda model
def run_lda(data_name, tweet_tokens, num_topics=10, no_below=5, no_above=0.2):
    # set up dictionary
    dict = corpora.Dictionary(tweet_tokens)
    dict.filter_extremes(no_below, no_above)
    dict.compactify()

    # set up corpus
    corpus = [dict.doc2bow(w) for w in tweet_tokens]
    test_size = int(len(corpus) * 0.1)
    test_corpus = corpus[:test_size]
    train_corpus = corpus[test_size:]   

    # train and save lda model
    logging.basicConfig(format='%(message)s', level=logging.INFO)
    lda = gensim.models.ldamodel.LdaModel(corpus=corpus, 
                                          id2word=dict, 
                                          num_topics=num_topics, 
                                          random_state=100, 
                                          passes=10, 
                                          update_every=3, 
                                          alpha='auto',
                                          per_word_topics=True)
    lda.save("saved_lda_models/lda_model_" + data_name)
    
    return lda, dict, corpus, train_corpus, test_corpus

In [247]:
# display words comprising topics
def examine_topics(lda, dict, num_topics=10):
    for topic in range(num_topics):
        print("Topic # ",(topic+1))
        for t in lda.get_topic_terms(topic):
            print("{}: {}".format(dict[t[0]], t[1]))
        print("\n")

In [249]:
def examine_topics_plain(lda, dict, num_topics=10):
    for topic in range(num_topics):
        print("Topic # ",(topic+1))
        for t in lda.get_topic_terms(topic):
            print(dict[t[0]])
        print("\n")

In [208]:
# compare train/test perplexity
def analyze_train_test_results(lda, train_corpus, test_corpus):
    # look at train set results
    N = sum(count for doc in train_corpus for _, count in doc)
    print("# of words in train corpus: ",N)
    perplexity = np.exp2(-lda.log_perplexity(train_corpus))
    print("perplexity(train):", perplexity,"\n")

    # look at test set results
    N = sum(count for doc in test_corpus for _, count in doc)
    print("# of words in test corpus: ",N)
    perplexity = np.exp2(-lda.log_perplexity(test_corpus))
    print("perplexity(test):", perplexity)

In [209]:
# look at overall perplexity and coherence score
def analyze_overall_results(lda, tweet_tokens, dict, corpus):
    print('\nPerplexity: ', lda.log_perplexity(corpus))     # lower is better

    coherence_model_lda = CoherenceModel(model=lda, texts=tweet_tokens, dictionary=dict, coherence='c_v')
    coherence_lda = coherence_model_lda.get_coherence()
    print('\nCoherence Score: ', coherence_lda)     # higher is better

In [210]:
# open interactive visualization of topics
def visualize_topics(lda, corpus, dict):
    pyLDAvis.enable_notebook(local=True)
    vis = pyLDAvis.gensim.prepare(lda, corpus, dict)
    pyLDAvis.show(vis, local=False)

In [186]:
# https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/
def format_topics_sentences(ldamodel, corpus, texts):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for _, row in enumerate(ldamodel[corpus]):
        row = sorted(row[0], key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = pd.concat([sent_topics_df, pd.DataFrame([[int(topic_num), round(prop_topic,4), topic_keywords]], columns=['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords'])], ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)


def get_dominant_topics(df_topic_sents_keywords):
    df_dominant_topic = df_topic_sents_keywords.reset_index()
    df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']
    df_dominant_topic.head(10)
    return df_topic_sents_keywords

In [199]:
# group top 5 sentences under each topic
def get_representative_docs(df_topic_sents_keywords):
    sent_topics_sorteddf = pd.DataFrame()
    sent_topics_outdf_grpd = df_topic_sents_keywords.groupby('Dominant_Topic')
    for _, grp in sent_topics_outdf_grpd:
        sent_topics_sorteddf = pd.concat([sent_topics_sorteddf, 
                                                grp.sort_values(['Perc_Contribution'], ascending=[0]).head(1)], 
                                                axis=0)
    # reset index    
    sent_topics_sorteddf.reset_index(drop=True, inplace=True)
    # format and show
    sent_topics_sorteddf.columns = ['Topic_Num', "Topic_Perc_Contrib", "Keywords", "Text"]
    sent_topics_sorteddf.head()
    return sent_topics_sorteddf

In [211]:
# show further details about topics
def run_topic_analysis(df_topic_sents_keywords):
    # num documents per topic
    topic_counts = df_topic_sents_keywords['Dominant_Topic'].value_counts()

    # percentage documents for each topic
    topic_contribution = round(topic_counts/topic_counts.sum(), 4)

    # topic number, keywords
    topic_num_keywords = df_topic_sents_keywords[['Dominant_Topic', 'Topic_Keywords']]

    # concatenate column-wise
    df_dominant_topics = pd.concat([topic_num_keywords, topic_counts, topic_contribution], axis=1)

    # add column names
    df_dominant_topics.columns = ['Dominant_Topic', 'Topic_Keywords', 'Num_Documents', 'Perc_Documents']

    # show
    df_dominant_topics

    return df_dominant_topics

#### Zainichi Korean LDA
##### 2022


In [291]:
txt_filename = "datasets_minority_groups/zainichi_2022.txt"
tweet_tokens_z22, retweets_z22, error_z22 = preprocess_tokenize_all(txt_filename, "2022")
print("retweets: ", len(retweets_z22))
print("errors: ", len(error_z22))
save_to_csv(tweet_tokens_z22, "save_tokens_zainichi_2022.csv")

retweets:  0
errors:  0


In [255]:
# # preprocess and tokenize
# txt_filename = "datasets_minority_groups/zainichi_2022.txt"
# tweet_tokens_z22, retweets_z22, error_z22 = preprocess_tokenize_all(txt_filename, "2022")
# print("retweets: ", len(retweets_z22))
# print("errors: ", len(error_z22))
# save_to_csv(tweet_tokens_z22, "saved_tokens/save_tokens_zainichi_2022.csv")

tweet_tokens_z22 = load_from_csv("saved_tokens/save_tokens_zainichi_2022.csv")

# run lda
lda_z22, dict_z22, corpus_z22, train_corpus_z22, test_corpus_z22 = run_lda("zainichi_2022", tweet_tokens_z22, num_topics=7)

# analyze results
examine_topics(lda_z22, dict_z22, num_topics=7)
analyze_train_test_results(lda_z22, train_corpus_z22, test_corpus_z22)
analyze_overall_results(lda_z22, tweet_tokens_z22, dict_z22, corpus_z22)
examine_topics_plain(lda_z22, dict_z22, num_topics=7)

# # further topic analysis
# topic_df_z22 = format_topics_sentences(lda_z22, corpus_z22, tweet_tokens_z22)
# get_dominant_topics(topic_df_z22)
# get_representative_docs(topic_df_z22)
# run_topic_analysis(topic_df_z22)

adding document #0 to Dictionary<0 unique tokens: []>
adding document #10000 to Dictionary<18537 unique tokens: ['在日', '在日コリアン', '恥ずかしい', '愛さ', '愛せ']...>
adding document #20000 to Dictionary<27445 unique tokens: ['在日', '在日コリアン', '恥ずかしい', '愛さ', '愛せ']...>
adding document #30000 to Dictionary<33722 unique tokens: ['在日', '在日コリアン', '恥ずかしい', '愛さ', '愛せ']...>
built Dictionary<34645 unique tokens: ['在日', '在日コリアン', '恥ずかしい', '愛さ', '愛せ']...> from 31901 documents (total 532809 corpus positions)
Dictionary lifecycle event {'msg': "built Dictionary<34645 unique tokens: ['在日', '在日コリアン', '恥ずかしい', '愛さ', '愛せ']...> from 31901 documents (total 532809 corpus positions)", 'datetime': '2024-03-21T13:54:43.642180', 'gensim': '4.3.2', 'python': '3.11.5 (main, Aug 24 2023, 15:18:16) [Clang 14.0.3 (clang-1403.0.22.14.1)]', 'platform': 'macOS-14.3.1-x86_64-i386-64bit', 'event': 'created'}
discarding 24860 tokens: [('在日コリアン', 29374), ('矢張り', 3), ('安保理決議', 1), ('代物', 4), ('赤の他人', 4), ('がんばり', 2), ('新著', 1), ('デモ活動',

Topic #  1
現代ビジネス: 0.026550816372036934
世: 0.021798953413963318
ください: 0.021060174331068993
思う: 0.02056611143052578
ぜひ: 0.018876343965530396
読み: 0.01869032345712185
ヘイトクライム: 0.018243838101625443
いたし: 0.018171582370996475
寄稿: 0.017818674445152283
放火事件: 0.01639495976269245


Topic #  2
差別: 0.041518524289131165
てる: 0.02196437120437622
日本: 0.02005515806376934
日本人: 0.018091298639774323
考え: 0.009142741560935974
言っ: 0.007533208467066288
女性: 0.007289637345820665
アイヌ民族: 0.0063135456293821335
問題: 0.0061370679177343845
言う: 0.006062828004360199


Topic #  3
杉田水脈: 0.04380813613533974
批判: 0.03638438507914543
政務官: 0.03409528732299805
岸田: 0.03381604328751564
更迭: 0.03293599188327789
侮辱: 0.032396938651800156
総務: 0.03014310449361801
首相: 0.029795361682772636
今度: 0.028358370065689087
殺到: 0.027639755979180336


Topic #  4
作品: 0.0331818088889122
ください: 0.03162636607885361
人権: 0.02405248023569584
問題: 0.022942300885915756
差別: 0.02206381782889366
謝罪: 0.020524410530924797
アイヌ民族: 0.018319591879844666
検閲: 0.01814424

-7.740 per-word bound, 213.8 perplexity estimate based on a held-out corpus of 28711 documents with 413683 words


perplexity(train): 213.81303629884772 

# of words in test corpus:  45546


-8.873 per-word bound, 468.7 perplexity estimate based on a held-out corpus of 3190 documents with 45546 words


perplexity(test): 468.7429729491


-7.728 per-word bound, 212.0 perplexity estimate based on a held-out corpus of 31901 documents with 459229 words
using ParallelWordOccurrenceAccumulator<processes=7, batch_size=64> to estimate probabilities from sliding windows



Perplexity:  -7.728155853901521


7 accumulators retrieved from output queue
accumulated word occurrence stats for 31901 virtual documents



Coherence Score:  0.4452799682047624
Topic #  1
現代ビジネス
世
ください
思う
ぜひ
読み
ヘイトクライム
いたし
寄稿
放火事件


Topic #  2
差別
てる
日本
日本人
考え
言っ
女性
アイヌ民族
問題
言う


Topic #  3
杉田水脈
批判
政務官
岸田
更迭
侮辱
総務
首相
今度
殺到


Topic #  4
作品
ください
人権
問題
差別
謝罪
アイヌ民族
検閲
上映
行っ


Topic #  5
デマ
事件
在日
現代ビジネス
読ん
犯人
川崎
思う
いただき
ください


Topic #  6
日本
韓国
日本人
在日
てる
韓国人
北朝鮮
思い
統一教会
いい


Topic #  7
笑
信じ
ヘイトスピチ
生徒
和合
アイヌ民族
ウトロ地区
映画
暮らす
平和




In [256]:
visualize_topics(lda_z22, corpus_z22, dict_z22)

Serving to http://127.0.0.1:8888/    [Ctrl-C to exit]


127.0.0.1 - - [21/Mar/2024 13:56:00] "GET / HTTP/1.1" 200 -



stopping Server...


##### 2015

In [290]:
# preprocess and tokenize
txt_filename = "datasets_minority_groups/zainichi_2015.txt"
tweet_tokens_z15, retweets_z15, error_z15 = preprocess_tokenize_all(txt_filename, "2015")
print("retweets: ", len(retweets_z15))
print("errors: ", len(error_z15))
save_to_csv(tweet_tokens_z15, "saved_tokens/save_tokens_zainichi_2015.csv")
# tweet_tokens_z22 = load_from_csv("saved_tokens/save_tokens_zainichi_2015.csv")

# run lda
lda_z15, dict_z15, corpus_z15, train_z15, test_z15 = run_lda("zainichi_2015", tweet_tokens_z15, num_topics=7)

# analyze results
examine_topics(lda_z15, dict_z15, num_topics=7)
analyze_train_test_results(lda_z15, train_z15, test_z15)
analyze_overall_results(lda_z15, tweet_tokens_z15, dict_z15, corpus_z15)
examine_topics_plain(lda_z15, dict_z15, num_topics=7)

adding document #0 to Dictionary<0 unique tokens: []>


retweets:  0
errors:  0


adding document #10000 to Dictionary<13654 unique tokens: ['しよ', '在日コリアン', '廃止', '悪用', '詐欺事件']...>
adding document #20000 to Dictionary<19862 unique tokens: ['しよ', '在日コリアン', '廃止', '悪用', '詐欺事件']...>
built Dictionary<22136 unique tokens: ['しよ', '在日コリアン', '廃止', '悪用', '詐欺事件']...> from 23653 documents (total 365268 corpus positions)
Dictionary lifecycle event {'msg': "built Dictionary<22136 unique tokens: ['しよ', '在日コリアン', '廃止', '悪用', '詐欺事件']...> from 23653 documents (total 365268 corpus positions)", 'datetime': '2024-03-21T15:47:32.628959', 'gensim': '4.3.2', 'python': '3.11.5 (main, Aug 24 2023, 15:18:16) [Clang 14.0.3 (clang-1403.0.22.14.1)]', 'platform': 'macOS-14.3.1-x86_64-i386-64bit', 'event': 'created'}
discarding 16484 tokens: [('在日コリアン', 22775), ('とらえ', 2), ('人間なんて', 1), ('居心地', 3), ('良さ', 4), ('裏切る', 1), ('陣営', 3), ('ですか', 3), ('らいい', 2), ('さよなら絶望先生', 1)]...
keeping 5652 tokens which were in no less than 5 and no more than 4730 (=20.0%) documents
resulting dictionary: Dictionary<5

Topic #  1
人権: 0.0188908614218235
デモ: 0.015129738487303257
共生: 0.014695307239890099
年月日: 0.012123403139412403
講師: 0.011598549783229828
啓発: 0.011227344162762165
演題: 0.011203547939658165
大阪: 0.010453215800225735
川崎: 0.010096454061567783
日本: 0.009370153769850731


Topic #  2
韓国: 0.03299598768353462
通名: 0.018645066767930984
世: 0.014036841690540314
判明: 0.013198582455515862
在日特権: 0.012841176241636276
必要: 0.011295854113996029
自民党: 0.011230997741222382
本名: 0.011201216839253902
更新: 0.010008885525166988
圧力: 0.009666013531386852


Topic #  3
通名: 0.07536827772855759
起こす: 0.036746278405189514
しよ: 0.036189235746860504
廃止: 0.03600465878844261
悪用: 0.035103365778923035
詐欺事件: 0.03489375486969948
デマ: 0.029688149690628052
強制送還: 0.02416597492992878
入管: 0.020514190196990967
ネット: 0.016667494550347328


Topic #  4
日本: 0.02235046960413456
てる: 0.020930448547005653
日本人: 0.019482294097542763
万人: 0.015496740117669106
在日: 0.012522543780505657
偏見: 0.012353751808404922
差別: 0.011057876981794834
生活保護: 0.008555085398256

-6.829 per-word bound, 113.7 perplexity estimate based on a held-out corpus of 21288 documents with 283349 words


perplexity(train): 113.72048448956998 

# of words in test corpus:  30022


-8.011 per-word bound, 257.9 perplexity estimate based on a held-out corpus of 2365 documents with 30022 words


perplexity(test): 257.9115872632356


-6.832 per-word bound, 113.9 perplexity estimate based on a held-out corpus of 23653 documents with 313371 words
using ParallelWordOccurrenceAccumulator<processes=7, batch_size=64> to estimate probabilities from sliding windows



Perplexity:  -6.831503491416741


7 accumulators retrieved from output queue
accumulated word occurrence stats for 23653 virtual documents



Coherence Score:  0.395177348359994
Topic #  1
人権
デモ
共生
年月日
講師
啓発
演題
大阪
川崎
日本


Topic #  2
韓国
通名
世
判明
在日特権
必要
自民党
本名
更新
圧力


Topic #  3
通名
起こす
しよ
廃止
悪用
詐欺事件
デマ
強制送還
入管
ネット


Topic #  4
日本
てる
日本人
万人
在日
偏見
差別
生活保護
笑
思う


Topic #  5
ヘイトスピチ
被害
日本人
日本
対象
多民族
ルル
マナ
共生
在住


Topic #  6
在日
大半
強制
現在
年月日
子孫
朝日新聞
使う
連行
密航者


Topic #  7
弁護士
日本人
在日
協会
韓国人
カジノ
日本
多い
反対
設立




In [251]:
visualize_topics(lda_z15, corpus_z15, dict_z15)

Serving to http://127.0.0.1:8888/    [Ctrl-C to exit]


127.0.0.1 - - [21/Mar/2024 13:36:58] "GET / HTTP/1.1" 200 -



stopping Server...


#### Ainu LDA

In [230]:
# preprocess and tokenize
txt_filename = "datasets_minority_groups/ainu_2022.txt"
tweet_tokens_a22, retweets_a22, error_a22 = preprocess_tokenize_all(txt_filename, "2022")
print("# retweets: ", len(retweets_a22))
print("# errors: ", len(error_a22))
save_to_csv(tweet_tokens_a22, "saved_tokens/save_tokens_ainu_2022.csv")

# run lda
lda_a22, dict_a22, corpus_a22, train_corpus_a22, test_corpus_a22 = run_lda("ainu_2022", tweet_tokens_a22)

# analyze perplexity, coherence results
examine_topics(lda_a22, dict_a22)
analyze_train_test_results(lda_a22, train_corpus_a22, test_corpus_a22)
analyze_overall_results(lda_a22, tweet_tokens_a22, dict_a22, corpus_a22)

# further topic analysis
topic_df_a22 = format_topics_sentences(lda_a22, corpus_a22, tweet_tokens_a22)
get_dominant_topics(topic_df_a22)
run_topic_analysis(topic_df_a22)

Retweet:  1768019931457823173
Retweet:  1768019931457823173
Retweet:  1768019931457823173
Retweet:  1768019931457823173
Retweet:  1768019931457823173
Retweet:  1768019931457823173
Retweet:  1768019931457823173
Retweet:  1768019931457823173
Retweet:  1768019931457823173
Retweet:  1768019931457823173
Retweet:  1768019931457823173
Retweet:  1768019931457823173
Retweet:  1768019931457823173
Retweet:  1768019931457823173
Retweet:  1768019931457823173
Retweet:  1768019931457823173
Retweet:  1768019931457823173
Retweet:  1768019931457823173
# retweets:  18
# errors:  0


adding document #0 to Dictionary<0 unique tokens: []>
adding document #10000 to Dictionary<21877 unique tokens: ['しっかり', 'す', 'アイヌ文化', '世界', '何故']...>
adding document #20000 to Dictionary<31754 unique tokens: ['しっかり', 'す', 'アイヌ文化', '世界', '何故']...>
adding document #30000 to Dictionary<38819 unique tokens: ['しっかり', 'す', 'アイヌ文化', '世界', '何故']...>
adding document #40000 to Dictionary<44789 unique tokens: ['しっかり', 'す', 'アイヌ文化', '世界', '何故']...>
adding document #50000 to Dictionary<50058 unique tokens: ['しっかり', 'す', 'アイヌ文化', '世界', '何故']...>
adding document #60000 to Dictionary<54965 unique tokens: ['しっかり', 'す', 'アイヌ文化', '世界', '何故']...>
adding document #70000 to Dictionary<59376 unique tokens: ['しっかり', 'す', 'アイヌ文化', '世界', '何故']...>
adding document #80000 to Dictionary<64066 unique tokens: ['しっかり', 'す', 'アイヌ文化', '世界', '何故']...>
adding document #90000 to Dictionary<68150 unique tokens: ['しっかり', 'す', 'アイヌ文化', '世界', '何故']...>
adding document #100000 to Dictionary<71377 unique tokens: ['しっかり', 'す', 

Topic #  1
アイヌ新法: 0.024342553690075874
差別: 0.01508571207523346
アイヌ民族: 0.01453197281807661
杉田: 0.010564574040472507
投稿: 0.01046412531286478
北海道新聞: 0.009290744550526142
電子版: 0.008457372896373272
自民党: 0.00838231947273016
議員: 0.008260530419647694
日本: 0.00801127776503563


Topic #  2
北海道: 0.01752557046711445
センタ: 0.010692108422517776
説: 0.009975483641028404
伝承: 0.008545252494513988
推進: 0.0075366683304309845
伝説: 0.007015409879386425
研究: 0.006585562601685524
小さい: 0.005936234723776579
アイヌ民族: 0.005802185740321875
大きい: 0.0057772365398705006


Topic #  3
アイヌ民族: 0.027311652898788452
否定: 0.026683175936341286
氏: 0.023264173418283463
質問: 0.017315536737442017
協会: 0.012873600237071514
発言: 0.01034583430737257
政策: 0.009968404658138752
匿名: 0.00895823072642088
募集中: 0.00879293866455555
抗する: 0.008654167875647545


Topic #  4
神: 0.01948375068604946
熊: 0.009930484928190708
人間: 0.007974233478307724
世界: 0.006895607803016901
妖怪: 0.006607246585190296
アイヌ民族博物館: 0.00645563006401062
国立: 0.005948139354586601
カムイ: 0.00

-8.986 per-word bound, 506.9 perplexity estimate based on a held-out corpus of 160078 documents with 2298404 words


perplexity(train): 506.9021158545095 

# of words in test corpus:  248565


-9.356 per-word bound, 655.3 perplexity estimate based on a held-out corpus of 17786 documents with 248565 words


perplexity(test): 655.2505360055125


-8.951 per-word bound, 495.0 perplexity estimate based on a held-out corpus of 177864 documents with 2546969 words
using ParallelWordOccurrenceAccumulator<processes=7, batch_size=64> to estimate probabilities from sliding windows



Perplexity:  -8.951367862657467


7 accumulators retrieved from output queue
accumulated word occurrence stats for 177864 virtual documents



Coherence Score:  0.44420872028157704


Unnamed: 0,Dominant_Topic,Topic_Keywords,Num_Documents,Perc_Documents
0,6,"北海道, 日本人, アイヌ民族, 歴史, 差別, 文化, 日本, 和人, 民族, ウポポイ",7807.0,0.0439
1,8,"てる, アイヌ文化, ゴルデンカムイ, 思う, いい, 思っ, カム, 言っ, 出, 読ん",5298.0,0.0298
2,8,"てる, アイヌ文化, ゴルデンカムイ, 思う, いい, 思っ, カム, 言っ, 出, 読ん",6082.0,0.0342
3,8,"てる, アイヌ文化, ゴルデンカムイ, 思う, いい, 思っ, カム, 言っ, 出, 読ん",6795.0,0.0382
4,8,"てる, アイヌ文化, ゴルデンカムイ, 思う, いい, 思っ, カム, 言っ, 出, 読ん",8201.0,0.0461
...,...,...,...,...
177859,1,"北海道, センタ, 説, 伝承, 推進, 伝説, 研究, 小さい, アイヌ民族, 大きい",,
177860,9,"北海道, ロシア, 日本, 先住民族, アイヌ民族, 縄文人, 権利, プチン, 民族, 先住民",,
177861,8,"てる, アイヌ文化, ゴルデンカムイ, 思う, いい, 思っ, カム, 言っ, 出, 読ん",,
177862,8,"てる, アイヌ文化, ゴルデンカムイ, 思う, いい, 思っ, カム, 言っ, 出, 読ん",,


In [None]:
lda_a22 = gensim.models.LdaModel.load("saved_lda_models/lda_model_ainu_2022")
examine_topics_plain(lda_a22, dict_a22)

In [240]:
# visualize topics (keep in separate cell so we can interrupt it)
visualize_topics(lda_a22, corpus_a22, dict_a22)

Serving to http://127.0.0.1:8888/    [Ctrl-C to exit]


127.0.0.1 - - [21/Mar/2024 13:04:40] "GET / HTTP/1.1" 200 -



stopping Server...


#### Haafu LDA

In [267]:
# preprocess and tokenize
# txt_filename = "datasets_minority_groups/haafu_2022.txt"
# tweet_tokens_h22, retweets_h22, error_h22 = preprocess_tokenize_all(txt_filename, "2022")
# print("# retweets: ", len(retweets_h22))
# print("# errors: ", len(error_h22))
# save_to_csv(tweet_tokens_h22, "saved_tokens/save_tokens_haafu_2022.csv")
tweet_tokens_h22 = load_from_csv("saved_tokens/save_tokens_haafu_2022.csv")

# run lda
lda_h22, dict_h22, corpus_h22, train_corpus_h22, test_corpus_h22 = run_lda("haafu_2022", tweet_tokens_h22)

# analyze perplexity, coherence results
examine_topics(lda_h22, dict_h22)
analyze_train_test_results(lda_h22, train_corpus_h22, test_corpus_h22)
analyze_overall_results(lda_h22, tweet_tokens_h22, dict_h22, corpus_h22)
examine_topics_plain(lda_h22, dict_h22)

# # further topic analysis
# topic_df_h22 = format_topics_sentences(lda_h22, corpus_h22, tweet_tokens_h22)
# get_dominant_topics(topic_df_h22)
# run_topic_analysis(topic_df_h22)

adding document #0 to Dictionary<0 unique tokens: []>
adding document #10000 to Dictionary<25349 unique tokens: ['ウエスト', 'コト', 'スリクォタ', 'ハフコト', 'ヒップ']...>
adding document #20000 to Dictionary<37599 unique tokens: ['ウエスト', 'コト', 'スリクォタ', 'ハフコト', 'ヒップ']...>
built Dictionary<45757 unique tokens: ['ウエスト', 'コト', 'スリクォタ', 'ハフコト', 'ヒップ']...> from 28339 documents (total 336290 corpus positions)
Dictionary lifecycle event {'msg': "built Dictionary<45757 unique tokens: ['ウエスト', 'コト', 'スリクォタ', 'ハフコト', 'ヒップ']...> from 28339 documents (total 336290 corpus positions)", 'datetime': '2024-03-21T14:44:01.214275', 'gensim': '4.3.2', 'python': '3.11.5 (main, Aug 24 2023, 15:18:16) [Clang 14.0.3 (clang-1403.0.22.14.1)]', 'platform': 'macOS-14.3.1-x86_64-i386-64bit', 'event': 'created'}
discarding 36383 tokens: [('スリクォタ', 2), ('ヒップ', 2), ('フィンガチップ', 1), ('着丈', 3), ('総称', 2), ('ツイゴル', 2), ('ハフ', 11515), ('東京国際', 3), ('コネクタ', 1), ('ハフピッチ', 3)]...
keeping 9374 tokens which were in no less than 5 and no more t

Topic #  1
ハフアップ: 0.07281313836574554
ハフツイン: 0.04404336214065552
好き: 0.025774581357836723
てる: 0.024832043796777725
可愛い: 0.019529644399881363
すぎ: 0.01516166515648365
かわいい: 0.013462583534419537
髪: 0.012923511676490307
いい: 0.01252046786248684
髪型: 0.012137483805418015


Topic #  2
質問: 0.04679687321186066
募集中: 0.02925187535583973
匿名: 0.029094258323311806
てる: 0.02260010503232479
答え: 0.016371281817555428
質問箱: 0.015730256214737892
ハフハフ: 0.015709104016423225
娘: 0.014258287847042084
食べ: 0.013501825742423534
日本人: 0.011580631136894226


Topic #  3
楽天: 0.0230252668261528
無料: 0.02148028463125229
送料: 0.017413552850484848
メンズ: 0.013019255362451077
パンツ: 0.012619173154234886
位: 0.0124123003333807
価格: 0.011813780292868614
サイズ: 0.01099755521863699
人間: 0.01047302596271038
詳細: 0.009011128917336464


Topic #  4
今日: 0.014453435316681862
サイズ: 0.012845532968640327
動画: 0.009657390415668488
ニュハフ: 0.008679565973579884
ハフハフ: 0.008115176111459732
ハフスイング: 0.00776886148378253
食べ: 0.007615769747644663
お願い: 0.0067673833

-8.262 per-word bound, 307.0 perplexity estimate based on a held-out corpus of 25506 documents with 239923 words


perplexity(train): 307.0486954361905 

# of words in test corpus:  26137


-10.204 per-word bound, 1179.4 perplexity estimate based on a held-out corpus of 2833 documents with 26137 words


perplexity(test): 1179.4110394573133


-8.229 per-word bound, 299.9 perplexity estimate based on a held-out corpus of 28339 documents with 266060 words
using ParallelWordOccurrenceAccumulator<processes=7, batch_size=64> to estimate probabilities from sliding windows



Perplexity:  -8.228510922007473


7 accumulators retrieved from output queue
accumulated word occurrence stats for 28339 virtual documents



Coherence Score:  0.4443484265278492
Topic #  1
ハフアップ
ハフツイン
好き
てる
可愛い
すぎ
かわいい
髪
いい
髪型


Topic #  2
質問
募集中
匿名
てる
答え
質問箱
ハフハフ
娘
食べ
日本人


Topic #  3
楽天
無料
送料
メンズ
パンツ
位
価格
サイズ
人間
詳細


Topic #  4
今日
サイズ
動画
ニュハフ
ハフハフ
ハフスイング
食べ
お願い
本日
国際結婚


Topic #  5
位
ニュハフ
シャツ
人気
水着
楽天
通販
速
件
母


Topic #  6
てる
思う
ニュハフ
ハフパンツ
やっ
選手
思っ
出
いい
言わ


Topic #  7
思っ
子供
芸能人
日本
入荷
サイズ
付き
毛布
ラジオ
美女


Topic #  8
ハフバスデ
てる
人間
今日
くれ
写真
頑張っ
これから
箱
もう


Topic #  9
ハフミリオン
連続
オリコン
デビュ
位
日本
写真
達成
曲
カロリハフ


Topic #  10
ハフパンツ
てる
今日
思っ
笑
いい
良い
思い
買っ
練習




In [265]:
# visualize topics
visualize_topics(lda_h22, corpus_h22, dict_h22)

Serving to http://127.0.0.1:8888/    [Ctrl-C to exit]


127.0.0.1 - - [21/Mar/2024 14:23:44] "GET / HTTP/1.1" 200 -



stopping Server...


#### 2015

In [293]:
# preprocess and tokenize
txt_filename = "datasets_minority_groups/haafu_2015_2.txt"
tweet_tokens_h15, retweets_h15, error_h15 = preprocess_tokenize_all(txt_filename, "2015")
print("# retweets: ", len(retweets_h15))
print("# errors: ", len(error_h15))
save_to_csv(tweet_tokens_h15, "saved_tokens/save_tokens_haafu_2015_2.csv")
# tweet_tokens_h22 = load_from_csv("saved_tokens/save_tokens_haafu_2015_2.csv")

# run lda
lda_h15, dict_h15, corpus_h15, train_corpus_h15, test_corpus_h15 = run_lda("haafu_2015_2", tweet_tokens_h15)

# analyze perplexity, coherence results
examine_topics(lda_h15, dict_h15)
analyze_train_test_results(lda_h15, train_corpus_h15, test_corpus_h15)
analyze_overall_results(lda_h15, tweet_tokens_h15, dict_h15, corpus_h15)
examine_topics_plain(lda_h15, dict_h15)

Retweet:  1767457684343951724
Retweet:  1767457684343951724
Retweet:  1767457684343951724
Retweet:  1767143033177518308
Retweet:  1767457684343951724
Retweet:  1767457684343951724
Retweet:  1767143033177518308
Retweet:  1767457684343951724
Retweet:  1767457684343951724
Retweet:  1767457684343951724
Retweet:  1767457684343951724
Retweet:  1767143033177518308
Retweet:  1767143033177518308
Retweet:  1767143033177518308
Retweet:  1767457684343951724
Retweet:  1767457684343951724
Retweet:  1767143033177518308
Retweet:  1767457684343951724
Retweet:  1767457684343951724
Retweet:  1767143033177518308
Retweet:  1767457684343951724
Retweet:  1767457684343951724
Retweet:  1767457684343951724
Retweet:  1767457684343951724
Retweet:  1768019931457823173
Retweet:  1767457684343951724
Retweet:  1767143033177518308
Retweet:  1767457684343951724
Retweet:  1767143033177518308
Retweet:  1767143033177518308
Retweet:  1767143033177518308
Retweet:  1767457684343951724
Retweet:  1767457684343951724
Retweet:  

adding document #0 to Dictionary<0 unique tokens: []>
adding document #10000 to Dictionary<18798 unique tokens: ['サルエルストリト', 'スウェットトレニングハフパンツ', 'ストッパ', 'タイプ', 'バレボル']...>
adding document #20000 to Dictionary<27679 unique tokens: ['サルエルストリト', 'スウェットトレニングハフパンツ', 'ストッパ', 'タイプ', 'バレボル']...>
adding document #30000 to Dictionary<35096 unique tokens: ['サルエルストリト', 'スウェットトレニングハフパンツ', 'ストッパ', 'タイプ', 'バレボル']...>
adding document #40000 to Dictionary<41621 unique tokens: ['サルエルストリト', 'スウェットトレニングハフパンツ', 'ストッパ', 'タイプ', 'バレボル']...>
adding document #50000 to Dictionary<46834 unique tokens: ['サルエルストリト', 'スウェットトレニングハフパンツ', 'ストッパ', 'タイプ', 'バレボル']...>
adding document #60000 to Dictionary<51808 unique tokens: ['サルエルストリト', 'スウェットトレニングハフパンツ', 'ストッパ', 'タイプ', 'バレボル']...>
adding document #70000 to Dictionary<56584 unique tokens: ['サルエルストリト', 'スウェットトレニングハフパンツ', 'ストッパ', 'タイプ', 'バレボル']...>
adding document #80000 to Dictionary<60993 unique tokens: ['サルエルストリト', 'スウェットトレニングハフパンツ', 'ストッパ', 'タイプ', 'バレボル']...>
adding doc

Topic #  1
印象: 0.014671452343463898
一言: 0.013577415607869625
第一印象: 0.011995990760624409
ベタハフ: 0.011298621073365211
呼び方: 0.011056649498641491
交換: 0.01102546975016594
やる: 0.010833230800926685
好き: 0.010385282337665558
最後: 0.009513512253761292
家族: 0.008785808458924294


Topic #  2
ニュハフ: 0.05032945051789284
動画: 0.01369214616715908
エロ: 0.011691706255078316
美女: 0.00918737705796957
詳細: 0.00858237873762846
下さい: 0.007618717383593321
眉: 0.007239223457872868
犬: 0.006653911434113979
あるある: 0.006493745371699333
ペニクリ: 0.0061466265469789505


Topic #  3
位: 0.030980264768004417
サンプル動画: 0.01808391697704792
女性: 0.012935432605445385
ランキング: 0.012351262383162975
アダルト: 0.011208968237042427
日本: 0.011034030467271805
ミスユニバス: 0.010914333164691925
応募: 0.010430708527565002
専門: 0.009732629172503948
サイト: 0.009205080568790436


Topic #  4
ニュハフ: 0.02098158374428749
娘: 0.01838276907801628
ハフハフ: 0.013040969148278236
機能: 0.010069803334772587
スマスマ: 0.009199390187859535
女優: 0.008989119902253151
ブログ: 0.008237142115831375
付き:

-9.182 per-word bound, 580.8 perplexity estimate based on a held-out corpus of 268131 documents with 2339732 words


perplexity(train): 580.7714364559255 

# of words in test corpus:  254868


-9.670 per-word bound, 814.6 perplexity estimate based on a held-out corpus of 29792 documents with 254868 words


perplexity(test): 814.6386971081697


-9.157 per-word bound, 571.0 perplexity estimate based on a held-out corpus of 297923 documents with 2594600 words
using ParallelWordOccurrenceAccumulator<processes=7, batch_size=64> to estimate probabilities from sliding windows



Perplexity:  -9.157306853403263


Python(4700) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(4702) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(4704) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(4706) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(4708) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(4710) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(4712) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
7 accumulators retrieved from output queue
accumulated word occurrence stats for 297923 virtual documents



Coherence Score:  0.37200172651310365
Topic #  1
印象
一言
第一印象
ベタハフ
呼び方
交換
やる
好き
最後
家族


Topic #  2
ニュハフ
動画
エロ
美女
詳細
下さい
眉
犬
あるある
ペニクリ


Topic #  3
位
サンプル動画
女性
ランキング
アダルト
日本
ミスユニバス
応募
専門
サイト


Topic #  4
ニュハフ
娘
ハフハフ
機能
スマスマ
女優
ブログ
付き
派
無修正動画


Topic #  5
ハフアップ
今日
人間
どう
髪
食べ
明日
色
昨日
髪型


Topic #  6
ハフパンツ
人気
カラコン
無料
大阪
黒
限定
着
シャツ
服


Topic #  7
まとめ
動画
ハフハフ
暴行
画像
アレンジ
基地
米
ヘア
簡単


Topic #  8
サイズ
現在
情報
商品
新品
ランク
水着
正式
セット
メンズ


Topic #  9
ニュハフ
てる
笑
いい
顔
思っ
日本人
可愛い
日本
好き


Topic #  10
新品
楽天
自転車
見る
丈
女児
沖縄
男子
送料
無料




In [None]:
# visualize topics
visualize_topics(lda_h15, corpus_h15, dict_h15)

#### Ryukyujin LDA


In [275]:
# preprocess and tokenize
# txt_filename = "datasets_minority_groups/ryukyujin_2022.txt"
# tweet_tokens_r22, retweets_r22, error_r22 = preprocess_tokenize_all(txt_filename, "2022")
# print("# retweets: ", len(retweets_r22))
# print("# errors: ", len(error_r22))
# save_to_csv(tweet_tokens_r22, "save_tokens_ryukyujin_2022.csv")
tweet_tokens_r22 = load_from_csv("save_tokens_ryukyujin_2022.csv")


# run lda
lda_r22, dict_r22, corpus_r22, train_corpus_r22, test_corpus_r22 = run_lda("ryukyujin_2022", tweet_tokens_r22, num_topics=5)

# analyze perplexity, coherence results
examine_topics(lda_r22, dict_r22, num_topics=5)
analyze_train_test_results(lda_r22, train_corpus_r22, test_corpus_r22)
analyze_overall_results(lda_r22, tweet_tokens_r22, dict_r22, corpus_r22)
examine_topics_plain(lda_r22, dict_r22, num_topics=5)

# # further topic analysis
# topic_df_r22 = format_topics_sentences(lda_r22, corpus_r22, tweet_tokens_r22)
# get_dominant_topics(topic_df_r22)
# run_topic_analysis(topic_df_r22)

adding document #0 to Dictionary<0 unique tokens: []>
built Dictionary<15354 unique tokens: ['ペジ', '下さい', '下さっ', '側', '先住民族']...> from 5454 documents (total 97145 corpus positions)
Dictionary lifecycle event {'msg': "built Dictionary<15354 unique tokens: ['ペジ', '下さい', '下さっ', '側', '先住民族']...> from 5454 documents (total 97145 corpus positions)", 'datetime': '2024-03-21T15:12:16.932493', 'gensim': '4.3.2', 'python': '3.11.5 (main, Aug 24 2023, 15:18:16) [Clang 14.0.3 (clang-1403.0.22.14.1)]', 'platform': 'macOS-14.3.1-x86_64-i386-64bit', 'event': 'created'}
discarding 12652 tokens: [('ペジ', 3), ('下さっ', 3), ('史上', 2), ('尊厳回復', 1), ('尽力', 2), ('日本の裁判所', 1), ('準備書面', 1), ('球人', 4102), ('琉', 4088), ('琉球', 1698)]...
keeping 2702 tokens which were in no less than 5 and no more than 1090 (=20.0%) documents
resulting dictionary: Dictionary<2702 unique tokens: ['下さい', '側', '先住民族', '初めて', '原告']...>
using autotuned alpha, starting with [0.2, 0.2, 0.2, 0.2, 0.2]
using symmetric eta at 0.2
using serial

Topic #  1
日本人: 0.02374880574643612
差別: 0.020312674343585968
日本: 0.015351329930126667
沖縄県民: 0.010037774220108986
沖縄県: 0.008960890583693981
てる: 0.008181980811059475
歴史: 0.00712039927020669
本土: 0.007103926967829466
思い: 0.006879498716443777
米軍: 0.005958393681794405


Topic #  2
言え: 0.08646105229854584
墓: 0.0583437979221344
言っ: 0.030428346246480942
死者: 0.02763770893216133
アレ: 0.026973117142915726
ぶっちゃけ: 0.02680429071187973
書き換え: 0.026639847084879875
客死: 0.026472946628928185
西南戦争: 0.026305828243494034
在日: 0.016350453719496727


Topic #  3
日本人: 0.019023463129997253
てる: 0.012287632562220097
日本: 0.011875526048243046
デニ: 0.010314094834029675
玉城: 0.010061032138764858
国連: 0.00928147230297327
知事: 0.009273777715861797
主張: 0.00776643306016922
琉球国: 0.007369603030383587
中国: 0.007268279325217009


Topic #  4
日本: 0.028978154063224792
日本人: 0.025690916925668716
中国: 0.024407552555203438
アイヌ: 0.022679900750517845
遺骨: 0.011570902541279793
まとめ: 0.01096714474260807
てる: 0.010561279952526093
独立: 0.01017931662499

-6.962 per-word bound, 124.6 perplexity estimate based on a held-out corpus of 4909 documents with 56585 words
-8.793 per-word bound, 443.6 perplexity estimate based on a held-out corpus of 545 documents with 6767 words


perplexity(train): 124.63064677293016 

# of words in test corpus:  6767
perplexity(test): 443.6307011462988


-6.943 per-word bound, 123.0 perplexity estimate based on a held-out corpus of 5454 documents with 63352 words
using ParallelWordOccurrenceAccumulator<processes=7, batch_size=64> to estimate probabilities from sliding windows



Perplexity:  -6.9429959834971156


7 accumulators retrieved from output queue
accumulated word occurrence stats for 5454 virtual documents



Coherence Score:  0.3119631895709615
Topic #  1
日本人
差別
日本
沖縄県民
沖縄県
てる
歴史
本土
思い
米軍


Topic #  2
言え
墓
言っ
死者
アレ
ぶっちゃけ
書き換え
客死
西南戦争
在日


Topic #  3
日本人
てる
日本
デニ
玉城
国連
知事
主張
琉球国
中国


Topic #  4
日本
日本人
中国
アイヌ
遺骨
まとめ
てる
独立
琉球独立
議論


Topic #  5
琉球独立
庶民
本村
安彦
年月日
米軍
日本
つぶやき
米
基地




In [278]:
# visualize topics
visualize_topics(lda_r22, corpus_r22, dict_r22)

Serving to http://127.0.0.1:8888/    [Ctrl-C to exit]


127.0.0.1 - - [21/Mar/2024 15:18:51] "GET / HTTP/1.1" 200 -



stopping Server...


#### Okinawajin LDA

In [288]:
# preprocess and tokenize
# txt_filename = "datasets_minority_groups/okinawajin_2022.txt"
# tweet_tokens_o22, retweets_o22, error_o22 = preprocess_tokenize_all(txt_filename, "2022")
# print("# retweets: ", len(retweets_o22))
# print("# errors: ", len(error_o22))
# save_to_csv(tweet_tokens_o22, "saved_tokens/save_tokens_okinawajin_2022.csv")
tweet_tokens_o22 = load_from_csv("saved_tokens/save_tokens_okinawajin_2022.csv")

# run lda
lda_o22, dict_o22, corpus_o22, train_corpus_o22, test_corpus_o22 = run_lda("okinawajin_2022", tweet_tokens_o22, num_topics=5)

# analyze perplexity, coherence results
examine_topics(lda_o22, dict_o22, num_topics=5)
analyze_train_test_results(lda_o22, train_corpus_o22, test_corpus_o22)
analyze_overall_results(lda_o22, tweet_tokens_o22, dict_o22, corpus_o22)
examine_topics_plain(lda_o22, dict_o22, num_topics=5)

adding document #0 to Dictionary<0 unique tokens: []>
adding document #10000 to Dictionary<21496 unique tokens: ['こんな', '一部', '学級新聞', '沖縄', '海']...>
adding document #20000 to Dictionary<31565 unique tokens: ['こんな', '一部', '学級新聞', '沖縄', '海']...>
built Dictionary<33925 unique tokens: ['こんな', '一部', '学級新聞', '沖縄', '海']...> from 23142 documents (total 360392 corpus positions)
Dictionary lifecycle event {'msg': "built Dictionary<33925 unique tokens: ['こんな', '一部', '学級新聞', '沖縄', '海']...> from 23142 documents (total 360392 corpus positions)", 'datetime': '2024-03-21T15:43:11.593617', 'gensim': '4.3.2', 'python': '3.11.5 (main, Aug 24 2023, 15:18:16) [Clang 14.0.3 (clang-1403.0.22.14.1)]', 'platform': 'macOS-14.3.1-x86_64-i386-64bit', 'event': 'created'}
discarding 25970 tokens: [('こんな', 4), ('学級新聞', 1), ('沖縄', 22147), ('ジャングル', 2), ('加える', 4), ('危害', 2), ('有ろ', 1), ('やまし', 3), ('公職', 1), ('一般常識', 4)]...
keeping 7955 tokens which were in no less than 5 and no more than 4628 (=20.0%) documents
resu

Topic #  1
ちゅ: 0.0475415401160717
食べ: 0.010914398357272148
うちな: 0.010250006802380085
笑: 0.008545998483896255
大会: 0.008113399147987366
今日: 0.008111551403999329
あい: 0.007849397137761116
好き: 0.007188490126281977
著者: 0.006966648157685995
マラソン: 0.005911864805966616


Topic #  2
てる: 0.02110971510410309
日本人: 0.009228167124092579
日本: 0.007966598495841026
ちむどんどん: 0.006693776696920395
言っ: 0.006499305833131075
本土: 0.006311091594398022
差別: 0.006176247727125883
基地: 0.006162857171148062
思っ: 0.005943520460277796
いい: 0.005611628759652376


Topic #  3
先住民族: 0.03366587311029434
利用: 0.023537665605545044
支援: 0.02230268158018589
アイヌ: 0.022050637751817703
日本: 0.019024698063731194
僕: 0.017847687005996704
日記: 0.016763368621468544
上京: 0.01675029657781124
侵略: 0.01615605689585209
独立: 0.01482328213751316


Topic #  4
踊り: 0.0580485500395298
カチャシ: 0.056762825697660446
顔: 0.032304082065820694
ササ: 0.031203407794237137
良かっ: 0.02912982925772667
とけ: 0.028156090527772903
狂い: 0.027785561978816986
行っ: 0.027550233528017998


-7.453 per-word bound, 175.2 perplexity estimate based on a held-out corpus of 20828 documents with 260832 words


perplexity(train): 175.24235053031063 

# of words in test corpus:  27129


-8.509 per-word bound, 364.3 perplexity estimate based on a held-out corpus of 2314 documents with 27129 words


perplexity(test): 364.32538966412784


-7.421 per-word bound, 171.4 perplexity estimate based on a held-out corpus of 23142 documents with 287961 words
using ParallelWordOccurrenceAccumulator<processes=7, batch_size=64> to estimate probabilities from sliding windows



Perplexity:  -7.421196575454295


7 accumulators retrieved from output queue
accumulated word occurrence stats for 23142 virtual documents



Coherence Score:  0.46977642090403904
Topic #  1
ちゅ
食べ
うちな
笑
大会
今日
あい
好き
著者
マラソン


Topic #  2
てる
日本人
日本
ちむどんどん
言っ
本土
差別
基地
思っ
いい


Topic #  3
先住民族
利用
支援
アイヌ
日本
僕
日記
上京
侵略
独立


Topic #  4
踊り
カチャシ
顔
ササ
良かっ
とけ
狂い
行っ
嫌
せめて


Topic #  5
型
僕
上京
日記
あまり
質問
七月
血液型
人間
募集中




In [289]:
visualize_topics(lda_o22, corpus_o22, dict_o22)

Serving to http://127.0.0.1:8888/    [Ctrl-C to exit]


127.0.0.1 - - [21/Mar/2024 15:44:03] "GET / HTTP/1.1" 200 -



stopping Server...
