### Setup

In [116]:
# general
import numpy as np
import pandas as pd
import csv

# tokenization
import json
import MeCab
import import_ipynb
import thesis_preprocess
from stopwords_ja import stop_words
from stopwords_slothlib import stop_words_2

# lda topic modeling
import gensim, logging
import gensim.corpora as corpora
from gensim.models import CoherenceModel
import pyLDAvis
import pyLDAvis.gensim

In [55]:
# https://lda.readthedocs.io/en/latest/getting_started.html
# https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/
# https://github.com/deankuo/Japan-Manifesto-Classification/blob/main/topic_modeling.ipynb
# https://github.com/m3yrin/NTM/blob/master/LDA_jp.ipynb
# https://tdual.hatenablog.com/entry/2018/04/09/133000#1LDA%E3%81%AE%E5%89%8D%E3%81%AB%E3%83%88%E3%83%94%E3%83%83%E3%82%AF%E3%83%A2%E3%83%87%E3%83%AB%E3%81%A8%E3%81%AF

### Preprocessing and Tokenization

In [141]:
# tokenize cleaned tweets into words
def tokenize(text):
    mt = MeCab.Tagger("-d /usr/local/lib/mecab/dic/mecab-ipadic-neologd")
    parsed = mt.parseToNode(text)
    components = []
    
    while parsed:
        word = parsed.surface
        pos = parsed.feature.split(",")[0]

        # for lda, we only want nouns, verbs, adjectives
        include_pos = ["名詞", "動詞", "形容詞", "副詞"]
        if pos in include_pos: components.append(word)
        parsed = parsed.next
    
    # remove stopwords
    components = [token for token in components if ((not token in stop_words) and (not token in stop_words_2))]
    
    return components

In [154]:
# run preprocessing and tokenization for tweets from given .txt file
def preprocess_tokenize_all(filename, year):
    # store results and exception tweets
    tokens = []
    retweets = []
    not_parsed = []

    # iterate through tweets, preprocess and tokenize
    with open(filename, 'r') as file:
        for line in file:
            tweet = json.loads(line)
            if line == None or tweet == None:
                not_parsed.append((line, tweet))
                print("Parsing error: ", line, tweet)
            elif tweet['retweetedTweet']:
                retweets.append(tweet)
                print("Retweet: ", tweet['id'])
            # filter out 2024 sponsored(?) tweets
            elif int(tweet['date'].split("-")[0]) < int(year) + 1: 
                tweet_text = tweet['rawContent'] # note: need other prop for over 140 char?
                processed = preprocess(tweet_text)            
                components = tokenize(processed)
                tokens.append(components)

    file.close()
    return tokens, retweets, not_parsed

In [57]:
# pass in filename you want to save data as, including '.csv'
def save_to_csv(tweet_tokens, filename):
    f = open(filename, 'w')
    writer = csv.writer(f)
    for tweet in tweet_tokens:
        writer.writerow(tweet)
    f.close()

In [58]:
# pass in filename of csv you want to load, including '.csv'
def load_from_csv(filename):
    with open(filename, newline='') as f:
        reader = csv.reader(f)
        tweet_tokens = list(reader)
    return tweet_tokens

### Latent Dirichlet Allocation

In [212]:
# train and save lda model for given year; data_name is id suffix to save lda model
def run_lda(data_name, tweet_tokens, no_below=5, no_above=0.2):
    # set up dictionary
    dict = corpora.Dictionary(tweet_tokens)
    dict.filter_extremes(no_below, no_above)
    dict.compactify()

    # set up corpus
    corpus = [dict.doc2bow(w) for w in tweet_tokens]
    test_size = int(len(corpus) * 0.1)
    test_corpus = corpus[:test_size]
    train_corpus = corpus[test_size:]   

    # train and save lda model
    logging.basicConfig(format='%(message)s', level=logging.INFO)
    lda = gensim.models.ldamodel.LdaModel(corpus=corpus, 
                                          id2word=dict, 
                                          num_topics=5, 
                                          random_state=100, 
                                          passes=10, 
                                          update_every=3, 
                                          alpha=0.05,
                                          per_word_topics=True)
    lda.save("save_lda_model_" + data_name)
    
    return lda, dict, corpus, train_corpus, test_corpus

In [207]:
# display words comprising topics
def examine_topics(lda, dict):
    for topic in range(5):
        print("Topic # ",(topic+1))
        for t in lda.get_topic_terms(topic):
            print("{}: {}".format(dict[t[0]], t[1]))
        print("\n")

In [208]:
# compare train/test perplexity
def analyze_train_test_results(lda, train_corpus, test_corpus):
    # look at train set results
    N = sum(count for doc in train_corpus for _, count in doc)
    print("# of words in train corpus: ",N)
    perplexity = np.exp2(-lda.log_perplexity(train_corpus))
    print("perplexity(train):", perplexity,"\n")

    # look at test set results
    N = sum(count for doc in test_corpus for _, count in doc)
    print("# of words in test corpus: ",N)
    perplexity = np.exp2(-lda.log_perplexity(test_corpus))
    print("perplexity(test):", perplexity)

In [209]:
# look at overall perplexity and coherence score
def analyze_overall_results(lda, tweet_tokens, dict, corpus):
    print('\nPerplexity: ', lda.log_perplexity(corpus))     # lower is better

    coherence_model_lda = CoherenceModel(model=lda, texts=tweet_tokens, dictionary=dict, coherence='c_v')
    coherence_lda = coherence_model_lda.get_coherence()
    print('\nCoherence Score: ', coherence_lda)     # higher is better

In [210]:
# open interactive visualization of topics
def visualize_topics(lda, corpus, dict):
    pyLDAvis.enable_notebook(local=True)
    vis = pyLDAvis.gensim.prepare(lda, corpus, dict)
    pyLDAvis.show(vis, local=False)

In [186]:
# https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/
def format_topics_sentences(ldamodel, corpus, texts):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for _, row in enumerate(ldamodel[corpus]):
        row = sorted(row[0], key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = pd.concat([sent_topics_df, pd.DataFrame([[int(topic_num), round(prop_topic,4), topic_keywords]], columns=['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords'])], ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)


def get_dominant_topics(df_topic_sents_keywords):
    df_dominant_topic = df_topic_sents_keywords.reset_index()
    df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']
    df_dominant_topic.head(10)
    return df_topic_sents_keywords

In [199]:
# group top 5 sentences under each topic
def get_representative_docs(df_topic_sents_keywords):
    sent_topics_sorteddf = pd.DataFrame()
    sent_topics_outdf_grpd = df_topic_sents_keywords.groupby('Dominant_Topic')
    for _, grp in sent_topics_outdf_grpd:
        sent_topics_sorteddf = pd.concat([sent_topics_sorteddf, 
                                                grp.sort_values(['Perc_Contribution'], ascending=[0]).head(1)], 
                                                axis=0)
    # reset index    
    sent_topics_sorteddf.reset_index(drop=True, inplace=True)
    # format and show
    sent_topics_sorteddf.columns = ['Topic_Num', "Topic_Perc_Contrib", "Keywords", "Text"]
    sent_topics_sorteddf.head()
    return sent_topics_sorteddf

In [211]:
# show further details about topics
def run_topic_analysis(df_topic_sents_keywords):
    # num documents per topic
    topic_counts = df_topic_sents_keywords['Dominant_Topic'].value_counts()

    # percentage documents for each topic
    topic_contribution = round(topic_counts/topic_counts.sum(), 4)

    # topic number, keywords
    topic_num_keywords = df_topic_sents_keywords[['Dominant_Topic', 'Topic_Keywords']]

    # concatenate column-wise
    df_dominant_topics = pd.concat([topic_num_keywords, topic_counts, topic_contribution], axis=1)

    # add column names
    df_dominant_topics.columns = ['Dominant_Topic', 'Topic_Keywords', 'Num_Documents', 'Perc_Documents']

    # show
    df_dominant_topics

    return df_dominant_topics

#### General 2022 LDA

In [None]:
# run for 2022
tweet_tokens_2022, retweets_2022, not_parsed_2022 = preprocess_tokenize_all("2022")

# did we get retweets or errors?
print(len(retweets_2022))
print(len(not_parsed_2022))

In [42]:
lda_2022_gen = gensim.models.LdaModel.load("save_lda_model_2022")

loading LdaModel object from thesis_lda_model_2022
loading expElogbeta from thesis_lda_model_2022.expElogbeta.npy with mmap=None
setting ignored attribute id2word to None
setting ignored attribute dispatcher to None
setting ignored attribute state to None
LdaModel lifecycle event {'fname': 'thesis_lda_model_2022', 'datetime': '2024-03-17T10:28:38.384616', 'gensim': '4.3.2', 'python': '3.11.5 (main, Aug 24 2023, 15:18:16) [Clang 14.0.3 (clang-1403.0.22.14.1)]', 'platform': 'macOS-14.3.1-x86_64-i386-64bit', 'event': 'loaded'}
loading LdaState object from thesis_lda_model_2022.state
LdaState lifecycle event {'fname': 'thesis_lda_model_2022.state', 'datetime': '2024-03-17T10:28:38.407748', 'gensim': '4.3.2', 'python': '3.11.5 (main, Aug 24 2023, 15:18:16) [Clang 14.0.3 (clang-1403.0.22.14.1)]', 'platform': 'macOS-14.3.1-x86_64-i386-64bit', 'event': 'loaded'}


#### Zainichi Korean LDA

In [215]:
# preprocess and tokenize
txt_filename = "datasets_minority_groups/zainichi_2022.txt"
tweet_tokens_z22, retweets_z22, error_z22 = preprocess_tokenize_all(txt_filename, "2022")
# did we get retweets or errors?
print("retweets: ", len(retweets_z22))
print("errors: ", len(error_z22))
# save tokens to csv
save_to_csv(tweet_tokens_z22, "save_tokens_zainichi_2022.csv")

retweets:  0
errors:  0


In [216]:
# run lda
lda_z22, dict_z22, corpus_z22, train_corpus_z22, test_corpus_z22 = run_lda("zainichi_2022", tweet_tokens_z22)

adding document #0 to Dictionary<0 unique tokens: []>
adding document #10000 to Dictionary<18537 unique tokens: ['在日', '在日コリアン', '恥ずかしい', '愛さ', '愛せ']...>
adding document #20000 to Dictionary<27445 unique tokens: ['在日', '在日コリアン', '恥ずかしい', '愛さ', '愛せ']...>
adding document #30000 to Dictionary<33722 unique tokens: ['在日', '在日コリアン', '恥ずかしい', '愛さ', '愛せ']...>
built Dictionary<34645 unique tokens: ['在日', '在日コリアン', '恥ずかしい', '愛さ', '愛せ']...> from 31901 documents (total 532809 corpus positions)
Dictionary lifecycle event {'msg': "built Dictionary<34645 unique tokens: ['在日', '在日コリアン', '恥ずかしい', '愛さ', '愛せ']...> from 31901 documents (total 532809 corpus positions)", 'datetime': '2024-03-17T21:20:32.549172', 'gensim': '4.3.2', 'python': '3.11.5 (main, Aug 24 2023, 15:18:16) [Clang 14.0.3 (clang-1403.0.22.14.1)]', 'platform': 'macOS-14.3.1-x86_64-i386-64bit', 'event': 'created'}
discarding 24860 tokens: [('在日コリアン', 29374), ('矢張り', 3), ('安保理決議', 1), ('代物', 4), ('赤の他人', 4), ('がんばり', 2), ('新著', 1), ('デモ活動',

In [217]:
# analyze results
examine_topics(lda_z22, dict_z22)
analyze_train_test_results(lda_z22, train_corpus_z22, test_corpus_z22)
analyze_overall_results(lda_z22, tweet_tokens_z22, dict_z22, corpus_z22)

Topic #  1
現代ビジネス: 0.02036028355360031
差別: 0.018534637987613678
ください: 0.016673192381858826
思う: 0.016658447682857513
世: 0.016550375148653984
ヘイトクライム: 0.014851619489490986
ぜひ: 0.014640428125858307
読み: 0.014068474993109703
ウトロ: 0.014045638963580132
いたし: 0.013972759246826172


Topic #  2
日本: 0.029021091759204865
差別: 0.022041985765099525
日本人: 0.01994379051029682
てる: 0.017536623403429985
韓国: 0.011796552687883377
言っ: 0.006137709133327007
在日: 0.005817307159304619
言う: 0.004979840945452452
問題: 0.004651606548577547
考え: 0.004624845460057259


Topic #  3
ヘイト: 0.007178770378232002
歴史: 0.006823691073805094
戦後: 0.005888070911169052
考え: 0.005880876909941435
僕: 0.005228658206760883
思い: 0.005211341194808483
差別: 0.004732219502329826
弁護士: 0.004667274188250303
日本: 0.00465357955545187
金嬉老事件: 0.004622648004442453


Topic #  4
岸田: 0.024091046303510666
政務官: 0.02401636727154255
更迭: 0.023340312764048576
批判: 0.022674068808555603
侮辱: 0.02263890579342842
アイヌ民族: 0.021839581429958344
総務: 0.02143535017967224
杉田水脈: 0.02

-7.798 per-word bound, 222.6 perplexity estimate based on a held-out corpus of 28711 documents with 413683 words


perplexity(train): 222.62115476559515 

# of words in test corpus:  45546


-8.872 per-word bound, 468.4 perplexity estimate based on a held-out corpus of 3190 documents with 45546 words


perplexity(test): 468.39876175703836


-7.792 per-word bound, 221.6 perplexity estimate based on a held-out corpus of 31901 documents with 459229 words
using ParallelWordOccurrenceAccumulator<processes=7, batch_size=64> to estimate probabilities from sliding windows



Perplexity:  -7.791754543678939


7 accumulators retrieved from output queue
accumulated word occurrence stats for 31901 virtual documents



Coherence Score:  0.3960619483421115


In [None]:
visualize_topics(lda_z22, corpus_z22, dict_z22)

Serving to http://127.0.0.1:8888/    [Ctrl-C to exit]


127.0.0.1 - - [17/Mar/2024 19:57:00] "GET / HTTP/1.1" 200 -



stopping Server...


In [218]:
topic_df_z22 = format_topics_sentences(lda_z22, corpus_z22, tweet_tokens_z22)

In [219]:
get_dominant_topics(topic_df_z22)

Unnamed: 0,Dominant_Topic,Perc_Contribution,Topic_Keywords,0
0,1,0.9805,"日本, 差別, 日本人, てる, 韓国, 言っ, 在日, 言う, 問題, 考え","[犯罪, 在日コリアン, 朴, 貞子, 恥ずかしい, 恥ずかしい, 感覚, 矢張り, 日本,..."
1,1,0.9943,"日本, 差別, 日本人, てる, 韓国, 言っ, 在日, 言う, 問題, 考え","[前科, 朴, 貞子, 黙っ, 下さい, 日本, 愛せ, 愛さ, 日本, 在日コリアン, あ..."
2,1,0.9784,"日本, 差別, 日本人, てる, 韓国, 言っ, 在日, 言う, 問題, 考え","[日本人, 嫌, 嫌, 嗤, 韓国, 中国, 日本人, 日本, 在日コリアン, 中国人]"
3,1,0.9877,"日本, 差別, 日本人, てる, 韓国, 言っ, 在日, 言う, 問題, 考え","[拉致問題, ミサイル, 問題, 安保理決議, 違反, 関心, 持っ, 在日コリアン, 差別..."
4,2,0.7415,"ヘイト, 歴史, 戦後, 考え, 僕, 思い, 差別, 弁護士, 日本, 金嬉老事件","[お前, 作っ, 言っ, 全く, 赤の他人, 擬態, アカウント, ウヨ, 対峙, 在日コリ..."
...,...,...,...,...
31896,2,0.5331,"ヘイト, 歴史, 戦後, 考え, 僕, 思い, 差別, 弁護士, 日本, 金嬉老事件","[在日台湾人, 在日コリアン, 大日本帝国, 植民地, 日本列島, 台湾, 韓国, 渡っ, ..."
31897,1,0.4137,"日本, 差別, 日本人, てる, 韓国, 言っ, 在日, 言う, 問題, 考え","[国籍, 解消, 否, 解消, 国籍, 法律, 依存, 先方, 国籍法, 一度, 取得, 失..."
31898,3,0.7090,"岸田, 政務官, 更迭, 批判, 侮辱, アイヌ民族, 総務, 杉田水脈, 首相, 今度","[正確, アイヌ民族, フリ, 在日コリアン, 批判]"
31899,1,0.9114,"日本, 差別, 日本人, てる, 韓国, 言っ, 在日, 言う, 問題, 考え","[議員, 人間性, 問題, やはり, 政治家, 資質, あらか, 国会中継, 杉田水脈, ア..."


In [220]:
get_representative_docs(topic_df_z22)

Unnamed: 0,Topic_Num,Topic_Perc_Contrib,Keywords,Text
0,0,0.9936,"現代ビジネス, 差別, ください, 思う, 世, ヘイトクライム, ぜひ, 読み, ウトロ,...","[判決, 要旨, 全文, 社会, 不安, あおっ, 在日コリアン, ねらっ, ウトロ, 放火..."
1,1,0.9953,"日本, 差別, 日本人, てる, 韓国, 言っ, 在日, 言う, 問題, 考え","[韓国, 不法, 脱出, 日本, 不法入国, 滞在, 日本国籍, 拒絶, 在日特権, 貪る,..."
2,2,0.9934,"ヘイト, 歴史, 戦後, 考え, 僕, 思い, 差別, 弁護士, 日本, 金嬉老事件","[ヘイト街宣, 予告, 桜, 本, 守ろ, 支援, 声明, 拡散, 差別主義, 在日コリアン..."
3,3,0.9929,"岸田, 政務官, 更迭, 批判, 侮辱, アイヌ民族, 総務, 杉田水脈, 首相, 今度","[リバティ, 解放, 同盟, モノ, 薬害エイズ, ハンセン病, 水俣病, 在日コリアン, ..."
4,4,0.9932,"事件, デマ, 在日, 現代ビジネス, 思う, 犯人, 読ん, 川崎, ください, 日本","[樋口直人, 日本型排外主義, 在特会, 外国人参政権, 東アジア, 地政学, 名古屋, 大..."


In [221]:
run_topic_analysis(topic_df_z22)

Unnamed: 0,Dominant_Topic,Topic_Keywords,Num_Documents,Perc_Documents
0,1,"日本, 差別, 日本人, てる, 韓国, 言っ, 在日, 言う, 問題, 考え",4679.0,0.1467
1,1,"日本, 差別, 日本人, てる, 韓国, 言っ, 在日, 言う, 問題, 考え",15708.0,0.4924
2,1,"日本, 差別, 日本人, てる, 韓国, 言っ, 在日, 言う, 問題, 考え",5921.0,0.1856
3,1,"日本, 差別, 日本人, てる, 韓国, 言っ, 在日, 言う, 問題, 考え",2372.0,0.0744
4,2,"ヘイト, 歴史, 戦後, 考え, 僕, 思い, 差別, 弁護士, 日本, 金嬉老事件",3221.0,0.1010
...,...,...,...,...
31896,2,"ヘイト, 歴史, 戦後, 考え, 僕, 思い, 差別, 弁護士, 日本, 金嬉老事件",,
31897,1,"日本, 差別, 日本人, てる, 韓国, 言っ, 在日, 言う, 問題, 考え",,
31898,3,"岸田, 政務官, 更迭, 批判, 侮辱, アイヌ民族, 総務, 杉田水脈, 首相, 今度",,
31899,1,"日本, 差別, 日本人, てる, 韓国, 言っ, 在日, 言う, 問題, 考え",,


#### Ainu LDA

In [222]:
# preprocess and tokenize
txt_filename = "datasets_minority_groups/ainu_2022.txt"
tweet_tokens_a22, retweets_a22, error_a22 = preprocess_tokenize_all(txt_filename, "2022")
# did we get retweets or errors?
print("# retweets: ", len(retweets_a22))
print("# errors: ", len(error_a22))
# save tokens to csv
save_to_csv(tweet_tokens_a22, "save_tokens_ainu_2022.csv")

# run lda
lda_a22, dict_a22, corpus_a22, train_corpus_a22, test_corpus_a22 = run_lda("ainu_2022", tweet_tokens_a22)

# analyze perplexity, coherence results
examine_topics(lda_a22, dict_a22)
analyze_train_test_results(lda_a22, train_corpus_a22, test_corpus_a22)
analyze_overall_results(lda_a22, tweet_tokens_a22, dict_a22, corpus_a22)

# further topic analysis
topic_df_a22 = format_topics_sentences(lda_a22, corpus_a22, tweet_tokens_a22)
get_dominant_topics(topic_df_a22)
run_topic_analysis(topic_df_a22)

Retweet:  1768019931457823173
Retweet:  1768019931457823173
Retweet:  1768019931457823173
Retweet:  1768019931457823173
Retweet:  1768019931457823173
Retweet:  1768019931457823173
Retweet:  1768019931457823173
Retweet:  1768019931457823173
Retweet:  1768019931457823173
Retweet:  1768019931457823173
Retweet:  1768019931457823173
Retweet:  1768019931457823173
Retweet:  1768019931457823173
Retweet:  1768019931457823173
Retweet:  1768019931457823173
Retweet:  1768019931457823173
Retweet:  1768019931457823173
Retweet:  1768019931457823173
# retweets:  18
# errors:  0


adding document #0 to Dictionary<0 unique tokens: []>
adding document #10000 to Dictionary<21877 unique tokens: ['しっかり', 'す', 'アイヌ文化', '世界', '何故']...>
adding document #20000 to Dictionary<31754 unique tokens: ['しっかり', 'す', 'アイヌ文化', '世界', '何故']...>
adding document #30000 to Dictionary<38819 unique tokens: ['しっかり', 'す', 'アイヌ文化', '世界', '何故']...>
adding document #40000 to Dictionary<44789 unique tokens: ['しっかり', 'す', 'アイヌ文化', '世界', '何故']...>
adding document #50000 to Dictionary<50058 unique tokens: ['しっかり', 'す', 'アイヌ文化', '世界', '何故']...>
adding document #60000 to Dictionary<54965 unique tokens: ['しっかり', 'す', 'アイヌ文化', '世界', '何故']...>
adding document #70000 to Dictionary<59376 unique tokens: ['しっかり', 'す', 'アイヌ文化', '世界', '何故']...>
adding document #80000 to Dictionary<64066 unique tokens: ['しっかり', 'す', 'アイヌ文化', '世界', '何故']...>
adding document #90000 to Dictionary<68150 unique tokens: ['しっかり', 'す', 'アイヌ文化', '世界', '何故']...>
adding document #100000 to Dictionary<71377 unique tokens: ['しっかり', 'す', 

Topic #  1
日本: 0.010331884026527405
北海道: 0.010054299607872963
てる: 0.00973436888307333
アイヌ民族: 0.007899015210568905
日本人: 0.007763258647173643
沖縄: 0.007519761566072702
縄文人: 0.007251410745084286
アイヌ新法: 0.006389557849615812
反対: 0.00488533778116107
差別: 0.004456777591258287


Topic #  2
アイヌ語: 0.03292619064450264
北海道: 0.01881691813468933
意味: 0.009355717338621616
地名: 0.009075571782886982
由来: 0.007194993551820517
日本語: 0.005900090094655752
説: 0.004701962228864431
杉田: 0.003977031446993351
アイヌ民族: 0.00372358993627131
言語: 0.0035547411534935236


Topic #  3
アイヌ民族: 0.01646190881729126
氏: 0.011190274730324745
アイヌ語: 0.010810202918946743
否定: 0.008597702719271183
質問: 0.007991740480065346
協会: 0.006732426583766937
語: 0.0061555104330182076
投稿: 0.005470494739711285
研究: 0.005018000025302172
やゆ: 0.004587939009070396


Topic #  4
てる: 0.014529531821608543
アイヌ文化: 0.014022273011505604
ゴルデンカムイ: 0.011045513674616814
アイヌ語: 0.009332857094705105
文化: 0.006749141030013561
カム: 0.005577051546424627
思っ: 0.005486170761287212
思

-8.577 per-word bound, 382.0 perplexity estimate based on a held-out corpus of 160078 documents with 2298404 words


perplexity(train): 382.0058283002294 

# of words in test corpus:  248565


-9.198 per-word bound, 587.3 perplexity estimate based on a held-out corpus of 17786 documents with 248565 words


perplexity(test): 587.3169267175116


-8.569 per-word bound, 379.7 perplexity estimate based on a held-out corpus of 177864 documents with 2546969 words
using ParallelWordOccurrenceAccumulator<processes=7, batch_size=64> to estimate probabilities from sliding windows



Perplexity:  -8.568711645170653


7 accumulators retrieved from output queue
accumulated word occurrence stats for 177864 virtual documents



Coherence Score:  0.40843429327793795


Unnamed: 0,Dominant_Topic,Topic_Keywords,Num_Documents,Perc_Documents
0,4,"ロシア, 北海道, アイヌ民族, 先住民族, 日本, 差別, 民族, 権利, 日本人, プチン",27549.0,0.1549
1,0,"日本, 北海道, てる, アイヌ民族, 日本人, 沖縄, 縄文人, アイヌ新法, 反対, 差別",20159.0,0.1133
2,3,"てる, アイヌ文化, ゴルデンカムイ, アイヌ語, 文化, カム, 思っ, 思う, いい, 北海道",12442.0,0.0700
3,3,"てる, アイヌ文化, ゴルデンカムイ, アイヌ語, 文化, カム, 思っ, 思う, いい, 北海道",77894.0,0.4379
4,3,"てる, アイヌ文化, ゴルデンカムイ, アイヌ語, 文化, カム, 思っ, 思う, いい, 北海道",39820.0,0.2239
...,...,...,...,...
177859,1,"アイヌ語, 北海道, 意味, 地名, 由来, 日本語, 説, 杉田, アイヌ民族, 言語",,
177860,3,"てる, アイヌ文化, ゴルデンカムイ, アイヌ語, 文化, カム, 思っ, 思う, いい, 北海道",,
177861,3,"てる, アイヌ文化, ゴルデンカムイ, アイヌ語, 文化, カム, 思っ, 思う, いい, 北海道",,
177862,3,"てる, アイヌ文化, ゴルデンカムイ, アイヌ語, 文化, カム, 思っ, 思う, いい, 北海道",,


In [None]:
# visualize topics (keep in separate cell so we can interrupt it)
visualize_topics(lda_a22, corpus_a22, dict_a22)

#### Haafu LDA

In [223]:
# preprocess and tokenize
txt_filename = "datasets_minority_groups/haafu_2022.txt"
tweet_tokens_h22, retweets_h22, error_h22 = preprocess_tokenize_all(txt_filename, "2022")
# check retweets/errors
print("# retweets: ", len(retweets_h22))
print("# errors: ", len(error_h22))
# save tokens to csv
save_to_csv(tweet_tokens_h22, "save_tokens_haafu_2022.csv")

# run lda
lda_h22, dict_h22, corpus_h22, train_corpus_h22, test_corpus_h22 = run_lda("haafu_2022", tweet_tokens_h22)

# analyze perplexity, coherence results
examine_topics(lda_h22, dict_h22)
analyze_train_test_results(lda_h22, train_corpus_h22, test_corpus_h22)
analyze_overall_results(lda_h22, tweet_tokens_h22, dict_h22, corpus_h22)

# further topic analysis
topic_df_h22 = format_topics_sentences(lda_h22, corpus_h22, tweet_tokens_h22)
get_dominant_topics(topic_df_h22)
run_topic_analysis(topic_df_h22)

adding document #0 to Dictionary<0 unique tokens: []>
adding document #10000 to Dictionary<25349 unique tokens: ['ウエスト', 'コト', 'スリクォタ', 'ハフコト', 'ヒップ']...>


# retweets:  0
# errors:  0


adding document #20000 to Dictionary<37599 unique tokens: ['ウエスト', 'コト', 'スリクォタ', 'ハフコト', 'ヒップ']...>
built Dictionary<45757 unique tokens: ['ウエスト', 'コト', 'スリクォタ', 'ハフコト', 'ヒップ']...> from 28339 documents (total 336290 corpus positions)
Dictionary lifecycle event {'msg': "built Dictionary<45757 unique tokens: ['ウエスト', 'コト', 'スリクォタ', 'ハフコト', 'ヒップ']...> from 28339 documents (total 336290 corpus positions)", 'datetime': '2024-03-17T21:43:02.891827', 'gensim': '4.3.2', 'python': '3.11.5 (main, Aug 24 2023, 15:18:16) [Clang 14.0.3 (clang-1403.0.22.14.1)]', 'platform': 'macOS-14.3.1-x86_64-i386-64bit', 'event': 'created'}
discarding 36383 tokens: [('スリクォタ', 2), ('ヒップ', 2), ('フィンガチップ', 1), ('着丈', 3), ('総称', 2), ('ツイゴル', 2), ('ハフ', 11515), ('東京国際', 3), ('コネクタ', 1), ('ハフピッチ', 3)]...
keeping 9374 tokens which were in no less than 5 and no more than 5667 (=20.0%) documents
resulting dictionary: Dictionary<9374 unique tokens: ['ウエスト', 'コト', 'ハフコト', '丈', '中間']...>
using symmetric eta at 0.2
using ser

Topic #  1
ハフアップ: 0.04842441529035568
ハフツイン: 0.03004157729446888
てる: 0.024011677131056786
好き: 0.01608104631304741
可愛い: 0.012150638736784458
いい: 0.011008348315954208
すぎ: 0.00994652695953846
かわいい: 0.009061633609235287
髪: 0.008595802821218967
くん: 0.008541189134120941


Topic #  2
質問: 0.02486533485352993
てる: 0.01982671394944191
ハフミリオン: 0.01669182814657688
募集中: 0.015543737448751926
匿名: 0.015460001304745674
位: 0.01213137898594141
ハフパンツ: 0.01074016373604536
ニュハフ: 0.008923954330384731
答え: 0.008670365437865257
質問箱: 0.008359689265489578


Topic #  3
人間: 0.009507953189313412
思っ: 0.005602049175649881
てる: 0.005087725352495909
やっ: 0.004421718418598175
選手: 0.004139452241361141
いい: 0.0036454133223742247
サイド: 0.003484367160126567
思い: 0.0032396414317190647
英語: 0.0030960168223828077
子供: 0.002971452893689275


Topic #  4
今日: 0.013251381926238537
サイズ: 0.008203381672501564
食べ: 0.007550818845629692
ハフハフ: 0.0074395728297531605
笑: 0.007304506842046976
てる: 0.007284467574208975
ハフバスデ: 0.006763615179806948
いい: 0.

-8.295 per-word bound, 314.1 perplexity estimate based on a held-out corpus of 25506 documents with 239923 words


perplexity(train): 314.1275224150233 

# of words in test corpus:  26137


-9.804 per-word bound, 893.9 perplexity estimate based on a held-out corpus of 2833 documents with 26137 words


perplexity(test): 893.9166643864165


-8.270 per-word bound, 308.6 perplexity estimate based on a held-out corpus of 28339 documents with 266060 words
using ParallelWordOccurrenceAccumulator<processes=7, batch_size=64> to estimate probabilities from sliding windows



Perplexity:  -8.269737292708953


7 accumulators retrieved from output queue
accumulated word occurrence stats for 28339 virtual documents



Coherence Score:  0.4268732663352532


Unnamed: 0,Dominant_Topic,Topic_Keywords,Num_Documents,Perc_Documents
0,4,"ハフパンツ, ニュハフ, 楽天, シャツ, 位, 半袖, 無料, 送料, 人気, 楽天市場",9322.0,0.3289
1,1,"質問, てる, ハフミリオン, 募集中, 匿名, 位, ハフパンツ, ニュハフ, 答え, 質問箱",3594.0,0.1268
2,2,"人間, 思っ, てる, やっ, 選手, いい, サイド, 思い, 英語, 子供",4560.0,0.1609
3,2,"人間, 思っ, てる, やっ, 選手, いい, サイド, 思い, 英語, 子供",7690.0,0.2714
4,2,"人間, 思っ, てる, やっ, 選手, いい, サイド, 思い, 英語, 子供",3173.0,0.1120
...,...,...,...,...
28334,3,"今日, サイズ, 食べ, ハフハフ, 笑, てる, ハフバスデ, いい, 思っ, ハフパンツ",,
28335,3,"今日, サイズ, 食べ, ハフハフ, 笑, てる, ハフバスデ, いい, 思っ, ハフパンツ",,
28336,2,"人間, 思っ, てる, やっ, 選手, いい, サイド, 思い, 英語, 子供",,
28337,2,"人間, 思っ, てる, やっ, 選手, いい, サイド, 思い, 英語, 子供",,


In [225]:
# visualize topics
visualize_topics(lda_h22, corpus_h22, dict_h22)

Serving to http://127.0.0.1:8888/    [Ctrl-C to exit]


127.0.0.1 - - [17/Mar/2024 21:52:24] "GET / HTTP/1.1" 200 -



stopping Server...


#### Ryukyujin LDA


In [224]:
# preprocess and tokenize
txt_filename = "datasets_minority_groups/ryukyujin_2022.txt"
tweet_tokens_r22, retweets_r22, error_r22 = preprocess_tokenize_all(txt_filename, "2022")
# check retweets/errors
print("# retweets: ", len(retweets_r22))
print("# errors: ", len(error_r22))
# save tokens to csv
save_to_csv(tweet_tokens_r22, "save_tokens_haafu_2022.csv")

# run lda
lda_r22, dict_r22, corpus_r22, train_corpus_r22, test_corpus_r22 = run_lda("haafu_2022", tweet_tokens_r22)

# analyze perplexity, coherence results
examine_topics(lda_r22, dict_r22)
analyze_train_test_results(lda_r22, train_corpus_r22, test_corpus_r22)
analyze_overall_results(lda_r22, tweet_tokens_r22, dict_r22, corpus_r22)

# further topic analysis
topic_df_r22 = format_topics_sentences(lda_r22, corpus_r22, tweet_tokens_r22)
get_dominant_topics(topic_df_r22)
run_topic_analysis(topic_df_r22)

Retweet:  1768019931457823173


adding document #0 to Dictionary<0 unique tokens: []>
built Dictionary<15354 unique tokens: ['ペジ', '下さい', '下さっ', '側', '先住民族']...> from 5454 documents (total 97145 corpus positions)
Dictionary lifecycle event {'msg': "built Dictionary<15354 unique tokens: ['ペジ', '下さい', '下さっ', '側', '先住民族']...> from 5454 documents (total 97145 corpus positions)", 'datetime': '2024-03-17T21:47:33.368163', 'gensim': '4.3.2', 'python': '3.11.5 (main, Aug 24 2023, 15:18:16) [Clang 14.0.3 (clang-1403.0.22.14.1)]', 'platform': 'macOS-14.3.1-x86_64-i386-64bit', 'event': 'created'}
discarding 12652 tokens: [('ペジ', 3), ('下さっ', 3), ('史上', 2), ('尊厳回復', 1), ('尽力', 2), ('日本の裁判所', 1), ('準備書面', 1), ('球人', 4102), ('琉', 4088), ('琉球', 1698)]...
keeping 2702 tokens which were in no less than 5 and no more than 1090 (=20.0%) documents
resulting dictionary: Dictionary<2702 unique tokens: ['下さい', '側', '先住民族', '初めて', '原告']...>
using symmetric eta at 0.2
using serial LDA version on this node
running online (multi-pass) LDA train

# retweets:  1
# errors:  0


PROGRESS: pass 0, at document #4000/5454
-8.677 per-word bound, 409.2 perplexity estimate based on a held-out corpus of 1454 documents with 17613 words
PROGRESS: pass 0, at document #5454/5454
topic #0 (0.050): 0.024*"差別" + 0.018*"日本人" + 0.013*"日本" + 0.012*"沖縄県民" + 0.008*"米軍" + 0.006*"我" + 0.006*"沖縄県" + 0.006*"思い" + 0.005*"琉球独立" + 0.005*"戦争"
topic #1 (0.050): 0.042*"言え" + 0.028*"墓" + 0.020*"琉球独立" + 0.019*"庶民" + 0.018*"本村" + 0.018*"安彦" + 0.015*"言っ" + 0.014*"死者" + 0.014*"アレ" + 0.014*"書き換え"
topic #2 (0.050): 0.025*"日本人" + 0.013*"日本" + 0.011*"てる" + 0.008*"言っ" + 0.007*"琉球独立" + 0.006*"言う" + 0.006*"知事" + 0.006*"中国" + 0.005*"デニ" + 0.005*"独立"
topic #3 (0.050): 0.027*"日本人" + 0.026*"日本" + 0.020*"琉球独立" + 0.019*"中国" + 0.017*"アイヌ" + 0.013*"庶民" + 0.013*"本村" + 0.013*"安彦" + 0.011*"てる" + 0.008*"まとめ"
topic #4 (0.050): 0.026*"日本" + 0.019*"琉球独立" + 0.018*"本村" + 0.018*"安彦" + 0.018*"庶民" + 0.014*"年月日" + 0.012*"米軍" + 0.011*"つぶやき" + 0.009*"日本人" + 0.009*"基地"
topic diff=1.854747, rho=1.000000
PROGRESS: pass 1, at 

Topic #  1
日本人: 0.024870773777365685
差別: 0.02381037175655365
日本: 0.01534298062324524
沖縄県民: 0.009157363325357437
てる: 0.008480338379740715
沖縄県: 0.008134803734719753
歴史: 0.007580472622066736
本土: 0.007128231227397919
思い: 0.0069444929249584675
戦争: 0.0048554944805800915


Topic #  2
言え: 0.06907447427511215
琉球独立: 0.05569665506482124
庶民: 0.05544215813279152
本村: 0.05475016310811043
安彦: 0.0545014962553978
墓: 0.04567573964595795
言っ: 0.023953162133693695
死者: 0.022086577489972115
アレ: 0.021556498482823372
ぶっちゃけ: 0.0214211568236351


Topic #  3
日本人: 0.023455040529370308
てる: 0.015985198318958282
日本: 0.0138449277728796
中国: 0.009322143159806728
デニ: 0.008665786124765873
玉城: 0.008274910971522331
言っ: 0.007760482374578714
知事: 0.0077031138353049755
国連: 0.007649337872862816
言う: 0.007047815248370171


Topic #  4
日本: 0.027911800891160965
中国: 0.026474766433238983
アイヌ: 0.02484007179737091
日本人: 0.023338964208960533
遺骨: 0.013558333739638329
まとめ: 0.013174746185541153
琉球独立: 0.010288620367646217
独立: 0.0101526258513331

-6.978 per-word bound, 126.1 perplexity estimate based on a held-out corpus of 4909 documents with 56585 words
-8.838 per-word bound, 457.6 perplexity estimate based on a held-out corpus of 545 documents with 6767 words


perplexity(train): 126.06495175796059 

# of words in test corpus:  6767
perplexity(test): 457.61706928291824


-6.959 per-word bound, 124.4 perplexity estimate based on a held-out corpus of 5454 documents with 63352 words
using ParallelWordOccurrenceAccumulator<processes=7, batch_size=64> to estimate probabilities from sliding windows



Perplexity:  -6.959118801586011


7 accumulators retrieved from output queue
accumulated word occurrence stats for 5454 virtual documents



Coherence Score:  0.3042714504656371


Unnamed: 0,Dominant_Topic,Topic_Keywords,Num_Documents,Perc_Documents
0,3,"日本, 中国, アイヌ, 日本人, 遺骨, まとめ, 琉球独立, 独立, てる, 議論",1470.0,0.2695
1,3,"日本, 中国, アイヌ, 日本人, 遺骨, まとめ, 琉球独立, 独立, てる, 議論",751.0,0.1377
2,4,"琉球独立, 庶民, 本村, 安彦, 日本, 米軍, 年月日, 米, つぶやき, 基地",993.0,0.1821
3,2,"日本人, てる, 日本, 中国, デニ, 玉城, 言っ, 知事, 国連, 言う",1375.0,0.2521
4,3,"日本, 中国, アイヌ, 日本人, 遺骨, まとめ, 琉球独立, 独立, てる, 議論",865.0,0.1586
...,...,...,...,...
5449,0,"日本人, 差別, 日本, 沖縄県民, てる, 沖縄県, 歴史, 本土, 思い, 戦争",,
5450,3,"日本, 中国, アイヌ, 日本人, 遺骨, まとめ, 琉球独立, 独立, てる, 議論",,
5451,3,"日本, 中国, アイヌ, 日本人, 遺骨, まとめ, 琉球独立, 独立, てる, 議論",,
5452,0,"日本人, 差別, 日本, 沖縄県民, てる, 沖縄県, 歴史, 本土, 思い, 戦争",,


In [226]:
# visualize topics
visualize_topics(lda_r22, corpus_r22, dict_r22)

Serving to http://127.0.0.1:8888/    [Ctrl-C to exit]


127.0.0.1 - - [17/Mar/2024 21:59:58] "GET / HTTP/1.1" 200 -



stopping Server...


In [None]:
# TEST - increase # of topics from 5 to 10

# run lda
lda, dict, corpus, train, test = run_lda("test_2022", tweet_tokens_z22)

# analyze perplexity, coherence results
examine_topics(lda, dict)
analyze_train_test_results(lda, train, test)
print('\n')
analyze_overall_results(lda, tweet_tokens_z22, dict, corpus)

# visualize topics
visualize_topics(lda, corpus, dict)