### Setup

In [116]:
# general
import numpy as np
import pandas as pd
import csv

# tokenization
import json
import MeCab
import import_ipynb
import thesis_preprocess
from stopwords_ja import stop_words
from stopwords_slothlib import stop_words_2

# lda topic modeling
import gensim, logging
import gensim.corpora as corpora
from gensim.models import CoherenceModel
import pyLDAvis
import pyLDAvis.gensim

In [55]:
# https://lda.readthedocs.io/en/latest/getting_started.html
# https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/
# https://github.com/deankuo/Japan-Manifesto-Classification/blob/main/topic_modeling.ipynb
# https://github.com/m3yrin/NTM/blob/master/LDA_jp.ipynb
# https://tdual.hatenablog.com/entry/2018/04/09/133000#1LDA%E3%81%AE%E5%89%8D%E3%81%AB%E3%83%88%E3%83%94%E3%83%83%E3%82%AF%E3%83%A2%E3%83%87%E3%83%AB%E3%81%A8%E3%81%AF

### Preprocessing and Tokenization

In [141]:
# tokenize cleaned tweets into words
def tokenize(text):
    mt = MeCab.Tagger("-d /usr/local/lib/mecab/dic/mecab-ipadic-neologd")
    parsed = mt.parseToNode(text)
    components = []
    
    while parsed:
        word = parsed.surface
        pos = parsed.feature.split(",")[0]

        # for lda, we only want nouns, verbs, adjectives
        include_pos = ["名詞", "動詞", "形容詞", "副詞"]
        if pos in include_pos: components.append(word)
        parsed = parsed.next
    
    # remove stopwords
    components = [token for token in components if ((not token in stop_words) and (not token in stop_words_2))]
    
    return components

In [154]:
# run preprocessing and tokenization for tweets from given .txt file
def preprocess_tokenize_all(filename, year):
    # store results and exception tweets
    tokens = []
    retweets = []
    not_parsed = []

    # iterate through tweets, preprocess and tokenize
    with open(filename, 'r') as file:
        for line in file:
            tweet = json.loads(line)
            if line == None or tweet == None:
                not_parsed.append((line, tweet))
                print("Parsing error: ", line, tweet)
            elif tweet['retweetedTweet']:
                retweets.append(tweet)
                print("Retweet: ", tweet['id'])
            # filter out 2024 sponsored(?) tweets
            elif int(tweet['date'].split("-")[0]) < int(year) + 1: 
                tweet_text = tweet['rawContent'] # note: need other prop for over 140 char?
                processed = preprocess(tweet_text)            
                components = tokenize(processed)
                tokens.append(components)

    file.close()
    return tokens, retweets, not_parsed

In [57]:
# pass in filename you want to save data as, including '.csv'
def save_to_csv(tweet_tokens, filename):
    f = open(filename, 'w')
    writer = csv.writer(f)
    for tweet in tweet_tokens:
        writer.writerow(tweet)
    f.close()

In [58]:
# pass in filename of csv you want to load, including '.csv'
def load_from_csv(filename):
    with open(filename, newline='') as f:
        reader = csv.reader(f)
        tweet_tokens = list(reader)
    return tweet_tokens

### Latent Dirichlet Allocation

In [245]:
# train and save lda model for given year; data_name is id suffix to save lda model
def run_lda(data_name, tweet_tokens, num_topics=10, no_below=5, no_above=0.2):
    # set up dictionary
    dict = corpora.Dictionary(tweet_tokens)
    dict.filter_extremes(no_below, no_above)
    dict.compactify()

    # set up corpus
    corpus = [dict.doc2bow(w) for w in tweet_tokens]
    test_size = int(len(corpus) * 0.1)
    test_corpus = corpus[:test_size]
    train_corpus = corpus[test_size:]   

    # train and save lda model
    logging.basicConfig(format='%(message)s', level=logging.INFO)
    lda = gensim.models.ldamodel.LdaModel(corpus=corpus, 
                                          id2word=dict, 
                                          num_topics=num_topics, 
                                          random_state=100, 
                                          passes=10, 
                                          update_every=3, 
                                          alpha='auto',
                                          per_word_topics=True)
    lda.save("save_lda_model_" + data_name)
    
    return lda, dict, corpus, train_corpus, test_corpus

In [247]:
# display words comprising topics
def examine_topics(lda, dict, num_topics=10):
    for topic in range(num_topics):
        print("Topic # ",(topic+1))
        for t in lda.get_topic_terms(topic):
            print("{}: {}".format(dict[t[0]], t[1]))
        print("\n")

In [248]:
def examine_topics_plain(lda, dict, num_topics=10):
    for topic in range(num_topics):
        print("Topic # ",(topic+1))
        for t in lda.get_topic_terms(topic):
            print(dict[t[0]])
        print("\n")

In [208]:
# compare train/test perplexity
def analyze_train_test_results(lda, train_corpus, test_corpus):
    # look at train set results
    N = sum(count for doc in train_corpus for _, count in doc)
    print("# of words in train corpus: ",N)
    perplexity = np.exp2(-lda.log_perplexity(train_corpus))
    print("perplexity(train):", perplexity,"\n")

    # look at test set results
    N = sum(count for doc in test_corpus for _, count in doc)
    print("# of words in test corpus: ",N)
    perplexity = np.exp2(-lda.log_perplexity(test_corpus))
    print("perplexity(test):", perplexity)

In [209]:
# look at overall perplexity and coherence score
def analyze_overall_results(lda, tweet_tokens, dict, corpus):
    print('\nPerplexity: ', lda.log_perplexity(corpus))     # lower is better

    coherence_model_lda = CoherenceModel(model=lda, texts=tweet_tokens, dictionary=dict, coherence='c_v')
    coherence_lda = coherence_model_lda.get_coherence()
    print('\nCoherence Score: ', coherence_lda)     # higher is better

In [210]:
# open interactive visualization of topics
def visualize_topics(lda, corpus, dict):
    pyLDAvis.enable_notebook(local=True)
    vis = pyLDAvis.gensim.prepare(lda, corpus, dict)
    pyLDAvis.show(vis, local=False)

In [186]:
# https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/
def format_topics_sentences(ldamodel, corpus, texts):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for _, row in enumerate(ldamodel[corpus]):
        row = sorted(row[0], key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = pd.concat([sent_topics_df, pd.DataFrame([[int(topic_num), round(prop_topic,4), topic_keywords]], columns=['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords'])], ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)


def get_dominant_topics(df_topic_sents_keywords):
    df_dominant_topic = df_topic_sents_keywords.reset_index()
    df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']
    df_dominant_topic.head(10)
    return df_topic_sents_keywords

In [199]:
# group top 5 sentences under each topic
def get_representative_docs(df_topic_sents_keywords):
    sent_topics_sorteddf = pd.DataFrame()
    sent_topics_outdf_grpd = df_topic_sents_keywords.groupby('Dominant_Topic')
    for _, grp in sent_topics_outdf_grpd:
        sent_topics_sorteddf = pd.concat([sent_topics_sorteddf, 
                                                grp.sort_values(['Perc_Contribution'], ascending=[0]).head(1)], 
                                                axis=0)
    # reset index    
    sent_topics_sorteddf.reset_index(drop=True, inplace=True)
    # format and show
    sent_topics_sorteddf.columns = ['Topic_Num', "Topic_Perc_Contrib", "Keywords", "Text"]
    sent_topics_sorteddf.head()
    return sent_topics_sorteddf

In [211]:
# show further details about topics
def run_topic_analysis(df_topic_sents_keywords):
    # num documents per topic
    topic_counts = df_topic_sents_keywords['Dominant_Topic'].value_counts()

    # percentage documents for each topic
    topic_contribution = round(topic_counts/topic_counts.sum(), 4)

    # topic number, keywords
    topic_num_keywords = df_topic_sents_keywords[['Dominant_Topic', 'Topic_Keywords']]

    # concatenate column-wise
    df_dominant_topics = pd.concat([topic_num_keywords, topic_counts, topic_contribution], axis=1)

    # add column names
    df_dominant_topics.columns = ['Dominant_Topic', 'Topic_Keywords', 'Num_Documents', 'Perc_Documents']

    # show
    df_dominant_topics

    return df_dominant_topics

#### General 2022 LDA

In [None]:
# run for 2022
tweet_tokens_2022, retweets_2022, not_parsed_2022 = preprocess_tokenize_all("2022")

# did we get retweets or errors?
print(len(retweets_2022))
print(len(not_parsed_2022))

In [42]:
lda_2022_gen = gensim.models.LdaModel.load("save_lda_model_2022")

loading LdaModel object from thesis_lda_model_2022
loading expElogbeta from thesis_lda_model_2022.expElogbeta.npy with mmap=None
setting ignored attribute id2word to None
setting ignored attribute dispatcher to None
setting ignored attribute state to None
LdaModel lifecycle event {'fname': 'thesis_lda_model_2022', 'datetime': '2024-03-17T10:28:38.384616', 'gensim': '4.3.2', 'python': '3.11.5 (main, Aug 24 2023, 15:18:16) [Clang 14.0.3 (clang-1403.0.22.14.1)]', 'platform': 'macOS-14.3.1-x86_64-i386-64bit', 'event': 'loaded'}
loading LdaState object from thesis_lda_model_2022.state
LdaState lifecycle event {'fname': 'thesis_lda_model_2022.state', 'datetime': '2024-03-17T10:28:38.407748', 'gensim': '4.3.2', 'python': '3.11.5 (main, Aug 24 2023, 15:18:16) [Clang 14.0.3 (clang-1403.0.22.14.1)]', 'platform': 'macOS-14.3.1-x86_64-i386-64bit', 'event': 'loaded'}


#### Zainichi Korean LDA
##### 2022


In [229]:
# preprocess and tokenize
txt_filename = "datasets_minority_groups/zainichi_2022.txt"
tweet_tokens_z22, retweets_z22, error_z22 = preprocess_tokenize_all(txt_filename, "2022")
print("retweets: ", len(retweets_z22))
print("errors: ", len(error_z22))
save_to_csv(tweet_tokens_z22, "save_tokens_zainichi_2022.csv")

# run lda
lda_z22, dict_z22, corpus_z22, train_corpus_z22, test_corpus_z22 = run_lda("zainichi_2022", tweet_tokens_z22)

# analyze results
examine_topics(lda_z22, dict_z22)
analyze_train_test_results(lda_z22, train_corpus_z22, test_corpus_z22)
analyze_overall_results(lda_z22, tweet_tokens_z22, dict_z22, corpus_z22)

# further topic analysis
topic_df_z22 = format_topics_sentences(lda_z22, corpus_z22, tweet_tokens_z22)
get_dominant_topics(topic_df_z22)
get_representative_docs(topic_df_z22)
run_topic_analysis(topic_df_z22)

adding document #0 to Dictionary<0 unique tokens: []>


retweets:  0
errors:  0


adding document #10000 to Dictionary<18537 unique tokens: ['在日', '在日コリアン', '恥ずかしい', '愛さ', '愛せ']...>
adding document #20000 to Dictionary<27445 unique tokens: ['在日', '在日コリアン', '恥ずかしい', '愛さ', '愛せ']...>
adding document #30000 to Dictionary<33722 unique tokens: ['在日', '在日コリアン', '恥ずかしい', '愛さ', '愛せ']...>
built Dictionary<34645 unique tokens: ['在日', '在日コリアン', '恥ずかしい', '愛さ', '愛せ']...> from 31901 documents (total 532809 corpus positions)
Dictionary lifecycle event {'msg': "built Dictionary<34645 unique tokens: ['在日', '在日コリアン', '恥ずかしい', '愛さ', '愛せ']...> from 31901 documents (total 532809 corpus positions)", 'datetime': '2024-03-17T22:06:45.566855', 'gensim': '4.3.2', 'python': '3.11.5 (main, Aug 24 2023, 15:18:16) [Clang 14.0.3 (clang-1403.0.22.14.1)]', 'platform': 'macOS-14.3.1-x86_64-i386-64bit', 'event': 'created'}
discarding 24860 tokens: [('在日コリアン', 29374), ('矢張り', 3), ('安保理決議', 1), ('代物', 4), ('赤の他人', 4), ('がんばり', 2), ('新著', 1), ('デモ活動', 3), ('察する', 1), ('特会', 2)]...
keeping 9785 tokens whi

Topic #  1
現代ビジネス: 0.03381703048944473
世: 0.028344137594103813
思う: 0.025435030460357666
ください: 0.02533184364438057
ぜひ: 0.024472197517752647
読み: 0.02353762835264206
ヘイトクライム: 0.023219751194119453
いたし: 0.02317471243441105
寄稿: 0.022674623876810074
ウトロ: 0.022224508225917816


Topic #  2
差別: 0.03133709356188774
日本: 0.030221354216337204
日本人: 0.0273088738322258
てる: 0.02561582438647747
言っ: 0.009285970591008663
言う: 0.007920498959720135
思う: 0.006673811003565788
いい: 0.006338497158139944
帰化: 0.0056755319237709045
外国人: 0.005561834666877985


Topic #  3
事件: 0.057337868958711624
デマ: 0.0573095940053463
現代ビジネス: 0.0384482704102993
在日: 0.030255340039730072
読ん: 0.029539935290813446
犯人: 0.028354644775390625
川崎: 0.025646798312664032
思う: 0.02449604868888855
いただき: 0.024488283321261406
悪質: 0.023875750601291656


Topic #  4
アイヌ民族: 0.05558505654335022
首相: 0.0468258298933506
アイヌ: 0.030066298320889473
沖縄: 0.017145786434412003
問題: 0.014441381208598614
日本人: 0.010293419472873211
体制: 0.007969978265464306
小松川事件: 0.007792

-7.785 per-word bound, 220.6 perplexity estimate based on a held-out corpus of 28711 documents with 413683 words


perplexity(train): 220.5815644727267 

# of words in test corpus:  45546


-8.934 per-word bound, 489.1 perplexity estimate based on a held-out corpus of 3190 documents with 45546 words


perplexity(test): 489.1202269191426


-7.760 per-word bound, 216.8 perplexity estimate based on a held-out corpus of 31901 documents with 459229 words
using ParallelWordOccurrenceAccumulator<processes=7, batch_size=64> to estimate probabilities from sliding windows



Perplexity:  -7.759913873385379


7 accumulators retrieved from output queue
accumulated word occurrence stats for 31901 virtual documents



Coherence Score:  0.4370279094335447


Unnamed: 0,Dominant_Topic,Topic_Keywords,Num_Documents,Perc_Documents
0,1,"差別, 日本, 日本人, てる, 言っ, 言う, 思う, いい, 帰化, 外国人",3080.0,0.0965
1,1,"差別, 日本, 日本人, てる, 言っ, 言う, 思う, いい, 帰化, 外国人",13588.0,0.4259
2,1,"差別, 日本, 日本人, てる, 言っ, 言う, 思う, いい, 帰化, 外国人",1080.0,0.0339
3,1,"差別, 日本, 日本人, てる, 言っ, 言う, 思う, いい, 帰化, 外国人",982.0,0.0308
4,1,"差別, 日本, 日本人, てる, 言っ, 言う, 思う, いい, 帰化, 外国人",2909.0,0.0912
...,...,...,...,...
31896,5,"韓国, 日本, 検閲, 在日, 韓国人, 統一教会, 二度と, 北朝鮮, 日本人, 弁護士",,
31897,8,"差別, ください, 作品, 問題, 人権, 謝罪, 行っ, 公開, 上映, 東京都",,
31898,3,"アイヌ民族, 首相, アイヌ, 沖縄, 問題, 日本人, 体制, 小松川事件, 起き, 主張",,
31899,9,"杉田水脈, 政務官, 岸田, 更迭, 批判, 侮辱, 総務, 今度, 殺到, 東京新聞",,


In [None]:
lda_z22 = gensim.models.LdaModel.load("save_lda_model_zainichi_2022")
examine_topics_plain(lda_z22, dict_z22)

In [238]:
visualize_topics(lda_z22, corpus_z22, dict_z22)

Serving to http://127.0.0.1:8888/    [Ctrl-C to exit]


127.0.0.1 - - [21/Mar/2024 12:56:21] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [21/Mar/2024 12:56:22] code 404, message Not Found
127.0.0.1 - - [21/Mar/2024 12:56:22] "GET /favicon.ico HTTP/1.1" 404 -



stopping Server...


##### 2015

In [246]:
# preprocess and tokenize
txt_filename = "datasets_minority_groups/zainichi_2015.txt"
tweet_tokens_z15, retweets_z15, error_z15 = preprocess_tokenize_all(txt_filename, "2015")
print("retweets: ", len(retweets_z15))
print("errors: ", len(error_z15))
save_to_csv(tweet_tokens_z15, "save_tokens_zainichi_2022.csv")

# run lda
lda_z15, dict_z15, corpus_z15, train_z15, test_z15 = run_lda("zainichi_2015", tweet_tokens_z15, num_topics=5)

# analyze results
examine_topics(lda_z15, dict_z15)
analyze_train_test_results(lda_z15, train_z15, test_z15)
analyze_overall_results(lda_z15, tweet_tokens_z15, dict_z15, corpus_z15)

# # further topic analysis
# topic_df_z15 = format_topics_sentences(lda_z15, corpus_z15, tweet_tokens_z15)
# get_dominant_topics(topic_df_z15)
# get_representative_docs(topic_df_z15)
# run_topic_analysis(topic_df_z15)

adding document #0 to Dictionary<0 unique tokens: []>


retweets:  0
errors:  0


adding document #10000 to Dictionary<13654 unique tokens: ['しよ', '在日コリアン', '廃止', '悪用', '詐欺事件']...>
adding document #20000 to Dictionary<19862 unique tokens: ['しよ', '在日コリアン', '廃止', '悪用', '詐欺事件']...>
built Dictionary<22136 unique tokens: ['しよ', '在日コリアン', '廃止', '悪用', '詐欺事件']...> from 23653 documents (total 365268 corpus positions)
Dictionary lifecycle event {'msg': "built Dictionary<22136 unique tokens: ['しよ', '在日コリアン', '廃止', '悪用', '詐欺事件']...> from 23653 documents (total 365268 corpus positions)", 'datetime': '2024-03-21T13:34:26.395820', 'gensim': '4.3.2', 'python': '3.11.5 (main, Aug 24 2023, 15:18:16) [Clang 14.0.3 (clang-1403.0.22.14.1)]', 'platform': 'macOS-14.3.1-x86_64-i386-64bit', 'event': 'created'}
discarding 16484 tokens: [('在日コリアン', 22775), ('とらえ', 2), ('人間なんて', 1), ('居心地', 3), ('良さ', 4), ('裏切る', 1), ('陣営', 3), ('ですか', 3), ('らいい', 2), ('さよなら絶望先生', 1)]...
keeping 5652 tokens which were in no less than 5 and no more than 4730 (=20.0%) documents
resulting dictionary: Dictionary<5

Topic #  1
ヘイトスピチ: 0.026580356061458588
差別: 0.011288809590041637
デモ: 0.010862398892641068
日本: 0.010672264732420444
参加: 0.009625867940485477
てる: 0.007963327690958977
川崎: 0.007405382581055164
戦後: 0.006753513123840094
抗議: 0.006167297717183828
規制: 0.00564454635605216


Topic #  2
デマ: 0.036443404853343964
ネット: 0.024918528273701668
強制送還: 0.02243289351463318
入管: 0.01901024393737316
否定: 0.015696583315730095
偏見: 0.015229578129947186
韓国: 0.012453993782401085
可能性: 0.012370530515909195
通報: 0.011880806647241116
保守速報: 0.011788868345320225


Topic #  3
通名: 0.07427722960710526
廃止: 0.035674963146448135
起こす: 0.030081236734986305
しよ: 0.02962517738342285
悪用: 0.02873629704117775
詐欺事件: 0.02856472320854664
万人: 0.012857712805271149
大半: 0.012087764218449593
強制: 0.01127065159380436
カジノ: 0.010475915856659412


Topic #  4
日本人: 0.025348545983433723
日本: 0.02373468689620495
在日: 0.019237900152802467
てる: 0.016113389283418655
弁護士: 0.01047326996922493
韓国: 0.009770265780389309
差別: 0.008688935078680515
協会: 0.0076047680340

IndexError: index 5 is out of bounds for axis 0 with size 5

In [None]:
examine_topics_plain(lda_z15, dict_z15)

Topic #  1
日本
必要
帰化
尊重
民団
本名
触れ
ネトウヨ
連中
人権


Topic #  2
韓国
判明
世
使う
自民党
在日特権
通名
強制
更新
圧力


Topic #  3
通名
起こす
しよ
廃止
悪用
詐欺事件
嫌悪感
万人
速報
密入国


Topic #  4
てる
デマ
偏見
日本人
ネット
日本
万人
笑
在日
入管


Topic #  5
ヘイトスピチ
被害
対象
万人
強制送還
共生
在住
多民族
報告書
センタ


Topic #  6
強制送還
支援
歴史
在日
在日コリアン犯罪
逮捕
日韓
重大
韓国籍
日本


Topic #  7
日本人
在日
弁護士
協会
多い
韓国人
日本
設立
朝鮮
セミナ


Topic #  8
日本
日本人
放題
いっ
ルル
マナ
しろ
韓国人
いい
帰化人


Topic #  9
ヘイトスピチ
日本
差別
韓国
問題
求め
裏
抗議
話題
行動


Topic #  10
年月日
大半
人権
カジノ
共生
強制
声
議員
多民族
子孫




In [244]:
visualize_topics(lda_z15, corpus_z15, dict_z15)

Serving to http://127.0.0.1:8888/    [Ctrl-C to exit]


127.0.0.1 - - [21/Mar/2024 13:30:08] "GET / HTTP/1.1" 200 -



stopping Server...


#### Ainu LDA

In [230]:
# preprocess and tokenize
txt_filename = "datasets_minority_groups/ainu_2022.txt"
tweet_tokens_a22, retweets_a22, error_a22 = preprocess_tokenize_all(txt_filename, "2022")
print("# retweets: ", len(retweets_a22))
print("# errors: ", len(error_a22))
save_to_csv(tweet_tokens_a22, "save_tokens_ainu_2022.csv")

# run lda
lda_a22, dict_a22, corpus_a22, train_corpus_a22, test_corpus_a22 = run_lda("ainu_2022", tweet_tokens_a22)

# analyze perplexity, coherence results
examine_topics(lda_a22, dict_a22)
analyze_train_test_results(lda_a22, train_corpus_a22, test_corpus_a22)
analyze_overall_results(lda_a22, tweet_tokens_a22, dict_a22, corpus_a22)

# further topic analysis
topic_df_a22 = format_topics_sentences(lda_a22, corpus_a22, tweet_tokens_a22)
get_dominant_topics(topic_df_a22)
run_topic_analysis(topic_df_a22)

Retweet:  1768019931457823173
Retweet:  1768019931457823173
Retweet:  1768019931457823173
Retweet:  1768019931457823173
Retweet:  1768019931457823173
Retweet:  1768019931457823173
Retweet:  1768019931457823173
Retweet:  1768019931457823173
Retweet:  1768019931457823173
Retweet:  1768019931457823173
Retweet:  1768019931457823173
Retweet:  1768019931457823173
Retweet:  1768019931457823173
Retweet:  1768019931457823173
Retweet:  1768019931457823173
Retweet:  1768019931457823173
Retweet:  1768019931457823173
Retweet:  1768019931457823173
# retweets:  18
# errors:  0


adding document #0 to Dictionary<0 unique tokens: []>
adding document #10000 to Dictionary<21877 unique tokens: ['しっかり', 'す', 'アイヌ文化', '世界', '何故']...>
adding document #20000 to Dictionary<31754 unique tokens: ['しっかり', 'す', 'アイヌ文化', '世界', '何故']...>
adding document #30000 to Dictionary<38819 unique tokens: ['しっかり', 'す', 'アイヌ文化', '世界', '何故']...>
adding document #40000 to Dictionary<44789 unique tokens: ['しっかり', 'す', 'アイヌ文化', '世界', '何故']...>
adding document #50000 to Dictionary<50058 unique tokens: ['しっかり', 'す', 'アイヌ文化', '世界', '何故']...>
adding document #60000 to Dictionary<54965 unique tokens: ['しっかり', 'す', 'アイヌ文化', '世界', '何故']...>
adding document #70000 to Dictionary<59376 unique tokens: ['しっかり', 'す', 'アイヌ文化', '世界', '何故']...>
adding document #80000 to Dictionary<64066 unique tokens: ['しっかり', 'す', 'アイヌ文化', '世界', '何故']...>
adding document #90000 to Dictionary<68150 unique tokens: ['しっかり', 'す', 'アイヌ文化', '世界', '何故']...>
adding document #100000 to Dictionary<71377 unique tokens: ['しっかり', 'す', 

Topic #  1
アイヌ新法: 0.024342553690075874
差別: 0.01508571207523346
アイヌ民族: 0.01453197281807661
杉田: 0.010564574040472507
投稿: 0.01046412531286478
北海道新聞: 0.009290744550526142
電子版: 0.008457372896373272
自民党: 0.00838231947273016
議員: 0.008260530419647694
日本: 0.00801127776503563


Topic #  2
北海道: 0.01752557046711445
センタ: 0.010692108422517776
説: 0.009975483641028404
伝承: 0.008545252494513988
推進: 0.0075366683304309845
伝説: 0.007015409879386425
研究: 0.006585562601685524
小さい: 0.005936234723776579
アイヌ民族: 0.005802185740321875
大きい: 0.0057772365398705006


Topic #  3
アイヌ民族: 0.027311652898788452
否定: 0.026683175936341286
氏: 0.023264173418283463
質問: 0.017315536737442017
協会: 0.012873600237071514
発言: 0.01034583430737257
政策: 0.009968404658138752
匿名: 0.00895823072642088
募集中: 0.00879293866455555
抗する: 0.008654167875647545


Topic #  4
神: 0.01948375068604946
熊: 0.009930484928190708
人間: 0.007974233478307724
世界: 0.006895607803016901
妖怪: 0.006607246585190296
アイヌ民族博物館: 0.00645563006401062
国立: 0.005948139354586601
カムイ: 0.00

-8.986 per-word bound, 506.9 perplexity estimate based on a held-out corpus of 160078 documents with 2298404 words


perplexity(train): 506.9021158545095 

# of words in test corpus:  248565


-9.356 per-word bound, 655.3 perplexity estimate based on a held-out corpus of 17786 documents with 248565 words


perplexity(test): 655.2505360055125


-8.951 per-word bound, 495.0 perplexity estimate based on a held-out corpus of 177864 documents with 2546969 words
using ParallelWordOccurrenceAccumulator<processes=7, batch_size=64> to estimate probabilities from sliding windows



Perplexity:  -8.951367862657467


7 accumulators retrieved from output queue
accumulated word occurrence stats for 177864 virtual documents



Coherence Score:  0.44420872028157704


Unnamed: 0,Dominant_Topic,Topic_Keywords,Num_Documents,Perc_Documents
0,6,"北海道, 日本人, アイヌ民族, 歴史, 差別, 文化, 日本, 和人, 民族, ウポポイ",7807.0,0.0439
1,8,"てる, アイヌ文化, ゴルデンカムイ, 思う, いい, 思っ, カム, 言っ, 出, 読ん",5298.0,0.0298
2,8,"てる, アイヌ文化, ゴルデンカムイ, 思う, いい, 思っ, カム, 言っ, 出, 読ん",6082.0,0.0342
3,8,"てる, アイヌ文化, ゴルデンカムイ, 思う, いい, 思っ, カム, 言っ, 出, 読ん",6795.0,0.0382
4,8,"てる, アイヌ文化, ゴルデンカムイ, 思う, いい, 思っ, カム, 言っ, 出, 読ん",8201.0,0.0461
...,...,...,...,...
177859,1,"北海道, センタ, 説, 伝承, 推進, 伝説, 研究, 小さい, アイヌ民族, 大きい",,
177860,9,"北海道, ロシア, 日本, 先住民族, アイヌ民族, 縄文人, 権利, プチン, 民族, 先住民",,
177861,8,"てる, アイヌ文化, ゴルデンカムイ, 思う, いい, 思っ, カム, 言っ, 出, 読ん",,
177862,8,"てる, アイヌ文化, ゴルデンカムイ, 思う, いい, 思っ, カム, 言っ, 出, 読ん",,


In [None]:
lda_a22 = gensim.models.LdaModel.load("save_lda_model_ainu_2022")
examine_topics_plain(lda_a22, dict_a22)

In [240]:
# visualize topics (keep in separate cell so we can interrupt it)
visualize_topics(lda_a22, corpus_a22, dict_a22)

Serving to http://127.0.0.1:8888/    [Ctrl-C to exit]


127.0.0.1 - - [21/Mar/2024 13:04:40] "GET / HTTP/1.1" 200 -



stopping Server...


#### Haafu LDA

In [231]:
# preprocess and tokenize
txt_filename = "datasets_minority_groups/haafu_2022.txt"
tweet_tokens_h22, retweets_h22, error_h22 = preprocess_tokenize_all(txt_filename, "2022")
print("# retweets: ", len(retweets_h22))
print("# errors: ", len(error_h22))
save_to_csv(tweet_tokens_h22, "save_tokens_haafu_2022.csv")

# run lda
lda_h22, dict_h22, corpus_h22, train_corpus_h22, test_corpus_h22 = run_lda("haafu_2022", tweet_tokens_h22)

# analyze perplexity, coherence results
examine_topics(lda_h22, dict_h22)
analyze_train_test_results(lda_h22, train_corpus_h22, test_corpus_h22)
analyze_overall_results(lda_h22, tweet_tokens_h22, dict_h22, corpus_h22)

# further topic analysis
topic_df_h22 = format_topics_sentences(lda_h22, corpus_h22, tweet_tokens_h22)
get_dominant_topics(topic_df_h22)
run_topic_analysis(topic_df_h22)

adding document #0 to Dictionary<0 unique tokens: []>
adding document #10000 to Dictionary<25349 unique tokens: ['ウエスト', 'コト', 'スリクォタ', 'ハフコト', 'ヒップ']...>


# retweets:  0
# errors:  0


adding document #20000 to Dictionary<37599 unique tokens: ['ウエスト', 'コト', 'スリクォタ', 'ハフコト', 'ヒップ']...>
built Dictionary<45757 unique tokens: ['ウエスト', 'コト', 'スリクォタ', 'ハフコト', 'ヒップ']...> from 28339 documents (total 336290 corpus positions)
Dictionary lifecycle event {'msg': "built Dictionary<45757 unique tokens: ['ウエスト', 'コト', 'スリクォタ', 'ハフコト', 'ヒップ']...> from 28339 documents (total 336290 corpus positions)", 'datetime': '2024-03-17T22:29:17.599451', 'gensim': '4.3.2', 'python': '3.11.5 (main, Aug 24 2023, 15:18:16) [Clang 14.0.3 (clang-1403.0.22.14.1)]', 'platform': 'macOS-14.3.1-x86_64-i386-64bit', 'event': 'created'}
discarding 36383 tokens: [('スリクォタ', 2), ('ヒップ', 2), ('フィンガチップ', 1), ('着丈', 3), ('総称', 2), ('ツイゴル', 2), ('ハフ', 11515), ('東京国際', 3), ('コネクタ', 1), ('ハフピッチ', 3)]...
keeping 9374 tokens which were in no less than 5 and no more than 5667 (=20.0%) documents
resulting dictionary: Dictionary<9374 unique tokens: ['ウエスト', 'コト', 'ハフコト', '丈', '中間']...>
using autotuned alpha, starting with

Topic #  1
ハフアップ: 0.07281313836574554
ハフツイン: 0.04404336214065552
好き: 0.025774581357836723
てる: 0.024832043796777725
可愛い: 0.019529644399881363
すぎ: 0.01516166515648365
かわいい: 0.013462583534419537
髪: 0.012923511676490307
いい: 0.01252046786248684
髪型: 0.012137483805418015


Topic #  2
質問: 0.04679687321186066
募集中: 0.02925187535583973
匿名: 0.029094258323311806
てる: 0.02260010503232479
答え: 0.016371281817555428
質問箱: 0.015730256214737892
ハフハフ: 0.015709104016423225
娘: 0.014258287847042084
食べ: 0.013501825742423534
日本人: 0.011580631136894226


Topic #  3
楽天: 0.0230252668261528
無料: 0.02148028463125229
送料: 0.017413552850484848
メンズ: 0.013019255362451077
パンツ: 0.012619173154234886
位: 0.0124123003333807
価格: 0.011813780292868614
サイズ: 0.01099755521863699
人間: 0.01047302596271038
詳細: 0.009011128917336464


Topic #  4
今日: 0.014453435316681862
サイズ: 0.012845532968640327
動画: 0.009657390415668488
ニュハフ: 0.008679565973579884
ハフハフ: 0.008115176111459732
ハフスイング: 0.00776886148378253
食べ: 0.007615769747644663
お願い: 0.0067673833

-8.262 per-word bound, 307.0 perplexity estimate based on a held-out corpus of 25506 documents with 239923 words


perplexity(train): 307.0486954361905 

# of words in test corpus:  26137


-10.204 per-word bound, 1179.4 perplexity estimate based on a held-out corpus of 2833 documents with 26137 words


perplexity(test): 1179.4110394573133


-8.229 per-word bound, 299.9 perplexity estimate based on a held-out corpus of 28339 documents with 266060 words
using ParallelWordOccurrenceAccumulator<processes=7, batch_size=64> to estimate probabilities from sliding windows



Perplexity:  -8.228510922007473


7 accumulators retrieved from output queue
accumulated word occurrence stats for 28339 virtual documents



Coherence Score:  0.4443484265278492


Unnamed: 0,Dominant_Topic,Topic_Keywords,Num_Documents,Perc_Documents
0,9,"ハフパンツ, てる, 今日, 思っ, 笑, いい, 良い, 思い, 買っ, 練習",7594.0,0.2680
1,1,"質問, 募集中, 匿名, てる, 答え, 質問箱, ハフハフ, 娘, 食べ, 日本人",1754.0,0.0619
2,9,"ハフパンツ, てる, 今日, 思っ, 笑, いい, 良い, 思い, 買っ, 練習",1328.0,0.0469
3,9,"ハフパンツ, てる, 今日, 思っ, 笑, いい, 良い, 思い, 買っ, 練習",2162.0,0.0763
4,5,"てる, 思う, ニュハフ, ハフパンツ, やっ, 選手, 思っ, 出, いい, 言わ",1475.0,0.0520
...,...,...,...,...
28334,9,"ハフパンツ, てる, 今日, 思っ, 笑, いい, 良い, 思い, 買っ, 練習",,
28335,3,"今日, サイズ, 動画, ニュハフ, ハフハフ, ハフスイング, 食べ, お願い, 本日, ...",,
28336,7,"ハフバスデ, てる, 人間, 今日, くれ, 写真, 頑張っ, これから, 箱, もう",,
28337,9,"ハフパンツ, てる, 今日, 思っ, 笑, いい, 良い, 思い, 買っ, 練習",,


In [225]:
# visualize topics
visualize_topics(lda_h22, corpus_h22, dict_h22)

Serving to http://127.0.0.1:8888/    [Ctrl-C to exit]


127.0.0.1 - - [17/Mar/2024 21:52:24] "GET / HTTP/1.1" 200 -



stopping Server...


#### Ryukyujin LDA


In [232]:
# preprocess and tokenize
txt_filename = "datasets_minority_groups/ryukyujin_2022.txt"
tweet_tokens_r22, retweets_r22, error_r22 = preprocess_tokenize_all(txt_filename, "2022")
print("# retweets: ", len(retweets_r22))
print("# errors: ", len(error_r22))
save_to_csv(tweet_tokens_r22, "save_tokens_haafu_2022.csv")

# run lda
lda_r22, dict_r22, corpus_r22, train_corpus_r22, test_corpus_r22 = run_lda("haafu_2022", tweet_tokens_r22)

# analyze perplexity, coherence results
examine_topics(lda_r22, dict_r22)
analyze_train_test_results(lda_r22, train_corpus_r22, test_corpus_r22)
analyze_overall_results(lda_r22, tweet_tokens_r22, dict_r22, corpus_r22)

# further topic analysis
topic_df_r22 = format_topics_sentences(lda_r22, corpus_r22, tweet_tokens_r22)
get_dominant_topics(topic_df_r22)
run_topic_analysis(topic_df_r22)

Retweet:  1768019931457823173


adding document #0 to Dictionary<0 unique tokens: []>
built Dictionary<15354 unique tokens: ['ペジ', '下さい', '下さっ', '側', '先住民族']...> from 5454 documents (total 97145 corpus positions)
Dictionary lifecycle event {'msg': "built Dictionary<15354 unique tokens: ['ペジ', '下さい', '下さっ', '側', '先住民族']...> from 5454 documents (total 97145 corpus positions)", 'datetime': '2024-03-17T22:30:52.402443', 'gensim': '4.3.2', 'python': '3.11.5 (main, Aug 24 2023, 15:18:16) [Clang 14.0.3 (clang-1403.0.22.14.1)]', 'platform': 'macOS-14.3.1-x86_64-i386-64bit', 'event': 'created'}
discarding 12652 tokens: [('ペジ', 3), ('下さっ', 3), ('史上', 2), ('尊厳回復', 1), ('尽力', 2), ('日本の裁判所', 1), ('準備書面', 1), ('球人', 4102), ('琉', 4088), ('琉球', 1698)]...
keeping 2702 tokens which were in no less than 5 and no more than 1090 (=20.0%) documents
resulting dictionary: Dictionary<2702 unique tokens: ['下さい', '側', '先住民族', '初めて', '原告']...>
using autotuned alpha, starting with [0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1]
using symmetri

# retweets:  1
# errors:  0


optimized alpha [0.080226295, 0.08063252, 0.07893601, 0.09436197, 0.08410304, 0.08225085, 0.08049894, 0.08044456, 0.0857325, 0.08564178]
PROGRESS: pass 0, at document #4000/5454
optimized alpha [0.064977475, 0.06668024, 0.06390427, 0.092284895, 0.06977773, 0.06857757, 0.06736619, 0.066213205, 0.07612383, 0.08028803]
-9.158 per-word bound, 571.2 perplexity estimate based on a held-out corpus of 1454 documents with 17613 words
PROGRESS: pass 0, at document #5454/5454
optimized alpha [0.05509683, 0.05629865, 0.054669403, 0.08848841, 0.05937838, 0.06094875, 0.05653076, 0.05636918, 0.07119005, 0.068123445]
topic #2 (0.055): 0.024*"日本人" + 0.014*"てる" + 0.012*"日本" + 0.009*"言っ" + 0.006*"認め" + 0.006*"台湾" + 0.005*"主張" + 0.005*"民族" + 0.005*"アイヌ" + 0.005*"中国"
topic #0 (0.055): 0.016*"日本人" + 0.013*"差別" + 0.012*"沖縄県民" + 0.012*"日本" + 0.011*"米軍" + 0.007*"戦争" + 0.006*"思い" + 0.006*"我" + 0.005*"言語" + 0.004*"基地"
topic #9 (0.068): 0.056*"琉球独立" + 0.050*"庶民" + 0.049*"安彦" + 0.048*"本村" + 0.018*"年月日" + 0.017*"つぶ

Topic #  1
遺骨: 0.014384125359356403
日本人: 0.010956871323287487
返還: 0.009170127101242542
訴訟: 0.00893373228609562
松島泰勝: 0.007980979047715664
思い: 0.0071505033411085606
うちな: 0.006623159162700176
今日: 0.005932514555752277
差別: 0.005860426463186741
戦争: 0.005492880940437317


Topic #  2
在日: 0.02401256561279297
日本: 0.023393813520669937
アイヌ: 0.014407451264560223
事件: 0.011774109676480293
血: 0.011291403323411942
台湾: 0.009761864319443703
住み: 0.00890452228486538
街: 0.008604373782873154
上位: 0.007893682457506657
ランキング: 0.007890650071203709


Topic #  3
日本人: 0.02994367852807045
てる: 0.01917760819196701
日本: 0.01186649315059185
認め: 0.011756932362914085
主張: 0.009721804410219193
言っ: 0.009475497528910637
先住民族: 0.008504999801516533
国連: 0.007528760004788637
琉球民族: 0.0071410974487662315
民族: 0.006876377388834953


Topic #  4
アイヌ: 0.03572263941168785
日本人: 0.03224308043718338
中国: 0.028558967635035515
日本: 0.02615533024072647
てる: 0.01622549071907997
独立: 0.014192627742886543
遺骨: 0.01088765449821949
独立派: 0.00981696788221

-6.972 per-word bound, 125.5 perplexity estimate based on a held-out corpus of 4909 documents with 56585 words
-9.979 per-word bound, 1009.2 perplexity estimate based on a held-out corpus of 545 documents with 6767 words


perplexity(train): 125.54114066103509 

# of words in test corpus:  6767
perplexity(test): 1009.2370984400651


-6.938 per-word bound, 122.6 perplexity estimate based on a held-out corpus of 5454 documents with 63352 words
using ParallelWordOccurrenceAccumulator<processes=7, batch_size=64> to estimate probabilities from sliding windows



Perplexity:  -6.93818099722492


7 accumulators retrieved from output queue
accumulated word occurrence stats for 5454 virtual documents



Coherence Score:  0.30095911457721186


Unnamed: 0,Dominant_Topic,Topic_Keywords,Num_Documents,Perc_Documents
0,0,"遺骨, 日本人, 返還, 訴訟, 松島泰勝, 思い, うちな, 今日, 差別, 戦争",430.0,0.0788
1,0,"遺骨, 日本人, 返還, 訴訟, 松島泰勝, 思い, うちな, 今日, 差別, 戦争",311.0,0.0570
2,9,"琉球独立, 庶民, 本村, 安彦, 年月日, つぶやき, 玉城, 知事, デニ, 記",449.0,0.0823
3,1,"在日, 日本, アイヌ, 事件, 血, 台湾, 住み, 街, 上位, ランキング",1058.0,0.1940
4,1,"在日, 日本, アイヌ, 事件, 血, 台湾, 住み, 街, 上位, ランキング",222.0,0.0407
...,...,...,...,...
5449,1,"在日, 日本, アイヌ, 事件, 血, 台湾, 住み, 街, 上位, ランキング",,
5450,9,"琉球独立, 庶民, 本村, 安彦, 年月日, つぶやき, 玉城, 知事, デニ, 記",,
5451,3,"アイヌ, 日本人, 中国, 日本, てる, 独立, 遺骨, 独立派, 氏, 縄文人",,
5452,3,"アイヌ, 日本人, 中国, 日本, てる, 独立, 遺骨, 独立派, 氏, 縄文人",,


In [226]:
# visualize topics
visualize_topics(lda_r22, corpus_r22, dict_r22)

Serving to http://127.0.0.1:8888/    [Ctrl-C to exit]


127.0.0.1 - - [17/Mar/2024 21:59:58] "GET / HTTP/1.1" 200 -



stopping Server...
