### Setup

In [116]:
# general
import numpy as np
import pandas as pd
import csv

# tokenization
import json
import MeCab
import import_ipynb
import thesis_preprocess
from stopwords_ja import stop_words
from stopwords_slothlib import stop_words_2

# lda topic modeling
import gensim, logging
import gensim.corpora as corpora
from gensim.models import CoherenceModel
import pyLDAvis
import pyLDAvis.gensim

In [55]:
# https://lda.readthedocs.io/en/latest/getting_started.html
# https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/
# https://github.com/deankuo/Japan-Manifesto-Classification/blob/main/topic_modeling.ipynb
# https://github.com/m3yrin/NTM/blob/master/LDA_jp.ipynb
# https://tdual.hatenablog.com/entry/2018/04/09/133000#1LDA%E3%81%AE%E5%89%8D%E3%81%AB%E3%83%88%E3%83%94%E3%83%83%E3%82%AF%E3%83%A2%E3%83%87%E3%83%AB%E3%81%A8%E3%81%AF

### Preprocessing and Tokenization

In [141]:
# tokenize cleaned tweets into words
def tokenize(text):
    mt = MeCab.Tagger("-d /usr/local/lib/mecab/dic/mecab-ipadic-neologd")
    parsed = mt.parseToNode(text)
    components = []
    
    while parsed:
        word = parsed.surface
        pos = parsed.feature.split(",")[0]

        # for lda, we only want nouns, verbs, adjectives
        include_pos = ["名詞", "動詞", "形容詞", "副詞"]
        if pos in include_pos: components.append(word)
        parsed = parsed.next
    
    # remove stopwords
    components = [token for token in components if ((not token in stop_words) and (not token in stop_words_2))]
    
    return components

In [142]:
# run preprocessing and tokenization for tweets from given .txt file
def preprocess_tokenize_all(filename, year):
    # store results and exception tweets
    tokens = []
    retweets = []
    not_parsed = []

    # iterate through tweets, preprocess and tokenize
    with open(filename, 'r') as file:
        for line in file:
            tweet = json.loads(line)
            if line == None or tweet == None:
                not_parsed.append((line, tweet))
                print("Parsing error: ", line, tweet)
            elif tweet['retweetedTweet']:
                retweets.append(tweet)
                print("Retweet: ", tweet['id'])
            # filter out 2024 sponsored(?) tweets
            elif int(tweet['date'].split("-")[0]) < int(year) + 1: 
                tweet_text = tweet['rawContent'] # note: need other prop for over 140 char?
                processed = preprocess(tweet_text)            
                components = tokenize(processed)
                tokens.append(components)

    file.close()
    return tokens, retweets, not_parsed

In [12]:
# # run for 2015
# tokens_2015, retweets_2015, not_parsed_2015 = preprocess_tokenize_all("2015")

# # did we get retweets or errors?
# print(len(retweets_2015))
# print(len(not_parsed_2015))

In [None]:
# run for 2022
tweet_tokens_2022, retweets_2022, not_parsed_2022 = preprocess_tokenize_all("2022")

# did we get retweets or errors?
print(len(retweets_2022))
print(len(not_parsed_2022))

In [57]:
# pass in filename you want to save data as, including '.csv'
def save_to_csv(tweet_tokens, filename):
    f = open(filename, 'w')
    writer = csv.writer(f)
    for tweet in tweet_tokens:
        writer.writerow(tweet)
    f.close()

In [58]:
# pass in filename of csv you want to load, including '.csv'
def load_from_csv(filename):
    with open(filename, newline='') as f:
        reader = csv.reader(f)
        tweet_tokens = list(reader)
    return tweet_tokens

In [85]:
# train and save word2vec model for given year
# data_name is id suffix to save lda model
def run_lda(data_name, tweet_tokens, no_below=2):
    # set up dictionary
    dict = corpora.Dictionary(tweet_tokens)
    dict.filter_extremes(no_below)
    dict.compactify()

    # set up corpus
    corpus = [dict.doc2bow(w) for w in tweet_tokens]
    test_size = int(len(corpus) * 0.1)
    test_corpus = corpus[:test_size]
    train_corpus = corpus[test_size:]   

    # train and save lda model
    logging.basicConfig(format='%(message)s', level=logging.INFO)
    lda = gensim.models.ldamodel.LdaModel(corpus=corpus, 
                                          id2word=dict, 
                                          num_topics=5, 
                                          random_state=100, 
                                          passes=10, 
                                          update_every=3, 
                                          alpha=0.05,
                                          per_word_topics=True)
    lda.save("save_lda_model_" + data_name)
    
    return lda, dict, corpus, train_corpus, test_corpus

In [42]:
lda_2022_gen = gensim.models.LdaModel.load("save_lda_model_2022")

loading LdaModel object from thesis_lda_model_2022
loading expElogbeta from thesis_lda_model_2022.expElogbeta.npy with mmap=None
setting ignored attribute id2word to None
setting ignored attribute dispatcher to None
setting ignored attribute state to None
LdaModel lifecycle event {'fname': 'thesis_lda_model_2022', 'datetime': '2024-03-17T10:28:38.384616', 'gensim': '4.3.2', 'python': '3.11.5 (main, Aug 24 2023, 15:18:16) [Clang 14.0.3 (clang-1403.0.22.14.1)]', 'platform': 'macOS-14.3.1-x86_64-i386-64bit', 'event': 'loaded'}
loading LdaState object from thesis_lda_model_2022.state
LdaState lifecycle event {'fname': 'thesis_lda_model_2022.state', 'datetime': '2024-03-17T10:28:38.407748', 'gensim': '4.3.2', 'python': '3.11.5 (main, Aug 24 2023, 15:18:16) [Clang 14.0.3 (clang-1403.0.22.14.1)]', 'platform': 'macOS-14.3.1-x86_64-i386-64bit', 'event': 'loaded'}


In [102]:
def examine_topics(lda, dict):
    for topic in range(5):
        print("Topic # ",(topic+1))
        for t in lda.get_topic_terms(topic):
            print("{}: {}".format(dict[t[0]], t[1]))
        print("\n")

In [50]:
# def analyze_train_test_results(lda, train_corpus, test_corpus):
#     # look at train set results
#     N = sum(count for doc in train_corpus for _, count in doc)
#     print("# of words in train corpus: ",N)
#     perplexity = np.exp2(-lda.log_perplexity(train_corpus))
#     print("perplexity(train):", perplexity,"\n")

#     # look at test set results
#     N = sum(count for doc in test_corpus for _, count in doc)
#     print("# of words in test corpus: ",N)
#     perplexity = np.exp2(-lda.log_perplexity(test_corpus))
#     print("perplexity(test):", perplexity)

In [105]:
def analyze_overall_results(lda, tweet_tokens, dict, corpus):
    # look at overall perplexity and coherence score
    print('\nPerplexity: ', lda.log_perplexity(corpus))     # lower is better

    coherence_model_lda = CoherenceModel(model=lda, texts=tweet_tokens, dictionary=dict, coherence='c_v')
    coherence_lda = coherence_model_lda.get_coherence()
    print('\nCoherence Score: ', coherence_lda)     # higher is better

In [123]:
def visualize_topics(lda, corpus, dict):
    pyLDAvis.enable_notebook(local=True)
    vis = pyLDAvis.gensim.prepare(lda, corpus, dict)
    pyLDAvis.show(vis, local=False)

In [136]:
for i, row in enumerate(lda_z22[corpus_z22]):
    print(row)

([(0, 0.2), (1, 0.2), (2, 0.2), (3, 0.2), (4, 0.2)], [], [])
([(0, 0.2), (1, 0.2), (2, 0.2), (3, 0.2), (4, 0.2)], [], [])
([(0, 0.2), (1, 0.2), (2, 0.2), (3, 0.2), (4, 0.2)], [], [])
([(0, 0.2), (1, 0.2), (2, 0.2), (3, 0.2), (4, 0.2)], [], [])
([(0, 0.2), (1, 0.2), (2, 0.2), (3, 0.2), (4, 0.2)], [], [])
([(0, 0.2), (1, 0.2), (2, 0.2), (3, 0.2), (4, 0.2)], [], [])
([(0, 0.2), (1, 0.2), (2, 0.2), (3, 0.2), (4, 0.2)], [], [])
([(0, 0.2), (1, 0.2), (2, 0.2), (3, 0.2), (4, 0.2)], [], [])
([(0, 0.2), (1, 0.2), (2, 0.2), (3, 0.2), (4, 0.2)], [], [])
([(0, 0.2), (1, 0.2), (2, 0.2), (3, 0.2), (4, 0.2)], [], [])
([(0, 0.2), (1, 0.2), (2, 0.2), (3, 0.2), (4, 0.2)], [], [])
([(0, 0.2), (1, 0.2), (2, 0.2), (3, 0.2), (4, 0.2)], [], [])
([(0, 0.2), (1, 0.2), (2, 0.2), (3, 0.2), (4, 0.2)], [], [])
([(0, 0.2), (1, 0.2), (2, 0.2), (3, 0.2), (4, 0.2)], [], [])
([(0, 0.2), (1, 0.2), (2, 0.2), (3, 0.2), (4, 0.2)], [], [])
([(0, 0.2), (1, 0.2), (2, 0.2), (3, 0.2), (4, 0.2)], [], [])
([(0, 0.2), (1, 0.2), (2

In [137]:
# https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/
def format_topics_sentences(lda, corpus, tweet_tokens):
    sent_topics_df = pd.DataFrame()

    # get main topic in each document
    for i, row in enumerate(lda[corpus]):
        print(len(row))
        print(row)
        # row = sorted(row, key=lambda x: (x[0]), reverse=True)
        # for each document, get dominant topic, perc contribution and keywords 
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = lda.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = pd.concat([sent_topics_df,pd.Series([int(topic_num), round(prop_topic,4), topic_keywords])], ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # add original text to the end of the output
    contents = pd.Series(tweet_tokens)
    # sent_topics_df = contents

    # TODO: debug this line
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)

In [138]:
format_topics_sentences(lda_z22, corpus_z22, tweet_tokens_z22)

3
([(0, 0.2), (1, 0.2), (2, 0.2), (3, 0.2), (4, 0.2)], [], [])


ValueError: too many values to unpack (expected 2)

In [None]:
# TODO: run this once we collect a keyword dataset

df_topic_sents_keywords = format_topics_sentences(lda, corpus, tweet_tokens)

# format and show
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']
df_dominant_topic.head(10)

In [None]:
# group top 5 sentences under each topic
sent_topics_sorteddf = pd.DataFrame()

sent_topics_outdf_grpd = df_topic_sents_keywords.groupby('Dominant_Topic')

for i, grp in sent_topics_outdf_grpd:
    sent_topics_sorteddf = pd.concat([sent_topics_sorteddf, 
                                             grp.sort_values(['Perc_Contribution'], ascending=[0]).head(1)], 
                                            axis=0)

# reset index    
sent_topics_sorteddf.reset_index(drop=True, inplace=True)

# format and show
sent_topics_sorteddf.columns = ['Topic_Num', "Topic_Perc_Contrib", "Keywords", "Text"]
sent_topics_sorteddf.head()

In [110]:
def topic_analysis():
    # num documents per topic
    topic_counts = df_topic_sents_keywords['Dominant_Topic'].value_counts()

    # percentage documents for each topic
    topic_contribution = round(topic_counts/topic_counts.sum(), 4)

    # topic number, keywords
    topic_num_keywords = df_topic_sents_keywords[['Dominant_Topic', 'Topic_Keywords']]

    # concatenate column-wise
    df_dominant_topics = pd.concat([topic_num_keywords, topic_counts, topic_contribution], axis=1)

    # add column names
    df_dominant_topics.columns = ['Dominant_Topic', 'Topic_Keywords', 'Num_Documents', 'Perc_Documents']

    # show
    df_dominant_topics

#### Zainichi Korean LDA

In [97]:
# preprocess and tokenize
txt_filename = "datasets_minority_groups/zainichi_2022.txt"
tweet_tokens_z22, retweets_z22, error_z22 = preprocess_tokenize_all(txt_filename, "2022")
# did we get retweets or errors?
print("retweets: ", len(retweets_z22))
print("errors: ", len(error_z22))
# save tokens to csv
save_to_csv(tweet_tokens_z22, "save_tokens_zainichi_2022.csv")

retweets:  0
errors:  0


In [99]:
len(tweet_tokens_z22)

5015

In [100]:
# run lda
lda_z22, dict_z22, corpus_z22, train_corpus_z22, test_corpus_z22 = run_lda("zainichi_2022", tweet_tokens_z22)

adding document #0 to Dictionary<0 unique tokens: []>
built Dictionary<33 unique tokens: ['ごはん', 'まだ', 'もう', 'クラ', 'ノ']...> from 5015 documents (total 259 corpus positions)
Dictionary lifecycle event {'msg': "built Dictionary<33 unique tokens: ['ごはん', 'まだ', 'もう', 'クラ', 'ノ']...> from 5015 documents (total 259 corpus positions)", 'datetime': '2024-03-17T11:37:23.795769', 'gensim': '4.3.2', 'python': '3.11.5 (main, Aug 24 2023, 15:18:16) [Clang 14.0.3 (clang-1403.0.22.14.1)]', 'platform': 'macOS-14.3.1-x86_64-i386-64bit', 'event': 'created'}
discarding 0 tokens: []...
keeping 33 tokens which were in no less than 2 and no more than 2507 (=50.0%) documents


resulting dictionary: Dictionary<33 unique tokens: ['ごはん', 'まだ', 'もう', 'クラ', 'ノ']...>
using symmetric eta at 0.2
using serial LDA version on this node
running online (multi-pass) LDA training, 5 topics, 10 passes over the supplied corpus of 5015 documents, updating model once every 5015 documents, evaluating perplexity every 5015 documents, iterating 50x with a convergence threshold of 0.001000
PROGRESS: pass 0, at document #2000/5015
PROGRESS: pass 0, at document #4000/5015
  perwordbound = self.bound(chunk, subsample_ratio=subsample_ratio) / (subsample_ratio * corpus_words)
-inf per-word bound, inf perplexity estimate based on a held-out corpus of 1015 documents with 0 words
PROGRESS: pass 0, at document #5015/5015
topic #0 (0.050): 0.064*"商品" + 0.053*"食べ" + 0.049*"土鍋" + 0.047*"レンジ" + 0.034*"もう" + 0.032*"匠" + 0.031*"詳しく" + 0.031*"本格" + 0.030*"得" + 0.030*"待ち"
topic #1 (0.050): 0.030*"本格" + 0.030*"炊飯" + 0.030*"待ち" + 0.030*"得" + 0.030*"忙しい" + 0.030*"戻れ" + 0.030*"最大" + 0.030*"大人気" + 0.03

In [106]:
# analyze results
examine_topics(lda_z22, dict_z22)
analyze_overall_results(lda_z22, tweet_tokens_z22, dict_z22, corpus_z22)

Topic #  1
商品: 0.037908896803855896
食べ: 0.03536500409245491
土鍋: 0.0345044881105423
レンジ: 0.03415774181485176
もう: 0.031062912195920944
匠: 0.030673755332827568
詳しく: 0.030478989705443382
本格: 0.030354492366313934
得: 0.03022005595266819
待ち: 0.030203618109226227


Topic #  2
本格: 0.03030303120613098
炊飯: 0.03030303120613098
待ち: 0.03030303120613098
得: 0.03030303120613098
忙しい: 0.03030303120613098
戻れ: 0.03030303120613098
最大: 0.03030303120613098
大人気: 0.03030303120613098
炊き: 0.03030303120613098
立て: 0.03030303120613098


Topic #  3
レンジ: 0.032560113817453384
商品: 0.03235068917274475
食べ: 0.0320434533059597
土鍋: 0.031530991196632385
平日: 0.03083186224102974
まだ: 0.030672764405608177
ノ: 0.030432678759098053
大人気: 0.030331376940011978
立て: 0.03032490983605385
もう: 0.030300559476017952


Topic #  4
商品: 0.030423611402511597
土鍋: 0.030376996845006943
食べ: 0.030365031212568283
ごはん: 0.03035399317741394
炊飯: 0.030351746827363968
レンジ: 0.030349384993314743
伊賀: 0.03032645210623741
大人気: 0.030315352603793144
口コミ: 0.0303121916

-3.796 per-word bound, 13.9 perplexity estimate based on a held-out corpus of 5015 documents with 259 words
using ParallelWordOccurrenceAccumulator<processes=7, batch_size=64> to estimate probabilities from sliding windows



Perplexity:  -3.7960171602867745


7 accumulators retrieved from output queue
accumulated word occurrence stats for 5015 virtual documents



Coherence Score:  1.0000000000000004


In [125]:
visualize_topics(lda_z22, corpus_z22, dict_z22)

Serving to http://127.0.0.1:8888/    [Ctrl-C to exit]


127.0.0.1 - - [17/Mar/2024 19:35:19] "GET / HTTP/1.1" 200 -



stopping Server...


In [129]:
format_topics_sentences(lda_z22, corpus_z22, tweet_tokens_z22)

IndexError: list index out of range

In [140]:
[[(dict_z22[id], freq) for id, freq in cp] for cp in corpus_z22[:2]]


[[], []]

In [143]:
print(corpus_z22[:1])


[[]]


In [144]:
id2word = corpora.Dictionary(tweet_tokens_z22)
texts = tweet_tokens_z22
corpus = [id2word.doc2bow(text) for text in texts]
print(corpus[:1])



adding document #0 to Dictionary<0 unique tokens: []>
built Dictionary<33 unique tokens: ['ごはん', 'まだ', 'もう', 'クラ', 'ノ']...> from 5015 documents (total 259 corpus positions)
Dictionary lifecycle event {'msg': "built Dictionary<33 unique tokens: ['ごはん', 'まだ', 'もう', 'クラ', 'ノ']...> from 5015 documents (total 259 corpus positions)", 'datetime': '2024-03-17T19:49:26.754028', 'gensim': '4.3.2', 'python': '3.11.5 (main, Aug 24 2023, 15:18:16) [Clang 14.0.3 (clang-1403.0.22.14.1)]', 'platform': 'macOS-14.3.1-x86_64-i386-64bit', 'event': 'created'}


[[]]
