### Setup

In [1]:
# general
import numpy as np
import pandas as pd
import csv

# tokenization
import json
import MeCab
import demoji
import mojimoji
import re
from stopwords_ja import stop_words
from stopwords_slothlib import stop_words_2

# lda topic modeling
import gensim, logging
import gensim.corpora as corpora
from gensim.models import CoherenceModel
import pyLDAvis
import pyLDAvis.gensim

### Preprocessing and Tokenization

In [None]:
# preprocess tweet content
def preprocess(text):    
    # from https://colab.research.google.com/drive/1bX-JyY4xmCm_RFkJg3QNcthUvEJaBghP
    # handle half-width/full-width chars, jp punctuation
    text = text.lower()
    text = mojimoji.zen_to_han(text, kana=False)
    text = mojimoji.han_to_zen(text, digit=False, ascii=False)
    text = text.translate(str.maketrans({
        '!': '！', '"': '”', '#': '＃', '$': '＄', '%': '％', '&': '＆', '\'': '’',
        '(': '（', ')': '）', '*': '＊', '+': '＋', ',': '，', '-': '−', '.': '．',
        '/': '／', ':': '：', ';': '；', '<': '＜', '=': '＝', '>': '＞', '?': '？',
        '@': '＠', '[': '［', '\\': '＼', ']': '］', '^': '＾', '_': '＿', '`': '｀',
        '{': '｛', '|': '｜', '}': '｝'
        }))
    zenkaku_leftsingle = b'\xe2\x80\x98'.decode('utf-8')
    text = re.sub('[’´｀]', zenkaku_leftsingle, text)
    
    # remove twitter-specific strings (handles, hashtags, etc.)
    text = re.sub("@([a-zA-Z0-9_]+)", "", text)
    text = re.sub("#([a-zA-Z0-9_ぁ-んァ-ン一-龠]+)", "", text)
    text = re.sub("http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+", "", text)

    # remove emojis
    text = demoji.replace(text, "")
    text = re.sub("([\uD83E-\uD83E])+", "", text)

    # remove punctuation and whitespace
    text = re.sub("([^一-龯ぁ-んァ-ン])+","",text)  
    text = re.sub("(\s)+", "", text)

    return text

In [10]:
# tokenize cleaned tweets into words
def tokenize(text):
    mt = MeCab.Tagger("-d /usr/local/lib/mecab/dic/mecab-ipadic-neologd")
    parsed = mt.parseToNode(text)
    components = []
    
    while parsed:
        word = parsed.surface
        pos = parsed.feature.split(",")[0]

        # for lda, we only want nouns, verbs, adjectives
        include_pos = ["名詞", "動詞", "形容詞"]
        if pos in include_pos: components.append(word)
        parsed = parsed.next
    
    # remove stopwords
    components = [token for token in components if ((not token in stop_words) and (not token in stop_words_2))]
    
    return components

In [11]:
# run preprocessing and tokenization for all tweets from given year dataset
def preprocess_tokenize_all(year):
    # store results and exception tweets
    token_tweets = []
    retweets = []
    not_parsed = []

    # iterate through tweets, preprocess and tokenize
    with open(year + '-all.txt', 'r') as file:
        for line in file:
            tweet = json.loads(line)
            if line == None or tweet == None:
                not_parsed.append((line, tweet))
                print("Parsing error: ", line, tweet)
            elif tweet['retweetedTweet']:
                retweets.append(tweet)
                print("Retweet: ", tweet['id'])
            else: 
                tweet_text = tweet['rawContent']
                processed = preprocess(tweet_text)            
                components = tokenize(processed)
                if len(components) > 0: token_tweets.append(components)

    file.close()
    return token_tweets, retweets, not_parsed

In [12]:
# # run for 2015
# tokens_2015, retweets_2015, not_parsed_2015 = preprocess_tokenize_all("2015")

# # did we get retweets or errors?
# print(len(retweets_2015))
# print(len(not_parsed_2015))

In [None]:
# run for 2022
token_tweets_2022, retweets_2022, not_parsed_2022 = preprocess_tokenize_all("2022")

# did we get retweets or errors?
print(len(retweets_2022))
print(len(not_parsed_2022))

In [None]:
# save tokenized tweets to csv
myFile = open('token_tweets_2022.csv', 'w')
writer = csv.writer(myFile)
for tweet in token_tweets_2022:
    writer.writerow(tweet)
myFile.close()

In [14]:
# load tokenized tweets from csv
with open('token_tweets_2022.csv', newline='') as f:
    reader = csv.reader(f)
    token_tweets_2022 = list(reader)

In [55]:
# https://lda.readthedocs.io/en/latest/getting_started.html
# https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/
# https://github.com/deankuo/Japan-Manifesto-Classification/blob/main/topic_modeling.ipynb
# https://github.com/m3yrin/NTM/blob/master/LDA_jp.ipynb
# https://tdual.hatenablog.com/entry/2018/04/09/133000#1LDA%E3%81%AE%E5%89%8D%E3%81%AB%E3%83%88%E3%83%94%E3%83%83%E3%82%AF%E3%83%A2%E3%83%87%E3%83%AB%E3%81%A8%E3%81%AF

In [15]:
# set up dictionary
d = corpora.Dictionary(token_tweets_2022)
d.filter_extremes(no_below=5, no_above=0.2)
d.compactify()

In [16]:
# set up corpus
corpus = [d.doc2bow(w) for w in token_tweets_2022]
test_size = int(len(corpus) * 0.1)
test_corpus = corpus[:test_size]
train_corpus = corpus[test_size:]

In [18]:
logging.basicConfig(format='%(message)s', level=logging.INFO)
lda = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=d, num_topics=20, passes=10, update_every=5)

using symmetric alpha at 0.05
using symmetric eta at 0.05
using serial LDA version on this node
running online (multi-pass) LDA training, 20 topics, 10 passes over the supplied corpus of 530882 documents, updating model once every 10000 documents, evaluating perplexity every 20000 documents, iterating 50x with a convergence threshold of 0.001000
PROGRESS: pass 0, at document #2000/530882
PROGRESS: pass 0, at document #4000/530882
PROGRESS: pass 0, at document #6000/530882
PROGRESS: pass 0, at document #8000/530882
PROGRESS: pass 0, at document #10000/530882
merging changes from 10000 documents into a model of 530882 documents
topic #9 (0.050): 0.019*"今日" + 0.015*"ガチャ" + 0.015*"開催" + 0.010*"月日" + 0.008*"三連休" + 0.008*"読ん" + 0.008*"作品" + 0.007*"当たる" + 0.007*"対象" + 0.007*"てる"
topic #3 (0.050): 0.022*"てる" + 0.012*"ください" + 0.007*"今日" + 0.007*"おはよう" + 0.005*"参加" + 0.004*"み" + 0.004*"すぎ" + 0.004*"くれ" + 0.004*"好き" + 0.004*"フォロ"
topic #18 (0.050): 0.019*"今日" + 0.008*"フォロ" + 0.007*"良い" + 0.006*"お

In [20]:
lda.save("thesis_lda_model_2022")

LdaState lifecycle event {'fname_or_handle': 'thesis_lda_model_2022.state', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2024-03-12T15:47:01.962818', 'gensim': '4.3.2', 'python': '3.11.5 (main, Aug 24 2023, 15:18:16) [Clang 14.0.3 (clang-1403.0.22.14.1)]', 'platform': 'macOS-14.3.1-x86_64-i386-64bit', 'event': 'saving'}
saved thesis_lda_model_2022.state
LdaModel lifecycle event {'fname_or_handle': 'thesis_lda_model_2022', 'separately': "['expElogbeta', 'sstats']", 'sep_limit': 10485760, 'ignore': ['id2word', 'dispatcher', 'state'], 'datetime': '2024-03-12T15:47:02.069930', 'gensim': '4.3.2', 'python': '3.11.5 (main, Aug 24 2023, 15:18:16) [Clang 14.0.3 (clang-1403.0.22.14.1)]', 'platform': 'macOS-14.3.1-x86_64-i386-64bit', 'event': 'saving'}
storing np array 'expElogbeta' to thesis_lda_model_2022.expElogbeta.npy
not storing attribute id2word
not storing attribute dispatcher
not storing attribute state
saved thesis_lda_model_2022


In [21]:
lda = gensim.models.LdaModel.load("thesis_lda_model_2022")

loading LdaModel object from thesis_lda_model_2022
loading expElogbeta from thesis_lda_model_2022.expElogbeta.npy with mmap=None
setting ignored attribute id2word to None
setting ignored attribute dispatcher to None
setting ignored attribute state to None
LdaModel lifecycle event {'fname': 'thesis_lda_model_2022', 'datetime': '2024-03-12T15:47:36.007364', 'gensim': '4.3.2', 'python': '3.11.5 (main, Aug 24 2023, 15:18:16) [Clang 14.0.3 (clang-1403.0.22.14.1)]', 'platform': 'macOS-14.3.1-x86_64-i386-64bit', 'event': 'loaded'}
loading LdaState object from thesis_lda_model_2022.state
LdaState lifecycle event {'fname': 'thesis_lda_model_2022.state', 'datetime': '2024-03-12T15:47:36.014890', 'gensim': '4.3.2', 'python': '3.11.5 (main, Aug 24 2023, 15:18:16) [Clang 14.0.3 (clang-1403.0.22.14.1)]', 'platform': 'macOS-14.3.1-x86_64-i386-64bit', 'event': 'loaded'}


In [22]:
# examine topics
def get_topic_words(topic_id):
    for t in lda.get_topic_terms(topic_id):
        print("{}: {}".format(d[t[0]], t[1]))

for t in range(10):
    print("Topic # ",t)
    get_topic_words(t)
    print("\n")

Topic #  0
やっ: 0.026799719780683517
日本: 0.0246716421097517
てる: 0.02125527523458004
思う: 0.013306519016623497
め: 0.01292614359408617
無い: 0.012509232386946678
神: 0.011579198762774467
世界: 0.01125407312065363
幸せ: 0.010198567993938923
問題: 0.009188718162477016


Topic #  1
好き: 0.09288687258958817
食べ: 0.0533093623816967
っ: 0.047793470323085785
友達: 0.021909410133957863
作っ: 0.015416267327964306
美味しい: 0.014767379499971867
東京: 0.014110102318227291
紹介: 0.013756996020674706
てる: 0.012277146801352501
ポイント: 0.012270711362361908


Topic #  2
結果: 0.044579848647117615
ハズレ: 0.038142524659633636
チャレンジ: 0.035022806376218796
参加: 0.029096540063619614
キャンペン: 0.028365587815642357
可能: 0.02823077328503132
情報: 0.02562461420893669
残念: 0.025255398824810982
応募: 0.024196477606892586
今年: 0.023142917081713676


Topic #  3
写真: 0.040551379323005676
顔: 0.021795472130179405
繋がり: 0.01668873243033886
大事: 0.016189636662602425
大人: 0.014856521971523762
綺麗: 0.013941311277449131
凄い: 0.012946315109729767
フォロワ: 0.012082763016223907
な

In [23]:
# look at train set results
N = sum(count for doc in train_corpus for _, count in doc)
print("# of words in train corpus: ",N)
perplexity = np.exp2(-lda.log_perplexity(train_corpus))
print("perplexity(train):", perplexity,"\n")

# look at test set results
N = sum(count for doc in test_corpus for _, count in doc)
print("# of words in test corpus: ",N)
perplexity = np.exp2(-lda.log_perplexity(test_corpus))
print("perplexity(test):", perplexity)

# of words in train corpus:  3486641


-12.953 per-word bound, 7927.5 perplexity estimate based on a held-out corpus of 477794 documents with 3486641 words


perplexity(train): 7927.479605518746 

# of words in test corpus:  395707


-11.934 per-word bound, 3913.1 perplexity estimate based on a held-out corpus of 53088 documents with 395707 words


perplexity(test): 3913.081819537073


In [120]:
# look at overall perplexity and coherence score
print('\nPerplexity: ', lda.log_perplexity(corpus))     # lower is better

coherence_model_lda = CoherenceModel(model=lda, texts=token_tweets_2022, dictionary=d, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)     # higher is better

-12.785 per-word bound, 7056.6 perplexity estimate based on a held-out corpus of 530882 documents with 3882348 words
using ParallelWordOccurrenceAccumulator<processes=7, batch_size=64> to estimate probabilities from sliding windows



Perplexity:  -12.784766148268396


7 accumulators retrieved from output queue
accumulated word occurrence stats for 553717 virtual documents



Coherence Score:  0.4920924969549903


In [24]:
# visualize topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda, corpus, d)

In [None]:
vis

In [31]:
pyLDAvis.gensim.prepare(lda, corpus, d)

In [36]:
# https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/
def format_topics_sentences(ldamodel=lda, corpus=corpus, texts=token_tweets_2022):
    sent_topics_df = pd.DataFrame()

    # get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # for each document, get dominant topic, perc contribution and keywords 
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = pd.concat([sent_topics_df,pd.Series([int(topic_num), round(prop_topic,4), topic_keywords])], ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)


df_topic_sents_keywords = format_topics_sentences(ldamodel=lda, corpus=corpus, texts=token_tweets_2022)

# format and show
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']
df_dominant_topic.head(10)

In [None]:
# group top 5 sentences under each topic
sent_topics_sorteddf = pd.DataFrame()

sent_topics_outdf_grpd = df_topic_sents_keywords.groupby('Dominant_Topic')

for i, grp in sent_topics_outdf_grpd:
    sent_topics_sorteddf = pd.concat([sent_topics_sorteddf, 
                                             grp.sort_values(['Perc_Contribution'], ascending=[0]).head(1)], 
                                            axis=0)

# reset index    
sent_topics_sorteddf.reset_index(drop=True, inplace=True)

# format and show
sent_topics_sorteddf.columns = ['Topic_Num', "Topic_Perc_Contrib", "Keywords", "Text"]
sent_topics_sorteddf.head()

In [None]:
# num documents per topic
topic_counts = df_topic_sents_keywords['Dominant_Topic'].value_counts()

# percentage documents for each topic
topic_contribution = round(topic_counts/topic_counts.sum(), 4)

# topic number, keywords
topic_num_keywords = df_topic_sents_keywords[['Dominant_Topic', 'Topic_Keywords']]

# concatenate column-wise
df_dominant_topics = pd.concat([topic_num_keywords, topic_counts, topic_contribution], axis=1)

# add column names
df_dominant_topics.columns = ['Dominant_Topic', 'Topic_Keywords', 'Num_Documents', 'Perc_Documents']

# show
df_dominant_topics