### Setup

In [102]:
# general
import numpy as np
import pandas as pd
import csv

# tokenization
import json
import MeCab
import demoji
import mojimoji
import re
from stopwords_ja import stop_words
from stopwords_slothlib import stop_words_2

# lda topic modeling
import gensim, logging
import gensim.corpora as corpora
from gensim.models import CoherenceModel
import pyLDAvis
import pyLDAvis.gensim

In [99]:
import csv

### Preprocessing and Tokenization

In [2]:
# preprocess tweet content
def preprocess(text):    
    # from https://colab.research.google.com/drive/1bX-JyY4xmCm_RFkJg3QNcthUvEJaBghP
    # handle half-width/full-width chars, jp punctuation
    text = text.lower()
    text = mojimoji.zen_to_han(text, kana=False)
    text = mojimoji.han_to_zen(text, digit=False, ascii=False)
    text = text.translate(str.maketrans({
        '!': '！', '"': '”', '#': '＃', '$': '＄', '%': '％', '&': '＆', '\'': '’',
        '(': '（', ')': '）', '*': '＊', '+': '＋', ',': '，', '-': '−', '.': '．',
        '/': '／', ':': '：', ';': '；', '<': '＜', '=': '＝', '>': '＞', '?': '？',
        '@': '＠', '[': '［', '\\': '＼', ']': '］', '^': '＾', '_': '＿', '`': '｀',
        '{': '｛', '|': '｜', '}': '｝'
        }))
    zenkaku_leftsingle = b'\xe2\x80\x98'.decode('utf-8')
    text = re.sub('[’´｀]', zenkaku_leftsingle, text)
    
    # remove twitter-specific strings (handles, hashtags, etc.)
    text = re.sub("@([a-zA-Z0-9_]+)", "", text)
    text = re.sub("#([a-zA-Z0-9_ぁ-んァ-ン一-龠]+)", "", text)
    text = re.sub("http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+", "", text)

    # remove emojis
    text = demoji.replace(text, "")
    text = re.sub("([\uD83E-\uD83E])+", "", text)

    # remove punctuation and whitespace
    text = re.sub("([^一-龯ぁ-んァ-ン])+","",text)  
    text = re.sub("(\s)+", "", text)

    return text

In [3]:
# tokenize cleaned tweets into words
def tokenize(text):
    mt = MeCab.Tagger("-d /usr/local/lib/mecab/dic/mecab-ipadic-neologd")
    parsed = mt.parseToNode(text)
    components = []
    
    while parsed:
        word = parsed.surface
        pos = parsed.feature.split(",")[0]

        # for lda, we only want nouns, verbs, adjectives
        include_pos = ["名詞", "動詞", "形容詞"]
        if pos in include_pos: components.append(word)
        parsed = parsed.next
    
    # remove stopwords
    components = [token for token in components if ((not token in stop_words) and (not token in stop_words_2))]
    
    return components

In [109]:
# run preprocessing and tokenization for all tweets from given year dataset
def preprocess_tokenize_all(year):
    # store results and exception tweets
    token_tweets = []
    retweets = []
    not_parsed = []

    # iterate through tweets, preprocess and tokenize
    with open(year + '-all.txt', 'r') as file:
        for line in file:
            tweet = json.loads(line)
            if line == None or tweet == None:
                not_parsed.append((line, tweet))
                print("Parsing error: ", line, tweet)
            elif tweet['retweetedTweet']:
                retweets.append(tweet)
                print("Retweet: ", tweet['id'])
            else: 
                tweet_text = tweet['rawContent']
                processed = preprocess(tweet_text)            
                components = tokenize(processed)
                if len(components) > 0: token_tweets.append(components)

    file.close()
    return token_tweets, retweets, not_parsed

In [None]:
# # run for 2015
# tokens_2015, retweets_2015, not_parsed_2015 = preprocess_tokenize_all("2015")

# # did we get retweets or errors?
# print(len(retweets_2015))
# print(len(not_parsed_2015))

In [110]:
# run for 2022
token_tweets_2022, retweets_2022, not_parsed_2022 = preprocess_tokenize_all("2022")

# did we get retweets or errors?
print(len(retweets_2022))
print(len(not_parsed_2022))

0
0


In [111]:
# save tokenized tweets to csv
myFile = open('token_tweets_2022.csv', 'w')
writer = csv.writer(myFile)
for tweet in token_tweets_2022:
    writer.writerow(tweet)
myFile.close()

In [112]:
# load tokenized tweets from csv
with open('token_tweets_2022.csv', newline='') as f:
    reader = csv.reader(f)
    token_tweets_2022 = list(reader)

In [55]:
# https://lda.readthedocs.io/en/latest/getting_started.html
# https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/
# https://github.com/deankuo/Japan-Manifesto-Classification/blob/main/topic_modeling.ipynb
# https://github.com/m3yrin/NTM/blob/master/LDA_jp.ipynb
# https://tdual.hatenablog.com/entry/2018/04/09/133000#1LDA%E3%81%AE%E5%89%8D%E3%81%AB%E3%83%88%E3%83%94%E3%83%83%E3%82%AF%E3%83%A2%E3%83%87%E3%83%AB%E3%81%A8%E3%81%AF

In [113]:
# set up dictionary
d = corpora.Dictionary(token_tweets_2022)
d.filter_extremes(no_below=5, no_above=0.2)
d.compactify()

adding document #0 to Dictionary<0 unique tokens: []>


adding document #10000 to Dictionary<17071 unique tokens: ['め', 'いいね', 'い人', 'くれる', 'ぱい']...>
adding document #20000 to Dictionary<21964 unique tokens: ['め', 'いいね', 'い人', 'くれる', 'ぱい']...>
adding document #30000 to Dictionary<33320 unique tokens: ['め', 'いいね', 'い人', 'くれる', 'ぱい']...>
adding document #40000 to Dictionary<41782 unique tokens: ['め', 'いいね', 'い人', 'くれる', 'ぱい']...>
adding document #50000 to Dictionary<44866 unique tokens: ['め', 'いいね', 'い人', 'くれる', 'ぱい']...>
adding document #60000 to Dictionary<47029 unique tokens: ['め', 'いいね', 'い人', 'くれる', 'ぱい']...>
adding document #70000 to Dictionary<50158 unique tokens: ['め', 'いいね', 'い人', 'くれる', 'ぱい']...>
adding document #80000 to Dictionary<53829 unique tokens: ['め', 'いいね', 'い人', 'くれる', 'ぱい']...>
adding document #90000 to Dictionary<55227 unique tokens: ['め', 'いいね', 'い人', 'くれる', 'ぱい']...>
adding document #100000 to Dictionary<60494 unique tokens: ['め', 'いいね', 'い人', 'くれる', 'ぱい']...>
adding document #110000 to Dictionary<66313 unique tokens: 

In [114]:
# set up corpus
corpus = [d.doc2bow(w) for w in token_tweets_2022]
test_size = int(len(corpus) * 0.1)
test_corpus = corpus[:test_size]
train_corpus = corpus[test_size:]

In [115]:
print(corpus[:1])

[[(0, 1)]]


In [116]:
[[(d[i], freq) for i, freq in doc] for doc in corpus[:1]]

[[('め', 1)]]

In [117]:
logging.basicConfig(format='%(message)s', level=logging.INFO)
lda = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=d, num_topics=20, passes=10, update_every=5)

using symmetric alpha at 0.05
using symmetric eta at 0.05
using serial LDA version on this node
running online (multi-pass) LDA training, 20 topics, 10 passes over the supplied corpus of 530882 documents, updating model once every 10000 documents, evaluating perplexity every 20000 documents, iterating 50x with a convergence threshold of 0.001000
PROGRESS: pass 0, at document #2000/530882
PROGRESS: pass 0, at document #4000/530882
PROGRESS: pass 0, at document #6000/530882
PROGRESS: pass 0, at document #8000/530882
PROGRESS: pass 0, at document #10000/530882
merging changes from 10000 documents into a model of 530882 documents
topic #0 (0.050): 0.016*"今日" + 0.013*"おは" + 0.009*"思う" + 0.007*"いい" + 0.005*"てる" + 0.005*"朝" + 0.004*"元気" + 0.004*"起き" + 0.004*"凸" + 0.004*"おはよう"
topic #2 (0.050): 0.011*"今日" + 0.010*"良い" + 0.006*"ください" + 0.006*"てる" + 0.006*"行く" + 0.004*"来" + 0.004*"子供" + 0.004*"くれ" + 0.004*"食べ" + 0.004*"フォロ"
topic #16 (0.050): 0.018*"今日" + 0.009*"凸" + 0.009*"てる" + 0.006*"くれ" + 0.

In [None]:
# examine topics
def get_topic_words(topic_id):
    for t in lda.get_topic_terms(topic_id):
        print("{}: {}".format(d[t[0]], t[1]))

for t in range(10):
    print("Topic # ",t)
    get_topic_words(t)
    print("\n")

Topic #  0
参加: 0.05125609412789345
ください: 0.04141366109251976
結果: 0.035887837409973145
キャンペン: 0.03277701139450073
応募: 0.02719801478087902
繋がり: 0.027086764574050903
抽選: 0.02519279159605503
当たる: 0.023572368547320366
いただき: 0.021671734750270844
チャンス: 0.01977124996483326


Topic #  1
おは: 0.13742290437221527
くん: 0.08873689919710159
忘れ: 0.01844465173780918
やばい: 0.01704743504524231
買い: 0.011870494112372398
分かる: 0.011627483181655407
っぽい: 0.010741936974227428
休ん: 0.009681729599833488
一生: 0.009210122749209404
楽し: 0.008757698349654675


Topic #  2
お願い: 0.05675635114312172
致し: 0.024356860667467117
交換: 0.024327795952558517
可能: 0.019848929718136787
いたし: 0.01793503575026989
失礼: 0.017739158123731613
本日: 0.016644906252622604
声: 0.014810626395046711
情報: 0.014269010163843632
頂き: 0.014200180768966675


Topic #  3
日本: 0.03888186812400818
感謝: 0.02630898728966713
今年: 0.023176496848464012
ください: 0.019582226872444153
参加: 0.016282381489872932
チャレンジ: 0.016235476359725
結果: 0.015640586614608765
一緒: 0.0147619601339101

In [None]:
# look at results
N = sum(count for doc in train_corpus for _, count in doc)
print("# of words in train corpus: ",N)
perplexity = np.exp2(-lda.log_perplexity(train_corpus))
print("perplexity(train):", perplexity)

print("==============================")
N = sum(count for doc in test_corpus for _, count in doc)
print("# of words in test corpus: ",N)
perplexity = np.exp2(-lda.log_perplexity(test_corpus))
print("perplexity(test):", perplexity)

# of words in train corpus:  3494029


-13.182 per-word bound, 9295.2 perplexity estimate based on a held-out corpus of 508891 documents with 3494029 words


perplexity(train): 9295.246519198545
# of words in test corpus:  388319


-12.177 per-word bound, 4629.9 perplexity estimate based on a held-out corpus of 56543 documents with 388319 words


perplexity(test): 4629.881080432986


In [None]:
# look at overall perplexity and coherence score
print('\nPerplexity: ', lda.log_perplexity(corpus))     # lower is better

coherence_model_lda = CoherenceModel(model=lda, texts=token_tweets_2022, dictionary=d, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)     # higher is better

-13.013 per-word bound, 8268.2 perplexity estimate based on a held-out corpus of 565434 documents with 3882348 words
using ParallelWordOccurrenceAccumulator<processes=7, batch_size=64> to estimate probabilities from sliding windows



Perplexity:  -13.013352744329307


7 accumulators retrieved from output queue
accumulated word occurrence stats for 588269 virtual documents



Coherence Score:  0.45772442303523475


In [None]:
# visualize topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda, corpus, d)

In [None]:
vis

In [None]:
# https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/
def format_topics_sentences(ldamodel=lda, corpus=corpus, texts=token_tweets_2022):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)


df_topic_sents_keywords = format_topics_sentences(ldamodel=lda, corpus=corpus, texts=token_tweets_2022)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

# Show
df_dominant_topic.head(10)

In [None]:
# Group top 5 sentences under each topic
sent_topics_sorteddf = pd.DataFrame()

sent_topics_outdf_grpd = df_topic_sents_keywords.groupby('Dominant_Topic')

for i, grp in sent_topics_outdf_grpd:
    sent_topics_sorteddf = pd.concat([sent_topics_sorteddf, 
                                             grp.sort_values(['Perc_Contribution'], ascending=[0]).head(1)], 
                                            axis=0)

# Reset Index    
sent_topics_sorteddf.reset_index(drop=True, inplace=True)

# Format
sent_topics_sorteddf.columns = ['Topic_Num', "Topic_Perc_Contrib", "Keywords", "Text"]

# Show
sent_topics_sorteddf.head()

In [None]:
# Number of Documents for Each Topic
topic_counts = df_topic_sents_keywords['Dominant_Topic'].value_counts()

# Percentage of Documents for Each Topic
topic_contribution = round(topic_counts/topic_counts.sum(), 4)

# Topic Number and Keywords
topic_num_keywords = df_topic_sents_keywords[['Dominant_Topic', 'Topic_Keywords']]

# Concatenate Column wise
df_dominant_topics = pd.concat([topic_num_keywords, topic_counts, topic_contribution], axis=1)

# Change Column names
df_dominant_topics.columns = ['Dominant_Topic', 'Topic_Keywords', 'Num_Documents', 'Perc_Documents']

# Show
df_dominant_topics