### Setup

In [1]:
# tokenization
import json
import MeCab
import demoji
import mojimoji
import re
from stopwords_ja import stop_words
from stopwords_slothlib import stop_words_2

# lda topic modelling
import pandas as pd
from pprint import pprint
import gensim
from gensim.corpora import Dictionary
from gensim import corpora
# import gensim.corpora as corpora
# from gensim.models import CoherenceModel
from gensim.models.ldamodel import LdaModel
import scipy.sparse as sp
from scipy.sparse import csr_matrix
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
# import pyLDAvis
# import pyLDAvis.sklearn

In [None]:
import numpy as np

### Preprocessing and Tokenization

In [2]:
# preprocess tweet content
def preprocess(text):    
    # from https://colab.research.google.com/drive/1bX-JyY4xmCm_RFkJg3QNcthUvEJaBghP
    # handle half-width/full-width chars, jp punctuation
    text = text.lower()
    text = mojimoji.zen_to_han(text, kana=False)
    text = mojimoji.han_to_zen(text, digit=False, ascii=False)
    text = text.translate(str.maketrans({
        '!': '！', '"': '”', '#': '＃', '$': '＄', '%': '％', '&': '＆', '\'': '’',
        '(': '（', ')': '）', '*': '＊', '+': '＋', ',': '，', '-': '−', '.': '．',
        '/': '／', ':': '：', ';': '；', '<': '＜', '=': '＝', '>': '＞', '?': '？',
        '@': '＠', '[': '［', '\\': '＼', ']': '］', '^': '＾', '_': '＿', '`': '｀',
        '{': '｛', '|': '｜', '}': '｝'
        }))
    zenkaku_leftsingle = b'\xe2\x80\x98'.decode('utf-8')
    text = re.sub('[’´｀]', zenkaku_leftsingle, text)
    
    # remove twitter-specific strings (handles, hashtags, etc.)
    text = re.sub("@([a-zA-Z0-9_]+)", "", text)
    text = re.sub("#([a-zA-Z0-9_ぁ-んァ-ン一-龠]+)", "", text)
    text = re.sub("http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+", "", text)

    # remove emojis
    text = demoji.replace(text, "")
    text = re.sub("([\uD83E-\uD83E])+", "", text)

    # remove punctuation and whitespace
    text = re.sub("([^一-龯ぁ-んァ-ン])+","",text)  
    text = re.sub("(\s)+", "", text)

    return text

In [3]:
# tokenize cleaned tweets into words
def tokenize(text):
    mt = MeCab.Tagger("-d /usr/local/lib/mecab/dic/mecab-ipadic-neologd")
    parsed = mt.parseToNode(text)
    components = []
    
    while parsed:
        word = parsed.surface
        pos = parsed.feature.split(",")[0]

        # for lda, we only want nouns, verbs, adjectives
        include_pos = ["名詞", "動詞", "形容詞"]
        if pos in include_pos: components.append(word)
        parsed = parsed.next
    
    # remove stopwords
    components = [token for token in components if ((not token in stop_words) and (not token in stop_words_2))]
    
    return components

In [4]:
# run preprocessing and tokenization for all tweets from given year dataset
def preprocess_tokenize_all(year):
    # store results and exception tweets
    tokens = []
    retweets = []
    not_parsed = []

    # iterate through tweets, preprocess and tokenize
    with open(year + '-all.txt', 'r') as file:
        for line in file:
            tweet = json.loads(line)
            if line == None or tweet == None:
                not_parsed.append((line, tweet))
                print("Parsing error: ", line, tweet)
            elif tweet['retweetedTweet']:
                retweets.append(tweet)
                print("Retweet: ", tweet['id'])
            else: 
                tweet_text = tweet['rawContent']
                # preprocess text
                processed = preprocess(tweet_text)            
                # tokenize with mecab
                components = tokenize(processed)
                tokens.append(components)

    file.close()
    return tokens, retweets, not_parsed

In [None]:
# # run for 2015
# tokens_2015, retweets_2015, not_parsed_2015 = preprocess_tokenize_all("2015")

# # did we get retweets or errors?
# print(len(retweets_2015))
# print(len(not_parsed_2015))

In [5]:
# run for 2022
tokens_2022, retweets_2022, not_parsed_2022 = preprocess_tokenize_all("2022")

# did we get retweets or errors?
print(len(retweets_2022))
print(len(not_parsed_2022))

In [None]:
np.save("thesis_lda_2022_tweets")

In [None]:
lda_tokens_2022 = np.load("thesis_lda_2022_tweets")