#### Setup

In [10]:
# import packages
import json
import MeCab
import demoji
import re
from stop_words import stop_words
import gensim, logging

#### Clean and Tokenize Tweets

In [21]:
# tokenize with mecab
mt = MeCab.Tagger("-d /usr/local/lib/mecab/dic/mecab-ipadic-neologd")

# store results and exception tweets
tokens = []
retweets = []
not_parsed = []

# iterate through tweets
with open('2022-all.txt', 'r') as file:
    for line in file:
        tweet = json.loads(line)
    
        if line == None or tweet == None:
            not_parsed.append((line, tweet))
            print("Parsing error: ", line, tweet)
        elif tweet['retweetedTweet']:
            retweets.append(tweet)
            print("Retweet: ", tweet['id'])
        else: 
            # clean tweet content
            tweet_text = tweet['rawContent'] # note: need other prop for over 140 char?
            remove_emojis = demoji.replace(tweet_text, "")
            remove_more_emojis = re.sub("([\uD83E-\uD83E])+", "", remove_emojis)
            remove_newlines = re.sub("(\n)+", "", remove_more_emojis)
            remove_usernames = re.sub("@([a-zA-Z0-9_]+)", "", remove_newlines)
            remove_hashtags = re.sub("#([a-zA-Z0-9_ぁ-んァ-ン一-龠]+)", "", remove_usernames)
            remove_links = re.sub("http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+", "", remove_hashtags)
            remove_punc = re.sub("([-.,;\"\'!?~@#$%^&*():\{\}\[\]\/\\\\]+)", "", remove_links)
            remove_jp_punc = re.sub("([\uFF01-\uFF0F\uFF1A-\uFF20\uFF3B-\uFF40\uFF5B-\uFF65\uFF9E-\uFFEE\u3000-\u303F]+)", "", remove_punc)
            remove_geo_shapes = re.sub("([\u25A0-\u25FF])+", "", remove_jp_punc)
            remove_misc_symbols = re.sub("([\u2600-\u26FF])+", "", remove_geo_shapes)

            # tokenize with mecab
            parsed = mt.parseToNode(remove_misc_symbols)
            components = []
            while parsed:
                components.append(parsed.surface)
                parsed = parsed.next
            components = [token for token in components if not token in stop_words]
            tokens.append(components)

file.close()

#### Word2Vec

In [24]:
# word2vec

# set up logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# train word2vec
model = gensim.models.Word2Vec(tokens, min_count=10)

2023-11-06 23:24:31,753 : INFO : collecting all words and their counts
2023-11-06 23:24:31,758 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2023-11-06 23:24:31,789 : INFO : PROGRESS: at sentence #10000, processed 112838 words, keeping 19834 word types
2023-11-06 23:24:31,815 : INFO : PROGRESS: at sentence #20000, processed 222965 words, keeping 24995 word types
2023-11-06 23:24:31,843 : INFO : PROGRESS: at sentence #30000, processed 338097 words, keeping 38689 word types
2023-11-06 23:24:31,901 : INFO : PROGRESS: at sentence #40000, processed 448583 words, keeping 49200 word types
2023-11-06 23:24:31,927 : INFO : PROGRESS: at sentence #50000, processed 558436 words, keeping 54062 word types
2023-11-06 23:24:31,953 : INFO : PROGRESS: at sentence #60000, processed 672942 words, keeping 56539 word types
2023-11-06 23:24:31,982 : INFO : PROGRESS: at sentence #70000, processed 786832 words, keeping 62302 word types
2023-11-06 23:24:32,018 : INFO : PROGRESS: at 

In [25]:
# check similarity given by trained model
sim = model.wv.most_similar('今日')
print(sim)

[('今週', 0.7725615501403809), ('本日', 0.6239190697669983), ('今日も一日', 0.6208266019821167), ('日曜', 0.5616788268089294), ('昨日', 0.5532968640327454), ('三連休', 0.5496022701263428), ('3連休', 0.5415098667144775), ('土曜日', 0.5411140322685242), ('日曜日', 0.5399312376976013), ('今夜', 0.5187978148460388)]
