#### Setup

In [10]:
# import packages
import json
import MeCab
import demoji
import re
from stop_words import stop_words
import gensim, logging

#### Clean and Tokenize Tweets

In [21]:
# tokenize with mecab
mt = MeCab.Tagger("-d /usr/local/lib/mecab/dic/mecab-ipadic-neologd")

# store results and exception tweets
tokens = []
retweets = []
not_parsed = []

# iterate through tweets
with open('2022-all.txt', 'r') as file:
    for line in file:
        tweet = json.loads(line)
    
        if line == None or tweet == None:
            not_parsed.append((line, tweet))
            print("Parsing error: ", line, tweet)
        elif tweet['retweetedTweet']:
            retweets.append(tweet)
            print("Retweet: ", tweet['id'])
        else: 
            # clean tweet content
            tweet_text = tweet['rawContent'] # note: need other prop for over 140 char?
            remove_emojis = demoji.replace(tweet_text, "")
            remove_more_emojis = re.sub("([\uD83E-\uD83E])+", "", remove_emojis)
            remove_newlines = re.sub("(\n)+", "", remove_more_emojis)
            remove_usernames = re.sub("@([a-zA-Z0-9_]+)", "", remove_newlines)
            remove_hashtags = re.sub("#([a-zA-Z0-9_ぁ-んァ-ン一-龠]+)", "", remove_usernames)
            remove_links = re.sub("http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+", "", remove_hashtags)
            remove_punc = re.sub("([-.,;\"\'!?~@#$%^&*():\{\}\[\]\/\\\\]+)", "", remove_links)
            remove_jp_punc = re.sub("([\uFF01-\uFF0F\uFF1A-\uFF20\uFF3B-\uFF40\uFF5B-\uFF65\uFF9E-\uFFEE\u3000-\u303F]+)", "", remove_punc)
            remove_geo_shapes = re.sub("([\u25A0-\u25FF])+", "", remove_jp_punc)
            remove_misc_symbols = re.sub("([\u2600-\u26FF])+", "", remove_geo_shapes)

            # tokenize with mecab
            parsed = mt.parseToNode(remove_misc_symbols)
            components = []
            while parsed:
                components.append(parsed.surface)
                parsed = parsed.next
            components = [token for token in components if not token in stop_words]
            tokens.append(components)

file.close()

#### Word2Vec

In [None]:
# word2vec

# set up logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# train word2vec
model = gensim.models.Word2Vec(tokens, min_count=10)

# check similarity given by trained model
sim = model.wv.most_similar('今日')
print(sim)

2023-11-03 22:13:15,364 : INFO : collecting all words and their counts
2023-11-03 22:13:15,365 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2023-11-03 22:13:15,366 : INFO : collected 805 word types from a corpus of 1172 raw words and 100 sentences
2023-11-03 22:13:15,366 : INFO : Creating a fresh vocabulary
2023-11-03 22:13:15,367 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=3 retains 61 unique words (7.58% of original 805, drops 744)', 'datetime': '2023-11-03T22:13:15.367696', 'gensim': '4.3.2', 'python': '3.11.5 (main, Aug 24 2023, 15:18:16) [Clang 14.0.3 (clang-1403.0.22.14.1)]', 'platform': 'macOS-13.5-x86_64-i386-64bit', 'event': 'prepare_vocab'}
2023-11-03 22:13:15,369 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=3 leaves 323 word corpus (27.56% of original 1172, drops 849)', 'datetime': '2023-11-03T22:13:15.369444', 'gensim': '4.3.2', 'python': '3.11.5 (main, Aug 24 2023, 15:18:16) [Clang 14.0.3 (clang-1403.0.22.1

[('来', 0.2849620580673218), ('お願い', 0.24154554307460785), ('おはよう', 0.20102809369564056), ('大切', 0.1925160437822342), ('日本', 0.17427216470241547), ('よかっ', 0.16761992871761322), ('欲しい', 0.13868466019630432), ('だけ', 0.1273307055234909), ('°', 0.10057535767555237), ('ありがとう', 0.09638050198554993)]
