#### Setup

In [11]:
# import packages
import json
import MeCab
import demoji
import re
from stop_words import stop_words
import gensim, logging

#### Clean and Tokenize Tweets

In [2]:
# tokenize with mecab
mt = MeCab.Tagger("-d /usr/local/lib/mecab/dic/mecab-ipadic-neologd")

# dataset we are working with
year = "2015"

# store results and exception tweets
tokens = []
retweets = []
not_parsed = []

# iterate through tweets
with open(year + '-all.txt', 'r') as file:
    for line in file:
        tweet = json.loads(line)
    
        if line == None or tweet == None:
            not_parsed.append((line, tweet))
            print("Parsing error: ", line, tweet)
        elif tweet['retweetedTweet']:
            retweets.append(tweet)
            print("Retweet: ", tweet['id'])
        else: 
            # clean tweet content
            tweet_text = tweet['rawContent'] # note: need other prop for over 140 char?
            remove_emojis = demoji.replace(tweet_text, "")
            remove_more_emojis = re.sub("([\uD83E-\uD83E])+", "", remove_emojis)
            remove_newlines = re.sub("(\n)+", "", remove_more_emojis)
            remove_usernames = re.sub("@([a-zA-Z0-9_]+)", "", remove_newlines)
            remove_hashtags = re.sub("#([a-zA-Z0-9_ぁ-んァ-ン一-龠]+)", "", remove_usernames)
            remove_links = re.sub("http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+", "", remove_hashtags)
            remove_punc = re.sub("([-.,;\"\'!?~@#$%^&*():\{\}\[\]\/\\\\]+)", "", remove_links)
            remove_jp_punc = re.sub("([\uFF01-\uFF0F\uFF1A-\uFF20\uFF3B-\uFF40\uFF5B-\uFF65\uFF9E-\uFFEE\u3000-\u303F]+)", "", remove_punc)
            remove_geo_shapes = re.sub("([\u25A0-\u25FF])+", "", remove_jp_punc)
            remove_misc_symbols = re.sub("([\u2600-\u26FF])+", "", remove_geo_shapes)

            # tokenize with mecab
            parsed = mt.parseToNode(remove_misc_symbols)
            components = []
            while parsed:
                components.append(parsed.surface)
                parsed = parsed.next
            components = [token for token in components if not token in stop_words]
            tokens.append(components)

file.close()

In [4]:
# did we get retweets, or did twscrape filter them out?
print(len(retweets))

# did any tweets fail parsing?
print(len(not_parsed))

0
0


#### Word2Vec

In [5]:
# set up logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# train word2vec model
model = gensim.models.Word2Vec(tokens, min_count=10)

2023-11-16 11:56:05,171 : INFO : collecting all words and their counts
2023-11-16 11:56:05,172 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2023-11-16 11:56:05,205 : INFO : PROGRESS: at sentence #10000, processed 112838 words, keeping 19834 word types
2023-11-16 11:56:05,228 : INFO : PROGRESS: at sentence #20000, processed 222965 words, keeping 24995 word types
2023-11-16 11:56:05,263 : INFO : PROGRESS: at sentence #30000, processed 338097 words, keeping 38689 word types
2023-11-16 11:56:05,287 : INFO : PROGRESS: at sentence #40000, processed 448583 words, keeping 49200 word types
2023-11-16 11:56:05,312 : INFO : PROGRESS: at sentence #50000, processed 558436 words, keeping 54062 word types
2023-11-16 11:56:05,336 : INFO : PROGRESS: at sentence #60000, processed 672942 words, keeping 56539 word types
2023-11-16 11:56:05,361 : INFO : PROGRESS: at sentence #70000, processed 786832 words, keeping 62302 word types
2023-11-16 11:56:05,401 : INFO : PROGRESS: at 

In [6]:
# save word2vec model
model.save("thesis_w2v_" + year + "_tweets")

2023-11-16 11:56:24,323 : INFO : Word2Vec lifecycle event {'fname_or_handle': 'thesis_w2v_2022_tweets', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2023-11-16T11:56:24.323415', 'gensim': '4.3.2', 'python': '3.11.5 (main, Aug 24 2023, 15:18:16) [Clang 14.0.3 (clang-1403.0.22.14.1)]', 'platform': 'macOS-13.5-x86_64-i386-64bit', 'event': 'saving'}
2023-11-16 11:56:24,334 : INFO : not storing attribute cum_table
2023-11-16 11:56:24,421 : INFO : saved thesis_w2v_2022_tweets


In [9]:
# load trained word2vec model
gensim.models.Word2Vec.load("thesis_w2v_" + year + "_tweets")

2023-11-16 11:57:19,137 : INFO : loading Word2Vec object from thesis_w2v_2022_tweets
2023-11-16 11:57:19,198 : INFO : loading wv recursively from thesis_w2v_2022_tweets.wv.* with mmap=None
2023-11-16 11:57:19,199 : INFO : setting ignored attribute cum_table to None
2023-11-16 11:57:19,433 : INFO : Word2Vec lifecycle event {'fname': 'thesis_w2v_2022_tweets', 'datetime': '2023-11-16T11:57:19.433173', 'gensim': '4.3.2', 'python': '3.11.5 (main, Aug 24 2023, 15:18:16) [Clang 14.0.3 (clang-1403.0.22.14.1)]', 'platform': 'macOS-13.5-x86_64-i386-64bit', 'event': 'loaded'}


<gensim.models.word2vec.Word2Vec at 0x110e53a90>

In [10]:
# check similarity given by trained model
print(model.wv.most_similar(positive='在日',topn=10))
print(model.wv.most_similar(positive='外国人',topn=10))


[('侵略者', 0.8887750506401062), ('軍隊', 0.8882155418395996), ('民意', 0.883852481842041), ('略奪', 0.8815134167671204), ('統一協会', 0.8762111663818359), ('全土', 0.8744518756866455), ('陰謀', 0.8732355237007141), ('ネオコン', 0.8732097148895264), ('食糧', 0.8714119791984558), ('裁判官', 0.8696999549865723)]
[('中国人', 0.887516975402832), ('ユダヤ教', 0.870999276638031), ('イスラム教', 0.8588070869445801), ('ユダヤ人', 0.8398546576499939), ('留学生', 0.838367760181427), ('組織', 0.834230899810791), ('欧米', 0.8304198980331421), ('日本人', 0.8295359015464783), ('中国', 0.8283986449241638), ('支持', 0.827479898929596)]
