### Setup

In [1]:
# tokenization
import json
import MeCab
import import_ipynb
import thesis_preprocess
from stopwords.stopwords_ja import stop_words
from stopwords.stopwords_slothlib import stop_words_2

# word2vec
import gensim, logging

# plotting
from sklearn.manifold import TSNE               
import numpy as np                
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm

importing Jupyter notebook from thesis_preprocess.ipynb


### Clean and Tokenize Tweets

In [10]:
# tokenize cleaned tweets into words
def tokenize_w2v(text):
    mt = MeCab.Tagger("-d /usr/local/lib/mecab/dic/mecab-ipadic-neologd")
    parsed = mt.parseToNode(text)
    components = []
    
    while parsed:
        word = parsed.surface
        pos = parsed.feature.split(",")[0]

        # remove beg/end tokens, particles, fillers, auxiliary bound prefixes/endings
        exclude_pos = ['BOS/EOS', '助詞', 'フィラー', '接頭詞', '助動詞']
        if pos not in exclude_pos: components.append(word)
        parsed = parsed.next
    
    # remove stopwords
    components = [token for token in components if ((not token in stop_words) and (not token in stop_words_2))]
    
    return components

In [7]:
# preprocess and tokenize with w2v-specific tokenize function
def preprocess_tokenize_all_unique(filename, year):
    tokens = []
    tweets = thesis_preprocess.get_unique_tweets(filename, year)
    for tweet in tweets:
        processed = thesis_preprocess.preprocess(tweet)            
        components = tokenize_w2v(processed)
        tokens.append(components)

    return tokens, tweets

In [11]:
tokens_15, tweets_15 = preprocess_tokenize_all_unique("datasets_general_years/2015-all.txt","2015")
thesis_preprocess.save_to_csv(tokens_15,"saved_tokens_unique/2015-all.csv")
thesis_preprocess.save_to_csv(tweets_15,"saved_tweets_unique/2015-all.csv")

In [None]:
tokens_22, tweets_22 = preprocess_tokenize_all_unique("datasets_general_years/2022-all.txt","2022")
thesis_preprocess.save_to_csv(tokens_22,"saved_tokens_unique/2022-all.csv")
thesis_preprocess.save_to_csv(tweets_22,"saved_tweets_unique/2022-all.csv")

### Word2Vec

In [4]:
# train and save word2vec model for given year
def run_word2vec(year, tokens):
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
    model = gensim.models.Word2Vec(tokens, min_count=5)
    model.save("saved_w2v_models_unique/w2v_model_" + year)

    return model

In [12]:
# train and save word2vec model for 2015 
model_2015 = run_word2vec("2015", tokens_15)

2024-04-10 22:28:41,599 : INFO : collecting all words and their counts
2024-04-10 22:28:41,601 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2024-04-10 22:28:41,616 : INFO : PROGRESS: at sentence #10000, processed 58962 words, keeping 19405 word types
2024-04-10 22:28:41,645 : INFO : PROGRESS: at sentence #20000, processed 117050 words, keeping 30132 word types
2024-04-10 22:28:41,663 : INFO : PROGRESS: at sentence #30000, processed 175559 words, keeping 38413 word types
2024-04-10 22:28:41,694 : INFO : PROGRESS: at sentence #40000, processed 233405 words, keeping 45727 word types
2024-04-10 22:28:41,717 : INFO : PROGRESS: at sentence #50000, processed 292013 words, keeping 52151 word types
2024-04-10 22:28:41,733 : INFO : PROGRESS: at sentence #60000, processed 350103 words, keeping 57989 word types
2024-04-10 22:28:41,749 : INFO : PROGRESS: at sentence #70000, processed 409158 words, keeping 63457 word types
2024-04-10 22:28:41,772 : INFO : PROGRESS: at s

In [None]:
# train and save word2vec model for 2022
model_2022 = run_word2vec("2022", tokens_22)

### Compare Similar Words

In [13]:
def get_similar_words(keyword:str, model, positive=[], negative=[], topn=10):
    if len(positive) == 0: positive = keyword

    print("\nSimilar words to " + keyword + ": 2015")
    try:
        words = model.wv.most_similar(positive=positive, negative=negative, topn=topn)
        for w in words:
            print(w[0])
    except:
        print("Error\n")

In [14]:
def compare_similar_words(keyword:str, model_2015, model_2022, positive=[], negative=[], topn=10):
    if len(positive) == 0: positive = keyword

    # 2015
    print("\nSimilar words to " + keyword + ": 2015")
    try:
        words_15 = model_2015.wv.most_similar(positive=positive, negative=negative, topn=topn)
        for w in words_15:
            print(w[0])
    except:
        print("Error\n")

    # 2022
    print("\nSimilar words to " + keyword + ": 2022")
    try:
        words_22 = model_2022.wv.most_similar(positive=positive, negative=negative, topn=topn)
        for w in words_22:
            print(w[0])
    except:
        print("Error\n")

In [16]:
model_2015 = gensim.models.Word2Vec.load("saved_w2v_models_unique/w2v_model_2015")
model_2022 = gensim.models.Word2Vec.load("saved_w2v_models_unique/w2v_model_2022")

2024-04-10 22:31:26,421 : INFO : loading Word2Vec object from saved_w2v_models_unique/w2v_model_2015
2024-04-10 22:31:26,687 : INFO : loading wv recursively from saved_w2v_models_unique/w2v_model_2015.wv.* with mmap=None
2024-04-10 22:31:26,689 : INFO : setting ignored attribute cum_table to None
2024-04-10 22:31:27,073 : INFO : Word2Vec lifecycle event {'fname': 'saved_w2v_models_unique/w2v_model_2015', 'datetime': '2024-04-10T22:31:27.073595', 'gensim': '4.3.2', 'python': '3.11.5 (main, Aug 24 2023, 15:18:16) [Clang 14.0.3 (clang-1403.0.22.14.1)]', 'platform': 'macOS-14.4.1-x86_64-i386-64bit', 'event': 'loaded'}
2024-04-10 22:31:27,076 : INFO : loading Word2Vec object from saved_w2v_models_unique/w2v_model_2022
2024-04-10 22:31:27,337 : INFO : loading wv recursively from saved_w2v_models_unique/w2v_model_2022.wv.* with mmap=None
2024-04-10 22:31:27,347 : INFO : setting ignored attribute cum_table to None
2024-04-10 22:31:27,670 : INFO : Word2Vec lifecycle event {'fname': 'saved_w2v_m

In [None]:
get_similar_words("在日", model_2015)
get_similar_words("アイヌ", model_2015)
get_similar_words("沖縄", model_2015, positive=["沖縄","日本人"])
get_similar_words("琉球", model_2015, positive=["琉球","日本人"])
get_similar_words("ハフ", model_2015, positive=["ハフ",'日本人'], negative=["髪","服"])
get_similar_words("ベトナム", model_2015)
get_similar_words("フィリピン", model_2015)
get_similar_words("外人", model_2015)
get_similar_words("外国人", model_2015)

In [18]:
compare_similar_words("在日", model_2015, model_2022)
compare_similar_words("アイヌ", model_2015, model_2022)
compare_similar_words("沖縄", model_2015, model_2022, positive=["沖縄","日本人"])
compare_similar_words("琉球", model_2015, model_2022, positive=["琉球","日本人"])
compare_similar_words("ハフ", model_2015, model_2022, positive=["ハフ","日本人"], negative=["髪","服"])
compare_similar_words("ベトナム", model_2015, model_2022)
compare_similar_words("フィリピン", model_2015, model_2022)
compare_similar_words("外人", model_2015, model_2022)
compare_similar_words("外国人", model_2015, model_2022)


Similar words to 在日: 2015
違憲
発覚
公正
首相
政策
自民
野党
博愛
安倍
集団的自衛権

Similar words to 在日: 2022
植民地
習近平
反日カルト
毀損
我が国
人権問題
日本の政治
反日
留学生
国益

Similar words to アイヌ: 2015
Error


Similar words to アイヌ: 2022
オウム
朝鮮
死後
貶める
表現の自由
公正
言説
一族
仏教
諸悪の根源

Similar words to 沖縄: 2015
韓国
アメリカ
文化
中国
日本
米軍
海外
報道
ドイツ
産経

Similar words to 沖縄: 2022
アメリカ
外国
移住
台湾
中国
住む
留学
日本
イギリス
韓国

Similar words to 琉球: 2015
イスラム
痛烈
韓国人
北朝鮮
外国
中韓
ジャナリスト
フランス
人質事件
訪日

Similar words to 琉球: 2022
朝鮮人
真珠湾攻撃
中国人
中国共産党
軍事作戦
大国
称賛
時代遅れ
生ん
橋下徹

Similar words to ハフ: 2015
国会
団体
成立
安保法案
朝日新聞デジタル
調査
安保
支援
規制
ニュス

Similar words to ハフ: 2022
記者
代表
出身
ウクライナ
維新
国葬
大統領
報道
氏
安倍さん

Similar words to ベトナム: 2015
審
小西
考
刑務所
ダウ平均
名簿
言い渡さ
聖地
特別区
国籍

Similar words to ベトナム: 2022
江戸時代
隈研吾
中東
締結
西欧
半島
起源
内戦
南米
美的

Similar words to フィリピン: 2015
メキシコ
サムスン
愛国
方位
織田信成
放射能汚染
天皇陛下
倒壊
二酸化炭素
闘争

Similar words to フィリピン: 2022
郊外
北方
ロンドン
逃走
沿線
宮殿
人情
発祥
江戸時代
白川郷

Similar words to 外人: 2015
間違える
噛み合っ
通じる
汚く
ウケ
困惑
錯覚
ヘラヘラ
自己嫌悪
叩か

Similar words to 外人: 2022
ヅラ
あらわれ
呼ばわり
引っ掛かる
ジジイ
ホ