### Setup

In [None]:
# tokenization
import json
import MeCab
import import_ipynb
import thesis_preprocess
from stopwords.stopwords_ja import stop_words
from stopwords.stopwords_slothlib import stop_words_2

# word2vec
import gensim, logging

# plotting
from sklearn.manifold import TSNE               
import numpy as np                
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm

### Clean and Tokenize Tweets

In [None]:
# tokenize cleaned tweets into words
def tokenize_w2v(text):
    mt = MeCab.Tagger("-d /usr/local/lib/mecab/dic/mecab-ipadic-neologd")
    parsed = mt.parseToNode(text)
    components = []
    
    while parsed:
        word = parsed.surface
        pos = parsed.feature.split(",")[0]

        # remove beg/end tokens, particles, fillers, auxiliary bound prefixes/endings
        exclude_pos = ['BOS/EOS', '助詞', 'フィラー', '接頭詞', '助動詞']
        if pos not in exclude_pos: components.append(word)
        parsed = parsed.next
    
    # remove stopwords
    components = [token for token in components if ((not token in stop_words) and (not token in stop_words_2))]
    
    return components

In [None]:
# preprocess and tokenize with w2v-specific tokenize function
def preprocess_tokenize_all_unique(filename, year):
    tokens = []
    tweets = thesis_preprocess.get_unique_tweets(filename, year)
    for tweet in tweets:
        processed = thesis_preprocess.preprocess(tweet)            
        components = tokenize_w2v(processed)
        tokens.append(components)

    return tokens, tweets

In [None]:
tokens_15, tweets_15 = preprocess_tokenize_all_unique("datasets_general_years/2015-all.txt","2015")
thesis_preprocess.save_to_csv(tokens_15,"saved_tokens/2015-all.csv")
thesis_preprocess.save_to_csv(tweets_15,"saved_tweets/2015-all.csv")

In [None]:
tokens_22, tweets_22 = preprocess_tokenize_all_unique("datasets_general_years/2022-all.txt","2022")
thesis_preprocess.save_to_csv(tokens_22,"saved_tokens/2022-all.csv")
thesis_preprocess.save_to_csv(tweets_22,"saved_tweets/2022-all.csv")

### Word2Vec

In [None]:
# train and save word2vec model for given year
def run_word2vec(year, tokens):
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
    model = gensim.models.Word2Vec(tokens, min_count=5)
    model.save("saved_w2v_models_unique/w2v_model_" + year)

    return model

In [None]:
# train and save word2vec model for 2015 
model_2015 = run_word2vec("2015", tokens_15)

In [None]:
# train and save word2vec model for 2022
model_2022 = run_word2vec("2022", tokens_22)

### Compare Similar Words

In [None]:
def get_similar_words(keyword:str, model, positive=[], negative=[], topn=10):
    if len(positive) == 0: positive = keyword

    print("\nSimilar words to " + keyword + ": 2015")
    try:
        words = model.wv.most_similar(positive=positive, negative=negative, topn=topn)
        for w in words:
            print(w[0])
    except:
        print("Error\n")

In [None]:
def compare_similar_words(keyword:str, model_2015, model_2022, positive=[], negative=[], topn=10):
    if len(positive) == 0: positive = keyword

    # 2015
    print("\nSimilar words to " + keyword + ": 2015")
    try:
        words_15 = model_2015.wv.most_similar(positive=positive, negative=negative, topn=topn)
        for w in words_15:
            print(w[0])
    except:
        print("Error\n")

    # 2022
    print("\nSimilar words to " + keyword + ": 2022")
    try:
        words_22 = model_2022.wv.most_similar(positive=positive, negative=negative, topn=topn)
        for w in words_22:
            print(w[0])
    except:
        print("Error\n")

In [None]:
model_2015 = gensim.models.Word2Vec.load("saved_w2v_models_unique/w2v_model_2015")
model_2022 = gensim.models.Word2Vec.load("saved_w2v_models_unique/w2v_model_2022")

In [None]:
get_similar_words("在日", model_2015)
get_similar_words("アイヌ", model_2015)
get_similar_words("沖縄", model_2015, positive=["沖縄","日本人"])
get_similar_words("琉球", model_2015, positive=["琉球","日本人"])
get_similar_words("ハフ", model_2015, positive=["ハフ",'日本人'], negative=["髪","服"])
get_similar_words("ベトナム", model_2015)
get_similar_words("フィリピン", model_2015)
get_similar_words("外人", model_2015)
get_similar_words("外国人", model_2015)

In [None]:
compare_similar_words("在日", model_2015, model_2022)
compare_similar_words("アイヌ", model_2015, model_2022)
compare_similar_words("沖縄", model_2015, model_2022, positive=["沖縄","日本人"])
compare_similar_words("琉球", model_2015, model_2022, positive=["琉球","日本人"])
compare_similar_words("ハフ", model_2015, model_2022, positive=["ハフ","日本人"], negative=["髪","服"])
compare_similar_words("ベトナム", model_2015, model_2022)
compare_similar_words("フィリピン", model_2015, model_2022)
compare_similar_words("外人", model_2015, model_2022)
compare_similar_words("外国人", model_2015, model_2022)