#### Setup

In [None]:
# tokenization
import json
import MeCab
import import_ipynb
import thesis_preprocess
from stopwords.stopwords_ja import stop_words
from stopwords.stopwords_slothlib import stop_words_2

# word2vec
import gensim, logging

# plotting
from sklearn.manifold import TSNE               
import numpy as np                
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm

#### Clean and Tokenize Tweets

In [None]:
# tokenize cleaned tweets into words
def tokenize(text):
    mt = MeCab.Tagger("-d /usr/local/lib/mecab/dic/mecab-ipadic-neologd")
    parsed = mt.parseToNode(text)
    components = []
    
    while parsed:
        word = parsed.surface
        pos = parsed.feature.split(",")[0]

        # remove beg/end tokens, particles, fillers, auxiliary bound prefixes/endings
        exclude_pos = ['BOS/EOS', '助詞', 'フィラー', '接頭詞', '助動詞']
        if pos not in exclude_pos: components.append(word)
        parsed = parsed.next
    
    # remove stopwords
    components = [token for token in components if ((not token in stop_words) and (not token in stop_words_2))]
    
    return components

In [None]:
tweets_22 = thesis_preprocess.get_unique_tweets("datasets_general_years/2022-all.txt", 2022)

In [None]:
tweets_15 = thesis_preprocess.get_unique_tweets("datasets_general_years/2015-all.txt", 2015)

In [None]:
# run preprocessing and tokenization for tweets from given .txt file
def preprocess_tokenize_all_unique(filename, year):
    tokens = []
    tweets = thesis_preprocess.get_unique_tweets(filename, year)
    for tweet in tweets:
        processed = thesis_preprocess.preprocess(tweet)            
        components = tokenize(processed)
        tokens.append(components)

    return tokens, tweets

In [None]:
tokens_15_2 = preprocess_tokenize_all_2("datasets_general_years/2015-all.txt","2015")

In [None]:
# run preprocessing and tokenization for all tweets from given year dataset
def preprocess_tokenize_all(year):
    # store results and exception tweets
    tokens = []
    retweets = []
    not_parsed = []

    # iterate through tweets, preprocess and tokenize
    with open('datasets_general_years/'+ year + '-all.txt', 'r') as file:
        for line in file:
            tweet = json.loads(line)
            if line == None or tweet == None:
                not_parsed.append((line, tweet))
                print("Parsing error: ", line, tweet)
            elif tweet['retweetedTweet']:
                retweets.append(tweet)
                print("Retweet: ", tweet['id'])
            # filter out 2024 sponsored(?) tweets
            elif int(tweet['date'].split("-")[0]) < int(year) + 1: 
                tweet_text = tweet['rawContent'] # note: need other prop for over 140 char?
                processed = thesis_preprocess.preprocess(tweet_text)            
                components = tokenize(processed)
                tokens.append(components)

    file.close()
    return tokens, retweets, not_parsed

In [None]:
# run for 2015
tokens_2015, retweets_2015, not_parsed_2015 = preprocess_tokenize_all("2015")

# did we get retweets or errors?
print(len(retweets_2015))
print(len(not_parsed_2015))

In [None]:
# run for 2022
tokens_2022, retweets_2022, not_parsed_2022 = preprocess_tokenize_all("2022")

# did we get retweets or errors?
print(len(retweets_2022))
print(len(not_parsed_2022))

#### Word2Vec

In [None]:
# train and save word2vec model for given year
def run_word2vec(year, tokens):
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
    model = gensim.models.Word2Vec(tokens, min_count=5)
    model.save("saved_w2v_models/w2v_model_" + year)

In [None]:
run_word2vec("2015", tokens_15_2)

In [None]:
# train and save word2vec model for 2015 
run_word2vec("2015", tokens_2015)

In [None]:
# train and save word2vec model for 2022
run_word2vec("2022", tokens_2022)

In [None]:
# load trained word2vec model
model_2015 = gensim.models.Word2Vec.load("saved_w2v_models/w2v_model_2015")

# check similarity given by trained model
print(model_2015.wv.most_similar(positive='在日',topn=10))
print(model_2015.wv.most_similar(positive='外国人',topn=10))

In [None]:
# same for 2022
model_2022 = gensim.models.Word2Vec.load("saved_w2v_models/w2v_model_2022")

print(model_2022.wv.most_similar(positive='在日',topn=10))
print(model_2022.wv.most_similar(positive='外国人',topn=10))

#### Plot Word2Vec Findings

In [None]:
# https://radimrehurek.com/gensim/auto_examples/tutorials/run_word2vec.html#sphx-glr-auto-examples-tutorials-run-word2vec-py
def reduce_dimensions(model):
    num_dimensions = 2

    # extract the words & their vectors, as numpy arrays
    vectors = np.asarray(model.wv.vectors)
    labels = np.asarray(model.wv.index_to_key)

    # reduce using t-SNE
    tsne = TSNE(n_components=num_dimensions, random_state=0)
    vectors = tsne.fit_transform(vectors)

    x_vals = [v[0] for v in vectors]
    y_vals = [v[1] for v in vectors]
    return x_vals, y_vals

In [None]:
# plot similar words
# https://aneesha.medium.com/using-tsne-to-plot-a-subset-of-similar-words-from-word2vec-bb8eeaea6229 
# https://albertauyeung.github.io/2020/03/15/matplotlib-cjk-fonts.html/
# https://stackoverflow.com/questions/70268270/how-to-plot-tsne-on-word2vec-created-from-gensim-for-the-most-similar-20-cases

def plot_closest_words(word, model, x_vals, y_vals):
    labels = [i for i in model.wv.index_to_key]
    close_words = [i[0] for i in model.wv.most_similar(positive=word, topn=15)]

    fprop = fm.FontProperties(fname='NotoSansJP-VariableFont_wght.ttf')
    for word in close_words:
        i = labels.index(word)
        plt.scatter(x_vals[i],y_vals[i])
        plt.annotate(labels[i], xy=(x_vals[i], y_vals[i]), xytext=(5, 2), textcoords='offset points', ha='right', va='bottom', fontproperties=fprop)

In [None]:
x_vals, y_vals = reduce_dimensions(model_2015)
plot_closest_words("外人", model_2015, x_vals, y_vals)

In [None]:
x_vals, y_vals = reduce_dimensions(model_2022)
plot_closest_words("外人", model_2022, x_vals, y_vals)

### W2V Similar Words - Minority Groups

In [None]:
def compare_similar_words(keyword:str, model_2015, model_2022, positive=[], negative=[], topn=10):
    if len(positive) == 0: positive = keyword

    # 2015
    print("\nSimilar words to " + keyword + ": 2015")
    try:
        words_15 = model_2015.wv.most_similar(positive=positive, negative=negative, topn=topn)
        for w in words_15:
            print(w[0])
    except:
        print("Error\n")

    # 2022
    print("\nSimilar words to " + keyword + ": 2022")
    try:
        words_22 = model_2022.wv.most_similar(positive=positive, negative=negative, topn=topn)
        for w in words_22:
            print(w[0])
    except:
        print("Error\n")

In [None]:
model_2015 = gensim.models.Word2Vec.load("saved_w2v_models/w2v_model_2015")
model_2022 = gensim.models.Word2Vec.load("saved_w2v_models/w2v_model_2022")

In [None]:
model_2015 = gensim.models.Word2Vec.load("saved_w2v_models/w2v_model_2015")
model_2022 = gensim.models.Word2Vec.load("saved_w2v_models/w2v_model_2022")


In [None]:
# Zainichi Koreans
compare_similar_words("在日", model_2015, model_2022)

In [None]:
# Ainu
compare_similar_words("アイヌ", model_2015, model_2022)

In [None]:
compare_similar_words("沖縄", model_2015, model_2022, positive=["沖縄","日本人"])
compare_similar_words("琉球", model_2015, model_2022, positive=["琉球","日本人"])

In [None]:
compare_similar_words("ハフ", model_2015, model_2022, positive=["ハフ","日本人"
                                                              ], negative=["髪","服"])

In [None]:
compare_similar_words("", model_2015, model_2022)

In [None]:
compare_similar_words("フィリピン", model_2015, model_2022)

In [None]:
compare_similar_words("外人", model_2015, model_2022)

In [None]:
compare_similar_words("外国人", model_2015, model_2022)

In [None]:
def get_similar_words(keyword:str, model, positive=[], negative=[], topn=10):
    if len(positive) == 0: positive = keyword

    print("\nSimilar words to " + keyword + ": 2015")
    try:
        words = model.wv.most_similar(positive=positive, negative=negative, topn=topn)
        for w in words:
            print(w[0])
    except:
        print("Error\n")

In [None]:
get_similar_words("在日", model_2015)
get_similar_words("アイヌ", model_2015)
get_similar_words("沖縄", model_2015, positive=["沖縄","日本人"])
get_similar_words("琉球", model_2015, positive=["沖縄","日本人"])
get_similar_words("ハフ", model_2015, positive=["ハフ",'日本人'], negative=["髪","服"])
get_similar_words("ベトナム", model_2015)
get_similar_words("フィリピン", model_2015)
get_similar_words("外人", model_2015)
get_similar_words("外国人", model_2015)