In [15]:
import pandas as pd
import numpy as np
from gensim.models import fasttext, word2vec
from scipy.stats import spearmanr
import json
import torch

## JWSAN

In [16]:
# http://www.utm.inf.uec.ac.jp/JWSAN/
jwsan_data_path = 'evaluate_dataset/jwsan-1400.csv'
jwsan_data = pd.read_csv(jwsan_data_path)

In [17]:
def jwsan_wv_evaluation(model, evaluation_data):
    POS = {1: "名詞", 2: "動詞", 3: "形容詞"}
    predicted_scores = {"名詞": [], "動詞": [], "形容詞": []}
    p_values = {"名詞": [], "動詞": [], "形容詞": []}
    actual_scores = {"名詞": [], "動詞": [], "形容詞": []}
    unknown_words = 0
    for _, row in evaluation_data.iterrows():
        word1 = row["word1"]
        word2 = row["word2"]
        pos = row["POS"]
        actual_score = row["similarity"]

        if word1 in model.wv.key_to_index and word2 in model.wv.key_to_index:
            similarity = model.wv.similarity(word1, word2)
        else:
            similarity = 0.0
            unknown_words += 1

        predicted_scores[POS[pos]].append(similarity)
        actual_scores[POS[pos]].append(actual_score)

    spearman_corr = {}
    for pos in POS.values():
        spearman_corr[pos], p_values[pos] = spearmanr(actual_scores[pos], predicted_scores[pos])
    spearman_corr["all"], p_values["all"] = spearmanr(actual_scores["名詞"] + actual_scores["動詞"] + actual_scores["形容詞"], predicted_scores["名詞"] + predicted_scores["動詞"] + predicted_scores["形容詞"])
    
    return spearman_corr, p_values, unknown_words

In [18]:
def jwsan_ft_evaluation(model, evaluation_data):
    POS = {1: "名詞", 2: "動詞", 3: "形容詞"}
    predicted_scores = {"名詞": [], "動詞": [], "形容詞": []}
    p_values = {"名詞": [], "動詞": [], "形容詞": []}
    actual_scores = {"名詞": [], "動詞": [], "形容詞": []}
    unknown_words = 0
    for _, row in evaluation_data.iterrows():
        word1 = row["word1"]
        word2 = row["word2"]
        pos = row["POS"]
        actual_score = row["similarity"]

        similarity = model.wv.similarity(word1, word2)

        predicted_scores[POS[pos]].append(similarity)
        actual_scores[POS[pos]].append(actual_score)

    spearman_corr = {}
    for pos in POS.values():
        spearman_corr[pos], p_values[pos] = spearmanr(actual_scores[pos], predicted_scores[pos])
    spearman_corr["all"], p_values["all"] = spearmanr(actual_scores["名詞"] + actual_scores["動詞"] + actual_scores["形容詞"], predicted_scores["名詞"] + predicted_scores["動詞"] + predicted_scores["形容詞"])
    return spearman_corr, p_values, unknown_words

In [19]:
from scipy.spatial.distance import cosine

def preprocess(tokenizer, word):
    word_tokens = tokenizer.tokenize(word)
    word_tokens = ["[CLS]"] + word_tokens + ["[SEP]"]
    word_ids = tokenizer.convert_tokens_to_ids(word_tokens)
    word_tensor = torch.tensor([word_ids])
    return word_tensor

def embedding(model, word_tensor):
    with torch.no_grad():
        outputs = model(word_tensor)
        hidden_states = outputs[2]
        # word_embedding = torch.stack(hidden_states[-4:]).mean(0)
        word_embedding = hidden_states[12]
        word_embedding = word_embedding.squeeze(0)[1]
        # word_embedding = hidden_states[-1].squeeze(0)[1]
    return word_embedding

def jwsan_bert_evaluation(model, tokenizer, evaluation_data):
    POS = {1: "名詞", 2: "動詞", 3: "形容詞"}
    predicted_scores = {"名詞": [], "動詞": [], "形容詞": []}
    p_values = {"名詞": [], "動詞": [], "形容詞": []}
    actual_scores = {"名詞": [], "動詞": [], "形容詞": []}

    for _, row in evaluation_data.iterrows():
        word1 = row["word1"]
        word2 = row["word2"]
        pos = row["POS"]
        actual_score = row["similarity"]

        word1_tensor = preprocess(tokenizer, word1)
        word2_tensor = preprocess(tokenizer, word2)

        word1_embedding = embedding(model, word1_tensor)
        word2_embedding = embedding(model, word2_tensor)

        similarity = 1 - cosine(word1_embedding, word2_embedding)

        predicted_scores[POS[pos]].append(similarity)
        actual_scores[POS[pos]].append(actual_score)

    spearman_corr = {}
    for pos in POS.values():
        spearman_corr[pos], p_values[pos] = spearmanr(actual_scores[pos], predicted_scores[pos])
    spearman_corr["all"], p_values["all"] = spearmanr(actual_scores["名詞"] + actual_scores["動詞"] + actual_scores["形容詞"], predicted_scores["名詞"] + predicted_scores["動詞"] + predicted_scores["形容詞"])
    return spearman_corr, p_values


In [20]:
from scipy.spatial.distance import cosine

# 静的にBERTのモデルの単語ベクトルを読み込む

# def jwsan_bert_evaluation(model, tokenizer, evaluation_data):
#     token_embeddings = model.get_input_embeddings().weight.cpu().detach().numpy()

#     POS = {1: "名詞", 2: "動詞", 3: "形容詞"}
#     predicted_scores = {"名詞": [], "動詞": [], "形容詞": []}
#     p_values = {"名詞": [], "動詞": [], "形容詞": []}
#     actual_scores = {"名詞": [], "動詞": [], "形容詞": []}

#     for _, row in evaluation_data.iterrows():
#         word1 = row["word1"]
#         word2 = row["word2"]
#         pos = row["POS"]
#         actual_score = row["similarity"]

#         word1 = tokenizer.tokenize(word1)
#         word1_ids = tokenizer.convert_tokens_to_ids(word1)[1:]
#         word2 = tokenizer.tokenize(word2)
#         word2_ids = tokenizer.convert_tokens_to_ids(word2)[1:]

#         word1_embedding = np.mean(token_embeddings[word1_ids], axis=0)
#         word2_embedding = np.mean(token_embeddings[word2_ids], axis=0)

#         similarity = 1 - cosine(word1_embedding, word2_embedding)

#         predicted_scores[POS[pos]].append(similarity)
#         actual_scores[POS[pos]].append(actual_score)

#     spearman_corr = {}
#     for pos in POS.values():
#         spearman_corr[pos], p_values[pos] = spearmanr(actual_scores[pos], predicted_scores[pos])
#     spearman_corr["all"], p_values["all"] = spearmanr(actual_scores["名詞"] + actual_scores["動詞"] + actual_scores["形容詞"], predicted_scores["名詞"] + predicted_scores["動詞"] + predicted_scores["形容詞"])
#     return spearman_corr, p_values

## JSTS

In [37]:
# https://github.com/yahoojapan/JGLUE/blob/main/datasets/jsts-v1.1/valid-v1.1.json
from datasets import load_dataset

dataset = load_dataset("shunk031/JGLUE", name="JSTS")

jsts_data = pd.DataFrame(dataset['validation'])
print(dataset)

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
Downloading builder script: 100%|██████████| 28.7k/28.7k [00:00<00:00, 62.9MB/s]
Downloading readme: 100%|██████████| 38.6k/38.6k [00:00<00:00, 23.6MB/s]
Downloading data: 3.16MB [00:00, 63.5MB/s]                  
Downloading data: 372kB [00:00, 31.8MB/s]                    
Generating train split: 12451 examples [00:00, 34788.20 examples/s]
Generating validation split: 1457 examples [00:00, 34884.10 examples/s]

DatasetDict({
    train: Dataset({
        features: ['sentence_pair_id', 'yjcaptions_id', 'sentence1', 'sentence2', 'label'],
        num_rows: 12451
    })
    validation: Dataset({
        features: ['sentence_pair_id', 'yjcaptions_id', 'sentence1', 'sentence2', 'label'],
        num_rows: 1457
    })
})





In [38]:
# jsts_data_path = 'evaluate_dataset/jsts-valid.json'

# with open(jsts_data_path) as f:
#     jsts_data = [json.loads(line) for line in f]

# jsts_data = pd.DataFrame(jsts_data)

In [39]:
def jsts_wv_evaluation(model, evaluation_data):
    predicted_scores = []
    unknown_words = 0
    for _, row in evaluation_data.iterrows():
        sentence1 = row["sentence1"]
        sentence2 = row["sentence2"]

        sentence1_wakati = wakati.parse(sentence1).split()
        sentence1_vecs = np.zeros((len(sentence1_wakati), 200))
        for i, word in enumerate(sentence1_wakati):
            if word in model.wv.key_to_index:
                sentence1_vecs[i] = model.wv[word]
            else:
                unknown_words += 1

        sentence2_wakati = wakati.parse(sentence2).split()
        sentence2_vecs = np.zeros((len(sentence2_wakati), 200))
        for i, word in enumerate(sentence2_wakati):
            if word in model.wv.key_to_index:
                sentence2_vecs[i] = model.wv[word]
            else:
                unknown_words += 1
        
        sen1 = np.mean(sentence1_vecs, axis=0)
        sen2 = np.mean(sentence2_vecs, axis=0)
        similarity = np.dot(sen1, sen2) / (np.linalg.norm(sen1) * np.linalg.norm(sen2))

        predicted_scores.append(similarity)

    actual_scores = evaluation_data["label"].values
    spearman_corr, p = spearmanr(actual_scores, predicted_scores)
    return spearman_corr, p, unknown_words

In [40]:
def jsts_ft_evaluation(model, evaluation_data):
    predicted_scores = []
    for _, row in evaluation_data.iterrows():
        sentence1 = row["sentence1"]
        sentence2 = row["sentence2"]

        sentence1_wakati = wakati.parse(sentence1).split()
        sentence1_vecs = np.zeros((len(sentence1_wakati), 200))
        for i, word in enumerate(sentence1_wakati):
            sentence1_vecs[i] = model.wv[word]


        sentence2_wakati = wakati.parse(sentence2).split()
        sentence2_vecs = np.zeros((len(sentence2_wakati), 200))
        for i, word in enumerate(sentence2_wakati):
            sentence2_vecs[i] = model.wv[word]
        
        sen1 = np.mean(sentence1_vecs, axis=0)
        sen2 = np.mean(sentence2_vecs, axis=0)
        similarity = np.dot(sen1, sen2) / (np.linalg.norm(sen1) * np.linalg.norm(sen2))

        predicted_scores.append(similarity)

    actual_scores = evaluation_data["label"].values
    spearman_corr, p = spearmanr(actual_scores, predicted_scores)
    return spearman_corr, p

In [41]:
def preprocess(tokenizer, sentence):
    sentence_tokens = tokenizer.tokenize(sentence)
    sentence_tokens = ["[CLS]"] + sentence_tokens + ["[SEP]"]
    sentence_ids = tokenizer.convert_tokens_to_ids(sentence_tokens)
    sentence_tensor = torch.tensor([sentence_ids])
    return sentence_tensor

def jsts_embedding(model, sentence_tensor):
    with torch.no_grad():
        outputs = model(sentence_tensor)
        hidden_states = outputs[2]
        # sentence_embedding = torch.stack(hidden_states[-4:]).mean(0)
        # sentence_embedding = sentence_embedding.squeeze(0)[1]
        # sentence_embedding = hidden_states[-1].squeeze(0)[0]
        sentence_embedding = hidden_states[-1].squeeze(0).mean(0)
    return sentence_embedding

def jsts_bert_evaluation(model, tokenizer, evaluation_data):
    predicted_scores = []
    for _, row in evaluation_data.iterrows():
        sentence1 = row["sentence1"]
        sentence2 = row["sentence2"]

        sentence1_tensor = preprocess(tokenizer, sentence1)
        sentence2_tensor = preprocess(tokenizer, sentence2)

        sentence1_embedding = jsts_embedding(model, sentence1_tensor)
        sentence2_embedding = jsts_embedding(model, sentence2_tensor)

        similarity = 1 - cosine(sentence1_embedding, sentence2_embedding)

        predicted_scores.append(similarity)

    actual_scores = evaluation_data["label"].values
    spearman_corr, p = spearmanr(actual_scores, predicted_scores)
    return spearman_corr, p

## モデルのロード

In [None]:
from transformers import BertConfig, BertForPreTraining, load_tf_weights_in_bert

config = BertConfig.from_json_file('model/bert/config.json')
bert_model = BertForPreTraining(config)
load_tf_weights_in_bert(bert_model, config, 'model/bert/model.ckpt-1400000')

BertForPreTraining(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(32000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwis

In [None]:
# from bert_japanese.src import tokenization_sentencepiece as tokenization
import tokenization_sentencepiece as tokenization

tokenizer = tokenization.FullTokenizer(
    model_file='model/bert/wiki-ja.model',
    vocab_file='model/bert/wiki-ja.vocab',
    do_lower_case=True
)

Loaded a trained SentencePiece model.


In [None]:
a = tokenizer.tokenize('こんにちは')
print(a)

['▁', 'こんにち', 'は']


In [None]:
a = tokenizer.tokenize('こんにちは')
# a.insert(0, '[CLS]')
# a.insert(-1, '[SEP]')
a = ["[CLS]"] + a + ["[SEP]"]
a = tokenizer.convert_tokens_to_ids(a)
a

[4, 9, 30442, 11, 5]

In [None]:
import torch

a = torch.tensor([a])
bert_model.eval()

with torch.no_grad():
    outputs = bert_model(a)

    last_hidden_states = outputs[0]

    hidden_states = outputs[2]

    word_embed = torch.stack(hidden_states[-4:]).sum(0)

In [None]:
word_embed.shape

torch.Size([1, 5, 768])

In [None]:
outputs.keys()

odict_keys(['prediction_logits', 'seq_relationship_logits', 'hidden_states'])

In [None]:
len(outputs[2])

13

In [None]:
len(outputs[2][12].squeeze(0)[1])

768

In [None]:
token_e = bert_model.get_input_embeddings().weight.cpu().detach().numpy()
token_e.shape

(32000, 768)

In [None]:
aa = token_e[55]
bb = token_e[56]

# cos similarity

from scipy.spatial.distance import cosine
1 - cosine(aa, bb)

0.17818844318389893

In [None]:
token_embeddings = bert_model.get_input_embeddings().weight.cpu().detach().numpy()


for  idx, row in jwsan_data.iterrows():
    if idx == 10:
        break
    word1 = row['word1']
    word2 = row['word2']
    pos = row['POS']
    actual_score = row['similarity']

    word1 = tokenizer.tokenize(word1)
    word1_ids = tokenizer.convert_tokens_to_ids(word1)[1:]
    print(word1)
    print(word1_ids)
    print()
    word2 = tokenizer.tokenize(word2)
    word2_ids = tokenizer.convert_tokens_to_ids(word2)[1:]
    print(word2)
    print(word2_ids)
    print()

    word1_embedding = np.mean(token_embeddings[word1_ids], axis=0)
    word2_embedding = np.mean(token_embeddings[word2_ids], axis=0)
    print(word1_embedding.shape)

['▁', 'か', '細い']
[95, 16945]

['▁', '弱い']
[8808]

(768,)
['▁', 'き', 'つい']
[203, 10805]

['▁', '甚', 'だ', 'しい']
[13708, 314, 3456]

(768,)
['▁', 'き', 'つい']
[203, 10805]

['▁', '悲し', 'い']
[22035, 128]

(768,)
['▁', 'けば', 'けば', 'しい']
[14422, 14422, 3456]

['▁', 'ど', 'ぎ', 'つい']
[1362, 845, 10805]

(768,)
['▁', 'さ', 'も', 'しい']
[338, 30, 3456]

['▁', '醜', 'い']
[25409, 128]

(768,)
['▁', 'と', 'ろ', 'い']
[20, 1406, 128]

['▁', '鈍', 'い']
[18892, 128]

(768,)
['▁', 'や', 'ばい']
[26, 21431]

['▁', '危', 'ない']
[14411, 278]

(768,)
['▁', '暗い']
[15993]

['▁', '湿', 'っぽい']
[9994, 24194]

(768,)
['▁', '暗い']
[15993]

['▁', '重', '苦しい']
[377, 24833]

(768,)
['▁', '暗い']
[15993]

['▁', '物', '悲し', 'い']
[280, 22035, 128]

(768,)


In [None]:
# 平均

torch.stack(hidden_states[-4:]).mean(0).shape

torch.Size([1, 5, 768])

In [None]:
hidden_states[-1].squeeze(0).shape

torch.Size([5, 768])

In [None]:
hidden_states[-1].squeeze(0).mean(0).shape

torch.Size([768])

In [None]:
outputs[2][12][0][2].shape


torch.Size([768])

In [None]:
w2v_model_path = 'model/word2vec/wiki20181220_w2v.model'
w2v_model = word2vec.Word2Vec.load(w2v_model_path)

In [None]:
ft_model_path = 'model/fasttext/jawiki20181220_fasttext.model'
ft_model = fasttext.FastText.load(ft_model_path)

In [42]:
jwsan_data_path = 'evaluate_dataset/jwsan-1400.csv'
jwsan_data = pd.read_csv(jwsan_data_path)

In [43]:
from datasets import load_dataset

dataset = load_dataset("shunk031/JGLUE", name="JSTS")
print(dataset)
jsts_data = pd.DataFrame(dataset['validation'])

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


DatasetDict({
    train: Dataset({
        features: ['sentence_pair_id', 'yjcaptions_id', 'sentence1', 'sentence2', 'label'],
        num_rows: 12451
    })
    validation: Dataset({
        features: ['sentence_pair_id', 'yjcaptions_id', 'sentence1', 'sentence2', 'label'],
        num_rows: 1457
    })
})


In [44]:
jsts_data.head()

Unnamed: 0,sentence_pair_id,yjcaptions_id,sentence1,sentence2,label
0,0,100312_421853-104611-31624,レンガの建物の前を、乳母車を押した女性が歩いています。,厩舎で馬と女性とが寄り添っています。,0.0
1,1,100371-104675-104678,山の上に顔の白い牛が2頭います。,曇り空の山肌で、牛が２匹草を食んでいます。,2.4
2,2,100668-104946-104949,バナナを持った人が道路を通行しています。,道の上をバナナを背負った男性が歩いています。,3.6
3,3,100958-105177-105178,スケートボーダーが手すりを滑っています。,階段の手すりでスケートボードをする男性がいます。,4.0
4,4,101401-105530-105533,ダブルベッドの上で、女性が足を組み横たわっています。,ベッドの上に寝転んで、足を組んでいる人が映っています。,3.0


## 評価

In [45]:
print('word2vec')

spearman_corr, p_values, unknown_words = jwsan_wv_evaluation(w2v_model, jwsan_data)
print('spearman_corr')
print(spearman_corr)
print('p_values')
print(p_values)
# for key, p in p_values.items():
#     print(f'{key}: {p:.4f}')
print(f'JWSAN {spearman_corr["all"]=}, {p_values["all"]}, {unknown_words=}')

spearman_corr, p_value, unknown_words = jsts_wv_evaluation(w2v_model, jsts_data)
print(f'JSTS {spearman_corr=}, {p_value} {unknown_words=}')

word2vec
spearman_corr
{'名詞': 0.521975358679802, '動詞': 0.4627191776866474, '形容詞': 0.3345741662764725, 'all': 0.5014307269959212}
p_values
{'名詞': 1.2586490589398057e-77, '動詞': 1.0359285082898006e-13, '形容詞': 0.004071720617042378, 'all': 5.191758793427666e-90}
JWSAN spearman_corr["all"]=0.5014307269959212, 5.191758793427666e-90, unknown_words=4
JSTS spearman_corr=0.5412707509457221, 1.123398519857243e-111 unknown_words=68


In [46]:
for key, p in p_values.items():
    print(f'{key}: {p:.3e}')
print(f'{p_value:.3e}')

名詞: 1.259e-77
動詞: 1.036e-13
形容詞: 4.072e-03
all: 5.192e-90
1.123e-111


In [47]:
print('fasttext')

spearman_corr, p_values, unknown_words = jwsan_ft_evaluation(ft_model, jwsan_data)

print('spearman_corr')
print(spearman_corr)
print('p_values')
print(p_values)
print(f'JWSAN {spearman_corr["all"]=}, {p_values["all"]=}')

spearman_corr, p_value = jsts_ft_evaluation(ft_model, jsts_data)
print(f'JSTS {spearman_corr=} {p_value=}')

fasttext
spearman_corr
{'名詞': 0.5264894693586655, '動詞': 0.3115584796126147, '形容詞': 0.3119980077984522, 'all': 0.4769133175370952}
p_values
{'名詞': 3.5135399456097005e-79, '動詞': 1.2924265415524242e-06, '形容詞': 0.007630183136417743, 'all': 2.050039430419599e-80}
JWSAN spearman_corr["all"]=0.4769133175370952, p_values["all"]=2.050039430419599e-80
JSTS spearman_corr=0.429704866001759 p_value=1.549590212161683e-66


In [48]:
for key, p in p_values.items():
    print(f'{key}: {p:.3e}')
print(f'{p_value:.3e}')

名詞: 3.514e-79
動詞: 1.292e-06
形容詞: 7.630e-03
all: 2.050e-80
1.550e-66


In [49]:
spearman_corr, p_values = jwsan_bert_evaluation(bert_model, tokenizer, jwsan_data)
print('spearman_corr')
print(spearman_corr)
print('p_values')
print(p_values)

spearman_corr
{'名詞': 0.4406580649294346, '動詞': 0.18879025293603824, '形容詞': 0.19488820566496115, 'all': 0.39198679493225075}
p_values
{'名詞': 2.8115695262553583e-53, '動詞': 0.003901308317660055, '形容詞': 0.10089893191352794, 'all': 1.231078845251381e-52}


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = um.true_divide(


In [50]:
for key, p in p_values.items():
    print(f'{key}: {p:.3e}')

名詞: 2.812e-53
動詞: 3.901e-03
形容詞: 1.009e-01
all: 1.231e-52


In [51]:
spearman_corr, p_value = jsts_bert_evaluation(bert_model, tokenizer, jsts_data)
print('spearman_corr')
print(spearman_corr)
print('p_values')
print(p_values)

spearman_corr
0.6918930629710041
p_values
{'名詞': 2.8115695262553583e-53, '動詞': 0.003901308317660055, '形容詞': 0.10089893191352794, 'all': 1.231078845251381e-52}


In [52]:
print(f'{p_value:.3e}')

4.484e-208


おまけ

In [53]:
ft_model_path = 'model/fasttext/min2/jawiki20181220_fasttext_min2.model'
ft_model = fasttext.FastText.load(ft_model_path)

print('fasttext min2')

spearman_corr, p_values, unknown_words = jwsan_ft_evaluation(ft_model, jwsan_data)

print('spearman_corr')
print(spearman_corr)
print('p_values')
print(p_values)
print(f'JWSAN {spearman_corr["all"]=}, {p_values["all"]=}')

spearman_corr, p_value = jsts_ft_evaluation(ft_model, jsts_data)
print(f'JSTS {spearman_corr=} {p_value=}')

fasttext min2
spearman_corr
{'名詞': 0.5577880945732592, '動詞': 0.26193300356131755, '形容詞': 0.24573280189537428, 'all': 0.4196253599898706}
p_values
{'名詞': 1.2500276027523094e-90, '動詞': 5.3717233540074785e-05, '形容詞': 0.03746700677636986, 'all': 8.062109253070062e-61}
JWSAN spearman_corr["all"]=0.4196253599898706, p_values["all"]=8.062109253070062e-61
JSTS spearman_corr=0.45533090746475025 p_value=1.7873599361056635e-75


In [54]:
w2v_model_path = 'more_wiki/word2vec/w2v.model'
w2v_model = word2vec.Word2Vec.load(w2v_model_path)

print('word2vec')

spearman_corr, p_values, unknown_words = jwsan_wv_evaluation(w2v_model, jwsan_data)
print('spearman_corr')
print(spearman_corr)
print('p_values')
print(p_values)
print(f'JWSAN {spearman_corr["all"]=}, {p_values["all"]}, {unknown_words=}')

spearman_corr, p_value, unknown_words = jsts_wv_evaluation(w2v_model, jsts_data)
print(f'JSTS {spearman_corr=}, {p_value} {unknown_words=}')

word2vec
spearman_corr
{'名詞': 0.5225075101054287, '動詞': 0.4623227533857069, '形容詞': 0.33645551281630753, 'all': 0.5015475615271434}
p_values
{'名詞': 8.277643031303288e-78, '動詞': 1.0939242064343544e-13, '形容詞': 0.003855946565197696, 'all': 4.6525207213432e-90}
JWSAN spearman_corr["all"]=0.5015475615271434, 4.6525207213432e-90, unknown_words=5
JSTS spearman_corr=0.544468283981086, 3.1100765709820357e-113 unknown_words=69


In [55]:
ft_model_path = 'more_wiki/fasttext/ft.model'
ft_model = fasttext.FastText.load(ft_model_path)

print('fasttext')

spearman_corr, p_values, unknown_words = jwsan_ft_evaluation(ft_model, jwsan_data)

print('spearman_corr')
print(spearman_corr)
print('p_values')
print(p_values)
print(f'JWSAN {spearman_corr["all"]=}, {p_values["all"]=}')

spearman_corr, p_value = jsts_ft_evaluation(ft_model, jsts_data)
print(f'JSTS {spearman_corr=} {p_value=}')

fasttext
spearman_corr
{'名詞': 0.5267398377971378, '動詞': 0.30901512351110494, '形容詞': 0.32743469735607295, 'all': 0.47773262196515603}
p_values
{'名詞': 2.8764548426402363e-79, '動詞': 1.5919282296046322e-06, '形容詞': 0.0049913198149807835, 'all': 1.008149032047325e-80}
JWSAN spearman_corr["all"]=0.47773262196515603, p_values["all"]=1.008149032047325e-80
JSTS spearman_corr=0.43139756601878343 p_value=4.199539355201169e-67
